pertpy 0.7.0__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pertpy/__init__.py +2 -1
- pertpy/data/__init__.py +61 -0
- pertpy/data/_dataloader.py +27 -23
- pertpy/data/_datasets.py +58 -0
- pertpy/metadata/__init__.py +2 -0
- pertpy/metadata/_cell_line.py +39 -70
- pertpy/metadata/_compound.py +3 -4
- pertpy/metadata/_drug.py +2 -6
- pertpy/metadata/_look_up.py +38 -51
- pertpy/metadata/_metadata.py +7 -10
- pertpy/metadata/_moa.py +2 -6
- pertpy/plot/__init__.py +0 -5
- pertpy/preprocessing/__init__.py +2 -0
- pertpy/preprocessing/_guide_rna.py +2 -3
- pertpy/tools/__init__.py +42 -4
- pertpy/tools/_augur.py +14 -15
- pertpy/tools/_cinemaot.py +2 -2
- pertpy/tools/_coda/_base_coda.py +118 -142
- pertpy/tools/_coda/_sccoda.py +16 -15
- pertpy/tools/_coda/_tasccoda.py +21 -22
- pertpy/tools/_dialogue.py +18 -23
- pertpy/tools/_differential_gene_expression/__init__.py +20 -0
- pertpy/tools/_differential_gene_expression/_base.py +657 -0
- pertpy/tools/_differential_gene_expression/_checks.py +41 -0
- pertpy/tools/_differential_gene_expression/_dge_comparison.py +86 -0
- pertpy/tools/_differential_gene_expression/_edger.py +125 -0
- pertpy/tools/_differential_gene_expression/_formulaic.py +189 -0
- pertpy/tools/_differential_gene_expression/_pydeseq2.py +95 -0
- pertpy/tools/_differential_gene_expression/_simple_tests.py +162 -0
- pertpy/tools/_differential_gene_expression/_statsmodels.py +72 -0
- pertpy/tools/_distances/_distance_tests.py +21 -16
- pertpy/tools/_distances/_distances.py +406 -70
- pertpy/tools/_enrichment.py +10 -15
- pertpy/tools/_kernel_pca.py +1 -1
- pertpy/tools/_milo.py +76 -53
- pertpy/tools/_mixscape.py +15 -11
- pertpy/tools/_perturbation_space/_clustering.py +5 -2
- pertpy/tools/_perturbation_space/_comparison.py +112 -0
- pertpy/tools/_perturbation_space/_discriminator_classifiers.py +20 -22
- pertpy/tools/_perturbation_space/_perturbation_space.py +23 -21
- pertpy/tools/_perturbation_space/_simple.py +3 -3
- pertpy/tools/_scgen/__init__.py +1 -1
- pertpy/tools/_scgen/_base_components.py +2 -3
- pertpy/tools/_scgen/_scgen.py +33 -28
- pertpy/tools/_scgen/_utils.py +2 -2
- {pertpy-0.7.0.dist-info → pertpy-0.8.0.dist-info}/METADATA +22 -13
- pertpy-0.8.0.dist-info/RECORD +57 -0
- {pertpy-0.7.0.dist-info → pertpy-0.8.0.dist-info}/WHEEL +1 -1
- pertpy/plot/_augur.py +0 -171
- pertpy/plot/_coda.py +0 -601
- pertpy/plot/_guide_rna.py +0 -64
- pertpy/plot/_milopy.py +0 -209
- pertpy/plot/_mixscape.py +0 -355
- pertpy/tools/_differential_gene_expression.py +0 -325
- pertpy-0.7.0.dist-info/RECORD +0 -53
- {pertpy-0.7.0.dist-info → pertpy-0.8.0.dist-info}/licenses/LICENSE +0 -0
pertpy/__init__.py
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
__author__ = "Lukas Heumos"
|
4
4
|
__email__ = "lukas.heumos@posteo.net"
|
5
|
-
__version__ = "0.
|
5
|
+
__version__ = "0.8.0"
|
6
6
|
|
7
7
|
import warnings
|
8
8
|
|
@@ -11,6 +11,7 @@ from numba import NumbaDeprecationWarning
|
|
11
11
|
|
12
12
|
warnings.filterwarnings("ignore", category=NumbaDeprecationWarning)
|
13
13
|
warnings.filterwarnings("ignore", category=MatplotlibDeprecationWarning)
|
14
|
+
warnings.filterwarnings("ignore", category=SyntaxWarning)
|
14
15
|
warnings.filterwarnings("ignore", category=UserWarning, module="scvi._settings")
|
15
16
|
|
16
17
|
from . import data as dt
|
pertpy/data/__init__.py
CHANGED
@@ -24,6 +24,7 @@ from pertpy.data._datasets import (
|
|
24
24
|
gasperini_2019_lowmoi,
|
25
25
|
gehring_2019,
|
26
26
|
haber_2017_regions,
|
27
|
+
hagai_2018,
|
27
28
|
kang_2018,
|
28
29
|
mcfarland_2020,
|
29
30
|
norman_2019,
|
@@ -52,5 +53,65 @@ from pertpy.data._datasets import (
|
|
52
53
|
tian_2021_crispri,
|
53
54
|
weinreb_2020,
|
54
55
|
xie_2017,
|
56
|
+
zhang_2021,
|
55
57
|
zhao_2021,
|
56
58
|
)
|
59
|
+
|
60
|
+
__all__ = [
|
61
|
+
"adamson_2016_pilot",
|
62
|
+
"adamson_2016_upr_epistasis",
|
63
|
+
"adamson_2016_upr_perturb_seq",
|
64
|
+
"aissa_2021",
|
65
|
+
"bhattacherjee",
|
66
|
+
"burczynski_crohn",
|
67
|
+
"chang_2021",
|
68
|
+
"cinemaot_example",
|
69
|
+
"combosciplex",
|
70
|
+
"datlinger_2017",
|
71
|
+
"datlinger_2021",
|
72
|
+
"dialogue_example",
|
73
|
+
"distance_example",
|
74
|
+
"dixit_2016",
|
75
|
+
"dixit_2016_raw",
|
76
|
+
"dong_2023",
|
77
|
+
"frangieh_2021",
|
78
|
+
"frangieh_2021_protein",
|
79
|
+
"frangieh_2021_raw",
|
80
|
+
"frangieh_2021_rna",
|
81
|
+
"gasperini_2019_atscale",
|
82
|
+
"gasperini_2019_highmoi",
|
83
|
+
"gasperini_2019_lowmoi",
|
84
|
+
"gehring_2019",
|
85
|
+
"haber_2017_regions",
|
86
|
+
"hagai_2018",
|
87
|
+
"kang_2018",
|
88
|
+
"mcfarland_2020",
|
89
|
+
"norman_2019",
|
90
|
+
"norman_2019_raw",
|
91
|
+
"papalexi_2021",
|
92
|
+
"replogle_2022_k562_essential",
|
93
|
+
"replogle_2022_k562_gwps",
|
94
|
+
"replogle_2022_rpe1",
|
95
|
+
"sc_sim_augur",
|
96
|
+
"schiebinger_2019_16day",
|
97
|
+
"schiebinger_2019_18day",
|
98
|
+
"schraivogel_2020_tap_screen_chr8",
|
99
|
+
"schraivogel_2020_tap_screen_chr11",
|
100
|
+
"sciplex3_raw",
|
101
|
+
"sciplex_gxe1",
|
102
|
+
"shifrut_2018",
|
103
|
+
"smillie_2019",
|
104
|
+
"srivatsan_2020_sciplex2",
|
105
|
+
"srivatsan_2020_sciplex3",
|
106
|
+
"srivatsan_2020_sciplex4",
|
107
|
+
"stephenson_2021_subsampled",
|
108
|
+
"tasccoda_example",
|
109
|
+
"tian_2019_day7neuron",
|
110
|
+
"tian_2019_ipsc",
|
111
|
+
"tian_2021_crispra",
|
112
|
+
"tian_2021_crispri",
|
113
|
+
"weinreb_2020",
|
114
|
+
"xie_2017",
|
115
|
+
"zhao_2021",
|
116
|
+
"zhang_2021",
|
117
|
+
]
|
pertpy/data/_dataloader.py
CHANGED
@@ -5,7 +5,8 @@ from string import ascii_lowercase
|
|
5
5
|
from zipfile import ZipFile
|
6
6
|
|
7
7
|
import requests
|
8
|
-
from
|
8
|
+
from filelock import FileLock
|
9
|
+
from lamin_utils import logger
|
9
10
|
from rich.progress import Progress
|
10
11
|
|
11
12
|
|
@@ -37,30 +38,33 @@ def _download( # pragma: no cover
|
|
37
38
|
download_to_path = (
|
38
39
|
f"{output_path}{output_file_name}" if str(output_path).endswith("/") else f"{output_path}/{output_file_name}"
|
39
40
|
)
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
41
|
+
|
42
|
+
Path(output_path).mkdir(parents=True, exist_ok=True)
|
43
|
+
lock_path = f"{output_path}/{output_file_name}.lock"
|
44
|
+
with FileLock(lock_path):
|
45
|
+
if Path(download_to_path).exists() and not overwrite:
|
46
|
+
logger.warning(f"File {download_to_path} already exists!")
|
44
47
|
return
|
45
|
-
else:
|
46
|
-
print(f"{warning} Overwriting...")
|
47
48
|
|
48
|
-
|
49
|
-
|
49
|
+
temp_file_name = f"{download_to_path}.part"
|
50
|
+
|
51
|
+
response = requests.get(url, stream=True)
|
52
|
+
total = int(response.headers.get("content-length", 0))
|
53
|
+
|
54
|
+
with Progress(refresh_per_second=100) as progress:
|
55
|
+
task = progress.add_task("[red]Downloading...", total=total)
|
56
|
+
with Path(temp_file_name).open("wb") as file:
|
57
|
+
for data in response.iter_content(block_size):
|
58
|
+
file.write(data)
|
59
|
+
progress.update(task, advance=block_size)
|
60
|
+
progress.update(task, completed=total, refresh=True)
|
50
61
|
|
51
|
-
|
52
|
-
task = progress.add_task("[red]Downloading...", total=total)
|
53
|
-
Path(output_path).mkdir(parents=True, exist_ok=True)
|
54
|
-
with Path(download_to_path).open("wb") as file:
|
55
|
-
for data in response.iter_content(block_size):
|
56
|
-
file.write(data)
|
57
|
-
progress.update(task, advance=block_size)
|
62
|
+
Path(temp_file_name).replace(download_to_path)
|
58
63
|
|
59
|
-
|
60
|
-
|
64
|
+
if is_zip:
|
65
|
+
output_path = output_path or tempfile.gettempdir()
|
66
|
+
with ZipFile(download_to_path, "r") as zip_obj:
|
67
|
+
zip_obj.extractall(path=output_path)
|
68
|
+
zip_obj.namelist()
|
61
69
|
|
62
|
-
|
63
|
-
output_path = output_path or tempfile.gettempdir()
|
64
|
-
with ZipFile(download_to_path, "r") as zip_obj:
|
65
|
-
zip_obj.extractall(path=output_path)
|
66
|
-
zip_obj.namelist()
|
70
|
+
Path(lock_path).unlink()
|
pertpy/data/_datasets.py
CHANGED
@@ -1540,3 +1540,61 @@ def sciplex_gxe1() -> AnnData: # pragma: no cover
|
|
1540
1540
|
adata = sc.read_h5ad(output_file_path)
|
1541
1541
|
|
1542
1542
|
return adata
|
1543
|
+
|
1544
|
+
|
1545
|
+
def zhang_2021() -> AnnData: # pragma: no cover
|
1546
|
+
"""Single-cell RNA-seq of TNBC patients' immune cells exposed to paclitaxel alone or combined with the anti-PD-L1 atezolizumab.
|
1547
|
+
|
1548
|
+
This analysis, involving 22 patients, identifies immune subtypes predictive of therapeutic
|
1549
|
+
responses and underscores potential limitations of combining paclitaxel with atezolizumab in treatment protocols.
|
1550
|
+
|
1551
|
+
The script that generated this specific AnnData object:
|
1552
|
+
https://github.com/tessadgreen/ThesisCode/blob/main/Chapter3/drug_response/import_zhang_data.ipynb
|
1553
|
+
|
1554
|
+
This dataset does not contain the single-cell ATAC-seq data that was also measured for the paper.
|
1555
|
+
|
1556
|
+
References:
|
1557
|
+
Zhang Y et al., Liu Z. Single-cell analyses reveal key immune cell subsets associated with response to PD-L1 blockade in triple-negative breast cancer.
|
1558
|
+
Cancer Cell. 2021 Volume 39, Issue 12. doi: https://doi.org/10.1016/j.ccell.2021.09.010
|
1559
|
+
|
1560
|
+
Returns:
|
1561
|
+
:class:`~anndata.AnnData` object of the dataset.
|
1562
|
+
"""
|
1563
|
+
output_file_name = "zhang_2021.h5ad"
|
1564
|
+
output_file_path = settings.datasetdir / output_file_name
|
1565
|
+
if not Path(output_file_path).exists():
|
1566
|
+
_download(
|
1567
|
+
url="https://figshare.com/ndownloader/files/46457872",
|
1568
|
+
output_file_name=output_file_name,
|
1569
|
+
output_path=settings.datasetdir,
|
1570
|
+
is_zip=False,
|
1571
|
+
)
|
1572
|
+
adata = sc.read_h5ad(output_file_path)
|
1573
|
+
|
1574
|
+
return adata
|
1575
|
+
|
1576
|
+
|
1577
|
+
def hagai_2018() -> AnnData: # pragma: no cover
|
1578
|
+
"""Cross-species analysis of primary dermal fibroblasts and bone marrow-derived phagocytes, stimulated with dsRNA and IFNB.
|
1579
|
+
|
1580
|
+
The study explores immune response variations across humans, macaques, mice, and rats.
|
1581
|
+
|
1582
|
+
Referenences:
|
1583
|
+
Hagai, T., Chen, X., Miragaia, R.J. et al. Gene expression variability across cells and species shapes innate immunity.
|
1584
|
+
Nature 563, 197–202 (2018). https://doi.org/10.1038/s41586-018-0657-2
|
1585
|
+
|
1586
|
+
Returns:
|
1587
|
+
:class:`~anndata.AnnData` object of the dataset.
|
1588
|
+
"""
|
1589
|
+
output_file_name = "hagai_2018.h5ad"
|
1590
|
+
output_file_path = settings.datasetdir / output_file_name
|
1591
|
+
if not Path(output_file_path).exists():
|
1592
|
+
_download(
|
1593
|
+
url="https://figshare.com/ndownloader/files/46978846",
|
1594
|
+
output_file_name=output_file_name,
|
1595
|
+
output_path=settings.datasetdir,
|
1596
|
+
is_zip=False,
|
1597
|
+
)
|
1598
|
+
adata = sc.read_h5ad(output_file_path)
|
1599
|
+
|
1600
|
+
return adata
|
pertpy/metadata/__init__.py
CHANGED
pertpy/metadata/_cell_line.py
CHANGED
@@ -3,13 +3,14 @@ from __future__ import annotations
|
|
3
3
|
from pathlib import Path
|
4
4
|
from typing import TYPE_CHECKING, Literal
|
5
5
|
|
6
|
+
from lamin_utils import logger
|
7
|
+
|
6
8
|
if TYPE_CHECKING:
|
7
9
|
from collections.abc import Iterable
|
8
10
|
|
9
11
|
import matplotlib.pyplot as plt
|
10
12
|
import numpy as np
|
11
13
|
import pandas as pd
|
12
|
-
from rich import print
|
13
14
|
from scanpy import settings
|
14
15
|
from scipy import stats
|
15
16
|
|
@@ -42,7 +43,6 @@ class CellLine(MetaData):
|
|
42
43
|
# Source: https://depmap.org/portal/download/all/ (DepMap Public 23Q4)
|
43
44
|
depmap_cell_line_path = Path(settings.cachedir) / "depmap_23Q4_info.csv"
|
44
45
|
if not Path(depmap_cell_line_path).exists():
|
45
|
-
print("[bold yellow]No DepMap metadata file found. Starting download now.")
|
46
46
|
_download(
|
47
47
|
url="https://ndownloader.figshare.com/files/43746708",
|
48
48
|
output_file_name="depmap_23Q4_info.csv",
|
@@ -59,10 +59,6 @@ class CellLine(MetaData):
|
|
59
59
|
|
60
60
|
if not Path(transformed_cancerxgene_cell_line_path).exists():
|
61
61
|
if not Path(cancerxgene_cell_line_path).exists():
|
62
|
-
print(
|
63
|
-
"[bold yellow]No cell line metadata file from The Genomics of Drug Sensitivity "
|
64
|
-
"in Cancer Project found. Starting download now."
|
65
|
-
)
|
66
62
|
_download(
|
67
63
|
url="https://www.cancerrxgene.org/api/celllines?list=all&sEcho=1&iColumns=7&sColumns=&"
|
68
64
|
"iDisplayStart=0&iDisplayLength=25&mDataProp_0=0&mDataProp_1=1&mDataProp_2=2&mDataProp_3=3&"
|
@@ -102,7 +98,6 @@ class CellLine(MetaData):
|
|
102
98
|
# Source: https://cellmodelpassports.sanger.ac.uk/downloads (Gene annotation)
|
103
99
|
gene_annotation_file_path = Path(settings.cachedir) / "genes_info.csv"
|
104
100
|
if not Path(gene_annotation_file_path).exists():
|
105
|
-
print("[bold yellow]No metadata file was found for gene annotation. Starting download now.")
|
106
101
|
_download(
|
107
102
|
url="https://cog.sanger.ac.uk/cmp/download/gene_identifiers_20191101.csv",
|
108
103
|
output_file_name="genes_info.csv",
|
@@ -120,10 +115,6 @@ class CellLine(MetaData):
|
|
120
115
|
# solution: remove the white space and convert to int before depmap updates the metadata
|
121
116
|
bulk_rna_sanger_file_path = Path(settings.cachedir) / "rnaseq_sanger_info.csv"
|
122
117
|
if not Path(bulk_rna_sanger_file_path).exists():
|
123
|
-
print(
|
124
|
-
"[bold yellow]No metadata file was found for bulk RNA-seq data of Sanger cell line."
|
125
|
-
" Starting download now."
|
126
|
-
)
|
127
118
|
_download(
|
128
119
|
url="https://figshare.com/ndownloader/files/42467103",
|
129
120
|
output_file_name="rnaseq_sanger_info.csv",
|
@@ -137,7 +128,6 @@ class CellLine(MetaData):
|
|
137
128
|
# Source: https://depmap.org/portal/download/all/ (DepMap Public 22Q2)
|
138
129
|
bulk_rna_broad_file_path = Path(settings.cachedir) / "rnaseq_depmap_info.csv"
|
139
130
|
if not Path(bulk_rna_broad_file_path).exists():
|
140
|
-
print("[bold yellow]No metadata file was found for CCLE expression data. Starting download now.")
|
141
131
|
_download(
|
142
132
|
url="https://figshare.com/ndownloader/files/34989922",
|
143
133
|
output_file_name="rnaseq_depmap_info.csv",
|
@@ -152,7 +142,6 @@ class CellLine(MetaData):
|
|
152
142
|
# Source: https://cellmodelpassports.sanger.ac.uk/downloads (Proteomics)
|
153
143
|
proteomics_file_path = Path(settings.cachedir) / "proteomics_info.csv"
|
154
144
|
if not Path(proteomics_file_path).exists():
|
155
|
-
print("[bold yellow]No metadata file was found for proteomics data (DepMap.Sanger). Starting download now.")
|
156
145
|
_download(
|
157
146
|
url="https://figshare.com/ndownloader/files/42468393",
|
158
147
|
output_file_name="proteomics_info.csv",
|
@@ -169,10 +158,6 @@ class CellLine(MetaData):
|
|
169
158
|
# URL: https://cog.sanger.ac.uk/cancerrxgene/GDSC_release8.4/GDSC1_fitted_dose_response_24Jul22.xlsx
|
170
159
|
drug_response_gdsc1_file_path = Path(settings.cachedir) / "gdsc1_info.csv"
|
171
160
|
if not Path(drug_response_gdsc1_file_path).exists():
|
172
|
-
print(
|
173
|
-
"[bold yellow]No metadata file was found for drug response data of GDSC1 dataset."
|
174
|
-
" Starting download now."
|
175
|
-
)
|
176
161
|
_download(
|
177
162
|
url="https://figshare.com/ndownloader/files/43757235",
|
178
163
|
output_file_name="gdsc1_info.csv",
|
@@ -184,10 +169,6 @@ class CellLine(MetaData):
|
|
184
169
|
if gdsc_dataset == 2:
|
185
170
|
drug_response_gdsc2_file_path = Path(settings.cachedir) / "gdsc2_info.csv"
|
186
171
|
if not Path(drug_response_gdsc2_file_path).exists():
|
187
|
-
print(
|
188
|
-
"[bold yellow]No metadata file was found for drug response data of GDSC2 dataset."
|
189
|
-
" Starting download now."
|
190
|
-
)
|
191
172
|
_download(
|
192
173
|
url="https://figshare.com/ndownloader/files/43757232",
|
193
174
|
output_file_name="gdsc2_info.csv",
|
@@ -213,15 +194,13 @@ class CellLine(MetaData):
|
|
213
194
|
|
214
195
|
Args:
|
215
196
|
adata: The data object to annotate.
|
216
|
-
query_id: The column of `.obs` with cell line information.
|
217
|
-
reference_id: The type of cell line identifier in the
|
218
|
-
If fetching cell line metadata from Cancerrxgene, it is recommended to choose
|
219
|
-
|
220
|
-
|
221
|
-
cell_line_source: The source of cell line metadata, DepMap or Cancerrxgene. Defaults to "DepMap".
|
197
|
+
query_id: The column of `.obs` with cell line information.
|
198
|
+
reference_id: The type of cell line identifier in the metadata, e.g. ModelID, CellLineName or StrippedCellLineName.
|
199
|
+
If fetching cell line metadata from Cancerrxgene, it is recommended to choose "stripped_cell_line_name".
|
200
|
+
fetch: The metadata to fetch.
|
201
|
+
cell_line_source: The source of cell line metadata, DepMap or Cancerrxgene.
|
222
202
|
verbosity: The number of unmatched identifiers to print, can be either non-negative values or "all".
|
223
|
-
|
224
|
-
copy: Determines whether a copy of the `adata` is returned. Defaults to False.
|
203
|
+
copy: Determines whether a copy of the `adata` is returned.
|
225
204
|
|
226
205
|
Returns:
|
227
206
|
Returns an AnnData object with cell line annotation.
|
@@ -248,11 +227,9 @@ class CellLine(MetaData):
|
|
248
227
|
reference_id = "stripped_cell_line_name"
|
249
228
|
if query_id == "DepMap_ID":
|
250
229
|
query_id = "stripped_cell_line_name"
|
251
|
-
|
252
|
-
"
|
253
|
-
"
|
254
|
-
"Ensure that stripped cell line names are available in 'adata.obs.' ",
|
255
|
-
"or use the DepMap as `cell_line_source` to annotate the cell line first ",
|
230
|
+
logger.error(
|
231
|
+
"`stripped_cell_line_name` is used as reference and query identifier to annotate cell line metadata from Cancerrxgene. "
|
232
|
+
"Ensure that stripped cell line names are available in 'adata.obs.' or use the DepMap as `cell_line_source` to annotate the cell line first."
|
256
233
|
)
|
257
234
|
if self.cancerxgene is None:
|
258
235
|
self._download_cell_line(cell_line_source="Cancerrxgene")
|
@@ -337,9 +314,9 @@ class CellLine(MetaData):
|
|
337
314
|
Args:
|
338
315
|
adata: The data object to annotate.
|
339
316
|
query_id: The column of `.obs` with cell line information. Defaults to "cell_line_name" if `cell_line_source` is sanger, otherwise "DepMap_ID".
|
340
|
-
cell_line_source: The bulk rna expression data from either broad or sanger cell line.
|
341
|
-
verbosity: The number of unmatched identifiers to print, can be either non-negative values or "all".
|
342
|
-
copy: Determines whether a copy of the `adata` is returned.
|
317
|
+
cell_line_source: The bulk rna expression data from either broad or sanger cell line.
|
318
|
+
verbosity: The number of unmatched identifiers to print, can be either non-negative values or "all".
|
319
|
+
copy: Determines whether a copy of the `adata` is returned.
|
343
320
|
|
344
321
|
Returns:
|
345
322
|
Returns an AnnData object with bulk rna expression annotation.
|
@@ -378,11 +355,10 @@ class CellLine(MetaData):
|
|
378
355
|
not_matched_identifiers = list(set(adata.obs[query_id]) - set(self.bulk_rna_sanger.index))
|
379
356
|
else:
|
380
357
|
reference_id = "DepMap_ID"
|
381
|
-
|
382
|
-
"To annotate bulk RNA data from Broad Institue, "
|
383
|
-
"`DepMap_ID` is
|
384
|
-
"
|
385
|
-
"Alternatively, use `annotate()` to annotate the cell line first ",
|
358
|
+
logger.warning(
|
359
|
+
"To annotate bulk RNA data from Broad Institue, `DepMap_ID` is used as default reference and query identifier if no `reference_id` is given.\n"
|
360
|
+
"Ensure that `DepMap_ID` is available in 'adata.obs'.\n"
|
361
|
+
"Alternatively, use `annotate()` to annotate the cell line first "
|
386
362
|
)
|
387
363
|
if self.bulk_rna_broad is None:
|
388
364
|
self._download_bulk_rna(cell_line_source="broad")
|
@@ -438,16 +414,12 @@ class CellLine(MetaData):
|
|
438
414
|
|
439
415
|
Args:
|
440
416
|
adata: The data object to annotate.
|
441
|
-
query_id: The column of `.obs` with cell line information.
|
417
|
+
query_id: The column of `.obs` with cell line information.
|
442
418
|
reference_id: The type of cell line identifier in the meta data, model_name or model_id.
|
443
|
-
Defaults to "model_name".
|
444
419
|
protein_information: The type of protein expression data to fetch, protein_intensity or zscore.
|
445
|
-
Defaults to "protein_intensity".
|
446
420
|
protein_id: The protein identifier saved in the fetched meta data, uniprot_id or symbol.
|
447
|
-
Defaults to "uniprot_id".
|
448
421
|
verbosity: The number of unmatched identifiers to print, can be either non-negative values or "all".
|
449
|
-
|
450
|
-
copy: Determines whether a copy of the `adata` is returned. Defaults to False.
|
422
|
+
copy: Determines whether a copy of the `adata` is returned.
|
451
423
|
|
452
424
|
Returns:
|
453
425
|
Returns an AnnData object with protein expression annotation.
|
@@ -481,7 +453,7 @@ class CellLine(MetaData):
|
|
481
453
|
raise ValueError(
|
482
454
|
f"The specified `reference_id`{reference_id} can't be found in the protein expression data. \n"
|
483
455
|
"To solve the issue, please use the reference identifier available in the metadata. \n"
|
484
|
-
"Alternatively, create a `CellLineMetaData.lookup()` object to obtain the available reference identifiers in the metadata.
|
456
|
+
"Alternatively, create a `CellLineMetaData.lookup()` object to obtain the available reference identifiers in the metadata."
|
485
457
|
)
|
486
458
|
|
487
459
|
identifier_num_all = len(adata.obs[query_id].unique())
|
@@ -511,7 +483,7 @@ class CellLine(MetaData):
|
|
511
483
|
reference_id: Literal["cell_line_name", "sanger_model_id", "cosmic_id"] = "cell_line_name",
|
512
484
|
query_perturbation: str = "perturbation",
|
513
485
|
reference_perturbation: Literal["drug_name", "drug_id"] = "drug_name",
|
514
|
-
gdsc_dataset: Literal[
|
486
|
+
gdsc_dataset: Literal["gdsc_1", "gdsc_2"] = "gdsc_1",
|
515
487
|
verbosity: int | str = 5,
|
516
488
|
copy: bool = False,
|
517
489
|
) -> AnnData:
|
@@ -522,22 +494,17 @@ class CellLine(MetaData):
|
|
522
494
|
|
523
495
|
Args:
|
524
496
|
adata: The data object to annotate.
|
525
|
-
query_id: The column of `.obs` with cell line information.
|
526
|
-
reference_id: The type of cell line identifier in the
|
527
|
-
Defaults to "cell_line_name".
|
497
|
+
query_id: The column of `.obs` with cell line information.
|
498
|
+
reference_id: The type of cell line identifier in the metadata, cell_line_name, sanger_model_id or cosmic_id.
|
528
499
|
query_perturbation: The column of `.obs` with perturbation information.
|
529
|
-
|
530
|
-
|
531
|
-
Defaults to 'drug_name'.
|
532
|
-
gdsc_dataset: The GDSC dataset, 1 or 2.
|
500
|
+
reference_perturbation: The type of perturbation in the metadata, drug_name or drug_id.
|
501
|
+
gdsc_dataset: The GDSC dataset, 1 or 2, specified as 'gdsc_1' or 'gdsc_2'.
|
533
502
|
The GDSC1 dataset updates previous releases with additional drug screening data from the
|
534
503
|
Sanger Institute and Massachusetts General Hospital.
|
535
504
|
It covers 970 Cell lines and 403 Compounds with 333292 IC50s.
|
536
505
|
GDSC2 is new and has 243,466 IC50 results from the latest screening at the Sanger Institute.
|
537
|
-
Defaults to 1.
|
538
506
|
verbosity: The number of unmatched identifiers to print, can be either non-negative values or 'all'.
|
539
|
-
|
540
|
-
copy: Determines whether a copy of the `adata` is returned. Defaults to False.
|
507
|
+
copy: Determines whether a copy of the `adata` is returned.
|
541
508
|
|
542
509
|
Returns:
|
543
510
|
Returns an AnnData object with drug response annotation.
|
@@ -559,14 +526,16 @@ class CellLine(MetaData):
|
|
559
526
|
"This ensures that the required query ID is included in your data."
|
560
527
|
)
|
561
528
|
# Lazily download the GDSC data
|
562
|
-
if gdsc_dataset ==
|
529
|
+
if gdsc_dataset == "gdsc_1":
|
563
530
|
if self.drug_response_gdsc1 is None:
|
564
531
|
self._download_gdsc(gdsc_dataset=1)
|
565
532
|
gdsc_data = self.drug_response_gdsc1
|
566
|
-
|
533
|
+
elif gdsc_dataset == "gdsc_2":
|
567
534
|
if self.drug_response_gdsc2 is None:
|
568
535
|
self._download_gdsc(gdsc_dataset=2)
|
569
536
|
gdsc_data = self.drug_response_gdsc2
|
537
|
+
else:
|
538
|
+
raise ValueError("The GDSC dataset specified in `gdsc_dataset` must be either 'gdsc_1' or 'gdsc_2'.")
|
570
539
|
|
571
540
|
identifier_num_all = len(adata.obs[query_id].unique())
|
572
541
|
not_matched_identifiers = list(set(adata.obs[query_id]) - set(gdsc_data[reference_id]))
|
@@ -583,7 +552,7 @@ class CellLine(MetaData):
|
|
583
552
|
adata.obs = (
|
584
553
|
adata.obs.reset_index()
|
585
554
|
.set_index([query_id, query_perturbation])
|
586
|
-
.assign(ln_ic50=
|
555
|
+
.assign(ln_ic50=gdsc_data.set_index([reference_id, reference_perturbation]).ln_ic50)
|
587
556
|
.reset_index()
|
588
557
|
.set_index(old_index_name)
|
589
558
|
)
|
@@ -678,8 +647,8 @@ class CellLine(MetaData):
|
|
678
647
|
|
679
648
|
Args:
|
680
649
|
adata: Input data object.
|
681
|
-
identifier: Column in `.obs` containing cell line identifiers.
|
682
|
-
metadata_key: Key of the AnnData obsm for comparison with the X matrix.
|
650
|
+
identifier: Column in `.obs` containing cell line identifiers.
|
651
|
+
metadata_key: Key of the AnnData obsm for comparison with the X matrix.
|
683
652
|
|
684
653
|
Returns:
|
685
654
|
Returns pearson correlation coefficients and their corresponding p-values for matched and unmatched cell lines separately.
|
@@ -695,7 +664,7 @@ class CellLine(MetaData):
|
|
695
664
|
if isinstance(adata.obsm[metadata_key], pd.DataFrame):
|
696
665
|
# Give warning if the genes are not the same
|
697
666
|
if sum(adata.obsm[metadata_key].columns != adata.var.index.values) > 0:
|
698
|
-
|
667
|
+
logger.warning(
|
699
668
|
"Column name of metadata is not the same as the index of adata.var. Ensure that the genes are in the same order."
|
700
669
|
)
|
701
670
|
|
@@ -726,6 +695,7 @@ class CellLine(MetaData):
|
|
726
695
|
adata: AnnData,
|
727
696
|
corr: pd.DataFrame,
|
728
697
|
pval: pd.DataFrame,
|
698
|
+
*,
|
729
699
|
identifier: str = "DepMap_ID",
|
730
700
|
metadata_key: str = "bulk_rna_broad",
|
731
701
|
category: str = "cell line",
|
@@ -737,13 +707,12 @@ class CellLine(MetaData):
|
|
737
707
|
adata: Input data object.
|
738
708
|
corr: Pearson correlation scores.
|
739
709
|
pval: P-values for pearson correlation.
|
740
|
-
identifier: Column in `.obs` containing the identifiers.
|
741
|
-
metadata_key: Key of the AnnData obsm for comparison with the X matrix.
|
742
|
-
category: The category for correlation comparison.
|
710
|
+
identifier: Column in `.obs` containing the identifiers.
|
711
|
+
metadata_key: Key of the AnnData obsm for comparison with the X matrix.
|
712
|
+
category: The category for correlation comparison.
|
743
713
|
subset_identifier: Selected identifiers for scatter plot visualization between the X matrix and `metadata_key`.
|
744
714
|
If not None, only the chosen cell line will be plotted, either specified as a value in `identifier` (string) or as an index number.
|
745
715
|
If None, all cell lines will be plotted.
|
746
|
-
Defaults to None.
|
747
716
|
Returns:
|
748
717
|
Pearson correlation coefficients and their corresponding p-values for matched and unmatched cell lines separately.
|
749
718
|
"""
|
pertpy/metadata/_compound.py
CHANGED
@@ -30,11 +30,10 @@ class Compound(MetaData):
|
|
30
30
|
|
31
31
|
Args:
|
32
32
|
adata: The data object to annotate.
|
33
|
-
query_id: The column of `.obs` with compound identifiers.
|
34
|
-
query_id_type: The type of compound identifiers, 'name' or 'cid'.
|
33
|
+
query_id: The column of `.obs` with compound identifiers.
|
34
|
+
query_id_type: The type of compound identifiers, 'name' or 'cid'.
|
35
35
|
verbosity: The number of unmatched identifiers to print, can be either non-negative values or "all".
|
36
|
-
|
37
|
-
copy: Determines whether a copy of the `adata` is returned. Defaults to False.
|
36
|
+
copy: Determines whether a copy of the `adata` is returned.
|
38
37
|
|
39
38
|
Returns:
|
40
39
|
Returns an AnnData object with compound annotation.
|
pertpy/metadata/_drug.py
CHANGED
@@ -6,7 +6,6 @@ from pathlib import Path
|
|
6
6
|
from typing import TYPE_CHECKING, Literal
|
7
7
|
|
8
8
|
import pandas as pd
|
9
|
-
from rich import print
|
10
9
|
from scanpy import settings
|
11
10
|
|
12
11
|
from pertpy.data._dataloader import _download
|
@@ -25,7 +24,6 @@ def _download_drug_annotation(
|
|
25
24
|
# Prepared in https://github.com/theislab/pertpy-datasets/blob/main/chembl_data.ipynb
|
26
25
|
chembl_path = Path(settings.cachedir) / "chembl.json"
|
27
26
|
if not Path(chembl_path).exists():
|
28
|
-
print("[bold yellow]No metadata file was found for chembl. Starting download now.")
|
29
27
|
_download(
|
30
28
|
url="https://figshare.com/ndownloader/files/43871718",
|
31
29
|
output_file_name="chembl.json",
|
@@ -40,7 +38,6 @@ def _download_drug_annotation(
|
|
40
38
|
elif source == "dgidb":
|
41
39
|
dgidb_path = Path(settings.cachedir) / "dgidb.tsv"
|
42
40
|
if not Path(dgidb_path).exists():
|
43
|
-
print("[bold yellow]No metadata file was found for dgidb. Starting download now.")
|
44
41
|
_download(
|
45
42
|
url="https://www.dgidb.org/data/latest/interactions.tsv",
|
46
43
|
output_file_name="dgidb.tsv",
|
@@ -54,7 +51,6 @@ def _download_drug_annotation(
|
|
54
51
|
else:
|
55
52
|
pharmgkb_path = Path(settings.cachedir) / "pharmgkb.tsv"
|
56
53
|
if not Path(pharmgkb_path).exists():
|
57
|
-
print("[bold yellow]No metadata file was found for pharmGKB. Starting download now.")
|
58
54
|
_download(
|
59
55
|
url="https://api.pharmgkb.org/v1/download/file/data/relationships.zip",
|
60
56
|
output_file_name="pharmgkb.zip",
|
@@ -103,8 +99,8 @@ class Drug(MetaData):
|
|
103
99
|
|
104
100
|
Args:
|
105
101
|
adata: AnnData object containing log-normalised data.
|
106
|
-
source: Source of the metadata, chembl, dgidb or pharmgkb.
|
107
|
-
copy: Determines whether a copy of the `adata` is returned.
|
102
|
+
source: Source of the metadata, chembl, dgidb or pharmgkb.
|
103
|
+
copy: Determines whether a copy of the `adata` is returned.
|
108
104
|
|
109
105
|
Returns:
|
110
106
|
An AnnData object with a new column `drug` in the var slot.
|