pertpy 0.7.0__py3-none-any.whl → 0.9.1__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- pertpy/__init__.py +2 -1
- pertpy/data/__init__.py +61 -0
- pertpy/data/_dataloader.py +27 -23
- pertpy/data/_datasets.py +58 -0
- pertpy/metadata/__init__.py +2 -0
- pertpy/metadata/_cell_line.py +39 -70
- pertpy/metadata/_compound.py +3 -4
- pertpy/metadata/_drug.py +2 -6
- pertpy/metadata/_look_up.py +38 -51
- pertpy/metadata/_metadata.py +7 -10
- pertpy/metadata/_moa.py +2 -6
- pertpy/plot/__init__.py +0 -5
- pertpy/preprocessing/__init__.py +2 -0
- pertpy/preprocessing/_guide_rna.py +6 -7
- pertpy/tools/__init__.py +67 -6
- pertpy/tools/_augur.py +14 -15
- pertpy/tools/_cinemaot.py +2 -2
- pertpy/tools/_coda/_base_coda.py +118 -142
- pertpy/tools/_coda/_sccoda.py +16 -15
- pertpy/tools/_coda/_tasccoda.py +21 -22
- pertpy/tools/_dialogue.py +18 -23
- pertpy/tools/_differential_gene_expression/__init__.py +20 -0
- pertpy/tools/_differential_gene_expression/_base.py +657 -0
- pertpy/tools/_differential_gene_expression/_checks.py +41 -0
- pertpy/tools/_differential_gene_expression/_dge_comparison.py +86 -0
- pertpy/tools/_differential_gene_expression/_edger.py +125 -0
- pertpy/tools/_differential_gene_expression/_formulaic.py +189 -0
- pertpy/tools/_differential_gene_expression/_pydeseq2.py +95 -0
- pertpy/tools/_differential_gene_expression/_simple_tests.py +162 -0
- pertpy/tools/_differential_gene_expression/_statsmodels.py +72 -0
- pertpy/tools/_distances/_distance_tests.py +21 -16
- pertpy/tools/_distances/_distances.py +406 -70
- pertpy/tools/_enrichment.py +10 -15
- pertpy/tools/_kernel_pca.py +1 -1
- pertpy/tools/_milo.py +77 -54
- pertpy/tools/_mixscape.py +15 -11
- pertpy/tools/_perturbation_space/_clustering.py +5 -2
- pertpy/tools/_perturbation_space/_comparison.py +112 -0
- pertpy/tools/_perturbation_space/_discriminator_classifiers.py +21 -23
- pertpy/tools/_perturbation_space/_perturbation_space.py +23 -21
- pertpy/tools/_perturbation_space/_simple.py +3 -3
- pertpy/tools/_scgen/__init__.py +1 -1
- pertpy/tools/_scgen/_base_components.py +2 -3
- pertpy/tools/_scgen/_scgen.py +33 -28
- pertpy/tools/_scgen/_utils.py +2 -2
- {pertpy-0.7.0.dist-info → pertpy-0.9.1.dist-info}/METADATA +32 -14
- pertpy-0.9.1.dist-info/RECORD +57 -0
- {pertpy-0.7.0.dist-info → pertpy-0.9.1.dist-info}/WHEEL +1 -1
- pertpy/plot/_augur.py +0 -171
- pertpy/plot/_coda.py +0 -601
- pertpy/plot/_guide_rna.py +0 -64
- pertpy/plot/_milopy.py +0 -209
- pertpy/plot/_mixscape.py +0 -355
- pertpy/tools/_differential_gene_expression.py +0 -325
- pertpy-0.7.0.dist-info/RECORD +0 -53
- {pertpy-0.7.0.dist-info → pertpy-0.9.1.dist-info}/licenses/LICENSE +0 -0
pertpy/__init__.py
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
__author__ = "Lukas Heumos"
|
4
4
|
__email__ = "lukas.heumos@posteo.net"
|
5
|
-
__version__ = "0.
|
5
|
+
__version__ = "0.9.1"
|
6
6
|
|
7
7
|
import warnings
|
8
8
|
|
@@ -11,6 +11,7 @@ from numba import NumbaDeprecationWarning
|
|
11
11
|
|
12
12
|
warnings.filterwarnings("ignore", category=NumbaDeprecationWarning)
|
13
13
|
warnings.filterwarnings("ignore", category=MatplotlibDeprecationWarning)
|
14
|
+
warnings.filterwarnings("ignore", category=SyntaxWarning)
|
14
15
|
warnings.filterwarnings("ignore", category=UserWarning, module="scvi._settings")
|
15
16
|
|
16
17
|
from . import data as dt
|
pertpy/data/__init__.py
CHANGED
@@ -24,6 +24,7 @@ from pertpy.data._datasets import (
|
|
24
24
|
gasperini_2019_lowmoi,
|
25
25
|
gehring_2019,
|
26
26
|
haber_2017_regions,
|
27
|
+
hagai_2018,
|
27
28
|
kang_2018,
|
28
29
|
mcfarland_2020,
|
29
30
|
norman_2019,
|
@@ -52,5 +53,65 @@ from pertpy.data._datasets import (
|
|
52
53
|
tian_2021_crispri,
|
53
54
|
weinreb_2020,
|
54
55
|
xie_2017,
|
56
|
+
zhang_2021,
|
55
57
|
zhao_2021,
|
56
58
|
)
|
59
|
+
|
60
|
+
__all__ = [
|
61
|
+
"adamson_2016_pilot",
|
62
|
+
"adamson_2016_upr_epistasis",
|
63
|
+
"adamson_2016_upr_perturb_seq",
|
64
|
+
"aissa_2021",
|
65
|
+
"bhattacherjee",
|
66
|
+
"burczynski_crohn",
|
67
|
+
"chang_2021",
|
68
|
+
"cinemaot_example",
|
69
|
+
"combosciplex",
|
70
|
+
"datlinger_2017",
|
71
|
+
"datlinger_2021",
|
72
|
+
"dialogue_example",
|
73
|
+
"distance_example",
|
74
|
+
"dixit_2016",
|
75
|
+
"dixit_2016_raw",
|
76
|
+
"dong_2023",
|
77
|
+
"frangieh_2021",
|
78
|
+
"frangieh_2021_protein",
|
79
|
+
"frangieh_2021_raw",
|
80
|
+
"frangieh_2021_rna",
|
81
|
+
"gasperini_2019_atscale",
|
82
|
+
"gasperini_2019_highmoi",
|
83
|
+
"gasperini_2019_lowmoi",
|
84
|
+
"gehring_2019",
|
85
|
+
"haber_2017_regions",
|
86
|
+
"hagai_2018",
|
87
|
+
"kang_2018",
|
88
|
+
"mcfarland_2020",
|
89
|
+
"norman_2019",
|
90
|
+
"norman_2019_raw",
|
91
|
+
"papalexi_2021",
|
92
|
+
"replogle_2022_k562_essential",
|
93
|
+
"replogle_2022_k562_gwps",
|
94
|
+
"replogle_2022_rpe1",
|
95
|
+
"sc_sim_augur",
|
96
|
+
"schiebinger_2019_16day",
|
97
|
+
"schiebinger_2019_18day",
|
98
|
+
"schraivogel_2020_tap_screen_chr8",
|
99
|
+
"schraivogel_2020_tap_screen_chr11",
|
100
|
+
"sciplex3_raw",
|
101
|
+
"sciplex_gxe1",
|
102
|
+
"shifrut_2018",
|
103
|
+
"smillie_2019",
|
104
|
+
"srivatsan_2020_sciplex2",
|
105
|
+
"srivatsan_2020_sciplex3",
|
106
|
+
"srivatsan_2020_sciplex4",
|
107
|
+
"stephenson_2021_subsampled",
|
108
|
+
"tasccoda_example",
|
109
|
+
"tian_2019_day7neuron",
|
110
|
+
"tian_2019_ipsc",
|
111
|
+
"tian_2021_crispra",
|
112
|
+
"tian_2021_crispri",
|
113
|
+
"weinreb_2020",
|
114
|
+
"xie_2017",
|
115
|
+
"zhao_2021",
|
116
|
+
"zhang_2021",
|
117
|
+
]
|
pertpy/data/_dataloader.py
CHANGED
@@ -5,7 +5,8 @@ from string import ascii_lowercase
|
|
5
5
|
from zipfile import ZipFile
|
6
6
|
|
7
7
|
import requests
|
8
|
-
from
|
8
|
+
from filelock import FileLock
|
9
|
+
from lamin_utils import logger
|
9
10
|
from rich.progress import Progress
|
10
11
|
|
11
12
|
|
@@ -37,30 +38,33 @@ def _download( # pragma: no cover
|
|
37
38
|
download_to_path = (
|
38
39
|
f"{output_path}{output_file_name}" if str(output_path).endswith("/") else f"{output_path}/{output_file_name}"
|
39
40
|
)
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
41
|
+
|
42
|
+
Path(output_path).mkdir(parents=True, exist_ok=True)
|
43
|
+
lock_path = f"{output_path}/{output_file_name}.lock"
|
44
|
+
with FileLock(lock_path):
|
45
|
+
if Path(download_to_path).exists() and not overwrite:
|
46
|
+
logger.warning(f"File {download_to_path} already exists!")
|
44
47
|
return
|
45
|
-
else:
|
46
|
-
print(f"{warning} Overwriting...")
|
47
48
|
|
48
|
-
|
49
|
-
|
49
|
+
temp_file_name = f"{download_to_path}.part"
|
50
|
+
|
51
|
+
response = requests.get(url, stream=True)
|
52
|
+
total = int(response.headers.get("content-length", 0))
|
53
|
+
|
54
|
+
with Progress(refresh_per_second=100) as progress:
|
55
|
+
task = progress.add_task("[red]Downloading...", total=total)
|
56
|
+
with Path(temp_file_name).open("wb") as file:
|
57
|
+
for data in response.iter_content(block_size):
|
58
|
+
file.write(data)
|
59
|
+
progress.update(task, advance=block_size)
|
60
|
+
progress.update(task, completed=total, refresh=True)
|
50
61
|
|
51
|
-
|
52
|
-
task = progress.add_task("[red]Downloading...", total=total)
|
53
|
-
Path(output_path).mkdir(parents=True, exist_ok=True)
|
54
|
-
with Path(download_to_path).open("wb") as file:
|
55
|
-
for data in response.iter_content(block_size):
|
56
|
-
file.write(data)
|
57
|
-
progress.update(task, advance=block_size)
|
62
|
+
Path(temp_file_name).replace(download_to_path)
|
58
63
|
|
59
|
-
|
60
|
-
|
64
|
+
if is_zip:
|
65
|
+
output_path = output_path or tempfile.gettempdir()
|
66
|
+
with ZipFile(download_to_path, "r") as zip_obj:
|
67
|
+
zip_obj.extractall(path=output_path)
|
68
|
+
zip_obj.namelist()
|
61
69
|
|
62
|
-
|
63
|
-
output_path = output_path or tempfile.gettempdir()
|
64
|
-
with ZipFile(download_to_path, "r") as zip_obj:
|
65
|
-
zip_obj.extractall(path=output_path)
|
66
|
-
zip_obj.namelist()
|
70
|
+
Path(lock_path).unlink()
|
pertpy/data/_datasets.py
CHANGED
@@ -1540,3 +1540,61 @@ def sciplex_gxe1() -> AnnData: # pragma: no cover
|
|
1540
1540
|
adata = sc.read_h5ad(output_file_path)
|
1541
1541
|
|
1542
1542
|
return adata
|
1543
|
+
|
1544
|
+
|
1545
|
+
def zhang_2021() -> AnnData: # pragma: no cover
|
1546
|
+
"""Single-cell RNA-seq of TNBC patients' immune cells exposed to paclitaxel alone or combined with the anti-PD-L1 atezolizumab.
|
1547
|
+
|
1548
|
+
This analysis, involving 22 patients, identifies immune subtypes predictive of therapeutic
|
1549
|
+
responses and underscores potential limitations of combining paclitaxel with atezolizumab in treatment protocols.
|
1550
|
+
|
1551
|
+
The script that generated this specific AnnData object:
|
1552
|
+
https://github.com/tessadgreen/ThesisCode/blob/main/Chapter3/drug_response/import_zhang_data.ipynb
|
1553
|
+
|
1554
|
+
This dataset does not contain the single-cell ATAC-seq data that was also measured for the paper.
|
1555
|
+
|
1556
|
+
References:
|
1557
|
+
Zhang Y et al., Liu Z. Single-cell analyses reveal key immune cell subsets associated with response to PD-L1 blockade in triple-negative breast cancer.
|
1558
|
+
Cancer Cell. 2021 Volume 39, Issue 12. doi: https://doi.org/10.1016/j.ccell.2021.09.010
|
1559
|
+
|
1560
|
+
Returns:
|
1561
|
+
:class:`~anndata.AnnData` object of the dataset.
|
1562
|
+
"""
|
1563
|
+
output_file_name = "zhang_2021.h5ad"
|
1564
|
+
output_file_path = settings.datasetdir / output_file_name
|
1565
|
+
if not Path(output_file_path).exists():
|
1566
|
+
_download(
|
1567
|
+
url="https://figshare.com/ndownloader/files/46457872",
|
1568
|
+
output_file_name=output_file_name,
|
1569
|
+
output_path=settings.datasetdir,
|
1570
|
+
is_zip=False,
|
1571
|
+
)
|
1572
|
+
adata = sc.read_h5ad(output_file_path)
|
1573
|
+
|
1574
|
+
return adata
|
1575
|
+
|
1576
|
+
|
1577
|
+
def hagai_2018() -> AnnData: # pragma: no cover
|
1578
|
+
"""Cross-species analysis of primary dermal fibroblasts and bone marrow-derived phagocytes, stimulated with dsRNA and IFNB.
|
1579
|
+
|
1580
|
+
The study explores immune response variations across humans, macaques, mice, and rats.
|
1581
|
+
|
1582
|
+
Referenences:
|
1583
|
+
Hagai, T., Chen, X., Miragaia, R.J. et al. Gene expression variability across cells and species shapes innate immunity.
|
1584
|
+
Nature 563, 197–202 (2018). https://doi.org/10.1038/s41586-018-0657-2
|
1585
|
+
|
1586
|
+
Returns:
|
1587
|
+
:class:`~anndata.AnnData` object of the dataset.
|
1588
|
+
"""
|
1589
|
+
output_file_name = "hagai_2018.h5ad"
|
1590
|
+
output_file_path = settings.datasetdir / output_file_name
|
1591
|
+
if not Path(output_file_path).exists():
|
1592
|
+
_download(
|
1593
|
+
url="https://figshare.com/ndownloader/files/46978846",
|
1594
|
+
output_file_name=output_file_name,
|
1595
|
+
output_path=settings.datasetdir,
|
1596
|
+
is_zip=False,
|
1597
|
+
)
|
1598
|
+
adata = sc.read_h5ad(output_file_path)
|
1599
|
+
|
1600
|
+
return adata
|
pertpy/metadata/__init__.py
CHANGED
pertpy/metadata/_cell_line.py
CHANGED
@@ -3,13 +3,14 @@ from __future__ import annotations
|
|
3
3
|
from pathlib import Path
|
4
4
|
from typing import TYPE_CHECKING, Literal
|
5
5
|
|
6
|
+
from lamin_utils import logger
|
7
|
+
|
6
8
|
if TYPE_CHECKING:
|
7
9
|
from collections.abc import Iterable
|
8
10
|
|
9
11
|
import matplotlib.pyplot as plt
|
10
12
|
import numpy as np
|
11
13
|
import pandas as pd
|
12
|
-
from rich import print
|
13
14
|
from scanpy import settings
|
14
15
|
from scipy import stats
|
15
16
|
|
@@ -42,7 +43,6 @@ class CellLine(MetaData):
|
|
42
43
|
# Source: https://depmap.org/portal/download/all/ (DepMap Public 23Q4)
|
43
44
|
depmap_cell_line_path = Path(settings.cachedir) / "depmap_23Q4_info.csv"
|
44
45
|
if not Path(depmap_cell_line_path).exists():
|
45
|
-
print("[bold yellow]No DepMap metadata file found. Starting download now.")
|
46
46
|
_download(
|
47
47
|
url="https://ndownloader.figshare.com/files/43746708",
|
48
48
|
output_file_name="depmap_23Q4_info.csv",
|
@@ -59,10 +59,6 @@ class CellLine(MetaData):
|
|
59
59
|
|
60
60
|
if not Path(transformed_cancerxgene_cell_line_path).exists():
|
61
61
|
if not Path(cancerxgene_cell_line_path).exists():
|
62
|
-
print(
|
63
|
-
"[bold yellow]No cell line metadata file from The Genomics of Drug Sensitivity "
|
64
|
-
"in Cancer Project found. Starting download now."
|
65
|
-
)
|
66
62
|
_download(
|
67
63
|
url="https://www.cancerrxgene.org/api/celllines?list=all&sEcho=1&iColumns=7&sColumns=&"
|
68
64
|
"iDisplayStart=0&iDisplayLength=25&mDataProp_0=0&mDataProp_1=1&mDataProp_2=2&mDataProp_3=3&"
|
@@ -102,7 +98,6 @@ class CellLine(MetaData):
|
|
102
98
|
# Source: https://cellmodelpassports.sanger.ac.uk/downloads (Gene annotation)
|
103
99
|
gene_annotation_file_path = Path(settings.cachedir) / "genes_info.csv"
|
104
100
|
if not Path(gene_annotation_file_path).exists():
|
105
|
-
print("[bold yellow]No metadata file was found for gene annotation. Starting download now.")
|
106
101
|
_download(
|
107
102
|
url="https://cog.sanger.ac.uk/cmp/download/gene_identifiers_20191101.csv",
|
108
103
|
output_file_name="genes_info.csv",
|
@@ -120,10 +115,6 @@ class CellLine(MetaData):
|
|
120
115
|
# solution: remove the white space and convert to int before depmap updates the metadata
|
121
116
|
bulk_rna_sanger_file_path = Path(settings.cachedir) / "rnaseq_sanger_info.csv"
|
122
117
|
if not Path(bulk_rna_sanger_file_path).exists():
|
123
|
-
print(
|
124
|
-
"[bold yellow]No metadata file was found for bulk RNA-seq data of Sanger cell line."
|
125
|
-
" Starting download now."
|
126
|
-
)
|
127
118
|
_download(
|
128
119
|
url="https://figshare.com/ndownloader/files/42467103",
|
129
120
|
output_file_name="rnaseq_sanger_info.csv",
|
@@ -137,7 +128,6 @@ class CellLine(MetaData):
|
|
137
128
|
# Source: https://depmap.org/portal/download/all/ (DepMap Public 22Q2)
|
138
129
|
bulk_rna_broad_file_path = Path(settings.cachedir) / "rnaseq_depmap_info.csv"
|
139
130
|
if not Path(bulk_rna_broad_file_path).exists():
|
140
|
-
print("[bold yellow]No metadata file was found for CCLE expression data. Starting download now.")
|
141
131
|
_download(
|
142
132
|
url="https://figshare.com/ndownloader/files/34989922",
|
143
133
|
output_file_name="rnaseq_depmap_info.csv",
|
@@ -152,7 +142,6 @@ class CellLine(MetaData):
|
|
152
142
|
# Source: https://cellmodelpassports.sanger.ac.uk/downloads (Proteomics)
|
153
143
|
proteomics_file_path = Path(settings.cachedir) / "proteomics_info.csv"
|
154
144
|
if not Path(proteomics_file_path).exists():
|
155
|
-
print("[bold yellow]No metadata file was found for proteomics data (DepMap.Sanger). Starting download now.")
|
156
145
|
_download(
|
157
146
|
url="https://figshare.com/ndownloader/files/42468393",
|
158
147
|
output_file_name="proteomics_info.csv",
|
@@ -169,10 +158,6 @@ class CellLine(MetaData):
|
|
169
158
|
# URL: https://cog.sanger.ac.uk/cancerrxgene/GDSC_release8.4/GDSC1_fitted_dose_response_24Jul22.xlsx
|
170
159
|
drug_response_gdsc1_file_path = Path(settings.cachedir) / "gdsc1_info.csv"
|
171
160
|
if not Path(drug_response_gdsc1_file_path).exists():
|
172
|
-
print(
|
173
|
-
"[bold yellow]No metadata file was found for drug response data of GDSC1 dataset."
|
174
|
-
" Starting download now."
|
175
|
-
)
|
176
161
|
_download(
|
177
162
|
url="https://figshare.com/ndownloader/files/43757235",
|
178
163
|
output_file_name="gdsc1_info.csv",
|
@@ -184,10 +169,6 @@ class CellLine(MetaData):
|
|
184
169
|
if gdsc_dataset == 2:
|
185
170
|
drug_response_gdsc2_file_path = Path(settings.cachedir) / "gdsc2_info.csv"
|
186
171
|
if not Path(drug_response_gdsc2_file_path).exists():
|
187
|
-
print(
|
188
|
-
"[bold yellow]No metadata file was found for drug response data of GDSC2 dataset."
|
189
|
-
" Starting download now."
|
190
|
-
)
|
191
172
|
_download(
|
192
173
|
url="https://figshare.com/ndownloader/files/43757232",
|
193
174
|
output_file_name="gdsc2_info.csv",
|
@@ -213,15 +194,13 @@ class CellLine(MetaData):
|
|
213
194
|
|
214
195
|
Args:
|
215
196
|
adata: The data object to annotate.
|
216
|
-
query_id: The column of `.obs` with cell line information.
|
217
|
-
reference_id: The type of cell line identifier in the
|
218
|
-
If fetching cell line metadata from Cancerrxgene, it is recommended to choose
|
219
|
-
|
220
|
-
|
221
|
-
cell_line_source: The source of cell line metadata, DepMap or Cancerrxgene. Defaults to "DepMap".
|
197
|
+
query_id: The column of `.obs` with cell line information.
|
198
|
+
reference_id: The type of cell line identifier in the metadata, e.g. ModelID, CellLineName or StrippedCellLineName.
|
199
|
+
If fetching cell line metadata from Cancerrxgene, it is recommended to choose "stripped_cell_line_name".
|
200
|
+
fetch: The metadata to fetch.
|
201
|
+
cell_line_source: The source of cell line metadata, DepMap or Cancerrxgene.
|
222
202
|
verbosity: The number of unmatched identifiers to print, can be either non-negative values or "all".
|
223
|
-
|
224
|
-
copy: Determines whether a copy of the `adata` is returned. Defaults to False.
|
203
|
+
copy: Determines whether a copy of the `adata` is returned.
|
225
204
|
|
226
205
|
Returns:
|
227
206
|
Returns an AnnData object with cell line annotation.
|
@@ -248,11 +227,9 @@ class CellLine(MetaData):
|
|
248
227
|
reference_id = "stripped_cell_line_name"
|
249
228
|
if query_id == "DepMap_ID":
|
250
229
|
query_id = "stripped_cell_line_name"
|
251
|
-
|
252
|
-
"
|
253
|
-
"
|
254
|
-
"Ensure that stripped cell line names are available in 'adata.obs.' ",
|
255
|
-
"or use the DepMap as `cell_line_source` to annotate the cell line first ",
|
230
|
+
logger.error(
|
231
|
+
"`stripped_cell_line_name` is used as reference and query identifier to annotate cell line metadata from Cancerrxgene. "
|
232
|
+
"Ensure that stripped cell line names are available in 'adata.obs.' or use the DepMap as `cell_line_source` to annotate the cell line first."
|
256
233
|
)
|
257
234
|
if self.cancerxgene is None:
|
258
235
|
self._download_cell_line(cell_line_source="Cancerrxgene")
|
@@ -337,9 +314,9 @@ class CellLine(MetaData):
|
|
337
314
|
Args:
|
338
315
|
adata: The data object to annotate.
|
339
316
|
query_id: The column of `.obs` with cell line information. Defaults to "cell_line_name" if `cell_line_source` is sanger, otherwise "DepMap_ID".
|
340
|
-
cell_line_source: The bulk rna expression data from either broad or sanger cell line.
|
341
|
-
verbosity: The number of unmatched identifiers to print, can be either non-negative values or "all".
|
342
|
-
copy: Determines whether a copy of the `adata` is returned.
|
317
|
+
cell_line_source: The bulk rna expression data from either broad or sanger cell line.
|
318
|
+
verbosity: The number of unmatched identifiers to print, can be either non-negative values or "all".
|
319
|
+
copy: Determines whether a copy of the `adata` is returned.
|
343
320
|
|
344
321
|
Returns:
|
345
322
|
Returns an AnnData object with bulk rna expression annotation.
|
@@ -378,11 +355,10 @@ class CellLine(MetaData):
|
|
378
355
|
not_matched_identifiers = list(set(adata.obs[query_id]) - set(self.bulk_rna_sanger.index))
|
379
356
|
else:
|
380
357
|
reference_id = "DepMap_ID"
|
381
|
-
|
382
|
-
"To annotate bulk RNA data from Broad Institue, "
|
383
|
-
"`DepMap_ID` is
|
384
|
-
"
|
385
|
-
"Alternatively, use `annotate()` to annotate the cell line first ",
|
358
|
+
logger.warning(
|
359
|
+
"To annotate bulk RNA data from Broad Institue, `DepMap_ID` is used as default reference and query identifier if no `reference_id` is given.\n"
|
360
|
+
"Ensure that `DepMap_ID` is available in 'adata.obs'.\n"
|
361
|
+
"Alternatively, use `annotate()` to annotate the cell line first "
|
386
362
|
)
|
387
363
|
if self.bulk_rna_broad is None:
|
388
364
|
self._download_bulk_rna(cell_line_source="broad")
|
@@ -438,16 +414,12 @@ class CellLine(MetaData):
|
|
438
414
|
|
439
415
|
Args:
|
440
416
|
adata: The data object to annotate.
|
441
|
-
query_id: The column of `.obs` with cell line information.
|
417
|
+
query_id: The column of `.obs` with cell line information.
|
442
418
|
reference_id: The type of cell line identifier in the meta data, model_name or model_id.
|
443
|
-
Defaults to "model_name".
|
444
419
|
protein_information: The type of protein expression data to fetch, protein_intensity or zscore.
|
445
|
-
Defaults to "protein_intensity".
|
446
420
|
protein_id: The protein identifier saved in the fetched meta data, uniprot_id or symbol.
|
447
|
-
Defaults to "uniprot_id".
|
448
421
|
verbosity: The number of unmatched identifiers to print, can be either non-negative values or "all".
|
449
|
-
|
450
|
-
copy: Determines whether a copy of the `adata` is returned. Defaults to False.
|
422
|
+
copy: Determines whether a copy of the `adata` is returned.
|
451
423
|
|
452
424
|
Returns:
|
453
425
|
Returns an AnnData object with protein expression annotation.
|
@@ -481,7 +453,7 @@ class CellLine(MetaData):
|
|
481
453
|
raise ValueError(
|
482
454
|
f"The specified `reference_id`{reference_id} can't be found in the protein expression data. \n"
|
483
455
|
"To solve the issue, please use the reference identifier available in the metadata. \n"
|
484
|
-
"Alternatively, create a `CellLineMetaData.lookup()` object to obtain the available reference identifiers in the metadata.
|
456
|
+
"Alternatively, create a `CellLineMetaData.lookup()` object to obtain the available reference identifiers in the metadata."
|
485
457
|
)
|
486
458
|
|
487
459
|
identifier_num_all = len(adata.obs[query_id].unique())
|
@@ -511,7 +483,7 @@ class CellLine(MetaData):
|
|
511
483
|
reference_id: Literal["cell_line_name", "sanger_model_id", "cosmic_id"] = "cell_line_name",
|
512
484
|
query_perturbation: str = "perturbation",
|
513
485
|
reference_perturbation: Literal["drug_name", "drug_id"] = "drug_name",
|
514
|
-
gdsc_dataset: Literal[
|
486
|
+
gdsc_dataset: Literal["gdsc_1", "gdsc_2"] = "gdsc_1",
|
515
487
|
verbosity: int | str = 5,
|
516
488
|
copy: bool = False,
|
517
489
|
) -> AnnData:
|
@@ -522,22 +494,17 @@ class CellLine(MetaData):
|
|
522
494
|
|
523
495
|
Args:
|
524
496
|
adata: The data object to annotate.
|
525
|
-
query_id: The column of `.obs` with cell line information.
|
526
|
-
reference_id: The type of cell line identifier in the
|
527
|
-
Defaults to "cell_line_name".
|
497
|
+
query_id: The column of `.obs` with cell line information.
|
498
|
+
reference_id: The type of cell line identifier in the metadata, cell_line_name, sanger_model_id or cosmic_id.
|
528
499
|
query_perturbation: The column of `.obs` with perturbation information.
|
529
|
-
|
530
|
-
|
531
|
-
Defaults to 'drug_name'.
|
532
|
-
gdsc_dataset: The GDSC dataset, 1 or 2.
|
500
|
+
reference_perturbation: The type of perturbation in the metadata, drug_name or drug_id.
|
501
|
+
gdsc_dataset: The GDSC dataset, 1 or 2, specified as 'gdsc_1' or 'gdsc_2'.
|
533
502
|
The GDSC1 dataset updates previous releases with additional drug screening data from the
|
534
503
|
Sanger Institute and Massachusetts General Hospital.
|
535
504
|
It covers 970 Cell lines and 403 Compounds with 333292 IC50s.
|
536
505
|
GDSC2 is new and has 243,466 IC50 results from the latest screening at the Sanger Institute.
|
537
|
-
Defaults to 1.
|
538
506
|
verbosity: The number of unmatched identifiers to print, can be either non-negative values or 'all'.
|
539
|
-
|
540
|
-
copy: Determines whether a copy of the `adata` is returned. Defaults to False.
|
507
|
+
copy: Determines whether a copy of the `adata` is returned.
|
541
508
|
|
542
509
|
Returns:
|
543
510
|
Returns an AnnData object with drug response annotation.
|
@@ -559,14 +526,16 @@ class CellLine(MetaData):
|
|
559
526
|
"This ensures that the required query ID is included in your data."
|
560
527
|
)
|
561
528
|
# Lazily download the GDSC data
|
562
|
-
if gdsc_dataset ==
|
529
|
+
if gdsc_dataset == "gdsc_1":
|
563
530
|
if self.drug_response_gdsc1 is None:
|
564
531
|
self._download_gdsc(gdsc_dataset=1)
|
565
532
|
gdsc_data = self.drug_response_gdsc1
|
566
|
-
|
533
|
+
elif gdsc_dataset == "gdsc_2":
|
567
534
|
if self.drug_response_gdsc2 is None:
|
568
535
|
self._download_gdsc(gdsc_dataset=2)
|
569
536
|
gdsc_data = self.drug_response_gdsc2
|
537
|
+
else:
|
538
|
+
raise ValueError("The GDSC dataset specified in `gdsc_dataset` must be either 'gdsc_1' or 'gdsc_2'.")
|
570
539
|
|
571
540
|
identifier_num_all = len(adata.obs[query_id].unique())
|
572
541
|
not_matched_identifiers = list(set(adata.obs[query_id]) - set(gdsc_data[reference_id]))
|
@@ -583,7 +552,7 @@ class CellLine(MetaData):
|
|
583
552
|
adata.obs = (
|
584
553
|
adata.obs.reset_index()
|
585
554
|
.set_index([query_id, query_perturbation])
|
586
|
-
.assign(ln_ic50=
|
555
|
+
.assign(ln_ic50=gdsc_data.set_index([reference_id, reference_perturbation]).ln_ic50)
|
587
556
|
.reset_index()
|
588
557
|
.set_index(old_index_name)
|
589
558
|
)
|
@@ -678,8 +647,8 @@ class CellLine(MetaData):
|
|
678
647
|
|
679
648
|
Args:
|
680
649
|
adata: Input data object.
|
681
|
-
identifier: Column in `.obs` containing cell line identifiers.
|
682
|
-
metadata_key: Key of the AnnData obsm for comparison with the X matrix.
|
650
|
+
identifier: Column in `.obs` containing cell line identifiers.
|
651
|
+
metadata_key: Key of the AnnData obsm for comparison with the X matrix.
|
683
652
|
|
684
653
|
Returns:
|
685
654
|
Returns pearson correlation coefficients and their corresponding p-values for matched and unmatched cell lines separately.
|
@@ -695,7 +664,7 @@ class CellLine(MetaData):
|
|
695
664
|
if isinstance(adata.obsm[metadata_key], pd.DataFrame):
|
696
665
|
# Give warning if the genes are not the same
|
697
666
|
if sum(adata.obsm[metadata_key].columns != adata.var.index.values) > 0:
|
698
|
-
|
667
|
+
logger.warning(
|
699
668
|
"Column name of metadata is not the same as the index of adata.var. Ensure that the genes are in the same order."
|
700
669
|
)
|
701
670
|
|
@@ -726,6 +695,7 @@ class CellLine(MetaData):
|
|
726
695
|
adata: AnnData,
|
727
696
|
corr: pd.DataFrame,
|
728
697
|
pval: pd.DataFrame,
|
698
|
+
*,
|
729
699
|
identifier: str = "DepMap_ID",
|
730
700
|
metadata_key: str = "bulk_rna_broad",
|
731
701
|
category: str = "cell line",
|
@@ -737,13 +707,12 @@ class CellLine(MetaData):
|
|
737
707
|
adata: Input data object.
|
738
708
|
corr: Pearson correlation scores.
|
739
709
|
pval: P-values for pearson correlation.
|
740
|
-
identifier: Column in `.obs` containing the identifiers.
|
741
|
-
metadata_key: Key of the AnnData obsm for comparison with the X matrix.
|
742
|
-
category: The category for correlation comparison.
|
710
|
+
identifier: Column in `.obs` containing the identifiers.
|
711
|
+
metadata_key: Key of the AnnData obsm for comparison with the X matrix.
|
712
|
+
category: The category for correlation comparison.
|
743
713
|
subset_identifier: Selected identifiers for scatter plot visualization between the X matrix and `metadata_key`.
|
744
714
|
If not None, only the chosen cell line will be plotted, either specified as a value in `identifier` (string) or as an index number.
|
745
715
|
If None, all cell lines will be plotted.
|
746
|
-
Defaults to None.
|
747
716
|
Returns:
|
748
717
|
Pearson correlation coefficients and their corresponding p-values for matched and unmatched cell lines separately.
|
749
718
|
"""
|
pertpy/metadata/_compound.py
CHANGED
@@ -30,11 +30,10 @@ class Compound(MetaData):
|
|
30
30
|
|
31
31
|
Args:
|
32
32
|
adata: The data object to annotate.
|
33
|
-
query_id: The column of `.obs` with compound identifiers.
|
34
|
-
query_id_type: The type of compound identifiers, 'name' or 'cid'.
|
33
|
+
query_id: The column of `.obs` with compound identifiers.
|
34
|
+
query_id_type: The type of compound identifiers, 'name' or 'cid'.
|
35
35
|
verbosity: The number of unmatched identifiers to print, can be either non-negative values or "all".
|
36
|
-
|
37
|
-
copy: Determines whether a copy of the `adata` is returned. Defaults to False.
|
36
|
+
copy: Determines whether a copy of the `adata` is returned.
|
38
37
|
|
39
38
|
Returns:
|
40
39
|
Returns an AnnData object with compound annotation.
|
pertpy/metadata/_drug.py
CHANGED
@@ -6,7 +6,6 @@ from pathlib import Path
|
|
6
6
|
from typing import TYPE_CHECKING, Literal
|
7
7
|
|
8
8
|
import pandas as pd
|
9
|
-
from rich import print
|
10
9
|
from scanpy import settings
|
11
10
|
|
12
11
|
from pertpy.data._dataloader import _download
|
@@ -25,7 +24,6 @@ def _download_drug_annotation(
|
|
25
24
|
# Prepared in https://github.com/theislab/pertpy-datasets/blob/main/chembl_data.ipynb
|
26
25
|
chembl_path = Path(settings.cachedir) / "chembl.json"
|
27
26
|
if not Path(chembl_path).exists():
|
28
|
-
print("[bold yellow]No metadata file was found for chembl. Starting download now.")
|
29
27
|
_download(
|
30
28
|
url="https://figshare.com/ndownloader/files/43871718",
|
31
29
|
output_file_name="chembl.json",
|
@@ -40,7 +38,6 @@ def _download_drug_annotation(
|
|
40
38
|
elif source == "dgidb":
|
41
39
|
dgidb_path = Path(settings.cachedir) / "dgidb.tsv"
|
42
40
|
if not Path(dgidb_path).exists():
|
43
|
-
print("[bold yellow]No metadata file was found for dgidb. Starting download now.")
|
44
41
|
_download(
|
45
42
|
url="https://www.dgidb.org/data/latest/interactions.tsv",
|
46
43
|
output_file_name="dgidb.tsv",
|
@@ -54,7 +51,6 @@ def _download_drug_annotation(
|
|
54
51
|
else:
|
55
52
|
pharmgkb_path = Path(settings.cachedir) / "pharmgkb.tsv"
|
56
53
|
if not Path(pharmgkb_path).exists():
|
57
|
-
print("[bold yellow]No metadata file was found for pharmGKB. Starting download now.")
|
58
54
|
_download(
|
59
55
|
url="https://api.pharmgkb.org/v1/download/file/data/relationships.zip",
|
60
56
|
output_file_name="pharmgkb.zip",
|
@@ -103,8 +99,8 @@ class Drug(MetaData):
|
|
103
99
|
|
104
100
|
Args:
|
105
101
|
adata: AnnData object containing log-normalised data.
|
106
|
-
source: Source of the metadata, chembl, dgidb or pharmgkb.
|
107
|
-
copy: Determines whether a copy of the `adata` is returned.
|
102
|
+
source: Source of the metadata, chembl, dgidb or pharmgkb.
|
103
|
+
copy: Determines whether a copy of the `adata` is returned.
|
108
104
|
|
109
105
|
Returns:
|
110
106
|
An AnnData object with a new column `drug` in the var slot.
|