pertpy 0.9.5__py3-none-any.whl → 0.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pertpy/__init__.py +5 -1
- pertpy/_doc.py +2 -5
- pertpy/_types.py +6 -0
- pertpy/data/_dataloader.py +68 -24
- pertpy/data/_datasets.py +9 -9
- pertpy/metadata/__init__.py +2 -1
- pertpy/metadata/_cell_line.py +136 -30
- pertpy/metadata/_look_up.py +13 -19
- pertpy/metadata/_moa.py +1 -1
- pertpy/preprocessing/_guide_rna.py +221 -39
- pertpy/preprocessing/_guide_rna_mixture.py +177 -0
- pertpy/tools/__init__.py +1 -1
- pertpy/tools/_augur.py +138 -142
- pertpy/tools/_cinemaot.py +75 -117
- pertpy/tools/_coda/_base_coda.py +150 -174
- pertpy/tools/_coda/_sccoda.py +66 -69
- pertpy/tools/_coda/_tasccoda.py +71 -79
- pertpy/tools/_dialogue.py +60 -56
- pertpy/tools/_differential_gene_expression/_base.py +25 -43
- pertpy/tools/_differential_gene_expression/_checks.py +4 -6
- pertpy/tools/_differential_gene_expression/_dge_comparison.py +5 -6
- pertpy/tools/_differential_gene_expression/_edger.py +6 -10
- pertpy/tools/_differential_gene_expression/_pydeseq2.py +1 -1
- pertpy/tools/_differential_gene_expression/_simple_tests.py +3 -3
- pertpy/tools/_differential_gene_expression/_statsmodels.py +8 -5
- pertpy/tools/_distances/_distance_tests.py +1 -2
- pertpy/tools/_distances/_distances.py +86 -92
- pertpy/tools/_enrichment.py +8 -25
- pertpy/tools/_milo.py +23 -27
- pertpy/tools/_mixscape.py +261 -175
- pertpy/tools/_perturbation_space/_clustering.py +4 -4
- pertpy/tools/_perturbation_space/_comparison.py +4 -4
- pertpy/tools/_perturbation_space/_discriminator_classifiers.py +83 -32
- pertpy/tools/_perturbation_space/_perturbation_space.py +10 -10
- pertpy/tools/_perturbation_space/_simple.py +13 -17
- pertpy/tools/_scgen/_scgen.py +17 -20
- pertpy/tools/_scgen/_scgenvae.py +2 -2
- pertpy/tools/_scgen/_utils.py +3 -1
- {pertpy-0.9.5.dist-info → pertpy-0.11.0.dist-info}/METADATA +37 -21
- pertpy-0.11.0.dist-info/RECORD +58 -0
- {pertpy-0.9.5.dist-info → pertpy-0.11.0.dist-info}/licenses/LICENSE +1 -0
- pertpy/tools/_kernel_pca.py +0 -50
- pertpy-0.9.5.dist-info/RECORD +0 -57
- {pertpy-0.9.5.dist-info → pertpy-0.11.0.dist-info}/WHEEL +0 -0
pertpy/__init__.py
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
__author__ = "Lukas Heumos"
|
4
4
|
__email__ = "lukas.heumos@posteo.net"
|
5
|
-
__version__ = "0.
|
5
|
+
__version__ = "0.11.0"
|
6
6
|
|
7
7
|
import warnings
|
8
8
|
|
@@ -14,6 +14,10 @@ warnings.filterwarnings("ignore", category=MatplotlibDeprecationWarning)
|
|
14
14
|
warnings.filterwarnings("ignore", category=SyntaxWarning)
|
15
15
|
warnings.filterwarnings("ignore", category=UserWarning, module="scvi._settings")
|
16
16
|
|
17
|
+
import mudata
|
18
|
+
|
19
|
+
mudata.set_options(pull_on_update=False)
|
20
|
+
|
17
21
|
from . import data as dt
|
18
22
|
from . import metadata as md
|
19
23
|
from . import plot as pl
|
pertpy/_doc.py
CHANGED
@@ -2,9 +2,7 @@ from textwrap import dedent
|
|
2
2
|
|
3
3
|
|
4
4
|
def _doc_params(**kwds): # pragma: no cover
|
5
|
-
"""\
|
6
|
-
Docstrings should start with "\" in the first line for proper formatting.
|
7
|
-
"""
|
5
|
+
r"""Docstrings should start with "\" in the first line for proper formatting."""
|
8
6
|
|
9
7
|
def dec(obj):
|
10
8
|
obj.__orig_doc__ = obj.__doc__
|
@@ -15,6 +13,5 @@ def _doc_params(**kwds): # pragma: no cover
|
|
15
13
|
|
16
14
|
|
17
15
|
doc_common_plot_args = """\
|
18
|
-
|
19
|
-
return_fig: if `True`, returns figure of the plot.\
|
16
|
+
return_fig: if `True`, returns figure of the plot, that can be used for saving.\
|
20
17
|
"""
|
pertpy/_types.py
ADDED
pertpy/data/_dataloader.py
CHANGED
@@ -1,4 +1,6 @@
|
|
1
|
+
import shutil
|
1
2
|
import tempfile
|
3
|
+
import time
|
2
4
|
from pathlib import Path
|
3
5
|
from random import choice
|
4
6
|
from string import ascii_lowercase
|
@@ -7,6 +9,7 @@ from zipfile import ZipFile
|
|
7
9
|
import requests
|
8
10
|
from filelock import FileLock
|
9
11
|
from lamin_utils import logger
|
12
|
+
from requests.exceptions import RequestException
|
10
13
|
from rich.progress import Progress
|
11
14
|
|
12
15
|
|
@@ -17,7 +20,10 @@ def _download( # pragma: no cover
|
|
17
20
|
block_size: int = 1024,
|
18
21
|
overwrite: bool = False,
|
19
22
|
is_zip: bool = False,
|
20
|
-
|
23
|
+
timeout: int = 30,
|
24
|
+
max_retries: int = 3,
|
25
|
+
retry_delay: int = 5,
|
26
|
+
) -> Path:
|
21
27
|
"""Downloads a dataset irrespective of the format.
|
22
28
|
|
23
29
|
Args:
|
@@ -27,6 +33,9 @@ def _download( # pragma: no cover
|
|
27
33
|
block_size: Block size for downloads in bytes.
|
28
34
|
overwrite: Whether to overwrite existing files.
|
29
35
|
is_zip: Whether the downloaded file needs to be unzipped.
|
36
|
+
timeout: Request timeout in seconds.
|
37
|
+
max_retries: Maximum number of retry attempts.
|
38
|
+
retry_delay: Delay between retries in seconds.
|
30
39
|
"""
|
31
40
|
if output_file_name is None:
|
32
41
|
letters = ascii_lowercase
|
@@ -35,36 +44,71 @@ def _download( # pragma: no cover
|
|
35
44
|
if output_path is None:
|
36
45
|
output_path = tempfile.gettempdir()
|
37
46
|
|
38
|
-
download_to_path = (
|
39
|
-
f"{output_path}{output_file_name}" if str(output_path).endswith("/") else f"{output_path}/{output_file_name}"
|
40
|
-
)
|
47
|
+
download_to_path = Path(output_path) / output_file_name
|
41
48
|
|
42
49
|
Path(output_path).mkdir(parents=True, exist_ok=True)
|
43
|
-
lock_path = f"{
|
44
|
-
|
50
|
+
lock_path = Path(output_path) / f"{output_file_name}.lock"
|
51
|
+
|
52
|
+
with FileLock(lock_path, timeout=300):
|
45
53
|
if Path(download_to_path).exists() and not overwrite:
|
46
54
|
logger.warning(f"File {download_to_path} already exists!")
|
47
|
-
return
|
55
|
+
return download_to_path
|
56
|
+
|
57
|
+
temp_file_name = Path(f"{download_to_path}.part")
|
58
|
+
|
59
|
+
retry_count = 0
|
60
|
+
while retry_count <= max_retries:
|
61
|
+
try:
|
62
|
+
head_response = requests.head(url, timeout=timeout)
|
63
|
+
head_response.raise_for_status()
|
64
|
+
content_length = int(head_response.headers.get("content-length", 0))
|
65
|
+
|
66
|
+
free_space = shutil.disk_usage(output_path).free
|
67
|
+
if content_length > free_space:
|
68
|
+
raise OSError(
|
69
|
+
f"Insufficient disk space. Need {content_length} bytes, but only {free_space} available."
|
70
|
+
)
|
71
|
+
|
72
|
+
response = requests.get(url, stream=True)
|
73
|
+
response.raise_for_status()
|
74
|
+
total = int(response.headers.get("content-length", 0))
|
48
75
|
|
49
|
-
|
76
|
+
with Progress(refresh_per_second=5) as progress:
|
77
|
+
task = progress.add_task("[red]Downloading...", total=total)
|
78
|
+
with Path(temp_file_name).open("wb") as file:
|
79
|
+
for data in response.iter_content(block_size):
|
80
|
+
file.write(data)
|
81
|
+
progress.update(task, advance=len(data))
|
82
|
+
progress.update(task, completed=total, refresh=True)
|
50
83
|
|
51
|
-
|
52
|
-
total = int(response.headers.get("content-length", 0))
|
84
|
+
Path(temp_file_name).replace(download_to_path)
|
53
85
|
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
file.write(data)
|
59
|
-
progress.update(task, advance=block_size)
|
60
|
-
progress.update(task, completed=total, refresh=True)
|
86
|
+
if is_zip:
|
87
|
+
with ZipFile(download_to_path, "r") as zip_obj:
|
88
|
+
zip_obj.extractall(path=output_path)
|
89
|
+
return Path(output_path)
|
61
90
|
|
62
|
-
|
91
|
+
return download_to_path
|
92
|
+
except (OSError, RequestException) as e:
|
93
|
+
retry_count += 1
|
94
|
+
if retry_count <= max_retries:
|
95
|
+
logger.warning(
|
96
|
+
f"Download attempt {retry_count}/{max_retries} failed: {str(e)}. Retrying in {retry_delay} seconds..."
|
97
|
+
)
|
98
|
+
time.sleep(retry_delay)
|
99
|
+
else:
|
100
|
+
logger.error(f"Download failed after {max_retries} attempts: {str(e)}")
|
101
|
+
if Path(temp_file_name).exists():
|
102
|
+
Path(temp_file_name).unlink(missing_ok=True)
|
103
|
+
raise
|
63
104
|
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
105
|
+
except Exception as e:
|
106
|
+
logger.error(f"Download failed: {str(e)}")
|
107
|
+
if Path(temp_file_name).exists():
|
108
|
+
Path(temp_file_name).unlink(missing_ok=True)
|
109
|
+
raise
|
110
|
+
finally:
|
111
|
+
if Path(temp_file_name).exists():
|
112
|
+
Path(temp_file_name).unlink(missing_ok=True)
|
69
113
|
|
70
|
-
|
114
|
+
return Path(download_to_path)
|
pertpy/data/_datasets.py
CHANGED
@@ -37,7 +37,7 @@ def papalexi_2021() -> MuData: # pragma: no cover
|
|
37
37
|
Returns:
|
38
38
|
:class:`~mudata.MuData` object of the ECCITE-seq dataset
|
39
39
|
"""
|
40
|
-
import
|
40
|
+
import mudata as md
|
41
41
|
|
42
42
|
output_file_name = "papalexi_2021.h5mu"
|
43
43
|
output_file_path = settings.datasetdir / output_file_name
|
@@ -48,9 +48,11 @@ def papalexi_2021() -> MuData: # pragma: no cover
|
|
48
48
|
output_path=settings.datasetdir,
|
49
49
|
is_zip=False,
|
50
50
|
)
|
51
|
-
|
51
|
+
mdata = md.read_h5mu(output_file_path)
|
52
|
+
mdata.pull_obs()
|
53
|
+
mdata.pull_var()
|
52
54
|
|
53
|
-
return
|
55
|
+
return mdata
|
54
56
|
|
55
57
|
|
56
58
|
def sc_sim_augur() -> AnnData: # pragma: no cover
|
@@ -408,7 +410,7 @@ def kang_2018() -> AnnData: # pragma: no cover
|
|
408
410
|
|
409
411
|
|
410
412
|
def stephenson_2021_subsampled() -> AnnData: # pragma: no cover
|
411
|
-
"""Processed 10X 5' scRNA-seq data from PBMC of COVID-19 patients and healthy donors
|
413
|
+
"""Processed 10X 5' scRNA-seq data from PBMC of COVID-19 patients and healthy donors.
|
412
414
|
|
413
415
|
The study profiled peripheral blood mononuclear cells from 90 COVID-19 patients with different disease severity and 23 healthy control donors.
|
414
416
|
Here the dataset was downsampled to approx. 500 cells per donor and cells were mapped to a reference atlas of healthy PBMCs from 12 studies
|
@@ -453,7 +455,7 @@ def haber_2017_regions() -> AnnData: # pragma: no cover
|
|
453
455
|
output_file_path = settings.datasetdir / output_file_name
|
454
456
|
if not Path(output_file_path).exists():
|
455
457
|
_download(
|
456
|
-
url="https://figshare.com/ndownloader/files/
|
458
|
+
url="https://figshare.com/ndownloader/files/54169301",
|
457
459
|
output_file_name=output_file_name,
|
458
460
|
output_path=settings.datasetdir,
|
459
461
|
is_zip=False,
|
@@ -650,7 +652,7 @@ def datlinger_2021() -> AnnData: # pragma: no cover
|
|
650
652
|
Publication: https://doi.org/10.1038/s41592-021-01153-z \
|
651
653
|
Obtained from scperturb: http://projects.sanderlab.org/scperturb/
|
652
654
|
|
653
|
-
|
655
|
+
Returns:
|
654
656
|
:class:`~anndata.AnnData` object of scPerturb prepared single-cell perturbation data
|
655
657
|
"""
|
656
658
|
output_file_name = "datlinger_2021.h5ad"
|
@@ -1516,9 +1518,7 @@ def combosciplex() -> AnnData: # pragma: no cover
|
|
1516
1518
|
|
1517
1519
|
|
1518
1520
|
def sciplex_gxe1() -> AnnData: # pragma: no cover
|
1519
|
-
"""sci-Plex-GxE
|
1520
|
-
genetically perturbed for HPRT1 or mismtach repair genes exposed to 6-thioguanine and temozolomide,
|
1521
|
-
respectively, and A172 dCas9-SunTag cells genetically perturbed for HPRT1 exposed to 6-thioguanine.
|
1521
|
+
"""sci-Plex-GxE profiling of A172 dCas9-KRAB (HPRT1 or MMR knockout) with 6-TG/TMZ and A172 dCas9-SunTag (HPRT1 knockout) with 6-TG.
|
1522
1522
|
|
1523
1523
|
References:
|
1524
1524
|
McFaline-Figueroa JL et al., Trapnell C. Multiplex single-cell chemical genomics reveals
|
pertpy/metadata/__init__.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
from pertpy.metadata._cell_line import CellLine
|
2
2
|
from pertpy.metadata._compound import Compound
|
3
3
|
from pertpy.metadata._drug import Drug
|
4
|
+
from pertpy.metadata._look_up import LookUp
|
4
5
|
from pertpy.metadata._moa import Moa
|
5
6
|
|
6
|
-
__all__ = ["CellLine", "Compound", "Drug", "Moa"]
|
7
|
+
__all__ = ["CellLine", "Compound", "Drug", "Moa", "LookUp"]
|
pertpy/metadata/_cell_line.py
CHANGED
@@ -39,6 +39,7 @@ class CellLine(MetaData):
|
|
39
39
|
self.proteomics = None
|
40
40
|
self.drug_response_gdsc1 = None
|
41
41
|
self.drug_response_gdsc2 = None
|
42
|
+
self.drug_response_prism = None
|
42
43
|
|
43
44
|
def _download_cell_line(self, cell_line_source: Literal["DepMap", "Cancerrxgene"] = "DepMap") -> None:
|
44
45
|
if cell_line_source == "DepMap":
|
@@ -54,6 +55,7 @@ class CellLine(MetaData):
|
|
54
55
|
is_zip=False,
|
55
56
|
)
|
56
57
|
self.depmap = pd.read_csv(depmap_cell_line_path)
|
58
|
+
self.depmap = self.depmap.reset_index().rename(columns={"CellLineName": "cell_line_name"})
|
57
59
|
else:
|
58
60
|
# Download cell line metadata from The Genomics of Drug Sensitivity in Cancer Project
|
59
61
|
# Source: https://www.cancerrxgene.org/celllines
|
@@ -157,7 +159,7 @@ class CellLine(MetaData):
|
|
157
159
|
def _download_gdsc(self, gdsc_dataset: Literal[1, 2] = 1) -> None:
|
158
160
|
if gdsc_dataset == 1:
|
159
161
|
# Download GDSC drug response data
|
160
|
-
# Source: https://www.cancerrxgene.org/downloads/bulk_download (Drug Screening - IC50s)
|
162
|
+
# Source: https://www.cancerrxgene.org/downloads/bulk_download (Drug Screening - IC50s and AUC)
|
161
163
|
# URL: https://cog.sanger.ac.uk/cancerrxgene/GDSC_release8.4/GDSC1_fitted_dose_response_24Jul22.xlsx
|
162
164
|
drug_response_gdsc1_file_path = Path(settings.cachedir) / "gdsc1_info.csv"
|
163
165
|
if not Path(drug_response_gdsc1_file_path).exists():
|
@@ -181,6 +183,23 @@ class CellLine(MetaData):
|
|
181
183
|
)
|
182
184
|
self.drug_response_gdsc2 = pd.read_csv(drug_response_gdsc2_file_path, index_col=0)
|
183
185
|
|
186
|
+
def _download_prism(self) -> None:
|
187
|
+
# Download PRISM drug response data
|
188
|
+
# Source: DepMap PRISM Repurposing 19Q4 secondary screen dose response curve parameters
|
189
|
+
drug_response_prism_file_path = Path(settings.cachedir) / "prism_info.csv"
|
190
|
+
if not Path(drug_response_prism_file_path).exists():
|
191
|
+
_download(
|
192
|
+
url="https://figshare.com/ndownloader/files/20237739",
|
193
|
+
output_file_name="prism_info.csv",
|
194
|
+
output_path=settings.cachedir,
|
195
|
+
block_size=4096,
|
196
|
+
is_zip=False,
|
197
|
+
)
|
198
|
+
df = pd.read_csv(drug_response_prism_file_path, index_col=0)[["depmap_id", "name", "ic50", "ec50", "auc"]]
|
199
|
+
df = df.dropna(subset=["depmap_id", "name"])
|
200
|
+
df = df.groupby(["depmap_id", "name"]).mean().reset_index()
|
201
|
+
self.drug_response_prism = df
|
202
|
+
|
184
203
|
def annotate(
|
185
204
|
self,
|
186
205
|
adata: AnnData,
|
@@ -197,13 +216,13 @@ class CellLine(MetaData):
|
|
197
216
|
|
198
217
|
Args:
|
199
218
|
adata: The data object to annotate.
|
200
|
-
query_id: The column of
|
219
|
+
query_id: The column of ``.obs`` with cell line information.
|
201
220
|
reference_id: The type of cell line identifier in the metadata, e.g. ModelID, CellLineName or StrippedCellLineName.
|
202
221
|
If fetching cell line metadata from Cancerrxgene, it is recommended to choose "stripped_cell_line_name".
|
203
222
|
fetch: The metadata to fetch.
|
204
223
|
cell_line_source: The source of cell line metadata, DepMap or Cancerrxgene.
|
205
224
|
verbosity: The number of unmatched identifiers to print, can be either non-negative values or "all".
|
206
|
-
copy: Determines whether a copy of
|
225
|
+
copy: Determines whether a copy of ``adata`` is returned.
|
207
226
|
|
208
227
|
Returns:
|
209
228
|
Returns an AnnData object with cell line annotation.
|
@@ -216,7 +235,7 @@ class CellLine(MetaData):
|
|
216
235
|
>>> adata_annotated = pt_metadata.annotate(adata=adata,
|
217
236
|
>>> reference_id='cell_line_name',
|
218
237
|
>>> query_id='cell_line_name',
|
219
|
-
>>> fetch=["cell_line_name", "
|
238
|
+
>>> fetch=["cell_line_name", "Age", "OncotreePrimaryDisease"],
|
220
239
|
>>> copy=True)
|
221
240
|
"""
|
222
241
|
if copy:
|
@@ -304,7 +323,7 @@ class CellLine(MetaData):
|
|
304
323
|
def annotate_bulk_rna(
|
305
324
|
self,
|
306
325
|
adata: AnnData,
|
307
|
-
query_id: str =
|
326
|
+
query_id: str = None,
|
308
327
|
cell_line_source: Literal["broad", "sanger"] = "sanger",
|
309
328
|
verbosity: int | str = 5,
|
310
329
|
gene_identifier: Literal["gene_name", "gene_ID", "both"] = "gene_ID",
|
@@ -316,9 +335,11 @@ class CellLine(MetaData):
|
|
316
335
|
|
317
336
|
Args:
|
318
337
|
adata: The data object to annotate.
|
319
|
-
query_id: The column of `.obs` with cell line information.
|
338
|
+
query_id: The column of `.obs` with cell line information.
|
339
|
+
Defaults to "cell_line_name" if `cell_line_source` is sanger, otherwise "DepMap_ID".
|
320
340
|
cell_line_source: The bulk rna expression data from either broad or sanger cell line.
|
321
341
|
verbosity: The number of unmatched identifiers to print, can be either non-negative values or "all".
|
342
|
+
gene_identifier: The type of gene identifier saved in the fetched meta data, 'gene_name', 'gene_ID' or 'both'.
|
322
343
|
copy: Determines whether a copy of the `adata` is returned.
|
323
344
|
|
324
345
|
Returns:
|
@@ -339,7 +360,7 @@ class CellLine(MetaData):
|
|
339
360
|
|
340
361
|
# Make sure that the specified `cell_line_type` can be found in the bulk rna expression data,
|
341
362
|
# then we can compare these keys and fetch the corresponding metadata.
|
342
|
-
if query_id not in adata.obs.columns:
|
363
|
+
if query_id not in adata.obs.columns and query_id is not None:
|
343
364
|
raise ValueError(
|
344
365
|
f"The specified `query_id` {query_id} can't be found in the `adata.obs`. \n"
|
345
366
|
"Ensure that you are using one of the available query IDs present in the adata.obs for the annotation."
|
@@ -347,25 +368,33 @@ class CellLine(MetaData):
|
|
347
368
|
"using the `annotate()` function before calling 'annotate_bulk_rna()'. "
|
348
369
|
"This ensures that the required query ID is included in your data, e.g. stripped_cell_line_name, DepMap ID."
|
349
370
|
)
|
350
|
-
|
371
|
+
if query_id is None:
|
372
|
+
query_id = "cell_line_name" if cell_line_source == "sanger" else "DepMap_ID"
|
351
373
|
identifier_num_all = len(adata.obs[query_id].unique())
|
352
374
|
|
353
375
|
# Lazily download the bulk rna expression data
|
354
376
|
if cell_line_source == "sanger":
|
377
|
+
if query_id not in adata.obs.columns:
|
378
|
+
raise ValueError(
|
379
|
+
"To annotate bulk RNA data from Wellcome Sanger Institute, `cell_line_name` is used as default reference and query identifier if no `query_id` is given."
|
380
|
+
"Ensure that you have column `cell_line_name` in `adata.obs` or specify column name in which cell line name is stored."
|
381
|
+
"If cell line name isn't available in 'adata.obs', use `annotate()` to annotate the cell line first."
|
382
|
+
)
|
355
383
|
if self.bulk_rna_sanger is None:
|
356
384
|
self._download_bulk_rna(cell_line_source="sanger")
|
357
385
|
reference_id = "model_name"
|
358
386
|
not_matched_identifiers = list(set(adata.obs[query_id]) - set(self.bulk_rna_sanger.index))
|
359
387
|
else:
|
388
|
+
if query_id not in adata.obs.columns:
|
389
|
+
raise ValueError(
|
390
|
+
"To annotate bulk RNA data from Broad Institue, `DepMap_ID` is used as default reference and query identifier if no `query_id` is given."
|
391
|
+
"Ensure that you have column `DepMap_ID` in `adata.obs` or specify column name in which DepMap ID is stored."
|
392
|
+
"If DepMap ID isn't available in 'adata.obs', use `annotate()` to annotate the cell line first."
|
393
|
+
)
|
360
394
|
reference_id = "DepMap_ID"
|
361
|
-
|
362
|
-
"To annotate bulk RNA data from Broad Institue, `DepMap_ID` is used as default reference and query identifier if no `reference_id` is given."
|
363
|
-
"If `DepMap_ID` isn't available in 'adata.obs', use `annotate()` to annotate the cell line first."
|
364
|
-
)
|
395
|
+
|
365
396
|
if self.bulk_rna_broad is None:
|
366
397
|
self._download_bulk_rna(cell_line_source="broad")
|
367
|
-
if query_id == "cell_line_name":
|
368
|
-
query_id = "DepMap_ID"
|
369
398
|
not_matched_identifiers = list(set(adata.obs[query_id]) - set(self.bulk_rna_broad.index))
|
370
399
|
|
371
400
|
self._warn_unmatch(
|
@@ -474,7 +503,8 @@ class CellLine(MetaData):
|
|
474
503
|
adata.obsm["proteomics_" + protein_information] = (
|
475
504
|
self.proteomics[[reference_id, protein_id, protein_information]]
|
476
505
|
.pivot(index=reference_id, columns=protein_id, values=protein_information)
|
477
|
-
.reindex(adata.obs
|
506
|
+
.reindex(adata.obs[query_id])
|
507
|
+
.set_index(adata.obs.index)
|
478
508
|
)
|
479
509
|
return adata
|
480
510
|
|
@@ -491,7 +521,7 @@ class CellLine(MetaData):
|
|
491
521
|
) -> AnnData:
|
492
522
|
"""Fetch drug response data from GDSC.
|
493
523
|
|
494
|
-
For each cell, we fetch drug response data as natural log of the fitted IC50 for its
|
524
|
+
For each cell, we fetch drug response data as natural log of the fitted IC50 and AUC for its
|
495
525
|
corresponding cell line and perturbation from GDSC fitted data results file.
|
496
526
|
|
497
527
|
Args:
|
@@ -554,13 +584,86 @@ class CellLine(MetaData):
|
|
554
584
|
adata.obs = (
|
555
585
|
adata.obs.reset_index()
|
556
586
|
.set_index([query_id, query_perturbation])
|
557
|
-
.assign(
|
587
|
+
.assign(ln_ic50_gdsc=gdsc_data.set_index([reference_id, reference_perturbation]).ln_ic50)
|
588
|
+
.assign(auc_gdsc=gdsc_data.set_index([reference_id, reference_perturbation]).auc)
|
558
589
|
.reset_index()
|
559
590
|
.set_index(old_index_name)
|
560
591
|
)
|
561
592
|
|
562
593
|
return adata
|
563
594
|
|
595
|
+
def annotate_from_prism(
|
596
|
+
self,
|
597
|
+
adata: AnnData,
|
598
|
+
query_id: str = "DepMap_ID",
|
599
|
+
query_perturbation: str = "perturbation",
|
600
|
+
verbosity: int | str = 5,
|
601
|
+
copy: bool = False,
|
602
|
+
) -> AnnData:
|
603
|
+
"""Fetch drug response data from PRISM.
|
604
|
+
|
605
|
+
For each cell, we fetch drug response data as IC50, EC50 and AUC for its
|
606
|
+
corresponding cell line and perturbation from PRISM fitted data results file.
|
607
|
+
Note that all rows where either `depmap_id` or `name` is missing will be dropped.
|
608
|
+
|
609
|
+
Args:
|
610
|
+
adata: The data object to annotate.
|
611
|
+
query_id: The column of `.obs` with cell line information.
|
612
|
+
query_perturbation: The column of `.obs` with perturbation information.
|
613
|
+
verbosity: The number of unmatched identifiers to print, can be either non-negative values or 'all'.
|
614
|
+
copy: Determines whether a copy of the `adata` is returned.
|
615
|
+
|
616
|
+
Returns:
|
617
|
+
Returns an AnnData object with drug response annotation.
|
618
|
+
|
619
|
+
Examples:
|
620
|
+
>>> import pertpy as pt
|
621
|
+
>>> adata = pt.dt.mcfarland_2020()
|
622
|
+
>>> pt_metadata = pt.md.CellLine()
|
623
|
+
>>> pt_metadata.annotate_from_prism(adata, query_id="DepMap_ID")
|
624
|
+
"""
|
625
|
+
if copy:
|
626
|
+
adata = adata.copy()
|
627
|
+
if query_id not in adata.obs.columns:
|
628
|
+
raise ValueError(
|
629
|
+
f"The specified `query_id` {query_id} can't be found in the `adata.obs`. \n"
|
630
|
+
"Ensure that you are using one of the available query IDs present in 'adata.obs' for the annotation.\n"
|
631
|
+
"If the desired query ID is not available, you can fetch the cell line metadata "
|
632
|
+
"using the `annotate()` function before calling `annotate_from_prism()`. "
|
633
|
+
"This ensures that the required query ID is included in your data."
|
634
|
+
)
|
635
|
+
if self.drug_response_prism is None:
|
636
|
+
self._download_prism()
|
637
|
+
prism_data = self.drug_response_prism
|
638
|
+
# PRISM starts most drug names with a lowercase letter, so we want to make it case-insensitive
|
639
|
+
prism_data["name_lower"] = prism_data["name"].str.lower()
|
640
|
+
adata.obs["perturbation_lower"] = adata.obs[query_perturbation].str.lower()
|
641
|
+
|
642
|
+
identifier_num_all = len(adata.obs[query_id].unique())
|
643
|
+
not_matched_identifiers = list(set(adata.obs[query_id]) - set(prism_data["depmap_id"]))
|
644
|
+
self._warn_unmatch(
|
645
|
+
total_identifiers=identifier_num_all,
|
646
|
+
unmatched_identifiers=not_matched_identifiers,
|
647
|
+
query_id=query_id,
|
648
|
+
reference_id="depmap_id",
|
649
|
+
metadata_type="drug response",
|
650
|
+
verbosity=verbosity,
|
651
|
+
)
|
652
|
+
|
653
|
+
old_index_name = "index" if adata.obs.index.name is None else adata.obs.index.name
|
654
|
+
adata.obs = (
|
655
|
+
adata.obs.reset_index()
|
656
|
+
.set_index([query_id, "perturbation_lower"])
|
657
|
+
.assign(ic50_prism=prism_data.set_index(["depmap_id", "name"]).ic50)
|
658
|
+
.assign(ec50_prism=prism_data.set_index(["depmap_id", "name"]).ec50)
|
659
|
+
.assign(auc_prism=prism_data.set_index(["depmap_id", "name"]).auc)
|
660
|
+
.reset_index()
|
661
|
+
.set_index(old_index_name)
|
662
|
+
.drop(columns="perturbation_lower")
|
663
|
+
)
|
664
|
+
|
665
|
+
return adata
|
666
|
+
|
564
667
|
def lookup(self) -> LookUp:
|
565
668
|
"""Generate LookUp object for CellLineMetaData.
|
566
669
|
|
@@ -577,7 +680,7 @@ class CellLine(MetaData):
|
|
577
680
|
>>> pt_metadata = pt.md.CellLine()
|
578
681
|
>>> lookup = pt_metadata.lookup()
|
579
682
|
"""
|
580
|
-
# Fetch the metadata if it hasn't
|
683
|
+
# Fetch the metadata if it hasn't been downloaded yet
|
581
684
|
if self.depmap is None:
|
582
685
|
self._download_cell_line(cell_line_source="DepMap")
|
583
686
|
if self.cancerxgene is None:
|
@@ -594,6 +697,8 @@ class CellLine(MetaData):
|
|
594
697
|
self._download_gdsc(gdsc_dataset=1)
|
595
698
|
if self.drug_response_gdsc2 is None:
|
596
699
|
self._download_gdsc(gdsc_dataset=2)
|
700
|
+
if self.drug_response_prism is None:
|
701
|
+
self._download_prism()
|
597
702
|
|
598
703
|
# Transfer the data
|
599
704
|
return LookUp(
|
@@ -607,6 +712,7 @@ class CellLine(MetaData):
|
|
607
712
|
self.proteomics,
|
608
713
|
self.drug_response_gdsc1,
|
609
714
|
self.drug_response_gdsc2,
|
715
|
+
self.drug_response_prism,
|
610
716
|
],
|
611
717
|
)
|
612
718
|
|
@@ -663,12 +769,14 @@ class CellLine(MetaData):
|
|
663
769
|
raise ValueError(
|
664
770
|
"Dimensions of adata.X do not match those of metadata. Ensure that they have the same gene list."
|
665
771
|
)
|
666
|
-
if
|
667
|
-
|
668
|
-
|
669
|
-
|
670
|
-
|
671
|
-
|
772
|
+
# Raise error if the genes are not the same
|
773
|
+
if (
|
774
|
+
isinstance(adata.obsm[metadata_key], pd.DataFrame)
|
775
|
+
and sum(adata.obsm[metadata_key].columns != adata.var.index.values) > 0
|
776
|
+
):
|
777
|
+
raise ValueError(
|
778
|
+
"Column name of metadata is not the same as the index of adata.var. Ensure that the genes are in the same order."
|
779
|
+
)
|
672
780
|
|
673
781
|
# Divide cell lines into those are present and not present in the metadata
|
674
782
|
overlapped_cl = adata[~adata.obsm[metadata_key].isna().all(axis=1), :]
|
@@ -693,7 +801,7 @@ class CellLine(MetaData):
|
|
693
801
|
return corr, pvals, new_corr, new_pvals
|
694
802
|
|
695
803
|
@_doc_params(common_plot_args=doc_common_plot_args)
|
696
|
-
def plot_correlation(
|
804
|
+
def plot_correlation( # noqa: D417
|
697
805
|
self,
|
698
806
|
adata: AnnData,
|
699
807
|
corr: pd.DataFrame,
|
@@ -703,7 +811,6 @@ class CellLine(MetaData):
|
|
703
811
|
metadata_key: str = "bulk_rna_broad",
|
704
812
|
category: str = "cell line",
|
705
813
|
subset_identifier: str | int | Iterable[str] | Iterable[int] | None = None,
|
706
|
-
show: bool = True,
|
707
814
|
return_fig: bool = False,
|
708
815
|
) -> Figure | None:
|
709
816
|
"""Visualise the correlation of cell lines with annotated metadata.
|
@@ -747,7 +854,7 @@ class CellLine(MetaData):
|
|
747
854
|
if all(isinstance(id, str) for id in subset_identifier_list):
|
748
855
|
if set(subset_identifier_list).issubset(adata.obs[identifier].unique()):
|
749
856
|
subset_identifier_list = np.where(
|
750
|
-
np.
|
857
|
+
np.isin(adata.obs[identifier].values, subset_identifier_list)
|
751
858
|
)[0]
|
752
859
|
else:
|
753
860
|
raise ValueError("`Subset_identifier` must be found in adata.obs.`identifier`.")
|
@@ -798,10 +905,9 @@ class CellLine(MetaData):
|
|
798
905
|
},
|
799
906
|
)
|
800
907
|
|
801
|
-
if show:
|
802
|
-
plt.show()
|
803
908
|
if return_fig:
|
804
909
|
return plt.gcf()
|
910
|
+
plt.show()
|
805
911
|
return None
|
806
912
|
else:
|
807
|
-
raise NotImplementedError
|
913
|
+
raise NotImplementedError("Only 'cell line' category is supported for correlation comparison.")
|
pertpy/metadata/_look_up.py
CHANGED
@@ -22,11 +22,12 @@ class LookUp:
|
|
22
22
|
type: Literal["cell_line", "moa", "compound", "drug"] = "cell_line",
|
23
23
|
transfer_metadata: Sequence[pd.DataFrame] | None = None,
|
24
24
|
):
|
25
|
-
"""
|
25
|
+
"""Lookup object for different type of metadata.
|
26
|
+
|
26
27
|
Args:
|
27
28
|
type: Metadata type for annotation. One of 'cell_line', 'compound', 'moa' or 'drug.
|
28
29
|
transfer_metadata: DataFrames used to generate Lookup object.
|
29
|
-
|
30
|
+
This is currently set to None for CompoundMetaData which does not require any dataframes for transfer.
|
30
31
|
"""
|
31
32
|
self.type = type
|
32
33
|
if type == "cell_line":
|
@@ -329,10 +330,7 @@ class LookUp:
|
|
329
330
|
if self.type != "cell_line":
|
330
331
|
raise ValueError("This is not a LookUp object specific for CellLineMetaData!")
|
331
332
|
|
332
|
-
if cell_line_source == "broad"
|
333
|
-
bulk_rna = self.bulk_rna_broad
|
334
|
-
else:
|
335
|
-
bulk_rna = self.bulk_rna_sanger
|
333
|
+
bulk_rna = self.bulk_rna_broad if cell_line_source == "broad" else self.bulk_rna_sanger
|
336
334
|
|
337
335
|
if query_id_list is not None:
|
338
336
|
identifier_num_all = len(query_id_list)
|
@@ -391,10 +389,7 @@ class LookUp:
|
|
391
389
|
"""
|
392
390
|
if self.type != "cell_line":
|
393
391
|
raise ValueError("This is not a LookUp object specific for CellLineMetaData!")
|
394
|
-
if gdsc_dataset == 1
|
395
|
-
gdsc_data = self.drug_response_gdsc1
|
396
|
-
else:
|
397
|
-
gdsc_data = self.drug_response_gdsc2
|
392
|
+
gdsc_data = self.drug_response_gdsc1 if gdsc_dataset == 1 else self.drug_response_gdsc2
|
398
393
|
|
399
394
|
if query_id_list is not None:
|
400
395
|
if reference_id not in gdsc_data.columns:
|
@@ -421,7 +416,7 @@ class LookUp:
|
|
421
416
|
reference_id: Literal["gene_id", "ensembl_gene_id", "hgnc_id", "hgnc_symbol"] = "ensembl_gene_id",
|
422
417
|
query_id_list: Sequence[str] | None = None,
|
423
418
|
) -> None:
|
424
|
-
"""A brief summary of gene annotation metadata
|
419
|
+
"""A brief summary of gene annotation metadata.
|
425
420
|
|
426
421
|
Args:
|
427
422
|
reference_id: The type of gene identifier in the meta data, gene_id, ensembl_gene_id, hgnc_id, hgnc_symbol.
|
@@ -555,15 +550,14 @@ class LookUp:
|
|
555
550
|
raise ValueError(
|
556
551
|
"Gene-disease association is not available in dgidb dataset, please try with pharmgkb."
|
557
552
|
)
|
553
|
+
elif query_id_type == "target":
|
554
|
+
not_matched_identifiers = list(set(query_id_list) - set(self.pharmgkb["Gene"]))
|
555
|
+
elif query_id_type == "compound":
|
556
|
+
compounds = self.pharmgkb[self.pharmgkb["Type"] == "Chemical"]
|
557
|
+
not_matched_identifiers = list(set(query_id_list) - set(compounds["Compound|Disease"]))
|
558
558
|
else:
|
559
|
-
|
560
|
-
|
561
|
-
elif query_id_type == "compound":
|
562
|
-
compounds = self.pharmgkb[self.pharmgkb["Type"] == "Chemical"]
|
563
|
-
not_matched_identifiers = list(set(query_id_list) - set(compounds["Compound|Disease"]))
|
564
|
-
else:
|
565
|
-
diseases = self.pharmgkb[self.pharmgkb["Type"] == "Disease"]
|
566
|
-
not_matched_identifiers = list(set(query_id_list) - set(diseases["Compound|Disease"]))
|
559
|
+
diseases = self.pharmgkb[self.pharmgkb["Type"] == "Disease"]
|
560
|
+
not_matched_identifiers = list(set(query_id_list) - set(diseases["Compound|Disease"]))
|
567
561
|
|
568
562
|
logger.info(f"{len(not_matched_identifiers)} {query_id_type}s are not found in the metadata.")
|
569
563
|
logger.info(f"{identifier_num_all - len(not_matched_identifiers)} {query_id_type}s are found! ")
|
pertpy/metadata/_moa.py
CHANGED
@@ -61,7 +61,7 @@ class Moa(MetaData):
|
|
61
61
|
adata = adata.copy()
|
62
62
|
|
63
63
|
if query_id not in adata.obs.columns:
|
64
|
-
raise ValueError(f"The requested query_id {query_id} is not in `adata.obs`.\
|
64
|
+
raise ValueError(f"The requested query_id {query_id} is not in `adata.obs`.\nPlease check again.")
|
65
65
|
|
66
66
|
if self.clue is None:
|
67
67
|
self._download_clue()
|