pertpy 0.9.5__py3-none-any.whl → 0.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. pertpy/__init__.py +5 -1
  2. pertpy/_doc.py +2 -5
  3. pertpy/_types.py +6 -0
  4. pertpy/data/_dataloader.py +68 -24
  5. pertpy/data/_datasets.py +9 -9
  6. pertpy/metadata/__init__.py +2 -1
  7. pertpy/metadata/_cell_line.py +136 -30
  8. pertpy/metadata/_look_up.py +13 -19
  9. pertpy/metadata/_moa.py +1 -1
  10. pertpy/preprocessing/_guide_rna.py +221 -39
  11. pertpy/preprocessing/_guide_rna_mixture.py +177 -0
  12. pertpy/tools/__init__.py +1 -1
  13. pertpy/tools/_augur.py +138 -142
  14. pertpy/tools/_cinemaot.py +75 -117
  15. pertpy/tools/_coda/_base_coda.py +150 -174
  16. pertpy/tools/_coda/_sccoda.py +66 -69
  17. pertpy/tools/_coda/_tasccoda.py +71 -79
  18. pertpy/tools/_dialogue.py +60 -56
  19. pertpy/tools/_differential_gene_expression/_base.py +25 -43
  20. pertpy/tools/_differential_gene_expression/_checks.py +4 -6
  21. pertpy/tools/_differential_gene_expression/_dge_comparison.py +5 -6
  22. pertpy/tools/_differential_gene_expression/_edger.py +6 -10
  23. pertpy/tools/_differential_gene_expression/_pydeseq2.py +1 -1
  24. pertpy/tools/_differential_gene_expression/_simple_tests.py +3 -3
  25. pertpy/tools/_differential_gene_expression/_statsmodels.py +8 -5
  26. pertpy/tools/_distances/_distance_tests.py +1 -2
  27. pertpy/tools/_distances/_distances.py +86 -92
  28. pertpy/tools/_enrichment.py +8 -25
  29. pertpy/tools/_milo.py +23 -27
  30. pertpy/tools/_mixscape.py +261 -175
  31. pertpy/tools/_perturbation_space/_clustering.py +4 -4
  32. pertpy/tools/_perturbation_space/_comparison.py +4 -4
  33. pertpy/tools/_perturbation_space/_discriminator_classifiers.py +83 -32
  34. pertpy/tools/_perturbation_space/_perturbation_space.py +10 -10
  35. pertpy/tools/_perturbation_space/_simple.py +13 -17
  36. pertpy/tools/_scgen/_scgen.py +17 -20
  37. pertpy/tools/_scgen/_scgenvae.py +2 -2
  38. pertpy/tools/_scgen/_utils.py +3 -1
  39. {pertpy-0.9.5.dist-info → pertpy-0.11.0.dist-info}/METADATA +37 -21
  40. pertpy-0.11.0.dist-info/RECORD +58 -0
  41. {pertpy-0.9.5.dist-info → pertpy-0.11.0.dist-info}/licenses/LICENSE +1 -0
  42. pertpy/tools/_kernel_pca.py +0 -50
  43. pertpy-0.9.5.dist-info/RECORD +0 -57
  44. {pertpy-0.9.5.dist-info → pertpy-0.11.0.dist-info}/WHEEL +0 -0
pertpy/__init__.py CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  __author__ = "Lukas Heumos"
4
4
  __email__ = "lukas.heumos@posteo.net"
5
- __version__ = "0.9.5"
5
+ __version__ = "0.11.0"
6
6
 
7
7
  import warnings
8
8
 
@@ -14,6 +14,10 @@ warnings.filterwarnings("ignore", category=MatplotlibDeprecationWarning)
14
14
  warnings.filterwarnings("ignore", category=SyntaxWarning)
15
15
  warnings.filterwarnings("ignore", category=UserWarning, module="scvi._settings")
16
16
 
17
+ import mudata
18
+
19
+ mudata.set_options(pull_on_update=False)
20
+
17
21
  from . import data as dt
18
22
  from . import metadata as md
19
23
  from . import plot as pl
pertpy/_doc.py CHANGED
@@ -2,9 +2,7 @@ from textwrap import dedent
2
2
 
3
3
 
4
4
  def _doc_params(**kwds): # pragma: no cover
5
- """\
6
- Docstrings should start with "\" in the first line for proper formatting.
7
- """
5
+ r"""Docstrings should start with "\" in the first line for proper formatting."""
8
6
 
9
7
  def dec(obj):
10
8
  obj.__orig_doc__ = obj.__doc__
@@ -15,6 +13,5 @@ def _doc_params(**kwds): # pragma: no cover
15
13
 
16
14
 
17
15
  doc_common_plot_args = """\
18
- show: if `True`, shows the plot.
19
- return_fig: if `True`, returns figure of the plot.\
16
+ return_fig: if `True`, returns figure of the plot, that can be used for saving.\
20
17
  """
pertpy/_types.py ADDED
@@ -0,0 +1,6 @@
1
+ from scipy import sparse
2
+
3
+ CSBase = sparse.csr_matrix | sparse.csc_matrix
4
+ CSRBase = sparse.csr_matrix
5
+ CSCBase = sparse.csc_matrix
6
+ SpBase = sparse.spmatrix
@@ -1,4 +1,6 @@
1
+ import shutil
1
2
  import tempfile
3
+ import time
2
4
  from pathlib import Path
3
5
  from random import choice
4
6
  from string import ascii_lowercase
@@ -7,6 +9,7 @@ from zipfile import ZipFile
7
9
  import requests
8
10
  from filelock import FileLock
9
11
  from lamin_utils import logger
12
+ from requests.exceptions import RequestException
10
13
  from rich.progress import Progress
11
14
 
12
15
 
@@ -17,7 +20,10 @@ def _download( # pragma: no cover
17
20
  block_size: int = 1024,
18
21
  overwrite: bool = False,
19
22
  is_zip: bool = False,
20
- ) -> None:
23
+ timeout: int = 30,
24
+ max_retries: int = 3,
25
+ retry_delay: int = 5,
26
+ ) -> Path:
21
27
  """Downloads a dataset irrespective of the format.
22
28
 
23
29
  Args:
@@ -27,6 +33,9 @@ def _download( # pragma: no cover
27
33
  block_size: Block size for downloads in bytes.
28
34
  overwrite: Whether to overwrite existing files.
29
35
  is_zip: Whether the downloaded file needs to be unzipped.
36
+ timeout: Request timeout in seconds.
37
+ max_retries: Maximum number of retry attempts.
38
+ retry_delay: Delay between retries in seconds.
30
39
  """
31
40
  if output_file_name is None:
32
41
  letters = ascii_lowercase
@@ -35,36 +44,71 @@ def _download( # pragma: no cover
35
44
  if output_path is None:
36
45
  output_path = tempfile.gettempdir()
37
46
 
38
- download_to_path = (
39
- f"{output_path}{output_file_name}" if str(output_path).endswith("/") else f"{output_path}/{output_file_name}"
40
- )
47
+ download_to_path = Path(output_path) / output_file_name
41
48
 
42
49
  Path(output_path).mkdir(parents=True, exist_ok=True)
43
- lock_path = f"{output_path}/{output_file_name}.lock"
44
- with FileLock(lock_path):
50
+ lock_path = Path(output_path) / f"{output_file_name}.lock"
51
+
52
+ with FileLock(lock_path, timeout=300):
45
53
  if Path(download_to_path).exists() and not overwrite:
46
54
  logger.warning(f"File {download_to_path} already exists!")
47
- return
55
+ return download_to_path
56
+
57
+ temp_file_name = Path(f"{download_to_path}.part")
58
+
59
+ retry_count = 0
60
+ while retry_count <= max_retries:
61
+ try:
62
+ head_response = requests.head(url, timeout=timeout)
63
+ head_response.raise_for_status()
64
+ content_length = int(head_response.headers.get("content-length", 0))
65
+
66
+ free_space = shutil.disk_usage(output_path).free
67
+ if content_length > free_space:
68
+ raise OSError(
69
+ f"Insufficient disk space. Need {content_length} bytes, but only {free_space} available."
70
+ )
71
+
72
+ response = requests.get(url, stream=True)
73
+ response.raise_for_status()
74
+ total = int(response.headers.get("content-length", 0))
48
75
 
49
- temp_file_name = f"{download_to_path}.part"
76
+ with Progress(refresh_per_second=5) as progress:
77
+ task = progress.add_task("[red]Downloading...", total=total)
78
+ with Path(temp_file_name).open("wb") as file:
79
+ for data in response.iter_content(block_size):
80
+ file.write(data)
81
+ progress.update(task, advance=len(data))
82
+ progress.update(task, completed=total, refresh=True)
50
83
 
51
- response = requests.get(url, stream=True)
52
- total = int(response.headers.get("content-length", 0))
84
+ Path(temp_file_name).replace(download_to_path)
53
85
 
54
- with Progress(refresh_per_second=100) as progress:
55
- task = progress.add_task("[red]Downloading...", total=total)
56
- with Path(temp_file_name).open("wb") as file:
57
- for data in response.iter_content(block_size):
58
- file.write(data)
59
- progress.update(task, advance=block_size)
60
- progress.update(task, completed=total, refresh=True)
86
+ if is_zip:
87
+ with ZipFile(download_to_path, "r") as zip_obj:
88
+ zip_obj.extractall(path=output_path)
89
+ return Path(output_path)
61
90
 
62
- Path(temp_file_name).replace(download_to_path)
91
+ return download_to_path
92
+ except (OSError, RequestException) as e:
93
+ retry_count += 1
94
+ if retry_count <= max_retries:
95
+ logger.warning(
96
+ f"Download attempt {retry_count}/{max_retries} failed: {str(e)}. Retrying in {retry_delay} seconds..."
97
+ )
98
+ time.sleep(retry_delay)
99
+ else:
100
+ logger.error(f"Download failed after {max_retries} attempts: {str(e)}")
101
+ if Path(temp_file_name).exists():
102
+ Path(temp_file_name).unlink(missing_ok=True)
103
+ raise
63
104
 
64
- if is_zip:
65
- output_path = output_path or tempfile.gettempdir()
66
- with ZipFile(download_to_path, "r") as zip_obj:
67
- zip_obj.extractall(path=output_path)
68
- zip_obj.namelist()
105
+ except Exception as e:
106
+ logger.error(f"Download failed: {str(e)}")
107
+ if Path(temp_file_name).exists():
108
+ Path(temp_file_name).unlink(missing_ok=True)
109
+ raise
110
+ finally:
111
+ if Path(temp_file_name).exists():
112
+ Path(temp_file_name).unlink(missing_ok=True)
69
113
 
70
- Path(lock_path).unlink()
114
+ return Path(download_to_path)
pertpy/data/_datasets.py CHANGED
@@ -37,7 +37,7 @@ def papalexi_2021() -> MuData: # pragma: no cover
37
37
  Returns:
38
38
  :class:`~mudata.MuData` object of the ECCITE-seq dataset
39
39
  """
40
- import muon as mu
40
+ import mudata as md
41
41
 
42
42
  output_file_name = "papalexi_2021.h5mu"
43
43
  output_file_path = settings.datasetdir / output_file_name
@@ -48,9 +48,11 @@ def papalexi_2021() -> MuData: # pragma: no cover
48
48
  output_path=settings.datasetdir,
49
49
  is_zip=False,
50
50
  )
51
- mudata = mu.read(output_file_path)
51
+ mdata = md.read_h5mu(output_file_path)
52
+ mdata.pull_obs()
53
+ mdata.pull_var()
52
54
 
53
- return mudata
55
+ return mdata
54
56
 
55
57
 
56
58
  def sc_sim_augur() -> AnnData: # pragma: no cover
@@ -408,7 +410,7 @@ def kang_2018() -> AnnData: # pragma: no cover
408
410
 
409
411
 
410
412
  def stephenson_2021_subsampled() -> AnnData: # pragma: no cover
411
- """Processed 10X 5' scRNA-seq data from PBMC of COVID-19 patients and healthy donors
413
+ """Processed 10X 5' scRNA-seq data from PBMC of COVID-19 patients and healthy donors.
412
414
 
413
415
  The study profiled peripheral blood mononuclear cells from 90 COVID-19 patients with different disease severity and 23 healthy control donors.
414
416
  Here the dataset was downsampled to approx. 500 cells per donor and cells were mapped to a reference atlas of healthy PBMCs from 12 studies
@@ -453,7 +455,7 @@ def haber_2017_regions() -> AnnData: # pragma: no cover
453
455
  output_file_path = settings.datasetdir / output_file_name
454
456
  if not Path(output_file_path).exists():
455
457
  _download(
456
- url="https://figshare.com/ndownloader/files/38169900",
458
+ url="https://figshare.com/ndownloader/files/54169301",
457
459
  output_file_name=output_file_name,
458
460
  output_path=settings.datasetdir,
459
461
  is_zip=False,
@@ -650,7 +652,7 @@ def datlinger_2021() -> AnnData: # pragma: no cover
650
652
  Publication: https://doi.org/10.1038/s41592-021-01153-z \
651
653
  Obtained from scperturb: http://projects.sanderlab.org/scperturb/
652
654
 
653
- Returns:
655
+ Returns:
654
656
  :class:`~anndata.AnnData` object of scPerturb prepared single-cell perturbation data
655
657
  """
656
658
  output_file_name = "datlinger_2021.h5ad"
@@ -1516,9 +1518,7 @@ def combosciplex() -> AnnData: # pragma: no cover
1516
1518
 
1517
1519
 
1518
1520
  def sciplex_gxe1() -> AnnData: # pragma: no cover
1519
- """sci-Plex-GxE combined chemical and genetic profiling of A172 dCas9-KRAB cells
1520
- genetically perturbed for HPRT1 or mismtach repair genes exposed to 6-thioguanine and temozolomide,
1521
- respectively, and A172 dCas9-SunTag cells genetically perturbed for HPRT1 exposed to 6-thioguanine.
1521
+ """sci-Plex-GxE profiling of A172 dCas9-KRAB (HPRT1 or MMR knockout) with 6-TG/TMZ and A172 dCas9-SunTag (HPRT1 knockout) with 6-TG.
1522
1522
 
1523
1523
  References:
1524
1524
  McFaline-Figueroa JL et al., Trapnell C. Multiplex single-cell chemical genomics reveals
@@ -1,6 +1,7 @@
1
1
  from pertpy.metadata._cell_line import CellLine
2
2
  from pertpy.metadata._compound import Compound
3
3
  from pertpy.metadata._drug import Drug
4
+ from pertpy.metadata._look_up import LookUp
4
5
  from pertpy.metadata._moa import Moa
5
6
 
6
- __all__ = ["CellLine", "Compound", "Drug", "Moa"]
7
+ __all__ = ["CellLine", "Compound", "Drug", "Moa", "LookUp"]
@@ -39,6 +39,7 @@ class CellLine(MetaData):
39
39
  self.proteomics = None
40
40
  self.drug_response_gdsc1 = None
41
41
  self.drug_response_gdsc2 = None
42
+ self.drug_response_prism = None
42
43
 
43
44
  def _download_cell_line(self, cell_line_source: Literal["DepMap", "Cancerrxgene"] = "DepMap") -> None:
44
45
  if cell_line_source == "DepMap":
@@ -54,6 +55,7 @@ class CellLine(MetaData):
54
55
  is_zip=False,
55
56
  )
56
57
  self.depmap = pd.read_csv(depmap_cell_line_path)
58
+ self.depmap = self.depmap.reset_index().rename(columns={"CellLineName": "cell_line_name"})
57
59
  else:
58
60
  # Download cell line metadata from The Genomics of Drug Sensitivity in Cancer Project
59
61
  # Source: https://www.cancerrxgene.org/celllines
@@ -157,7 +159,7 @@ class CellLine(MetaData):
157
159
  def _download_gdsc(self, gdsc_dataset: Literal[1, 2] = 1) -> None:
158
160
  if gdsc_dataset == 1:
159
161
  # Download GDSC drug response data
160
- # Source: https://www.cancerrxgene.org/downloads/bulk_download (Drug Screening - IC50s)
162
+ # Source: https://www.cancerrxgene.org/downloads/bulk_download (Drug Screening - IC50s and AUC)
161
163
  # URL: https://cog.sanger.ac.uk/cancerrxgene/GDSC_release8.4/GDSC1_fitted_dose_response_24Jul22.xlsx
162
164
  drug_response_gdsc1_file_path = Path(settings.cachedir) / "gdsc1_info.csv"
163
165
  if not Path(drug_response_gdsc1_file_path).exists():
@@ -181,6 +183,23 @@ class CellLine(MetaData):
181
183
  )
182
184
  self.drug_response_gdsc2 = pd.read_csv(drug_response_gdsc2_file_path, index_col=0)
183
185
 
186
+ def _download_prism(self) -> None:
187
+ # Download PRISM drug response data
188
+ # Source: DepMap PRISM Repurposing 19Q4 secondary screen dose response curve parameters
189
+ drug_response_prism_file_path = Path(settings.cachedir) / "prism_info.csv"
190
+ if not Path(drug_response_prism_file_path).exists():
191
+ _download(
192
+ url="https://figshare.com/ndownloader/files/20237739",
193
+ output_file_name="prism_info.csv",
194
+ output_path=settings.cachedir,
195
+ block_size=4096,
196
+ is_zip=False,
197
+ )
198
+ df = pd.read_csv(drug_response_prism_file_path, index_col=0)[["depmap_id", "name", "ic50", "ec50", "auc"]]
199
+ df = df.dropna(subset=["depmap_id", "name"])
200
+ df = df.groupby(["depmap_id", "name"]).mean().reset_index()
201
+ self.drug_response_prism = df
202
+
184
203
  def annotate(
185
204
  self,
186
205
  adata: AnnData,
@@ -197,13 +216,13 @@ class CellLine(MetaData):
197
216
 
198
217
  Args:
199
218
  adata: The data object to annotate.
200
- query_id: The column of `.obs` with cell line information.
219
+ query_id: The column of ``.obs`` with cell line information.
201
220
  reference_id: The type of cell line identifier in the metadata, e.g. ModelID, CellLineName or StrippedCellLineName.
202
221
  If fetching cell line metadata from Cancerrxgene, it is recommended to choose "stripped_cell_line_name".
203
222
  fetch: The metadata to fetch.
204
223
  cell_line_source: The source of cell line metadata, DepMap or Cancerrxgene.
205
224
  verbosity: The number of unmatched identifiers to print, can be either non-negative values or "all".
206
- copy: Determines whether a copy of the `adata` is returned.
225
+ copy: Determines whether a copy of ``adata`` is returned.
207
226
 
208
227
  Returns:
209
228
  Returns an AnnData object with cell line annotation.
@@ -216,7 +235,7 @@ class CellLine(MetaData):
216
235
  >>> adata_annotated = pt_metadata.annotate(adata=adata,
217
236
  >>> reference_id='cell_line_name',
218
237
  >>> query_id='cell_line_name',
219
- >>> fetch=["cell_line_name", "age", "primary_disease"],
238
+ >>> fetch=["cell_line_name", "Age", "OncotreePrimaryDisease"],
220
239
  >>> copy=True)
221
240
  """
222
241
  if copy:
@@ -304,7 +323,7 @@ class CellLine(MetaData):
304
323
  def annotate_bulk_rna(
305
324
  self,
306
325
  adata: AnnData,
307
- query_id: str = "cell_line_name",
326
+ query_id: str = None,
308
327
  cell_line_source: Literal["broad", "sanger"] = "sanger",
309
328
  verbosity: int | str = 5,
310
329
  gene_identifier: Literal["gene_name", "gene_ID", "both"] = "gene_ID",
@@ -316,9 +335,11 @@ class CellLine(MetaData):
316
335
 
317
336
  Args:
318
337
  adata: The data object to annotate.
319
- query_id: The column of `.obs` with cell line information. Defaults to "cell_line_name" if `cell_line_source` is sanger, otherwise "DepMap_ID".
338
+ query_id: The column of `.obs` with cell line information.
339
+ Defaults to "cell_line_name" if `cell_line_source` is sanger, otherwise "DepMap_ID".
320
340
  cell_line_source: The bulk rna expression data from either broad or sanger cell line.
321
341
  verbosity: The number of unmatched identifiers to print, can be either non-negative values or "all".
342
+ gene_identifier: The type of gene identifier saved in the fetched meta data, 'gene_name', 'gene_ID' or 'both'.
322
343
  copy: Determines whether a copy of the `adata` is returned.
323
344
 
324
345
  Returns:
@@ -339,7 +360,7 @@ class CellLine(MetaData):
339
360
 
340
361
  # Make sure that the specified `cell_line_type` can be found in the bulk rna expression data,
341
362
  # then we can compare these keys and fetch the corresponding metadata.
342
- if query_id not in adata.obs.columns:
363
+ if query_id not in adata.obs.columns and query_id is not None:
343
364
  raise ValueError(
344
365
  f"The specified `query_id` {query_id} can't be found in the `adata.obs`. \n"
345
366
  "Ensure that you are using one of the available query IDs present in the adata.obs for the annotation."
@@ -347,25 +368,33 @@ class CellLine(MetaData):
347
368
  "using the `annotate()` function before calling 'annotate_bulk_rna()'. "
348
369
  "This ensures that the required query ID is included in your data, e.g. stripped_cell_line_name, DepMap ID."
349
370
  )
350
-
371
+ if query_id is None:
372
+ query_id = "cell_line_name" if cell_line_source == "sanger" else "DepMap_ID"
351
373
  identifier_num_all = len(adata.obs[query_id].unique())
352
374
 
353
375
  # Lazily download the bulk rna expression data
354
376
  if cell_line_source == "sanger":
377
+ if query_id not in adata.obs.columns:
378
+ raise ValueError(
379
+ "To annotate bulk RNA data from Wellcome Sanger Institute, `cell_line_name` is used as default reference and query identifier if no `query_id` is given."
380
+ "Ensure that you have column `cell_line_name` in `adata.obs` or specify column name in which cell line name is stored."
381
+ "If cell line name isn't available in 'adata.obs', use `annotate()` to annotate the cell line first."
382
+ )
355
383
  if self.bulk_rna_sanger is None:
356
384
  self._download_bulk_rna(cell_line_source="sanger")
357
385
  reference_id = "model_name"
358
386
  not_matched_identifiers = list(set(adata.obs[query_id]) - set(self.bulk_rna_sanger.index))
359
387
  else:
388
+ if query_id not in adata.obs.columns:
389
+ raise ValueError(
390
+ "To annotate bulk RNA data from Broad Institue, `DepMap_ID` is used as default reference and query identifier if no `query_id` is given."
391
+ "Ensure that you have column `DepMap_ID` in `adata.obs` or specify column name in which DepMap ID is stored."
392
+ "If DepMap ID isn't available in 'adata.obs', use `annotate()` to annotate the cell line first."
393
+ )
360
394
  reference_id = "DepMap_ID"
361
- logger.warning(
362
- "To annotate bulk RNA data from Broad Institue, `DepMap_ID` is used as default reference and query identifier if no `reference_id` is given."
363
- "If `DepMap_ID` isn't available in 'adata.obs', use `annotate()` to annotate the cell line first."
364
- )
395
+
365
396
  if self.bulk_rna_broad is None:
366
397
  self._download_bulk_rna(cell_line_source="broad")
367
- if query_id == "cell_line_name":
368
- query_id = "DepMap_ID"
369
398
  not_matched_identifiers = list(set(adata.obs[query_id]) - set(self.bulk_rna_broad.index))
370
399
 
371
400
  self._warn_unmatch(
@@ -474,7 +503,8 @@ class CellLine(MetaData):
474
503
  adata.obsm["proteomics_" + protein_information] = (
475
504
  self.proteomics[[reference_id, protein_id, protein_information]]
476
505
  .pivot(index=reference_id, columns=protein_id, values=protein_information)
477
- .reindex(adata.obs.index)
506
+ .reindex(adata.obs[query_id])
507
+ .set_index(adata.obs.index)
478
508
  )
479
509
  return adata
480
510
 
@@ -491,7 +521,7 @@ class CellLine(MetaData):
491
521
  ) -> AnnData:
492
522
  """Fetch drug response data from GDSC.
493
523
 
494
- For each cell, we fetch drug response data as natural log of the fitted IC50 for its
524
+ For each cell, we fetch drug response data as natural log of the fitted IC50 and AUC for its
495
525
  corresponding cell line and perturbation from GDSC fitted data results file.
496
526
 
497
527
  Args:
@@ -554,13 +584,86 @@ class CellLine(MetaData):
554
584
  adata.obs = (
555
585
  adata.obs.reset_index()
556
586
  .set_index([query_id, query_perturbation])
557
- .assign(ln_ic50=gdsc_data.set_index([reference_id, reference_perturbation]).ln_ic50)
587
+ .assign(ln_ic50_gdsc=gdsc_data.set_index([reference_id, reference_perturbation]).ln_ic50)
588
+ .assign(auc_gdsc=gdsc_data.set_index([reference_id, reference_perturbation]).auc)
558
589
  .reset_index()
559
590
  .set_index(old_index_name)
560
591
  )
561
592
 
562
593
  return adata
563
594
 
595
+ def annotate_from_prism(
596
+ self,
597
+ adata: AnnData,
598
+ query_id: str = "DepMap_ID",
599
+ query_perturbation: str = "perturbation",
600
+ verbosity: int | str = 5,
601
+ copy: bool = False,
602
+ ) -> AnnData:
603
+ """Fetch drug response data from PRISM.
604
+
605
+ For each cell, we fetch drug response data as IC50, EC50 and AUC for its
606
+ corresponding cell line and perturbation from PRISM fitted data results file.
607
+ Note that all rows where either `depmap_id` or `name` is missing will be dropped.
608
+
609
+ Args:
610
+ adata: The data object to annotate.
611
+ query_id: The column of `.obs` with cell line information.
612
+ query_perturbation: The column of `.obs` with perturbation information.
613
+ verbosity: The number of unmatched identifiers to print, can be either non-negative values or 'all'.
614
+ copy: Determines whether a copy of the `adata` is returned.
615
+
616
+ Returns:
617
+ Returns an AnnData object with drug response annotation.
618
+
619
+ Examples:
620
+ >>> import pertpy as pt
621
+ >>> adata = pt.dt.mcfarland_2020()
622
+ >>> pt_metadata = pt.md.CellLine()
623
+ >>> pt_metadata.annotate_from_prism(adata, query_id="DepMap_ID")
624
+ """
625
+ if copy:
626
+ adata = adata.copy()
627
+ if query_id not in adata.obs.columns:
628
+ raise ValueError(
629
+ f"The specified `query_id` {query_id} can't be found in the `adata.obs`. \n"
630
+ "Ensure that you are using one of the available query IDs present in 'adata.obs' for the annotation.\n"
631
+ "If the desired query ID is not available, you can fetch the cell line metadata "
632
+ "using the `annotate()` function before calling `annotate_from_prism()`. "
633
+ "This ensures that the required query ID is included in your data."
634
+ )
635
+ if self.drug_response_prism is None:
636
+ self._download_prism()
637
+ prism_data = self.drug_response_prism
638
+ # PRISM starts most drug names with a lowercase letter, so we want to make it case-insensitive
639
+ prism_data["name_lower"] = prism_data["name"].str.lower()
640
+ adata.obs["perturbation_lower"] = adata.obs[query_perturbation].str.lower()
641
+
642
+ identifier_num_all = len(adata.obs[query_id].unique())
643
+ not_matched_identifiers = list(set(adata.obs[query_id]) - set(prism_data["depmap_id"]))
644
+ self._warn_unmatch(
645
+ total_identifiers=identifier_num_all,
646
+ unmatched_identifiers=not_matched_identifiers,
647
+ query_id=query_id,
648
+ reference_id="depmap_id",
649
+ metadata_type="drug response",
650
+ verbosity=verbosity,
651
+ )
652
+
653
+ old_index_name = "index" if adata.obs.index.name is None else adata.obs.index.name
654
+ adata.obs = (
655
+ adata.obs.reset_index()
656
+ .set_index([query_id, "perturbation_lower"])
657
+ .assign(ic50_prism=prism_data.set_index(["depmap_id", "name"]).ic50)
658
+ .assign(ec50_prism=prism_data.set_index(["depmap_id", "name"]).ec50)
659
+ .assign(auc_prism=prism_data.set_index(["depmap_id", "name"]).auc)
660
+ .reset_index()
661
+ .set_index(old_index_name)
662
+ .drop(columns="perturbation_lower")
663
+ )
664
+
665
+ return adata
666
+
564
667
  def lookup(self) -> LookUp:
565
668
  """Generate LookUp object for CellLineMetaData.
566
669
 
@@ -577,7 +680,7 @@ class CellLine(MetaData):
577
680
  >>> pt_metadata = pt.md.CellLine()
578
681
  >>> lookup = pt_metadata.lookup()
579
682
  """
580
- # Fetch the metadata if it hasn't beed downloaded yet
683
+ # Fetch the metadata if it hasn't been downloaded yet
581
684
  if self.depmap is None:
582
685
  self._download_cell_line(cell_line_source="DepMap")
583
686
  if self.cancerxgene is None:
@@ -594,6 +697,8 @@ class CellLine(MetaData):
594
697
  self._download_gdsc(gdsc_dataset=1)
595
698
  if self.drug_response_gdsc2 is None:
596
699
  self._download_gdsc(gdsc_dataset=2)
700
+ if self.drug_response_prism is None:
701
+ self._download_prism()
597
702
 
598
703
  # Transfer the data
599
704
  return LookUp(
@@ -607,6 +712,7 @@ class CellLine(MetaData):
607
712
  self.proteomics,
608
713
  self.drug_response_gdsc1,
609
714
  self.drug_response_gdsc2,
715
+ self.drug_response_prism,
610
716
  ],
611
717
  )
612
718
 
@@ -663,12 +769,14 @@ class CellLine(MetaData):
663
769
  raise ValueError(
664
770
  "Dimensions of adata.X do not match those of metadata. Ensure that they have the same gene list."
665
771
  )
666
- if isinstance(adata.obsm[metadata_key], pd.DataFrame):
667
- # Give warning if the genes are not the same
668
- if sum(adata.obsm[metadata_key].columns != adata.var.index.values) > 0:
669
- logger.warning(
670
- "Column name of metadata is not the same as the index of adata.var. Ensure that the genes are in the same order."
671
- )
772
+ # Raise error if the genes are not the same
773
+ if (
774
+ isinstance(adata.obsm[metadata_key], pd.DataFrame)
775
+ and sum(adata.obsm[metadata_key].columns != adata.var.index.values) > 0
776
+ ):
777
+ raise ValueError(
778
+ "Column name of metadata is not the same as the index of adata.var. Ensure that the genes are in the same order."
779
+ )
672
780
 
673
781
  # Divide cell lines into those are present and not present in the metadata
674
782
  overlapped_cl = adata[~adata.obsm[metadata_key].isna().all(axis=1), :]
@@ -693,7 +801,7 @@ class CellLine(MetaData):
693
801
  return corr, pvals, new_corr, new_pvals
694
802
 
695
803
  @_doc_params(common_plot_args=doc_common_plot_args)
696
- def plot_correlation(
804
+ def plot_correlation( # noqa: D417
697
805
  self,
698
806
  adata: AnnData,
699
807
  corr: pd.DataFrame,
@@ -703,7 +811,6 @@ class CellLine(MetaData):
703
811
  metadata_key: str = "bulk_rna_broad",
704
812
  category: str = "cell line",
705
813
  subset_identifier: str | int | Iterable[str] | Iterable[int] | None = None,
706
- show: bool = True,
707
814
  return_fig: bool = False,
708
815
  ) -> Figure | None:
709
816
  """Visualise the correlation of cell lines with annotated metadata.
@@ -747,7 +854,7 @@ class CellLine(MetaData):
747
854
  if all(isinstance(id, str) for id in subset_identifier_list):
748
855
  if set(subset_identifier_list).issubset(adata.obs[identifier].unique()):
749
856
  subset_identifier_list = np.where(
750
- np.in1d(adata.obs[identifier].values, subset_identifier_list)
857
+ np.isin(adata.obs[identifier].values, subset_identifier_list)
751
858
  )[0]
752
859
  else:
753
860
  raise ValueError("`Subset_identifier` must be found in adata.obs.`identifier`.")
@@ -798,10 +905,9 @@ class CellLine(MetaData):
798
905
  },
799
906
  )
800
907
 
801
- if show:
802
- plt.show()
803
908
  if return_fig:
804
909
  return plt.gcf()
910
+ plt.show()
805
911
  return None
806
912
  else:
807
- raise NotImplementedError
913
+ raise NotImplementedError("Only 'cell line' category is supported for correlation comparison.")
@@ -22,11 +22,12 @@ class LookUp:
22
22
  type: Literal["cell_line", "moa", "compound", "drug"] = "cell_line",
23
23
  transfer_metadata: Sequence[pd.DataFrame] | None = None,
24
24
  ):
25
- """
25
+ """Lookup object for different type of metadata.
26
+
26
27
  Args:
27
28
  type: Metadata type for annotation. One of 'cell_line', 'compound', 'moa' or 'drug.
28
29
  transfer_metadata: DataFrames used to generate Lookup object.
29
- This is currently set to None for CompoundMetaData which does not require any dataframes for transfer.
30
+ This is currently set to None for CompoundMetaData which does not require any dataframes for transfer.
30
31
  """
31
32
  self.type = type
32
33
  if type == "cell_line":
@@ -329,10 +330,7 @@ class LookUp:
329
330
  if self.type != "cell_line":
330
331
  raise ValueError("This is not a LookUp object specific for CellLineMetaData!")
331
332
 
332
- if cell_line_source == "broad":
333
- bulk_rna = self.bulk_rna_broad
334
- else:
335
- bulk_rna = self.bulk_rna_sanger
333
+ bulk_rna = self.bulk_rna_broad if cell_line_source == "broad" else self.bulk_rna_sanger
336
334
 
337
335
  if query_id_list is not None:
338
336
  identifier_num_all = len(query_id_list)
@@ -391,10 +389,7 @@ class LookUp:
391
389
  """
392
390
  if self.type != "cell_line":
393
391
  raise ValueError("This is not a LookUp object specific for CellLineMetaData!")
394
- if gdsc_dataset == 1:
395
- gdsc_data = self.drug_response_gdsc1
396
- else:
397
- gdsc_data = self.drug_response_gdsc2
392
+ gdsc_data = self.drug_response_gdsc1 if gdsc_dataset == 1 else self.drug_response_gdsc2
398
393
 
399
394
  if query_id_list is not None:
400
395
  if reference_id not in gdsc_data.columns:
@@ -421,7 +416,7 @@ class LookUp:
421
416
  reference_id: Literal["gene_id", "ensembl_gene_id", "hgnc_id", "hgnc_symbol"] = "ensembl_gene_id",
422
417
  query_id_list: Sequence[str] | None = None,
423
418
  ) -> None:
424
- """A brief summary of gene annotation metadata
419
+ """A brief summary of gene annotation metadata.
425
420
 
426
421
  Args:
427
422
  reference_id: The type of gene identifier in the meta data, gene_id, ensembl_gene_id, hgnc_id, hgnc_symbol.
@@ -555,15 +550,14 @@ class LookUp:
555
550
  raise ValueError(
556
551
  "Gene-disease association is not available in dgidb dataset, please try with pharmgkb."
557
552
  )
553
+ elif query_id_type == "target":
554
+ not_matched_identifiers = list(set(query_id_list) - set(self.pharmgkb["Gene"]))
555
+ elif query_id_type == "compound":
556
+ compounds = self.pharmgkb[self.pharmgkb["Type"] == "Chemical"]
557
+ not_matched_identifiers = list(set(query_id_list) - set(compounds["Compound|Disease"]))
558
558
  else:
559
- if query_id_type == "target":
560
- not_matched_identifiers = list(set(query_id_list) - set(self.pharmgkb["Gene"]))
561
- elif query_id_type == "compound":
562
- compounds = self.pharmgkb[self.pharmgkb["Type"] == "Chemical"]
563
- not_matched_identifiers = list(set(query_id_list) - set(compounds["Compound|Disease"]))
564
- else:
565
- diseases = self.pharmgkb[self.pharmgkb["Type"] == "Disease"]
566
- not_matched_identifiers = list(set(query_id_list) - set(diseases["Compound|Disease"]))
559
+ diseases = self.pharmgkb[self.pharmgkb["Type"] == "Disease"]
560
+ not_matched_identifiers = list(set(query_id_list) - set(diseases["Compound|Disease"]))
567
561
 
568
562
  logger.info(f"{len(not_matched_identifiers)} {query_id_type}s are not found in the metadata.")
569
563
  logger.info(f"{identifier_num_all - len(not_matched_identifiers)} {query_id_type}s are found! ")
pertpy/metadata/_moa.py CHANGED
@@ -61,7 +61,7 @@ class Moa(MetaData):
61
61
  adata = adata.copy()
62
62
 
63
63
  if query_id not in adata.obs.columns:
64
- raise ValueError(f"The requested query_id {query_id} is not in `adata.obs`.\n" "Please check again.")
64
+ raise ValueError(f"The requested query_id {query_id} is not in `adata.obs`.\nPlease check again.")
65
65
 
66
66
  if self.clue is None:
67
67
  self._download_clue()