pertpy 0.7.0__py3-none-any.whl → 0.8.0__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (56) hide show
  1. pertpy/__init__.py +2 -1
  2. pertpy/data/__init__.py +61 -0
  3. pertpy/data/_dataloader.py +27 -23
  4. pertpy/data/_datasets.py +58 -0
  5. pertpy/metadata/__init__.py +2 -0
  6. pertpy/metadata/_cell_line.py +39 -70
  7. pertpy/metadata/_compound.py +3 -4
  8. pertpy/metadata/_drug.py +2 -6
  9. pertpy/metadata/_look_up.py +38 -51
  10. pertpy/metadata/_metadata.py +7 -10
  11. pertpy/metadata/_moa.py +2 -6
  12. pertpy/plot/__init__.py +0 -5
  13. pertpy/preprocessing/__init__.py +2 -0
  14. pertpy/preprocessing/_guide_rna.py +2 -3
  15. pertpy/tools/__init__.py +42 -4
  16. pertpy/tools/_augur.py +14 -15
  17. pertpy/tools/_cinemaot.py +2 -2
  18. pertpy/tools/_coda/_base_coda.py +118 -142
  19. pertpy/tools/_coda/_sccoda.py +16 -15
  20. pertpy/tools/_coda/_tasccoda.py +21 -22
  21. pertpy/tools/_dialogue.py +18 -23
  22. pertpy/tools/_differential_gene_expression/__init__.py +20 -0
  23. pertpy/tools/_differential_gene_expression/_base.py +657 -0
  24. pertpy/tools/_differential_gene_expression/_checks.py +41 -0
  25. pertpy/tools/_differential_gene_expression/_dge_comparison.py +86 -0
  26. pertpy/tools/_differential_gene_expression/_edger.py +125 -0
  27. pertpy/tools/_differential_gene_expression/_formulaic.py +189 -0
  28. pertpy/tools/_differential_gene_expression/_pydeseq2.py +95 -0
  29. pertpy/tools/_differential_gene_expression/_simple_tests.py +162 -0
  30. pertpy/tools/_differential_gene_expression/_statsmodels.py +72 -0
  31. pertpy/tools/_distances/_distance_tests.py +21 -16
  32. pertpy/tools/_distances/_distances.py +406 -70
  33. pertpy/tools/_enrichment.py +10 -15
  34. pertpy/tools/_kernel_pca.py +1 -1
  35. pertpy/tools/_milo.py +76 -53
  36. pertpy/tools/_mixscape.py +15 -11
  37. pertpy/tools/_perturbation_space/_clustering.py +5 -2
  38. pertpy/tools/_perturbation_space/_comparison.py +112 -0
  39. pertpy/tools/_perturbation_space/_discriminator_classifiers.py +20 -22
  40. pertpy/tools/_perturbation_space/_perturbation_space.py +23 -21
  41. pertpy/tools/_perturbation_space/_simple.py +3 -3
  42. pertpy/tools/_scgen/__init__.py +1 -1
  43. pertpy/tools/_scgen/_base_components.py +2 -3
  44. pertpy/tools/_scgen/_scgen.py +33 -28
  45. pertpy/tools/_scgen/_utils.py +2 -2
  46. {pertpy-0.7.0.dist-info → pertpy-0.8.0.dist-info}/METADATA +22 -13
  47. pertpy-0.8.0.dist-info/RECORD +57 -0
  48. {pertpy-0.7.0.dist-info → pertpy-0.8.0.dist-info}/WHEEL +1 -1
  49. pertpy/plot/_augur.py +0 -171
  50. pertpy/plot/_coda.py +0 -601
  51. pertpy/plot/_guide_rna.py +0 -64
  52. pertpy/plot/_milopy.py +0 -209
  53. pertpy/plot/_mixscape.py +0 -355
  54. pertpy/tools/_differential_gene_expression.py +0 -325
  55. pertpy-0.7.0.dist-info/RECORD +0 -53
  56. {pertpy-0.7.0.dist-info → pertpy-0.8.0.dist-info}/licenses/LICENSE +0 -0
pertpy/__init__.py CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  __author__ = "Lukas Heumos"
4
4
  __email__ = "lukas.heumos@posteo.net"
5
- __version__ = "0.7.0"
5
+ __version__ = "0.8.0"
6
6
 
7
7
  import warnings
8
8
 
@@ -11,6 +11,7 @@ from numba import NumbaDeprecationWarning
11
11
 
12
12
  warnings.filterwarnings("ignore", category=NumbaDeprecationWarning)
13
13
  warnings.filterwarnings("ignore", category=MatplotlibDeprecationWarning)
14
+ warnings.filterwarnings("ignore", category=SyntaxWarning)
14
15
  warnings.filterwarnings("ignore", category=UserWarning, module="scvi._settings")
15
16
 
16
17
  from . import data as dt
pertpy/data/__init__.py CHANGED
@@ -24,6 +24,7 @@ from pertpy.data._datasets import (
24
24
  gasperini_2019_lowmoi,
25
25
  gehring_2019,
26
26
  haber_2017_regions,
27
+ hagai_2018,
27
28
  kang_2018,
28
29
  mcfarland_2020,
29
30
  norman_2019,
@@ -52,5 +53,65 @@ from pertpy.data._datasets import (
52
53
  tian_2021_crispri,
53
54
  weinreb_2020,
54
55
  xie_2017,
56
+ zhang_2021,
55
57
  zhao_2021,
56
58
  )
59
+
60
+ __all__ = [
61
+ "adamson_2016_pilot",
62
+ "adamson_2016_upr_epistasis",
63
+ "adamson_2016_upr_perturb_seq",
64
+ "aissa_2021",
65
+ "bhattacherjee",
66
+ "burczynski_crohn",
67
+ "chang_2021",
68
+ "cinemaot_example",
69
+ "combosciplex",
70
+ "datlinger_2017",
71
+ "datlinger_2021",
72
+ "dialogue_example",
73
+ "distance_example",
74
+ "dixit_2016",
75
+ "dixit_2016_raw",
76
+ "dong_2023",
77
+ "frangieh_2021",
78
+ "frangieh_2021_protein",
79
+ "frangieh_2021_raw",
80
+ "frangieh_2021_rna",
81
+ "gasperini_2019_atscale",
82
+ "gasperini_2019_highmoi",
83
+ "gasperini_2019_lowmoi",
84
+ "gehring_2019",
85
+ "haber_2017_regions",
86
+ "hagai_2018",
87
+ "kang_2018",
88
+ "mcfarland_2020",
89
+ "norman_2019",
90
+ "norman_2019_raw",
91
+ "papalexi_2021",
92
+ "replogle_2022_k562_essential",
93
+ "replogle_2022_k562_gwps",
94
+ "replogle_2022_rpe1",
95
+ "sc_sim_augur",
96
+ "schiebinger_2019_16day",
97
+ "schiebinger_2019_18day",
98
+ "schraivogel_2020_tap_screen_chr8",
99
+ "schraivogel_2020_tap_screen_chr11",
100
+ "sciplex3_raw",
101
+ "sciplex_gxe1",
102
+ "shifrut_2018",
103
+ "smillie_2019",
104
+ "srivatsan_2020_sciplex2",
105
+ "srivatsan_2020_sciplex3",
106
+ "srivatsan_2020_sciplex4",
107
+ "stephenson_2021_subsampled",
108
+ "tasccoda_example",
109
+ "tian_2019_day7neuron",
110
+ "tian_2019_ipsc",
111
+ "tian_2021_crispra",
112
+ "tian_2021_crispri",
113
+ "weinreb_2020",
114
+ "xie_2017",
115
+ "zhao_2021",
116
+ "zhang_2021",
117
+ ]
@@ -5,7 +5,8 @@ from string import ascii_lowercase
5
5
  from zipfile import ZipFile
6
6
 
7
7
  import requests
8
- from rich import print
8
+ from filelock import FileLock
9
+ from lamin_utils import logger
9
10
  from rich.progress import Progress
10
11
 
11
12
 
@@ -37,30 +38,33 @@ def _download( # pragma: no cover
37
38
  download_to_path = (
38
39
  f"{output_path}{output_file_name}" if str(output_path).endswith("/") else f"{output_path}/{output_file_name}"
39
40
  )
40
- if Path(download_to_path).exists():
41
- warning = f"[bold red]File {download_to_path} already exists!"
42
- if not overwrite:
43
- print(warning)
41
+
42
+ Path(output_path).mkdir(parents=True, exist_ok=True)
43
+ lock_path = f"{output_path}/{output_file_name}.lock"
44
+ with FileLock(lock_path):
45
+ if Path(download_to_path).exists() and not overwrite:
46
+ logger.warning(f"File {download_to_path} already exists!")
44
47
  return
45
- else:
46
- print(f"{warning} Overwriting...")
47
48
 
48
- response = requests.get(url, stream=True)
49
- total = int(response.headers.get("content-length", 0))
49
+ temp_file_name = f"{download_to_path}.part"
50
+
51
+ response = requests.get(url, stream=True)
52
+ total = int(response.headers.get("content-length", 0))
53
+
54
+ with Progress(refresh_per_second=100) as progress:
55
+ task = progress.add_task("[red]Downloading...", total=total)
56
+ with Path(temp_file_name).open("wb") as file:
57
+ for data in response.iter_content(block_size):
58
+ file.write(data)
59
+ progress.update(task, advance=block_size)
60
+ progress.update(task, completed=total, refresh=True)
50
61
 
51
- with Progress(refresh_per_second=100) as progress:
52
- task = progress.add_task("[red]Downloading...", total=total)
53
- Path(output_path).mkdir(parents=True, exist_ok=True)
54
- with Path(download_to_path).open("wb") as file:
55
- for data in response.iter_content(block_size):
56
- file.write(data)
57
- progress.update(task, advance=block_size)
62
+ Path(temp_file_name).replace(download_to_path)
58
63
 
59
- # force the progress bar to 100% at the end
60
- progress.update(task, completed=total, refresh=True)
64
+ if is_zip:
65
+ output_path = output_path or tempfile.gettempdir()
66
+ with ZipFile(download_to_path, "r") as zip_obj:
67
+ zip_obj.extractall(path=output_path)
68
+ zip_obj.namelist()
61
69
 
62
- if is_zip:
63
- output_path = output_path or tempfile.gettempdir()
64
- with ZipFile(download_to_path, "r") as zip_obj:
65
- zip_obj.extractall(path=output_path)
66
- zip_obj.namelist()
70
+ Path(lock_path).unlink()
pertpy/data/_datasets.py CHANGED
@@ -1540,3 +1540,61 @@ def sciplex_gxe1() -> AnnData: # pragma: no cover
1540
1540
  adata = sc.read_h5ad(output_file_path)
1541
1541
 
1542
1542
  return adata
1543
+
1544
+
1545
+ def zhang_2021() -> AnnData: # pragma: no cover
1546
+ """Single-cell RNA-seq of TNBC patients' immune cells exposed to paclitaxel alone or combined with the anti-PD-L1 atezolizumab.
1547
+
1548
+ This analysis, involving 22 patients, identifies immune subtypes predictive of therapeutic
1549
+ responses and underscores potential limitations of combining paclitaxel with atezolizumab in treatment protocols.
1550
+
1551
+ The script that generated this specific AnnData object:
1552
+ https://github.com/tessadgreen/ThesisCode/blob/main/Chapter3/drug_response/import_zhang_data.ipynb
1553
+
1554
+ This dataset does not contain the single-cell ATAC-seq data that was also measured for the paper.
1555
+
1556
+ References:
1557
+ Zhang Y et al., Liu Z. Single-cell analyses reveal key immune cell subsets associated with response to PD-L1 blockade in triple-negative breast cancer.
1558
+ Cancer Cell. 2021 Volume 39, Issue 12. doi: https://doi.org/10.1016/j.ccell.2021.09.010
1559
+
1560
+ Returns:
1561
+ :class:`~anndata.AnnData` object of the dataset.
1562
+ """
1563
+ output_file_name = "zhang_2021.h5ad"
1564
+ output_file_path = settings.datasetdir / output_file_name
1565
+ if not Path(output_file_path).exists():
1566
+ _download(
1567
+ url="https://figshare.com/ndownloader/files/46457872",
1568
+ output_file_name=output_file_name,
1569
+ output_path=settings.datasetdir,
1570
+ is_zip=False,
1571
+ )
1572
+ adata = sc.read_h5ad(output_file_path)
1573
+
1574
+ return adata
1575
+
1576
+
1577
+ def hagai_2018() -> AnnData: # pragma: no cover
1578
+ """Cross-species analysis of primary dermal fibroblasts and bone marrow-derived phagocytes, stimulated with dsRNA and IFNB.
1579
+
1580
+ The study explores immune response variations across humans, macaques, mice, and rats.
1581
+
1582
+ Referenences:
1583
+ Hagai, T., Chen, X., Miragaia, R.J. et al. Gene expression variability across cells and species shapes innate immunity.
1584
+ Nature 563, 197–202 (2018). https://doi.org/10.1038/s41586-018-0657-2
1585
+
1586
+ Returns:
1587
+ :class:`~anndata.AnnData` object of the dataset.
1588
+ """
1589
+ output_file_name = "hagai_2018.h5ad"
1590
+ output_file_path = settings.datasetdir / output_file_name
1591
+ if not Path(output_file_path).exists():
1592
+ _download(
1593
+ url="https://figshare.com/ndownloader/files/46978846",
1594
+ output_file_name=output_file_name,
1595
+ output_path=settings.datasetdir,
1596
+ is_zip=False,
1597
+ )
1598
+ adata = sc.read_h5ad(output_file_path)
1599
+
1600
+ return adata
@@ -2,3 +2,5 @@ from pertpy.metadata._cell_line import CellLine
2
2
  from pertpy.metadata._compound import Compound
3
3
  from pertpy.metadata._drug import Drug
4
4
  from pertpy.metadata._moa import Moa
5
+
6
+ __all__ = ["CellLine", "Compound", "Drug", "Moa"]
@@ -3,13 +3,14 @@ from __future__ import annotations
3
3
  from pathlib import Path
4
4
  from typing import TYPE_CHECKING, Literal
5
5
 
6
+ from lamin_utils import logger
7
+
6
8
  if TYPE_CHECKING:
7
9
  from collections.abc import Iterable
8
10
 
9
11
  import matplotlib.pyplot as plt
10
12
  import numpy as np
11
13
  import pandas as pd
12
- from rich import print
13
14
  from scanpy import settings
14
15
  from scipy import stats
15
16
 
@@ -42,7 +43,6 @@ class CellLine(MetaData):
42
43
  # Source: https://depmap.org/portal/download/all/ (DepMap Public 23Q4)
43
44
  depmap_cell_line_path = Path(settings.cachedir) / "depmap_23Q4_info.csv"
44
45
  if not Path(depmap_cell_line_path).exists():
45
- print("[bold yellow]No DepMap metadata file found. Starting download now.")
46
46
  _download(
47
47
  url="https://ndownloader.figshare.com/files/43746708",
48
48
  output_file_name="depmap_23Q4_info.csv",
@@ -59,10 +59,6 @@ class CellLine(MetaData):
59
59
 
60
60
  if not Path(transformed_cancerxgene_cell_line_path).exists():
61
61
  if not Path(cancerxgene_cell_line_path).exists():
62
- print(
63
- "[bold yellow]No cell line metadata file from The Genomics of Drug Sensitivity "
64
- "in Cancer Project found. Starting download now."
65
- )
66
62
  _download(
67
63
  url="https://www.cancerrxgene.org/api/celllines?list=all&sEcho=1&iColumns=7&sColumns=&"
68
64
  "iDisplayStart=0&iDisplayLength=25&mDataProp_0=0&mDataProp_1=1&mDataProp_2=2&mDataProp_3=3&"
@@ -102,7 +98,6 @@ class CellLine(MetaData):
102
98
  # Source: https://cellmodelpassports.sanger.ac.uk/downloads (Gene annotation)
103
99
  gene_annotation_file_path = Path(settings.cachedir) / "genes_info.csv"
104
100
  if not Path(gene_annotation_file_path).exists():
105
- print("[bold yellow]No metadata file was found for gene annotation. Starting download now.")
106
101
  _download(
107
102
  url="https://cog.sanger.ac.uk/cmp/download/gene_identifiers_20191101.csv",
108
103
  output_file_name="genes_info.csv",
@@ -120,10 +115,6 @@ class CellLine(MetaData):
120
115
  # solution: remove the white space and convert to int before depmap updates the metadata
121
116
  bulk_rna_sanger_file_path = Path(settings.cachedir) / "rnaseq_sanger_info.csv"
122
117
  if not Path(bulk_rna_sanger_file_path).exists():
123
- print(
124
- "[bold yellow]No metadata file was found for bulk RNA-seq data of Sanger cell line."
125
- " Starting download now."
126
- )
127
118
  _download(
128
119
  url="https://figshare.com/ndownloader/files/42467103",
129
120
  output_file_name="rnaseq_sanger_info.csv",
@@ -137,7 +128,6 @@ class CellLine(MetaData):
137
128
  # Source: https://depmap.org/portal/download/all/ (DepMap Public 22Q2)
138
129
  bulk_rna_broad_file_path = Path(settings.cachedir) / "rnaseq_depmap_info.csv"
139
130
  if not Path(bulk_rna_broad_file_path).exists():
140
- print("[bold yellow]No metadata file was found for CCLE expression data. Starting download now.")
141
131
  _download(
142
132
  url="https://figshare.com/ndownloader/files/34989922",
143
133
  output_file_name="rnaseq_depmap_info.csv",
@@ -152,7 +142,6 @@ class CellLine(MetaData):
152
142
  # Source: https://cellmodelpassports.sanger.ac.uk/downloads (Proteomics)
153
143
  proteomics_file_path = Path(settings.cachedir) / "proteomics_info.csv"
154
144
  if not Path(proteomics_file_path).exists():
155
- print("[bold yellow]No metadata file was found for proteomics data (DepMap.Sanger). Starting download now.")
156
145
  _download(
157
146
  url="https://figshare.com/ndownloader/files/42468393",
158
147
  output_file_name="proteomics_info.csv",
@@ -169,10 +158,6 @@ class CellLine(MetaData):
169
158
  # URL: https://cog.sanger.ac.uk/cancerrxgene/GDSC_release8.4/GDSC1_fitted_dose_response_24Jul22.xlsx
170
159
  drug_response_gdsc1_file_path = Path(settings.cachedir) / "gdsc1_info.csv"
171
160
  if not Path(drug_response_gdsc1_file_path).exists():
172
- print(
173
- "[bold yellow]No metadata file was found for drug response data of GDSC1 dataset."
174
- " Starting download now."
175
- )
176
161
  _download(
177
162
  url="https://figshare.com/ndownloader/files/43757235",
178
163
  output_file_name="gdsc1_info.csv",
@@ -184,10 +169,6 @@ class CellLine(MetaData):
184
169
  if gdsc_dataset == 2:
185
170
  drug_response_gdsc2_file_path = Path(settings.cachedir) / "gdsc2_info.csv"
186
171
  if not Path(drug_response_gdsc2_file_path).exists():
187
- print(
188
- "[bold yellow]No metadata file was found for drug response data of GDSC2 dataset."
189
- " Starting download now."
190
- )
191
172
  _download(
192
173
  url="https://figshare.com/ndownloader/files/43757232",
193
174
  output_file_name="gdsc2_info.csv",
@@ -213,15 +194,13 @@ class CellLine(MetaData):
213
194
 
214
195
  Args:
215
196
  adata: The data object to annotate.
216
- query_id: The column of `.obs` with cell line information. Defaults to "DepMap_ID".
217
- reference_id: The type of cell line identifier in the meta data, e.g. ModelID, CellLineName or StrippedCellLineName.
218
- If fetching cell line metadata from Cancerrxgene, it is recommended to choose
219
- "stripped_cell_line_name". Defaults to "ModelID".
220
- fetch: The metadata to fetch. Defaults to None (=all).
221
- cell_line_source: The source of cell line metadata, DepMap or Cancerrxgene. Defaults to "DepMap".
197
+ query_id: The column of `.obs` with cell line information.
198
+ reference_id: The type of cell line identifier in the metadata, e.g. ModelID, CellLineName or StrippedCellLineName.
199
+ If fetching cell line metadata from Cancerrxgene, it is recommended to choose "stripped_cell_line_name".
200
+ fetch: The metadata to fetch.
201
+ cell_line_source: The source of cell line metadata, DepMap or Cancerrxgene.
222
202
  verbosity: The number of unmatched identifiers to print, can be either non-negative values or "all".
223
- Defaults to 5.
224
- copy: Determines whether a copy of the `adata` is returned. Defaults to False.
203
+ copy: Determines whether a copy of the `adata` is returned.
225
204
 
226
205
  Returns:
227
206
  Returns an AnnData object with cell line annotation.
@@ -248,11 +227,9 @@ class CellLine(MetaData):
248
227
  reference_id = "stripped_cell_line_name"
249
228
  if query_id == "DepMap_ID":
250
229
  query_id = "stripped_cell_line_name"
251
- print(
252
- "[bold blue]`stripped_cell_line_name` is used as reference and query identifier ",
253
- " to annotate cell line metadata from Cancerrxgene. "
254
- "Ensure that stripped cell line names are available in 'adata.obs.' ",
255
- "or use the DepMap as `cell_line_source` to annotate the cell line first ",
230
+ logger.error(
231
+ "`stripped_cell_line_name` is used as reference and query identifier to annotate cell line metadata from Cancerrxgene. "
232
+ "Ensure that stripped cell line names are available in 'adata.obs.' or use the DepMap as `cell_line_source` to annotate the cell line first."
256
233
  )
257
234
  if self.cancerxgene is None:
258
235
  self._download_cell_line(cell_line_source="Cancerrxgene")
@@ -337,9 +314,9 @@ class CellLine(MetaData):
337
314
  Args:
338
315
  adata: The data object to annotate.
339
316
  query_id: The column of `.obs` with cell line information. Defaults to "cell_line_name" if `cell_line_source` is sanger, otherwise "DepMap_ID".
340
- cell_line_source: The bulk rna expression data from either broad or sanger cell line. Defaults to "sanger".
341
- verbosity: The number of unmatched identifiers to print, can be either non-negative values or "all". Defaults to 5.
342
- copy: Determines whether a copy of the `adata` is returned. Defaults to False.
317
+ cell_line_source: The bulk rna expression data from either broad or sanger cell line.
318
+ verbosity: The number of unmatched identifiers to print, can be either non-negative values or "all".
319
+ copy: Determines whether a copy of the `adata` is returned.
343
320
 
344
321
  Returns:
345
322
  Returns an AnnData object with bulk rna expression annotation.
@@ -378,11 +355,10 @@ class CellLine(MetaData):
378
355
  not_matched_identifiers = list(set(adata.obs[query_id]) - set(self.bulk_rna_sanger.index))
379
356
  else:
380
357
  reference_id = "DepMap_ID"
381
- print(
382
- "To annotate bulk RNA data from Broad Institue, ",
383
- "`DepMap_ID` is used as default reference and query identifier if no `reference_id` is given. ",
384
- "Ensure that `DepMap_ID` is available in 'adata.obs'. ",
385
- "Alternatively, use `annotate()` to annotate the cell line first ",
358
+ logger.warning(
359
+ "To annotate bulk RNA data from Broad Institue, `DepMap_ID` is used as default reference and query identifier if no `reference_id` is given.\n"
360
+ "Ensure that `DepMap_ID` is available in 'adata.obs'.\n"
361
+ "Alternatively, use `annotate()` to annotate the cell line first "
386
362
  )
387
363
  if self.bulk_rna_broad is None:
388
364
  self._download_bulk_rna(cell_line_source="broad")
@@ -438,16 +414,12 @@ class CellLine(MetaData):
438
414
 
439
415
  Args:
440
416
  adata: The data object to annotate.
441
- query_id: The column of `.obs` with cell line information. Defaults to "cell_line_name".
417
+ query_id: The column of `.obs` with cell line information.
442
418
  reference_id: The type of cell line identifier in the meta data, model_name or model_id.
443
- Defaults to "model_name".
444
419
  protein_information: The type of protein expression data to fetch, protein_intensity or zscore.
445
- Defaults to "protein_intensity".
446
420
  protein_id: The protein identifier saved in the fetched meta data, uniprot_id or symbol.
447
- Defaults to "uniprot_id".
448
421
  verbosity: The number of unmatched identifiers to print, can be either non-negative values or "all".
449
- Defaults to 5.
450
- copy: Determines whether a copy of the `adata` is returned. Defaults to False.
422
+ copy: Determines whether a copy of the `adata` is returned.
451
423
 
452
424
  Returns:
453
425
  Returns an AnnData object with protein expression annotation.
@@ -481,7 +453,7 @@ class CellLine(MetaData):
481
453
  raise ValueError(
482
454
  f"The specified `reference_id`{reference_id} can't be found in the protein expression data. \n"
483
455
  "To solve the issue, please use the reference identifier available in the metadata. \n"
484
- "Alternatively, create a `CellLineMetaData.lookup()` object to obtain the available reference identifiers in the metadata. "
456
+ "Alternatively, create a `CellLineMetaData.lookup()` object to obtain the available reference identifiers in the metadata."
485
457
  )
486
458
 
487
459
  identifier_num_all = len(adata.obs[query_id].unique())
@@ -511,7 +483,7 @@ class CellLine(MetaData):
511
483
  reference_id: Literal["cell_line_name", "sanger_model_id", "cosmic_id"] = "cell_line_name",
512
484
  query_perturbation: str = "perturbation",
513
485
  reference_perturbation: Literal["drug_name", "drug_id"] = "drug_name",
514
- gdsc_dataset: Literal[1, 2] = 1,
486
+ gdsc_dataset: Literal["gdsc_1", "gdsc_2"] = "gdsc_1",
515
487
  verbosity: int | str = 5,
516
488
  copy: bool = False,
517
489
  ) -> AnnData:
@@ -522,22 +494,17 @@ class CellLine(MetaData):
522
494
 
523
495
  Args:
524
496
  adata: The data object to annotate.
525
- query_id: The column of `.obs` with cell line information. Defaults to "cell_line_name".
526
- reference_id: The type of cell line identifier in the meta data, cell_line_name, sanger_model_id or cosmic_id.
527
- Defaults to "cell_line_name".
497
+ query_id: The column of `.obs` with cell line information.
498
+ reference_id: The type of cell line identifier in the metadata, cell_line_name, sanger_model_id or cosmic_id.
528
499
  query_perturbation: The column of `.obs` with perturbation information.
529
- Defaults to "perturbation".
530
- reference_perturbation: The type of perturbation in the meta data, drug_name or drug_id.
531
- Defaults to 'drug_name'.
532
- gdsc_dataset: The GDSC dataset, 1 or 2.
500
+ reference_perturbation: The type of perturbation in the metadata, drug_name or drug_id.
501
+ gdsc_dataset: The GDSC dataset, 1 or 2, specified as 'gdsc_1' or 'gdsc_2'.
533
502
  The GDSC1 dataset updates previous releases with additional drug screening data from the
534
503
  Sanger Institute and Massachusetts General Hospital.
535
504
  It covers 970 Cell lines and 403 Compounds with 333292 IC50s.
536
505
  GDSC2 is new and has 243,466 IC50 results from the latest screening at the Sanger Institute.
537
- Defaults to 1.
538
506
  verbosity: The number of unmatched identifiers to print, can be either non-negative values or 'all'.
539
- Defaults to 5.
540
- copy: Determines whether a copy of the `adata` is returned. Defaults to False.
507
+ copy: Determines whether a copy of the `adata` is returned.
541
508
 
542
509
  Returns:
543
510
  Returns an AnnData object with drug response annotation.
@@ -559,14 +526,16 @@ class CellLine(MetaData):
559
526
  "This ensures that the required query ID is included in your data."
560
527
  )
561
528
  # Lazily download the GDSC data
562
- if gdsc_dataset == 1:
529
+ if gdsc_dataset == "gdsc_1":
563
530
  if self.drug_response_gdsc1 is None:
564
531
  self._download_gdsc(gdsc_dataset=1)
565
532
  gdsc_data = self.drug_response_gdsc1
566
- else:
533
+ elif gdsc_dataset == "gdsc_2":
567
534
  if self.drug_response_gdsc2 is None:
568
535
  self._download_gdsc(gdsc_dataset=2)
569
536
  gdsc_data = self.drug_response_gdsc2
537
+ else:
538
+ raise ValueError("The GDSC dataset specified in `gdsc_dataset` must be either 'gdsc_1' or 'gdsc_2'.")
570
539
 
571
540
  identifier_num_all = len(adata.obs[query_id].unique())
572
541
  not_matched_identifiers = list(set(adata.obs[query_id]) - set(gdsc_data[reference_id]))
@@ -583,7 +552,7 @@ class CellLine(MetaData):
583
552
  adata.obs = (
584
553
  adata.obs.reset_index()
585
554
  .set_index([query_id, query_perturbation])
586
- .assign(ln_ic50=self.drug_response_gdsc1.set_index([reference_id, reference_perturbation]).ln_ic50)
555
+ .assign(ln_ic50=gdsc_data.set_index([reference_id, reference_perturbation]).ln_ic50)
587
556
  .reset_index()
588
557
  .set_index(old_index_name)
589
558
  )
@@ -678,8 +647,8 @@ class CellLine(MetaData):
678
647
 
679
648
  Args:
680
649
  adata: Input data object.
681
- identifier: Column in `.obs` containing cell line identifiers. Defaults to "DepMap_ID".
682
- metadata_key: Key of the AnnData obsm for comparison with the X matrix. Defaults to "bulk_rna_broad".
650
+ identifier: Column in `.obs` containing cell line identifiers.
651
+ metadata_key: Key of the AnnData obsm for comparison with the X matrix.
683
652
 
684
653
  Returns:
685
654
  Returns pearson correlation coefficients and their corresponding p-values for matched and unmatched cell lines separately.
@@ -695,7 +664,7 @@ class CellLine(MetaData):
695
664
  if isinstance(adata.obsm[metadata_key], pd.DataFrame):
696
665
  # Give warning if the genes are not the same
697
666
  if sum(adata.obsm[metadata_key].columns != adata.var.index.values) > 0:
698
- print(
667
+ logger.warning(
699
668
  "Column name of metadata is not the same as the index of adata.var. Ensure that the genes are in the same order."
700
669
  )
701
670
 
@@ -726,6 +695,7 @@ class CellLine(MetaData):
726
695
  adata: AnnData,
727
696
  corr: pd.DataFrame,
728
697
  pval: pd.DataFrame,
698
+ *,
729
699
  identifier: str = "DepMap_ID",
730
700
  metadata_key: str = "bulk_rna_broad",
731
701
  category: str = "cell line",
@@ -737,13 +707,12 @@ class CellLine(MetaData):
737
707
  adata: Input data object.
738
708
  corr: Pearson correlation scores.
739
709
  pval: P-values for pearson correlation.
740
- identifier: Column in `.obs` containing the identifiers. Defaults to 'DepMap_ID'.
741
- metadata_key: Key of the AnnData obsm for comparison with the X matrix. Defaults to 'bulk_rna_broad'.
742
- category: The category for correlation comparison. Defaults to "cell line".
710
+ identifier: Column in `.obs` containing the identifiers.
711
+ metadata_key: Key of the AnnData obsm for comparison with the X matrix.
712
+ category: The category for correlation comparison.
743
713
  subset_identifier: Selected identifiers for scatter plot visualization between the X matrix and `metadata_key`.
744
714
  If not None, only the chosen cell line will be plotted, either specified as a value in `identifier` (string) or as an index number.
745
715
  If None, all cell lines will be plotted.
746
- Defaults to None.
747
716
  Returns:
748
717
  Pearson correlation coefficients and their corresponding p-values for matched and unmatched cell lines separately.
749
718
  """
@@ -30,11 +30,10 @@ class Compound(MetaData):
30
30
 
31
31
  Args:
32
32
  adata: The data object to annotate.
33
- query_id: The column of `.obs` with compound identifiers. Defaults to 'perturbation'.
34
- query_id_type: The type of compound identifiers, 'name' or 'cid'. Defaults to 'name'.
33
+ query_id: The column of `.obs` with compound identifiers.
34
+ query_id_type: The type of compound identifiers, 'name' or 'cid'.
35
35
  verbosity: The number of unmatched identifiers to print, can be either non-negative values or "all".
36
- Defaults to 5.
37
- copy: Determines whether a copy of the `adata` is returned. Defaults to False.
36
+ copy: Determines whether a copy of the `adata` is returned.
38
37
 
39
38
  Returns:
40
39
  Returns an AnnData object with compound annotation.
pertpy/metadata/_drug.py CHANGED
@@ -6,7 +6,6 @@ from pathlib import Path
6
6
  from typing import TYPE_CHECKING, Literal
7
7
 
8
8
  import pandas as pd
9
- from rich import print
10
9
  from scanpy import settings
11
10
 
12
11
  from pertpy.data._dataloader import _download
@@ -25,7 +24,6 @@ def _download_drug_annotation(
25
24
  # Prepared in https://github.com/theislab/pertpy-datasets/blob/main/chembl_data.ipynb
26
25
  chembl_path = Path(settings.cachedir) / "chembl.json"
27
26
  if not Path(chembl_path).exists():
28
- print("[bold yellow]No metadata file was found for chembl. Starting download now.")
29
27
  _download(
30
28
  url="https://figshare.com/ndownloader/files/43871718",
31
29
  output_file_name="chembl.json",
@@ -40,7 +38,6 @@ def _download_drug_annotation(
40
38
  elif source == "dgidb":
41
39
  dgidb_path = Path(settings.cachedir) / "dgidb.tsv"
42
40
  if not Path(dgidb_path).exists():
43
- print("[bold yellow]No metadata file was found for dgidb. Starting download now.")
44
41
  _download(
45
42
  url="https://www.dgidb.org/data/latest/interactions.tsv",
46
43
  output_file_name="dgidb.tsv",
@@ -54,7 +51,6 @@ def _download_drug_annotation(
54
51
  else:
55
52
  pharmgkb_path = Path(settings.cachedir) / "pharmgkb.tsv"
56
53
  if not Path(pharmgkb_path).exists():
57
- print("[bold yellow]No metadata file was found for pharmGKB. Starting download now.")
58
54
  _download(
59
55
  url="https://api.pharmgkb.org/v1/download/file/data/relationships.zip",
60
56
  output_file_name="pharmgkb.zip",
@@ -103,8 +99,8 @@ class Drug(MetaData):
103
99
 
104
100
  Args:
105
101
  adata: AnnData object containing log-normalised data.
106
- source: Source of the metadata, chembl, dgidb or pharmgkb. Defaults to chembl.
107
- copy: Determines whether a copy of the `adata` is returned. Defaults to False.
102
+ source: Source of the metadata, chembl, dgidb or pharmgkb.
103
+ copy: Determines whether a copy of the `adata` is returned.
108
104
 
109
105
  Returns:
110
106
  An AnnData object with a new column `drug` in the var slot.