pertpy 0.6.0__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. pertpy/__init__.py +4 -2
  2. pertpy/data/__init__.py +66 -1
  3. pertpy/data/_dataloader.py +28 -26
  4. pertpy/data/_datasets.py +261 -92
  5. pertpy/metadata/__init__.py +6 -0
  6. pertpy/metadata/_cell_line.py +795 -0
  7. pertpy/metadata/_compound.py +128 -0
  8. pertpy/metadata/_drug.py +238 -0
  9. pertpy/metadata/_look_up.py +569 -0
  10. pertpy/metadata/_metadata.py +70 -0
  11. pertpy/metadata/_moa.py +125 -0
  12. pertpy/plot/__init__.py +0 -13
  13. pertpy/preprocessing/__init__.py +2 -0
  14. pertpy/preprocessing/_guide_rna.py +89 -6
  15. pertpy/tools/__init__.py +48 -15
  16. pertpy/tools/_augur.py +329 -32
  17. pertpy/tools/_cinemaot.py +145 -6
  18. pertpy/tools/_coda/_base_coda.py +1237 -116
  19. pertpy/tools/_coda/_sccoda.py +66 -36
  20. pertpy/tools/_coda/_tasccoda.py +46 -39
  21. pertpy/tools/_dialogue.py +180 -77
  22. pertpy/tools/_differential_gene_expression/__init__.py +20 -0
  23. pertpy/tools/_differential_gene_expression/_base.py +657 -0
  24. pertpy/tools/_differential_gene_expression/_checks.py +41 -0
  25. pertpy/tools/_differential_gene_expression/_dge_comparison.py +86 -0
  26. pertpy/tools/_differential_gene_expression/_edger.py +125 -0
  27. pertpy/tools/_differential_gene_expression/_formulaic.py +189 -0
  28. pertpy/tools/_differential_gene_expression/_pydeseq2.py +95 -0
  29. pertpy/tools/_differential_gene_expression/_simple_tests.py +162 -0
  30. pertpy/tools/_differential_gene_expression/_statsmodels.py +72 -0
  31. pertpy/tools/_distances/_distance_tests.py +29 -24
  32. pertpy/tools/_distances/_distances.py +584 -98
  33. pertpy/tools/_enrichment.py +460 -0
  34. pertpy/tools/_kernel_pca.py +1 -1
  35. pertpy/tools/_milo.py +406 -49
  36. pertpy/tools/_mixscape.py +677 -55
  37. pertpy/tools/_perturbation_space/_clustering.py +10 -3
  38. pertpy/tools/_perturbation_space/_comparison.py +112 -0
  39. pertpy/tools/_perturbation_space/_discriminator_classifiers.py +524 -0
  40. pertpy/tools/_perturbation_space/_perturbation_space.py +146 -52
  41. pertpy/tools/_perturbation_space/_simple.py +52 -11
  42. pertpy/tools/_scgen/__init__.py +1 -1
  43. pertpy/tools/_scgen/_base_components.py +2 -3
  44. pertpy/tools/_scgen/_scgen.py +706 -0
  45. pertpy/tools/_scgen/_utils.py +3 -5
  46. pertpy/tools/decoupler_LICENSE +674 -0
  47. {pertpy-0.6.0.dist-info → pertpy-0.8.0.dist-info}/METADATA +48 -20
  48. pertpy-0.8.0.dist-info/RECORD +57 -0
  49. {pertpy-0.6.0.dist-info → pertpy-0.8.0.dist-info}/WHEEL +1 -1
  50. pertpy/plot/_augur.py +0 -234
  51. pertpy/plot/_cinemaot.py +0 -81
  52. pertpy/plot/_coda.py +0 -1001
  53. pertpy/plot/_dialogue.py +0 -91
  54. pertpy/plot/_guide_rna.py +0 -82
  55. pertpy/plot/_milopy.py +0 -284
  56. pertpy/plot/_mixscape.py +0 -594
  57. pertpy/plot/_scgen.py +0 -337
  58. pertpy/tools/_differential_gene_expression.py +0 -99
  59. pertpy/tools/_metadata/__init__.py +0 -0
  60. pertpy/tools/_metadata/_cell_line.py +0 -613
  61. pertpy/tools/_metadata/_look_up.py +0 -342
  62. pertpy/tools/_perturbation_space/_discriminator_classifier.py +0 -381
  63. pertpy/tools/_scgen/_jax_scgen.py +0 -370
  64. pertpy-0.6.0.dist-info/RECORD +0 -50
  65. /pertpy/tools/_scgen/{_jax_scgenvae.py → _scgenvae.py} +0 -0
  66. {pertpy-0.6.0.dist-info → pertpy-0.8.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,795 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+ from typing import TYPE_CHECKING, Literal
5
+
6
+ from lamin_utils import logger
7
+
8
+ if TYPE_CHECKING:
9
+ from collections.abc import Iterable
10
+
11
+ import matplotlib.pyplot as plt
12
+ import numpy as np
13
+ import pandas as pd
14
+ from scanpy import settings
15
+ from scipy import stats
16
+
17
+ from pertpy.data._dataloader import _download
18
+
19
+ from ._look_up import LookUp
20
+ from ._metadata import MetaData
21
+
22
+ if TYPE_CHECKING:
23
+ from anndata import AnnData
24
+
25
+
26
+ class CellLine(MetaData):
27
+ """Utilities to fetch cell line metadata."""
28
+
29
+ def __init__(self):
30
+ super().__init__()
31
+ self.depmap = None
32
+ self.cancerxgene = None
33
+ self.gene_annotation = None
34
+ self.bulk_rna_sanger = None
35
+ self.bulk_rna_broad = None
36
+ self.proteomics = None
37
+ self.drug_response_gdsc1 = None
38
+ self.drug_response_gdsc2 = None
39
+
40
+ def _download_cell_line(self, cell_line_source: Literal["DepMap", "Cancerrxgene"] = "DepMap") -> None:
41
+ if cell_line_source == "DepMap":
42
+ # Download cell line metadata from DepMap
43
+ # Source: https://depmap.org/portal/download/all/ (DepMap Public 23Q4)
44
+ depmap_cell_line_path = Path(settings.cachedir) / "depmap_23Q4_info.csv"
45
+ if not Path(depmap_cell_line_path).exists():
46
+ _download(
47
+ url="https://ndownloader.figshare.com/files/43746708",
48
+ output_file_name="depmap_23Q4_info.csv",
49
+ output_path=settings.cachedir,
50
+ block_size=4096,
51
+ is_zip=False,
52
+ )
53
+ self.depmap = pd.read_csv(depmap_cell_line_path)
54
+ else:
55
+ # Download cell line metadata from The Genomics of Drug Sensitivity in Cancer Project
56
+ # Source: https://www.cancerrxgene.org/celllines
57
+ cancerxgene_cell_line_path = Path(settings.cachedir) / "cell_line_cancer_project.csv"
58
+ transformed_cancerxgene_cell_line_path = Path(settings.cachedir) / "cancerrxgene_info.csv"
59
+
60
+ if not Path(transformed_cancerxgene_cell_line_path).exists():
61
+ if not Path(cancerxgene_cell_line_path).exists():
62
+ _download(
63
+ url="https://www.cancerrxgene.org/api/celllines?list=all&sEcho=1&iColumns=7&sColumns=&"
64
+ "iDisplayStart=0&iDisplayLength=25&mDataProp_0=0&mDataProp_1=1&mDataProp_2=2&mDataProp_3=3&"
65
+ "mDataProp_4=4&mDataProp_5=5&mDataProp_6=6&sSearch=&bRegex=false&sSearch_0=&bRegex_0=false&"
66
+ "bSearchable_0=true&sSearch_1=&bRegex_1=false&bSearchable_1=true&sSearch_2=&bRegex_2=false&"
67
+ "bSearchable_2=true&sSearch_3=&bRegex_3=false&bSearchable_3=true&sSearch_4=&bRegex_4=false&"
68
+ "bSearchable_4=true&sSearch_5=&bRegex_5=false&bSearchable_5=true&sSearch_6=&bRegex_6=false&"
69
+ "bSearchable_6=true&iSortCol_0=0&sSortDir_0=asc&iSortingCols=1&bSortable_0=true&bSortable_1=true&"
70
+ "bSortable_2=true&bSortable_3=true&bSortable_4=true&bSortable_5=true&bSortable_6=true&export=csv",
71
+ output_file_name="cell_line_cancer_project.csv",
72
+ output_path=settings.cachedir,
73
+ block_size=4096,
74
+ is_zip=False,
75
+ )
76
+ self.cancerxgene = pd.read_csv(cancerxgene_cell_line_path)
77
+ self.cancerxgene.columns = self.cancerxgene.columns.str.strip()
78
+ self.cancerxgene["stripped_cell_line_name"] = (
79
+ self.cancerxgene["Cell line Name"]
80
+ .str.replace(r"\-|\.", "", regex=True)
81
+ .str.upper()
82
+ .astype("category")
83
+ )
84
+ # pivot the data frame so that each cell line has only one row of metadata
85
+ index_col = set(self.cancerxgene.columns) - {
86
+ "Datasets",
87
+ "number of drugs",
88
+ }
89
+ self.cancerxgene = self.cancerxgene.pivot(index=index_col, columns="Datasets", values="number of drugs")
90
+ self.cancerxgene.columns.name = None
91
+ self.cancerxgene = self.cancerxgene.reset_index().rename(columns={"Cell line Name": "cell_line_name"})
92
+ self.cancerxgene.to_csv(transformed_cancerxgene_cell_line_path)
93
+ else:
94
+ self.cancerxgene = pd.read_csv(transformed_cancerxgene_cell_line_path, index_col=0)
95
+
96
+ def _download_gene_annotation(self) -> None:
97
+ # Download metadata for driver genes from DepMap.Sanger
98
+ # Source: https://cellmodelpassports.sanger.ac.uk/downloads (Gene annotation)
99
+ gene_annotation_file_path = Path(settings.cachedir) / "genes_info.csv"
100
+ if not Path(gene_annotation_file_path).exists():
101
+ _download(
102
+ url="https://cog.sanger.ac.uk/cmp/download/gene_identifiers_20191101.csv",
103
+ output_file_name="genes_info.csv",
104
+ output_path=settings.cachedir,
105
+ block_size=4096,
106
+ is_zip=False,
107
+ )
108
+ self.gene_annotation = pd.read_table(gene_annotation_file_path, delimiter=",")
109
+
110
+ def _download_bulk_rna(self, cell_line_source: Literal["broad", "sanger"] = "broad") -> None:
111
+ if cell_line_source == "sanger":
112
+ # Download bulk RNA-seq data collated by the Wellcome Sanger Institute and the Broad Institute from DepMap.Sanger
113
+ # Source: https://cellmodelpassports.sanger.ac.uk/downloads (Expression data)
114
+ # issue: read count values contain random whitespace
115
+ # solution: remove the white space and convert to int before depmap updates the metadata
116
+ bulk_rna_sanger_file_path = Path(settings.cachedir) / "rnaseq_sanger_info.csv"
117
+ if not Path(bulk_rna_sanger_file_path).exists():
118
+ _download(
119
+ url="https://figshare.com/ndownloader/files/42467103",
120
+ output_file_name="rnaseq_sanger_info.csv",
121
+ output_path=settings.cachedir,
122
+ block_size=4096,
123
+ is_zip=False,
124
+ )
125
+ self.bulk_rna_sanger = pd.read_csv(bulk_rna_sanger_file_path, index_col=0, dtype="unicode")
126
+ else:
127
+ # Download CCLE expression data from DepMap
128
+ # Source: https://depmap.org/portal/download/all/ (DepMap Public 22Q2)
129
+ bulk_rna_broad_file_path = Path(settings.cachedir) / "rnaseq_depmap_info.csv"
130
+ if not Path(bulk_rna_broad_file_path).exists():
131
+ _download(
132
+ url="https://figshare.com/ndownloader/files/34989922",
133
+ output_file_name="rnaseq_depmap_info.csv",
134
+ output_path=settings.cachedir,
135
+ block_size=4096,
136
+ is_zip=False,
137
+ )
138
+ self.bulk_rna_broad = pd.read_csv(bulk_rna_broad_file_path, index_col=0)
139
+
140
+ def _download_proteomics(self) -> None:
141
+ # Download proteomics data processed by DepMap.Sanger
142
+ # Source: https://cellmodelpassports.sanger.ac.uk/downloads (Proteomics)
143
+ proteomics_file_path = Path(settings.cachedir) / "proteomics_info.csv"
144
+ if not Path(proteomics_file_path).exists():
145
+ _download(
146
+ url="https://figshare.com/ndownloader/files/42468393",
147
+ output_file_name="proteomics_info.csv",
148
+ output_path=settings.cachedir,
149
+ block_size=4096,
150
+ is_zip=False,
151
+ )
152
+ self.proteomics = pd.read_csv(proteomics_file_path, index_col=0)
153
+
154
+ def _download_gdsc(self, gdsc_dataset: Literal[1, 2] = 1) -> None:
155
+ if gdsc_dataset == 1:
156
+ # Download GDSC drug response data
157
+ # Source: https://www.cancerrxgene.org/downloads/bulk_download (Drug Screening - IC50s)
158
+ # URL: https://cog.sanger.ac.uk/cancerrxgene/GDSC_release8.4/GDSC1_fitted_dose_response_24Jul22.xlsx
159
+ drug_response_gdsc1_file_path = Path(settings.cachedir) / "gdsc1_info.csv"
160
+ if not Path(drug_response_gdsc1_file_path).exists():
161
+ _download(
162
+ url="https://figshare.com/ndownloader/files/43757235",
163
+ output_file_name="gdsc1_info.csv",
164
+ output_path=settings.cachedir,
165
+ block_size=4096,
166
+ is_zip=False,
167
+ )
168
+ self.drug_response_gdsc1 = pd.read_csv(drug_response_gdsc1_file_path, index_col=0)
169
+ if gdsc_dataset == 2:
170
+ drug_response_gdsc2_file_path = Path(settings.cachedir) / "gdsc2_info.csv"
171
+ if not Path(drug_response_gdsc2_file_path).exists():
172
+ _download(
173
+ url="https://figshare.com/ndownloader/files/43757232",
174
+ output_file_name="gdsc2_info.csv",
175
+ output_path=settings.cachedir,
176
+ block_size=4096,
177
+ is_zip=False,
178
+ )
179
+ self.drug_response_gdsc2 = pd.read_csv(drug_response_gdsc2_file_path, index_col=0)
180
+
181
+ def annotate(
182
+ self,
183
+ adata: AnnData,
184
+ query_id: str = "DepMap_ID",
185
+ reference_id: str = "ModelID",
186
+ fetch: list[str] | None = None,
187
+ cell_line_source: Literal["DepMap", "Cancerrxgene"] = "DepMap",
188
+ verbosity: int | str = 5,
189
+ copy: bool = False,
190
+ ) -> AnnData:
191
+ """Annotate cell lines.
192
+
193
+ For each cell, we fetch cell line annotation from either the Dependency Map (DepMap) or The Genomics of Drug Sensitivity in Cancer Project (Cancerxgene).
194
+
195
+ Args:
196
+ adata: The data object to annotate.
197
+ query_id: The column of `.obs` with cell line information.
198
+ reference_id: The type of cell line identifier in the metadata, e.g. ModelID, CellLineName or StrippedCellLineName.
199
+ If fetching cell line metadata from Cancerrxgene, it is recommended to choose "stripped_cell_line_name".
200
+ fetch: The metadata to fetch.
201
+ cell_line_source: The source of cell line metadata, DepMap or Cancerrxgene.
202
+ verbosity: The number of unmatched identifiers to print, can be either non-negative values or "all".
203
+ copy: Determines whether a copy of the `adata` is returned.
204
+
205
+ Returns:
206
+ Returns an AnnData object with cell line annotation.
207
+
208
+ Examples:
209
+ >>> import pertpy as pt
210
+ >>> adata = pt.dt.dialogue_example()
211
+ >>> adata.obs["cell_line_name"] = "MCF7"
212
+ >>> pt_metadata = pt.md.CellLine()
213
+ >>> adata_annotated = pt_metadata.annotate(adata=adata,
214
+ >>> reference_id='cell_line_name',
215
+ >>> query_id='cell_line_name',
216
+ >>> fetch=["cell_line_name", "age", "primary_disease"],
217
+ >>> copy=True)
218
+ """
219
+ if copy:
220
+ adata = adata.copy()
221
+
222
+ if cell_line_source == "DepMap":
223
+ if self.depmap is None:
224
+ self._download_cell_line(cell_line_source="DepMap")
225
+ cell_line_meta = self.depmap
226
+ else:
227
+ reference_id = "stripped_cell_line_name"
228
+ if query_id == "DepMap_ID":
229
+ query_id = "stripped_cell_line_name"
230
+ logger.error(
231
+ "`stripped_cell_line_name` is used as reference and query identifier to annotate cell line metadata from Cancerrxgene. "
232
+ "Ensure that stripped cell line names are available in 'adata.obs.' or use the DepMap as `cell_line_source` to annotate the cell line first."
233
+ )
234
+ if self.cancerxgene is None:
235
+ self._download_cell_line(cell_line_source="Cancerrxgene")
236
+ cell_line_meta = self.cancerxgene
237
+
238
+ if query_id not in adata.obs.columns:
239
+ raise ValueError(f"The requested query_id {query_id} is not in `adata.obs`.")
240
+
241
+ if reference_id in cell_line_meta.columns:
242
+ # If the specified cell line type can be found in the database,
243
+ # we can compare these keys and fetch the corresponding metadata.
244
+ identifier_num_all = len(adata.obs[query_id].unique())
245
+ not_matched_identifiers = list(set(adata.obs[query_id]) - set(cell_line_meta[reference_id]))
246
+
247
+ self._warn_unmatch(
248
+ total_identifiers=identifier_num_all,
249
+ unmatched_identifiers=not_matched_identifiers,
250
+ query_id=query_id,
251
+ reference_id=reference_id,
252
+ metadata_type="cell line",
253
+ verbosity=verbosity,
254
+ )
255
+
256
+ if fetch is not None:
257
+ # If fetch is specified and can be found in the DepMap database,
258
+ # We will subset the original metadata dataframe correspondingly and add them to the AnnData object.
259
+ # Redundant information will be removed.
260
+ if set(fetch).issubset(set(cell_line_meta.columns)):
261
+ if reference_id not in fetch:
262
+ fetch.append(reference_id)
263
+ else:
264
+ raise ValueError(
265
+ "Selected cell line information is not present in the metadata.\n"
266
+ "Please create a `CellLineMetaData.lookup()` object to obtain the available cell line information in the metadata."
267
+ )
268
+
269
+ # If no fetch is specified, all metadata is fetched by default.
270
+ # Sometimes there is already different cell line information in the AnnData object.
271
+ # To avoid redundant information we will remove duplicate information from metadata after merging.
272
+ adata.obs = (
273
+ adata.obs.merge(
274
+ cell_line_meta if fetch is None else cell_line_meta[fetch],
275
+ left_on=query_id,
276
+ right_on=reference_id,
277
+ how="left",
278
+ suffixes=("", "_fromMeta"),
279
+ )
280
+ .filter(regex="^(?!.*_fromMeta)")
281
+ .set_index(adata.obs.index)
282
+ )
283
+ # If query_id and reference_id have different names,
284
+ # there will be a column for each of them after merging,
285
+ # which is redundant as they refer to the same information.
286
+ # We will move the reference_id column.
287
+ if query_id != reference_id:
288
+ del adata.obs[reference_id]
289
+
290
+ else:
291
+ raise ValueError(
292
+ f"The requested cell line type {reference_id} is currently unavailable in the database.\n"
293
+ "Refer to the available reference identifier in the chosen database.\n"
294
+ "DepMap_ID is compared by default.\n"
295
+ "Alternatively, create a `CellLineMetaData.lookup()` object to "
296
+ "obtain the available reference identifiers in the metadata."
297
+ )
298
+
299
+ return adata
300
+
301
+ def annotate_bulk_rna(
302
+ self,
303
+ adata: AnnData,
304
+ query_id: str = "cell_line_name",
305
+ cell_line_source: Literal["broad", "sanger"] = "sanger",
306
+ verbosity: int | str = 5,
307
+ gene_identifier: Literal["gene_name", "gene_ID", "both"] = "gene_ID",
308
+ copy: bool = False,
309
+ ) -> AnnData:
310
+ """Fetch bulk rna expression from the Broad or Sanger.
311
+
312
+ For each cell, we fetch bulk rna expression from either Broad or Sanger cell line.
313
+
314
+ Args:
315
+ adata: The data object to annotate.
316
+ query_id: The column of `.obs` with cell line information. Defaults to "cell_line_name" if `cell_line_source` is sanger, otherwise "DepMap_ID".
317
+ cell_line_source: The bulk rna expression data from either broad or sanger cell line.
318
+ verbosity: The number of unmatched identifiers to print, can be either non-negative values or "all".
319
+ copy: Determines whether a copy of the `adata` is returned.
320
+
321
+ Returns:
322
+ Returns an AnnData object with bulk rna expression annotation.
323
+
324
+ Examples:
325
+ >>> import pertpy as pt
326
+ >>> adata = pt.dt.dialogue_example()
327
+ >>> adata.obs["cell_line_name"] = "MCF7"
328
+ >>> pt_metadata = pt.md.CellLine()
329
+ >>> adata_annotated = pt_metadata.annotate(
330
+ ... adata=adata, reference_id="cell_line_name", query_id="cell_line_name", copy=True
331
+ ... )
332
+ >>> pt_metadata.annotate_bulk_rna(adata_annotated)
333
+ """
334
+ if copy:
335
+ adata = adata.copy()
336
+
337
+ # Make sure that the specified `cell_line_type` can be found in the bulk rna expression data,
338
+ # then we can compare these keys and fetch the corresponding metadata.
339
+ if query_id not in adata.obs.columns:
340
+ raise ValueError(
341
+ f"The specified `query_id` {query_id} can't be found in the `adata.obs`.\n"
342
+ "Ensure that you are using one of the available query IDs present in the adata.obs for the annotation.\n"
343
+ "If the desired query ID is not available, you can fetch the cell line metadata "
344
+ "using the `annotate()` function before calling 'annotate_bulk_rna()'. "
345
+ "This ensures that the required query ID is included in your data, e.g. stripped_cell_line_name, DepMap ID."
346
+ )
347
+
348
+ identifier_num_all = len(adata.obs[query_id].unique())
349
+
350
+ # Lazily download the bulk rna expression data
351
+ if cell_line_source == "sanger":
352
+ if self.bulk_rna_sanger is None:
353
+ self._download_bulk_rna(cell_line_source="sanger")
354
+ reference_id = "model_name"
355
+ not_matched_identifiers = list(set(adata.obs[query_id]) - set(self.bulk_rna_sanger.index))
356
+ else:
357
+ reference_id = "DepMap_ID"
358
+ logger.warning(
359
+ "To annotate bulk RNA data from Broad Institue, `DepMap_ID` is used as default reference and query identifier if no `reference_id` is given.\n"
360
+ "Ensure that `DepMap_ID` is available in 'adata.obs'.\n"
361
+ "Alternatively, use `annotate()` to annotate the cell line first "
362
+ )
363
+ if self.bulk_rna_broad is None:
364
+ self._download_bulk_rna(cell_line_source="broad")
365
+ if query_id == "cell_line_name":
366
+ query_id = "DepMap_ID"
367
+ not_matched_identifiers = list(set(adata.obs[query_id]) - set(self.bulk_rna_broad.index))
368
+
369
+ self._warn_unmatch(
370
+ total_identifiers=identifier_num_all,
371
+ unmatched_identifiers=not_matched_identifiers,
372
+ query_id=query_id,
373
+ reference_id=reference_id,
374
+ metadata_type="bulk RNA",
375
+ verbosity=verbosity,
376
+ )
377
+
378
+ if cell_line_source == "sanger":
379
+ sanger_rna_exp = self.bulk_rna_sanger[self.bulk_rna_sanger.index.isin(adata.obs[query_id])]
380
+ sanger_rna_exp = sanger_rna_exp.reindex(adata.obs[query_id])
381
+ sanger_rna_exp.index = adata.obs.index
382
+ adata.obsm["bulk_rna_sanger"] = sanger_rna_exp
383
+ else:
384
+ if gene_identifier == "gene_ID":
385
+ self.bulk_rna_broad.columns = [
386
+ (gene_name.split(" (")[1].split(")")[0] if "(" in gene_name else gene_name)
387
+ for gene_name in self.bulk_rna_broad.columns
388
+ ]
389
+ elif gene_identifier == "gene_name":
390
+ self.bulk_rna_broad.columns = [
391
+ gene_name.split(" (")[0] if "(" in gene_name else gene_name
392
+ for gene_name in self.bulk_rna_broad.columns
393
+ ]
394
+ broad_rna_exp = self.bulk_rna_broad[self.bulk_rna_broad.index.isin(adata.obs[query_id])]
395
+ ccle_expression = broad_rna_exp.reindex(adata.obs[query_id])
396
+ ccle_expression.index = adata.obs.index
397
+ adata.obsm["bulk_rna_broad"] = ccle_expression
398
+
399
+ return adata
400
+
401
+ def annotate_protein_expression(
402
+ self,
403
+ adata: AnnData,
404
+ query_id: str = "cell_line_name",
405
+ reference_id: Literal["model_name", "model_id"] = "model_name",
406
+ protein_information: Literal["protein_intensity", "zscore"] = "protein_intensity",
407
+ protein_id: Literal["uniprot_id", "symbol"] = "uniprot_id",
408
+ verbosity: int | str = 5,
409
+ copy: bool = False,
410
+ ) -> AnnData:
411
+ """Fetch protein expression.
412
+
413
+ For each cell, we fetch protein intensity values acquired using data-independent acquisition mass spectrometry (DIA-MS).
414
+
415
+ Args:
416
+ adata: The data object to annotate.
417
+ query_id: The column of `.obs` with cell line information.
418
+ reference_id: The type of cell line identifier in the meta data, model_name or model_id.
419
+ protein_information: The type of protein expression data to fetch, protein_intensity or zscore.
420
+ protein_id: The protein identifier saved in the fetched meta data, uniprot_id or symbol.
421
+ verbosity: The number of unmatched identifiers to print, can be either non-negative values or "all".
422
+ copy: Determines whether a copy of the `adata` is returned.
423
+
424
+ Returns:
425
+ Returns an AnnData object with protein expression annotation.
426
+
427
+ Examples:
428
+ >>> import pertpy as pt
429
+ >>> adata = pt.dt.dialogue_example()
430
+ >>> adata.obs["cell_line_name"] = "MCF7"
431
+ >>> pt_metadata = pt.md.CellLine()
432
+ >>> adata_annotated = pt_metadata.annotate(
433
+ ... adata=adata, reference_id="cell_line_name", query_id="cell_line_name", copy=True
434
+ ... )
435
+ >>> pt_metadata.annotate_protein_expression(adata_annotated)
436
+ """
437
+ if copy:
438
+ adata = adata.copy()
439
+
440
+ # Make sure that the specified `cell_line_type` can be found in the protein expression data,
441
+ # then we can compare these keys and fetch the corresponding metadata.
442
+ if query_id not in adata.obs.columns:
443
+ raise ValueError(
444
+ f"The specified `query_id` {query_id} can't be found in `adata.obs`. \n"
445
+ "If the desired query ID is not available, you can fetch the cell line metadata \n"
446
+ "using the `annotate()` function before calling annotate_protein_expression(). \n"
447
+ "This ensures that the required query ID is included in your data."
448
+ )
449
+ # Lazily download the proteomics data
450
+ if self.proteomics is None:
451
+ self._download_proteomics()
452
+ if reference_id not in self.proteomics.columns:
453
+ raise ValueError(
454
+ f"The specified `reference_id`{reference_id} can't be found in the protein expression data. \n"
455
+ "To solve the issue, please use the reference identifier available in the metadata. \n"
456
+ "Alternatively, create a `CellLineMetaData.lookup()` object to obtain the available reference identifiers in the metadata."
457
+ )
458
+
459
+ identifier_num_all = len(adata.obs[query_id].unique())
460
+ not_matched_identifiers = list(set(adata.obs[query_id]) - set(self.proteomics[reference_id]))
461
+
462
+ self._warn_unmatch(
463
+ total_identifiers=identifier_num_all,
464
+ unmatched_identifiers=not_matched_identifiers,
465
+ query_id=query_id,
466
+ reference_id=reference_id,
467
+ metadata_type="protein expression",
468
+ verbosity=verbosity,
469
+ )
470
+
471
+ # convert the original protein intensities table from long format to wide format, group by the cell lines
472
+ adata.obsm["proteomics_" + protein_information] = (
473
+ self.proteomics[[reference_id, protein_id, protein_information]]
474
+ .pivot(index=reference_id, columns=protein_id, values=protein_information)
475
+ .reindex(adata.obs.index)
476
+ )
477
+ return adata
478
+
479
+ def annotate_from_gdsc(
480
+ self,
481
+ adata: AnnData,
482
+ query_id: str = "cell_line_name",
483
+ reference_id: Literal["cell_line_name", "sanger_model_id", "cosmic_id"] = "cell_line_name",
484
+ query_perturbation: str = "perturbation",
485
+ reference_perturbation: Literal["drug_name", "drug_id"] = "drug_name",
486
+ gdsc_dataset: Literal["gdsc_1", "gdsc_2"] = "gdsc_1",
487
+ verbosity: int | str = 5,
488
+ copy: bool = False,
489
+ ) -> AnnData:
490
+ """Fetch drug response data from GDSC.
491
+
492
+ For each cell, we fetch drug response data as natural log of the fitted IC50 for its
493
+ corresponding cell line and perturbation from GDSC fitted data results file.
494
+
495
+ Args:
496
+ adata: The data object to annotate.
497
+ query_id: The column of `.obs` with cell line information.
498
+ reference_id: The type of cell line identifier in the metadata, cell_line_name, sanger_model_id or cosmic_id.
499
+ query_perturbation: The column of `.obs` with perturbation information.
500
+ reference_perturbation: The type of perturbation in the metadata, drug_name or drug_id.
501
+ gdsc_dataset: The GDSC dataset, 1 or 2, specified as 'gdsc_1' or 'gdsc_2'.
502
+ The GDSC1 dataset updates previous releases with additional drug screening data from the
503
+ Sanger Institute and Massachusetts General Hospital.
504
+ It covers 970 Cell lines and 403 Compounds with 333292 IC50s.
505
+ GDSC2 is new and has 243,466 IC50 results from the latest screening at the Sanger Institute.
506
+ verbosity: The number of unmatched identifiers to print, can be either non-negative values or 'all'.
507
+ copy: Determines whether a copy of the `adata` is returned.
508
+
509
+ Returns:
510
+ Returns an AnnData object with drug response annotation.
511
+
512
+ Examples:
513
+ >>> import pertpy as pt
514
+ >>> adata = pt.dt.mcfarland_2020()
515
+ >>> pt_metadata = pt.md.CellLine()
516
+ >>> pt_metadata.annotate_from_gdsc(adata, query_id="cell_line")
517
+ """
518
+ if copy:
519
+ adata = adata.copy()
520
+ if query_id not in adata.obs.columns:
521
+ raise ValueError(
522
+ f"The specified `query_id` {query_id} can't be found in the `adata.obs`. \n"
523
+ "Ensure that you are using one of the available query IDs present in 'adata.obs' for the annotation.\n"
524
+ "If the desired query ID is not available, you can fetch the cell line metadata "
525
+ "using the `annotate()` function before calling `annotate_from_gdsc()`. "
526
+ "This ensures that the required query ID is included in your data."
527
+ )
528
+ # Lazily download the GDSC data
529
+ if gdsc_dataset == "gdsc_1":
530
+ if self.drug_response_gdsc1 is None:
531
+ self._download_gdsc(gdsc_dataset=1)
532
+ gdsc_data = self.drug_response_gdsc1
533
+ elif gdsc_dataset == "gdsc_2":
534
+ if self.drug_response_gdsc2 is None:
535
+ self._download_gdsc(gdsc_dataset=2)
536
+ gdsc_data = self.drug_response_gdsc2
537
+ else:
538
+ raise ValueError("The GDSC dataset specified in `gdsc_dataset` must be either 'gdsc_1' or 'gdsc_2'.")
539
+
540
+ identifier_num_all = len(adata.obs[query_id].unique())
541
+ not_matched_identifiers = list(set(adata.obs[query_id]) - set(gdsc_data[reference_id]))
542
+ self._warn_unmatch(
543
+ total_identifiers=identifier_num_all,
544
+ unmatched_identifiers=not_matched_identifiers,
545
+ query_id=query_id,
546
+ reference_id=reference_id,
547
+ metadata_type="drug response",
548
+ verbosity=verbosity,
549
+ )
550
+
551
+ old_index_name = "index" if adata.obs.index.name is None else adata.obs.index.name
552
+ adata.obs = (
553
+ adata.obs.reset_index()
554
+ .set_index([query_id, query_perturbation])
555
+ .assign(ln_ic50=gdsc_data.set_index([reference_id, reference_perturbation]).ln_ic50)
556
+ .reset_index()
557
+ .set_index(old_index_name)
558
+ )
559
+
560
+ return adata
561
+
562
+ def lookup(self) -> LookUp:
563
+ """Generate LookUp object for CellLineMetaData.
564
+
565
+ The LookUp object provides an overview of the metadata to annotate.
566
+ Each annotate_{metadata} function has a corresponding lookup function in the LookUp object,
567
+ where users can search the reference_id in the metadata and
568
+ compare with the query_id in their own data.
569
+
570
+ Returns:
571
+ A LookUp object specific for cell line annotation.
572
+
573
+ Examples:
574
+ >>> import pertpy as pt
575
+ >>> pt_metadata = pt.md.CellLine()
576
+ >>> lookup = pt_metadata.lookup()
577
+ """
578
+ # Fetch the metadata if it hasn't beed downloaded yet
579
+ if self.depmap is None:
580
+ self._download_cell_line(cell_line_source="DepMap")
581
+ if self.cancerxgene is None:
582
+ self._download_cell_line(cell_line_source="Cancerrxgene")
583
+ if self.gene_annotation is None:
584
+ self._download_gene_annotation()
585
+ if self.bulk_rna_broad is None:
586
+ self._download_bulk_rna(cell_line_source="broad")
587
+ if self.bulk_rna_sanger is None:
588
+ self._download_bulk_rna(cell_line_source="sanger")
589
+ if self.proteomics is None:
590
+ self._download_proteomics()
591
+ if self.drug_response_gdsc1 is None:
592
+ self._download_gdsc(gdsc_dataset=1)
593
+ if self.drug_response_gdsc2 is None:
594
+ self._download_gdsc(gdsc_dataset=2)
595
+
596
+ # Transfer the data
597
+ return LookUp(
598
+ type="cell_line",
599
+ transfer_metadata=[
600
+ self.depmap,
601
+ self.cancerxgene,
602
+ self.gene_annotation,
603
+ self.bulk_rna_sanger,
604
+ self.bulk_rna_broad,
605
+ self.proteomics,
606
+ self.drug_response_gdsc1,
607
+ self.drug_response_gdsc2,
608
+ ],
609
+ )
610
+
611
+ def _pairwise_correlation(
612
+ self, mat1: np.array, mat2: np.array, row_name: Iterable, col_name: Iterable
613
+ ) -> tuple[pd.DataFrame, pd.DataFrame]:
614
+ """Calculate the row-wise pearson correlation between two matrices.
615
+
616
+ Args:
617
+ mat1: Input array
618
+ mat2: Input array
619
+ row_name: Row name of the output dataframes
620
+ col_name: Row name of the output dataframes
621
+
622
+ Returns:
623
+ Returns DataFrames for both the Pearson correlation coefficients and their associated p-values.
624
+ """
625
+ corr = np.empty((mat1.shape[0], mat2.shape[0]))
626
+ pvals = np.empty((mat1.shape[0], mat2.shape[0]))
627
+
628
+ for i in range(mat1.shape[0]):
629
+ for j in range(mat2.shape[0]):
630
+ if i > j:
631
+ corr[i, j] = corr[j, i]
632
+ pvals[i, j] = pvals[j, i]
633
+ else:
634
+ corr[i, j], pvals[i, j] = stats.pearsonr(mat1[i], mat2[j])
635
+ corr = pd.DataFrame(corr, index=row_name, columns=col_name)
636
+ pvals = pd.DataFrame(pvals, index=row_name, columns=col_name)
637
+
638
+ return corr, pvals
639
+
640
+ def correlate(
641
+ self,
642
+ adata: AnnData,
643
+ identifier: str = "DepMap_ID",
644
+ metadata_key: str = "bulk_rna_broad",
645
+ ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame | None, pd.DataFrame | None]:
646
+ """Correlate cell lines with annotated metadata.
647
+
648
+ Args:
649
+ adata: Input data object.
650
+ identifier: Column in `.obs` containing cell line identifiers.
651
+ metadata_key: Key of the AnnData obsm for comparison with the X matrix.
652
+
653
+ Returns:
654
+ Returns pearson correlation coefficients and their corresponding p-values for matched and unmatched cell lines separately.
655
+ """
656
+ if metadata_key not in adata.obsm:
657
+ raise ValueError("The metadata can not be found in adata.obsm")
658
+ if identifier not in adata.obs:
659
+ raise ValueError("The identifier can not be found in adata.obs")
660
+ if adata.X.shape[1] != adata.obsm[metadata_key].shape[1]:
661
+ raise ValueError(
662
+ "Dimensions of adata.X do not match those of metadata. Ensure that they have the same gene list."
663
+ )
664
+ if isinstance(adata.obsm[metadata_key], pd.DataFrame):
665
+ # Give warning if the genes are not the same
666
+ if sum(adata.obsm[metadata_key].columns != adata.var.index.values) > 0:
667
+ logger.warning(
668
+ "Column name of metadata is not the same as the index of adata.var. Ensure that the genes are in the same order."
669
+ )
670
+
671
+ # Divide cell lines into those are present and not present in the metadata
672
+ overlapped_cl = adata[~adata.obsm[metadata_key].isna().all(axis=1), :]
673
+ missing_cl = adata[adata.obsm[metadata_key].isna().all(axis=1), :]
674
+
675
+ corr, pvals = self._pairwise_correlation(
676
+ overlapped_cl.X,
677
+ overlapped_cl.obsm[metadata_key].values,
678
+ row_name=overlapped_cl.obs[identifier],
679
+ col_name=overlapped_cl.obs[identifier],
680
+ )
681
+ if missing_cl is not None:
682
+ new_corr, new_pvals = self._pairwise_correlation(
683
+ missing_cl.X,
684
+ overlapped_cl.obsm[metadata_key].values,
685
+ row_name=missing_cl.obs[identifier],
686
+ col_name=overlapped_cl.obs[identifier],
687
+ )
688
+ else:
689
+ new_corr = new_pvals = None
690
+
691
+ return corr, pvals, new_corr, new_pvals
692
+
693
+ def plot_correlation(
694
+ self,
695
+ adata: AnnData,
696
+ corr: pd.DataFrame,
697
+ pval: pd.DataFrame,
698
+ *,
699
+ identifier: str = "DepMap_ID",
700
+ metadata_key: str = "bulk_rna_broad",
701
+ category: str = "cell line",
702
+ subset_identifier: str | int | Iterable[str] | Iterable[int] | None = None,
703
+ ) -> None:
704
+ """Visualise the correlation of cell lines with annotated metadata.
705
+
706
+ Args:
707
+ adata: Input data object.
708
+ corr: Pearson correlation scores.
709
+ pval: P-values for pearson correlation.
710
+ identifier: Column in `.obs` containing the identifiers.
711
+ metadata_key: Key of the AnnData obsm for comparison with the X matrix.
712
+ category: The category for correlation comparison.
713
+ subset_identifier: Selected identifiers for scatter plot visualization between the X matrix and `metadata_key`.
714
+ If not None, only the chosen cell line will be plotted, either specified as a value in `identifier` (string) or as an index number.
715
+ If None, all cell lines will be plotted.
716
+ Returns:
717
+ Pearson correlation coefficients and their corresponding p-values for matched and unmatched cell lines separately.
718
+ """
719
+ if corr is None or pval is None:
720
+ raise ValueError(
721
+ "Missing required input parameter: 'corr' or 'pval'. Please call the function `pt.md.CellLine.correlate()` to generate these outputs before proceeding."
722
+ )
723
+
724
+ if category == "cell line":
725
+ if subset_identifier is None:
726
+ annotation = "\n".join(
727
+ (
728
+ f"Mean pearson correlation: {np.mean(np.diag(corr)):.4f}",
729
+ f"Mean p-value: {np.mean(np.diag(pval)):.4f}",
730
+ )
731
+ )
732
+ plt.scatter(x=adata.obsm[metadata_key], y=adata.X)
733
+ plt.xlabel(metadata_key)
734
+ plt.ylabel("Baseline")
735
+ else:
736
+ subset_identifier_list = (
737
+ [subset_identifier] if isinstance(subset_identifier, str | int) else list(subset_identifier)
738
+ )
739
+ # Convert the valid identifiers to the index list
740
+ if all(isinstance(id, str) for id in subset_identifier_list):
741
+ if set(subset_identifier_list).issubset(adata.obs[identifier].unique()):
742
+ subset_identifier_list = np.where(
743
+ np.in1d(adata.obs[identifier].values, subset_identifier_list)
744
+ )[0]
745
+ else:
746
+ raise ValueError("`Subset_identifier` must be found in adata.obs.`identifier`.")
747
+ elif all(isinstance(id, int) and 0 <= id < adata.n_obs for id in subset_identifier_list):
748
+ pass
749
+ elif all(isinstance(id, int) and (id < 0 or id >= adata.n_obs) for id in subset_identifier_list):
750
+ raise ValueError("`Subset_identifier` out of index.")
751
+ else:
752
+ raise ValueError("`Subset_identifier` must contain either all strings or all integers.")
753
+
754
+ plt.scatter(
755
+ x=adata.obsm[metadata_key].iloc[subset_identifier_list],
756
+ y=adata[subset_identifier_list].X,
757
+ )
758
+ plt.xlabel(
759
+ f"{metadata_key}: {adata.obs[identifier].values[subset_identifier_list[0]]}"
760
+ if len(subset_identifier_list) == 1
761
+ else f"{metadata_key}"
762
+ )
763
+ plt.ylabel(
764
+ f"Baseline: {adata.obs[identifier].values[subset_identifier_list[0]]}"
765
+ if len(subset_identifier_list) == 1
766
+ else "Baseline"
767
+ )
768
+
769
+ # Annotate with the correlation coefficient and p-value of the chosen cell lines
770
+ subset_cor = np.mean(np.diag(corr.iloc[subset_identifier_list, subset_identifier_list]))
771
+ subset_pval = np.mean(np.diag(pval.iloc[subset_identifier_list, subset_identifier_list]))
772
+ annotation = "\n".join(
773
+ (
774
+ f"Pearson correlation: {subset_cor:.4f}",
775
+ f"P-value: {subset_pval:.4f}",
776
+ )
777
+ )
778
+
779
+ plt.text(
780
+ 0.05,
781
+ 0.95,
782
+ annotation,
783
+ fontsize=10,
784
+ transform=plt.gca().transAxes,
785
+ verticalalignment="top",
786
+ bbox={
787
+ "boxstyle": "round",
788
+ "alpha": 0.5,
789
+ "facecolor": "white",
790
+ "edgecolor": "black",
791
+ },
792
+ )
793
+ plt.show()
794
+ else:
795
+ raise NotImplementedError