pertpy 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. pertpy/__init__.py +3 -2
  2. pertpy/data/__init__.py +5 -1
  3. pertpy/data/_dataloader.py +2 -4
  4. pertpy/data/_datasets.py +203 -92
  5. pertpy/metadata/__init__.py +4 -0
  6. pertpy/metadata/_cell_line.py +826 -0
  7. pertpy/metadata/_compound.py +129 -0
  8. pertpy/metadata/_drug.py +242 -0
  9. pertpy/metadata/_look_up.py +582 -0
  10. pertpy/metadata/_metadata.py +73 -0
  11. pertpy/metadata/_moa.py +129 -0
  12. pertpy/plot/__init__.py +1 -9
  13. pertpy/plot/_augur.py +53 -116
  14. pertpy/plot/_coda.py +277 -677
  15. pertpy/plot/_guide_rna.py +17 -35
  16. pertpy/plot/_milopy.py +59 -134
  17. pertpy/plot/_mixscape.py +152 -391
  18. pertpy/preprocessing/_guide_rna.py +88 -4
  19. pertpy/tools/__init__.py +8 -13
  20. pertpy/tools/_augur.py +315 -17
  21. pertpy/tools/_cinemaot.py +143 -4
  22. pertpy/tools/_coda/_base_coda.py +1210 -65
  23. pertpy/tools/_coda/_sccoda.py +50 -21
  24. pertpy/tools/_coda/_tasccoda.py +27 -19
  25. pertpy/tools/_dialogue.py +164 -56
  26. pertpy/tools/_differential_gene_expression.py +240 -14
  27. pertpy/tools/_distances/_distance_tests.py +8 -8
  28. pertpy/tools/_distances/_distances.py +184 -34
  29. pertpy/tools/_enrichment.py +465 -0
  30. pertpy/tools/_milo.py +345 -11
  31. pertpy/tools/_mixscape.py +668 -50
  32. pertpy/tools/_perturbation_space/_clustering.py +5 -1
  33. pertpy/tools/_perturbation_space/_discriminator_classifiers.py +526 -0
  34. pertpy/tools/_perturbation_space/_perturbation_space.py +135 -43
  35. pertpy/tools/_perturbation_space/_simple.py +51 -10
  36. pertpy/tools/_scgen/__init__.py +1 -1
  37. pertpy/tools/_scgen/_scgen.py +701 -0
  38. pertpy/tools/_scgen/_utils.py +1 -3
  39. pertpy/tools/decoupler_LICENSE +674 -0
  40. {pertpy-0.6.0.dist-info → pertpy-0.7.0.dist-info}/METADATA +31 -12
  41. pertpy-0.7.0.dist-info/RECORD +53 -0
  42. {pertpy-0.6.0.dist-info → pertpy-0.7.0.dist-info}/WHEEL +1 -1
  43. pertpy/plot/_cinemaot.py +0 -81
  44. pertpy/plot/_dialogue.py +0 -91
  45. pertpy/plot/_scgen.py +0 -337
  46. pertpy/tools/_metadata/__init__.py +0 -0
  47. pertpy/tools/_metadata/_cell_line.py +0 -613
  48. pertpy/tools/_metadata/_look_up.py +0 -342
  49. pertpy/tools/_perturbation_space/_discriminator_classifier.py +0 -381
  50. pertpy/tools/_scgen/_jax_scgen.py +0 -370
  51. pertpy-0.6.0.dist-info/RECORD +0 -50
  52. /pertpy/tools/_scgen/{_jax_scgenvae.py → _scgenvae.py} +0 -0
  53. {pertpy-0.6.0.dist-info → pertpy-0.7.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,826 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+ from typing import TYPE_CHECKING, Literal
5
+
6
+ if TYPE_CHECKING:
7
+ from collections.abc import Iterable
8
+
9
+ import matplotlib.pyplot as plt
10
+ import numpy as np
11
+ import pandas as pd
12
+ from rich import print
13
+ from scanpy import settings
14
+ from scipy import stats
15
+
16
+ from pertpy.data._dataloader import _download
17
+
18
+ from ._look_up import LookUp
19
+ from ._metadata import MetaData
20
+
21
+ if TYPE_CHECKING:
22
+ from anndata import AnnData
23
+
24
+
25
+ class CellLine(MetaData):
26
+ """Utilities to fetch cell line metadata."""
27
+
28
+ def __init__(self):
29
+ super().__init__()
30
+ self.depmap = None
31
+ self.cancerxgene = None
32
+ self.gene_annotation = None
33
+ self.bulk_rna_sanger = None
34
+ self.bulk_rna_broad = None
35
+ self.proteomics = None
36
+ self.drug_response_gdsc1 = None
37
+ self.drug_response_gdsc2 = None
38
+
39
+ def _download_cell_line(self, cell_line_source: Literal["DepMap", "Cancerrxgene"] = "DepMap") -> None:
40
+ if cell_line_source == "DepMap":
41
+ # Download cell line metadata from DepMap
42
+ # Source: https://depmap.org/portal/download/all/ (DepMap Public 23Q4)
43
+ depmap_cell_line_path = Path(settings.cachedir) / "depmap_23Q4_info.csv"
44
+ if not Path(depmap_cell_line_path).exists():
45
+ print("[bold yellow]No DepMap metadata file found. Starting download now.")
46
+ _download(
47
+ url="https://ndownloader.figshare.com/files/43746708",
48
+ output_file_name="depmap_23Q4_info.csv",
49
+ output_path=settings.cachedir,
50
+ block_size=4096,
51
+ is_zip=False,
52
+ )
53
+ self.depmap = pd.read_csv(depmap_cell_line_path)
54
+ else:
55
+ # Download cell line metadata from The Genomics of Drug Sensitivity in Cancer Project
56
+ # Source: https://www.cancerrxgene.org/celllines
57
+ cancerxgene_cell_line_path = Path(settings.cachedir) / "cell_line_cancer_project.csv"
58
+ transformed_cancerxgene_cell_line_path = Path(settings.cachedir) / "cancerrxgene_info.csv"
59
+
60
+ if not Path(transformed_cancerxgene_cell_line_path).exists():
61
+ if not Path(cancerxgene_cell_line_path).exists():
62
+ print(
63
+ "[bold yellow]No cell line metadata file from The Genomics of Drug Sensitivity "
64
+ "in Cancer Project found. Starting download now."
65
+ )
66
+ _download(
67
+ url="https://www.cancerrxgene.org/api/celllines?list=all&sEcho=1&iColumns=7&sColumns=&"
68
+ "iDisplayStart=0&iDisplayLength=25&mDataProp_0=0&mDataProp_1=1&mDataProp_2=2&mDataProp_3=3&"
69
+ "mDataProp_4=4&mDataProp_5=5&mDataProp_6=6&sSearch=&bRegex=false&sSearch_0=&bRegex_0=false&"
70
+ "bSearchable_0=true&sSearch_1=&bRegex_1=false&bSearchable_1=true&sSearch_2=&bRegex_2=false&"
71
+ "bSearchable_2=true&sSearch_3=&bRegex_3=false&bSearchable_3=true&sSearch_4=&bRegex_4=false&"
72
+ "bSearchable_4=true&sSearch_5=&bRegex_5=false&bSearchable_5=true&sSearch_6=&bRegex_6=false&"
73
+ "bSearchable_6=true&iSortCol_0=0&sSortDir_0=asc&iSortingCols=1&bSortable_0=true&bSortable_1=true&"
74
+ "bSortable_2=true&bSortable_3=true&bSortable_4=true&bSortable_5=true&bSortable_6=true&export=csv",
75
+ output_file_name="cell_line_cancer_project.csv",
76
+ output_path=settings.cachedir,
77
+ block_size=4096,
78
+ is_zip=False,
79
+ )
80
+ self.cancerxgene = pd.read_csv(cancerxgene_cell_line_path)
81
+ self.cancerxgene.columns = self.cancerxgene.columns.str.strip()
82
+ self.cancerxgene["stripped_cell_line_name"] = (
83
+ self.cancerxgene["Cell line Name"]
84
+ .str.replace(r"\-|\.", "", regex=True)
85
+ .str.upper()
86
+ .astype("category")
87
+ )
88
+ # pivot the data frame so that each cell line has only one row of metadata
89
+ index_col = set(self.cancerxgene.columns) - {
90
+ "Datasets",
91
+ "number of drugs",
92
+ }
93
+ self.cancerxgene = self.cancerxgene.pivot(index=index_col, columns="Datasets", values="number of drugs")
94
+ self.cancerxgene.columns.name = None
95
+ self.cancerxgene = self.cancerxgene.reset_index().rename(columns={"Cell line Name": "cell_line_name"})
96
+ self.cancerxgene.to_csv(transformed_cancerxgene_cell_line_path)
97
+ else:
98
+ self.cancerxgene = pd.read_csv(transformed_cancerxgene_cell_line_path, index_col=0)
99
+
100
+ def _download_gene_annotation(self) -> None:
101
+ # Download metadata for driver genes from DepMap.Sanger
102
+ # Source: https://cellmodelpassports.sanger.ac.uk/downloads (Gene annotation)
103
+ gene_annotation_file_path = Path(settings.cachedir) / "genes_info.csv"
104
+ if not Path(gene_annotation_file_path).exists():
105
+ print("[bold yellow]No metadata file was found for gene annotation. Starting download now.")
106
+ _download(
107
+ url="https://cog.sanger.ac.uk/cmp/download/gene_identifiers_20191101.csv",
108
+ output_file_name="genes_info.csv",
109
+ output_path=settings.cachedir,
110
+ block_size=4096,
111
+ is_zip=False,
112
+ )
113
+ self.gene_annotation = pd.read_table(gene_annotation_file_path, delimiter=",")
114
+
115
+ def _download_bulk_rna(self, cell_line_source: Literal["broad", "sanger"] = "broad") -> None:
116
+ if cell_line_source == "sanger":
117
+ # Download bulk RNA-seq data collated by the Wellcome Sanger Institute and the Broad Institute from DepMap.Sanger
118
+ # Source: https://cellmodelpassports.sanger.ac.uk/downloads (Expression data)
119
+ # issue: read count values contain random whitespace
120
+ # solution: remove the white space and convert to int before depmap updates the metadata
121
+ bulk_rna_sanger_file_path = Path(settings.cachedir) / "rnaseq_sanger_info.csv"
122
+ if not Path(bulk_rna_sanger_file_path).exists():
123
+ print(
124
+ "[bold yellow]No metadata file was found for bulk RNA-seq data of Sanger cell line."
125
+ " Starting download now."
126
+ )
127
+ _download(
128
+ url="https://figshare.com/ndownloader/files/42467103",
129
+ output_file_name="rnaseq_sanger_info.csv",
130
+ output_path=settings.cachedir,
131
+ block_size=4096,
132
+ is_zip=False,
133
+ )
134
+ self.bulk_rna_sanger = pd.read_csv(bulk_rna_sanger_file_path, index_col=0, dtype="unicode")
135
+ else:
136
+ # Download CCLE expression data from DepMap
137
+ # Source: https://depmap.org/portal/download/all/ (DepMap Public 22Q2)
138
+ bulk_rna_broad_file_path = Path(settings.cachedir) / "rnaseq_depmap_info.csv"
139
+ if not Path(bulk_rna_broad_file_path).exists():
140
+ print("[bold yellow]No metadata file was found for CCLE expression data. Starting download now.")
141
+ _download(
142
+ url="https://figshare.com/ndownloader/files/34989922",
143
+ output_file_name="rnaseq_depmap_info.csv",
144
+ output_path=settings.cachedir,
145
+ block_size=4096,
146
+ is_zip=False,
147
+ )
148
+ self.bulk_rna_broad = pd.read_csv(bulk_rna_broad_file_path, index_col=0)
149
+
150
+ def _download_proteomics(self) -> None:
151
+ # Download proteomics data processed by DepMap.Sanger
152
+ # Source: https://cellmodelpassports.sanger.ac.uk/downloads (Proteomics)
153
+ proteomics_file_path = Path(settings.cachedir) / "proteomics_info.csv"
154
+ if not Path(proteomics_file_path).exists():
155
+ print("[bold yellow]No metadata file was found for proteomics data (DepMap.Sanger). Starting download now.")
156
+ _download(
157
+ url="https://figshare.com/ndownloader/files/42468393",
158
+ output_file_name="proteomics_info.csv",
159
+ output_path=settings.cachedir,
160
+ block_size=4096,
161
+ is_zip=False,
162
+ )
163
+ self.proteomics = pd.read_csv(proteomics_file_path, index_col=0)
164
+
165
+ def _download_gdsc(self, gdsc_dataset: Literal[1, 2] = 1) -> None:
166
+ if gdsc_dataset == 1:
167
+ # Download GDSC drug response data
168
+ # Source: https://www.cancerrxgene.org/downloads/bulk_download (Drug Screening - IC50s)
169
+ # URL: https://cog.sanger.ac.uk/cancerrxgene/GDSC_release8.4/GDSC1_fitted_dose_response_24Jul22.xlsx
170
+ drug_response_gdsc1_file_path = Path(settings.cachedir) / "gdsc1_info.csv"
171
+ if not Path(drug_response_gdsc1_file_path).exists():
172
+ print(
173
+ "[bold yellow]No metadata file was found for drug response data of GDSC1 dataset."
174
+ " Starting download now."
175
+ )
176
+ _download(
177
+ url="https://figshare.com/ndownloader/files/43757235",
178
+ output_file_name="gdsc1_info.csv",
179
+ output_path=settings.cachedir,
180
+ block_size=4096,
181
+ is_zip=False,
182
+ )
183
+ self.drug_response_gdsc1 = pd.read_csv(drug_response_gdsc1_file_path, index_col=0)
184
+ if gdsc_dataset == 2:
185
+ drug_response_gdsc2_file_path = Path(settings.cachedir) / "gdsc2_info.csv"
186
+ if not Path(drug_response_gdsc2_file_path).exists():
187
+ print(
188
+ "[bold yellow]No metadata file was found for drug response data of GDSC2 dataset."
189
+ " Starting download now."
190
+ )
191
+ _download(
192
+ url="https://figshare.com/ndownloader/files/43757232",
193
+ output_file_name="gdsc2_info.csv",
194
+ output_path=settings.cachedir,
195
+ block_size=4096,
196
+ is_zip=False,
197
+ )
198
+ self.drug_response_gdsc2 = pd.read_csv(drug_response_gdsc2_file_path, index_col=0)
199
+
200
+ def annotate(
201
+ self,
202
+ adata: AnnData,
203
+ query_id: str = "DepMap_ID",
204
+ reference_id: str = "ModelID",
205
+ fetch: list[str] | None = None,
206
+ cell_line_source: Literal["DepMap", "Cancerrxgene"] = "DepMap",
207
+ verbosity: int | str = 5,
208
+ copy: bool = False,
209
+ ) -> AnnData:
210
+ """Annotate cell lines.
211
+
212
+ For each cell, we fetch cell line annotation from either the Dependency Map (DepMap) or The Genomics of Drug Sensitivity in Cancer Project (Cancerxgene).
213
+
214
+ Args:
215
+ adata: The data object to annotate.
216
+ query_id: The column of `.obs` with cell line information. Defaults to "DepMap_ID".
217
+ reference_id: The type of cell line identifier in the meta data, e.g. ModelID, CellLineName or StrippedCellLineName.
218
+ If fetching cell line metadata from Cancerrxgene, it is recommended to choose
219
+ "stripped_cell_line_name". Defaults to "ModelID".
220
+ fetch: The metadata to fetch. Defaults to None (=all).
221
+ cell_line_source: The source of cell line metadata, DepMap or Cancerrxgene. Defaults to "DepMap".
222
+ verbosity: The number of unmatched identifiers to print, can be either non-negative values or "all".
223
+ Defaults to 5.
224
+ copy: Determines whether a copy of the `adata` is returned. Defaults to False.
225
+
226
+ Returns:
227
+ Returns an AnnData object with cell line annotation.
228
+
229
+ Examples:
230
+ >>> import pertpy as pt
231
+ >>> adata = pt.dt.dialogue_example()
232
+ >>> adata.obs["cell_line_name"] = "MCF7"
233
+ >>> pt_metadata = pt.md.CellLine()
234
+ >>> adata_annotated = pt_metadata.annotate(adata=adata,
235
+ >>> reference_id='cell_line_name',
236
+ >>> query_id='cell_line_name',
237
+ >>> fetch=["cell_line_name", "age", "primary_disease"],
238
+ >>> copy=True)
239
+ """
240
+ if copy:
241
+ adata = adata.copy()
242
+
243
+ if cell_line_source == "DepMap":
244
+ if self.depmap is None:
245
+ self._download_cell_line(cell_line_source="DepMap")
246
+ cell_line_meta = self.depmap
247
+ else:
248
+ reference_id = "stripped_cell_line_name"
249
+ if query_id == "DepMap_ID":
250
+ query_id = "stripped_cell_line_name"
251
+ print(
252
+ "[bold blue]`stripped_cell_line_name` is used as reference and query identifier ",
253
+ " to annotate cell line metadata from Cancerrxgene. "
254
+ "Ensure that stripped cell line names are available in 'adata.obs.' ",
255
+ "or use the DepMap as `cell_line_source` to annotate the cell line first ",
256
+ )
257
+ if self.cancerxgene is None:
258
+ self._download_cell_line(cell_line_source="Cancerrxgene")
259
+ cell_line_meta = self.cancerxgene
260
+
261
+ if query_id not in adata.obs.columns:
262
+ raise ValueError(f"The requested query_id {query_id} is not in `adata.obs`.")
263
+
264
+ if reference_id in cell_line_meta.columns:
265
+ # If the specified cell line type can be found in the database,
266
+ # we can compare these keys and fetch the corresponding metadata.
267
+ identifier_num_all = len(adata.obs[query_id].unique())
268
+ not_matched_identifiers = list(set(adata.obs[query_id]) - set(cell_line_meta[reference_id]))
269
+
270
+ self._warn_unmatch(
271
+ total_identifiers=identifier_num_all,
272
+ unmatched_identifiers=not_matched_identifiers,
273
+ query_id=query_id,
274
+ reference_id=reference_id,
275
+ metadata_type="cell line",
276
+ verbosity=verbosity,
277
+ )
278
+
279
+ if fetch is not None:
280
+ # If fetch is specified and can be found in the DepMap database,
281
+ # We will subset the original metadata dataframe correspondingly and add them to the AnnData object.
282
+ # Redundant information will be removed.
283
+ if set(fetch).issubset(set(cell_line_meta.columns)):
284
+ if reference_id not in fetch:
285
+ fetch.append(reference_id)
286
+ else:
287
+ raise ValueError(
288
+ "Selected cell line information is not present in the metadata.\n"
289
+ "Please create a `CellLineMetaData.lookup()` object to obtain the available cell line information in the metadata."
290
+ )
291
+
292
+ # If no fetch is specified, all metadata is fetched by default.
293
+ # Sometimes there is already different cell line information in the AnnData object.
294
+ # To avoid redundant information we will remove duplicate information from metadata after merging.
295
+ adata.obs = (
296
+ adata.obs.merge(
297
+ cell_line_meta if fetch is None else cell_line_meta[fetch],
298
+ left_on=query_id,
299
+ right_on=reference_id,
300
+ how="left",
301
+ suffixes=("", "_fromMeta"),
302
+ )
303
+ .filter(regex="^(?!.*_fromMeta)")
304
+ .set_index(adata.obs.index)
305
+ )
306
+ # If query_id and reference_id have different names,
307
+ # there will be a column for each of them after merging,
308
+ # which is redundant as they refer to the same information.
309
+ # We will move the reference_id column.
310
+ if query_id != reference_id:
311
+ del adata.obs[reference_id]
312
+
313
+ else:
314
+ raise ValueError(
315
+ f"The requested cell line type {reference_id} is currently unavailable in the database.\n"
316
+ "Refer to the available reference identifier in the chosen database.\n"
317
+ "DepMap_ID is compared by default.\n"
318
+ "Alternatively, create a `CellLineMetaData.lookup()` object to "
319
+ "obtain the available reference identifiers in the metadata."
320
+ )
321
+
322
+ return adata
323
+
324
+ def annotate_bulk_rna(
325
+ self,
326
+ adata: AnnData,
327
+ query_id: str = "cell_line_name",
328
+ cell_line_source: Literal["broad", "sanger"] = "sanger",
329
+ verbosity: int | str = 5,
330
+ gene_identifier: Literal["gene_name", "gene_ID", "both"] = "gene_ID",
331
+ copy: bool = False,
332
+ ) -> AnnData:
333
+ """Fetch bulk rna expression from the Broad or Sanger.
334
+
335
+ For each cell, we fetch bulk rna expression from either Broad or Sanger cell line.
336
+
337
+ Args:
338
+ adata: The data object to annotate.
339
+ query_id: The column of `.obs` with cell line information. Defaults to "cell_line_name" if `cell_line_source` is sanger, otherwise "DepMap_ID".
340
+ cell_line_source: The bulk rna expression data from either broad or sanger cell line. Defaults to "sanger".
341
+ verbosity: The number of unmatched identifiers to print, can be either non-negative values or "all". Defaults to 5.
342
+ copy: Determines whether a copy of the `adata` is returned. Defaults to False.
343
+
344
+ Returns:
345
+ Returns an AnnData object with bulk rna expression annotation.
346
+
347
+ Examples:
348
+ >>> import pertpy as pt
349
+ >>> adata = pt.dt.dialogue_example()
350
+ >>> adata.obs["cell_line_name"] = "MCF7"
351
+ >>> pt_metadata = pt.md.CellLine()
352
+ >>> adata_annotated = pt_metadata.annotate(
353
+ ... adata=adata, reference_id="cell_line_name", query_id="cell_line_name", copy=True
354
+ ... )
355
+ >>> pt_metadata.annotate_bulk_rna(adata_annotated)
356
+ """
357
+ if copy:
358
+ adata = adata.copy()
359
+
360
+ # Make sure that the specified `cell_line_type` can be found in the bulk rna expression data,
361
+ # then we can compare these keys and fetch the corresponding metadata.
362
+ if query_id not in adata.obs.columns:
363
+ raise ValueError(
364
+ f"The specified `query_id` {query_id} can't be found in the `adata.obs`.\n"
365
+ "Ensure that you are using one of the available query IDs present in the adata.obs for the annotation.\n"
366
+ "If the desired query ID is not available, you can fetch the cell line metadata "
367
+ "using the `annotate()` function before calling 'annotate_bulk_rna()'. "
368
+ "This ensures that the required query ID is included in your data, e.g. stripped_cell_line_name, DepMap ID."
369
+ )
370
+
371
+ identifier_num_all = len(adata.obs[query_id].unique())
372
+
373
+ # Lazily download the bulk rna expression data
374
+ if cell_line_source == "sanger":
375
+ if self.bulk_rna_sanger is None:
376
+ self._download_bulk_rna(cell_line_source="sanger")
377
+ reference_id = "model_name"
378
+ not_matched_identifiers = list(set(adata.obs[query_id]) - set(self.bulk_rna_sanger.index))
379
+ else:
380
+ reference_id = "DepMap_ID"
381
+ print(
382
+ "To annotate bulk RNA data from Broad Institue, ",
383
+ "`DepMap_ID` is used as default reference and query identifier if no `reference_id` is given. ",
384
+ "Ensure that `DepMap_ID` is available in 'adata.obs'. ",
385
+ "Alternatively, use `annotate()` to annotate the cell line first ",
386
+ )
387
+ if self.bulk_rna_broad is None:
388
+ self._download_bulk_rna(cell_line_source="broad")
389
+ if query_id == "cell_line_name":
390
+ query_id = "DepMap_ID"
391
+ not_matched_identifiers = list(set(adata.obs[query_id]) - set(self.bulk_rna_broad.index))
392
+
393
+ self._warn_unmatch(
394
+ total_identifiers=identifier_num_all,
395
+ unmatched_identifiers=not_matched_identifiers,
396
+ query_id=query_id,
397
+ reference_id=reference_id,
398
+ metadata_type="bulk RNA",
399
+ verbosity=verbosity,
400
+ )
401
+
402
+ if cell_line_source == "sanger":
403
+ sanger_rna_exp = self.bulk_rna_sanger[self.bulk_rna_sanger.index.isin(adata.obs[query_id])]
404
+ sanger_rna_exp = sanger_rna_exp.reindex(adata.obs[query_id])
405
+ sanger_rna_exp.index = adata.obs.index
406
+ adata.obsm["bulk_rna_sanger"] = sanger_rna_exp
407
+ else:
408
+ if gene_identifier == "gene_ID":
409
+ self.bulk_rna_broad.columns = [
410
+ (gene_name.split(" (")[1].split(")")[0] if "(" in gene_name else gene_name)
411
+ for gene_name in self.bulk_rna_broad.columns
412
+ ]
413
+ elif gene_identifier == "gene_name":
414
+ self.bulk_rna_broad.columns = [
415
+ gene_name.split(" (")[0] if "(" in gene_name else gene_name
416
+ for gene_name in self.bulk_rna_broad.columns
417
+ ]
418
+ broad_rna_exp = self.bulk_rna_broad[self.bulk_rna_broad.index.isin(adata.obs[query_id])]
419
+ ccle_expression = broad_rna_exp.reindex(adata.obs[query_id])
420
+ ccle_expression.index = adata.obs.index
421
+ adata.obsm["bulk_rna_broad"] = ccle_expression
422
+
423
+ return adata
424
+
425
+ def annotate_protein_expression(
426
+ self,
427
+ adata: AnnData,
428
+ query_id: str = "cell_line_name",
429
+ reference_id: Literal["model_name", "model_id"] = "model_name",
430
+ protein_information: Literal["protein_intensity", "zscore"] = "protein_intensity",
431
+ protein_id: Literal["uniprot_id", "symbol"] = "uniprot_id",
432
+ verbosity: int | str = 5,
433
+ copy: bool = False,
434
+ ) -> AnnData:
435
+ """Fetch protein expression.
436
+
437
+ For each cell, we fetch protein intensity values acquired using data-independent acquisition mass spectrometry (DIA-MS).
438
+
439
+ Args:
440
+ adata: The data object to annotate.
441
+ query_id: The column of `.obs` with cell line information. Defaults to "cell_line_name".
442
+ reference_id: The type of cell line identifier in the meta data, model_name or model_id.
443
+ Defaults to "model_name".
444
+ protein_information: The type of protein expression data to fetch, protein_intensity or zscore.
445
+ Defaults to "protein_intensity".
446
+ protein_id: The protein identifier saved in the fetched meta data, uniprot_id or symbol.
447
+ Defaults to "uniprot_id".
448
+ verbosity: The number of unmatched identifiers to print, can be either non-negative values or "all".
449
+ Defaults to 5.
450
+ copy: Determines whether a copy of the `adata` is returned. Defaults to False.
451
+
452
+ Returns:
453
+ Returns an AnnData object with protein expression annotation.
454
+
455
+ Examples:
456
+ >>> import pertpy as pt
457
+ >>> adata = pt.dt.dialogue_example()
458
+ >>> adata.obs["cell_line_name"] = "MCF7"
459
+ >>> pt_metadata = pt.md.CellLine()
460
+ >>> adata_annotated = pt_metadata.annotate(
461
+ ... adata=adata, reference_id="cell_line_name", query_id="cell_line_name", copy=True
462
+ ... )
463
+ >>> pt_metadata.annotate_protein_expression(adata_annotated)
464
+ """
465
+ if copy:
466
+ adata = adata.copy()
467
+
468
+ # Make sure that the specified `cell_line_type` can be found in the protein expression data,
469
+ # then we can compare these keys and fetch the corresponding metadata.
470
+ if query_id not in adata.obs.columns:
471
+ raise ValueError(
472
+ f"The specified `query_id` {query_id} can't be found in `adata.obs`. \n"
473
+ "If the desired query ID is not available, you can fetch the cell line metadata \n"
474
+ "using the `annotate()` function before calling annotate_protein_expression(). \n"
475
+ "This ensures that the required query ID is included in your data."
476
+ )
477
+ # Lazily download the proteomics data
478
+ if self.proteomics is None:
479
+ self._download_proteomics()
480
+ if reference_id not in self.proteomics.columns:
481
+ raise ValueError(
482
+ f"The specified `reference_id`{reference_id} can't be found in the protein expression data. \n"
483
+ "To solve the issue, please use the reference identifier available in the metadata. \n"
484
+ "Alternatively, create a `CellLineMetaData.lookup()` object to obtain the available reference identifiers in the metadata. "
485
+ )
486
+
487
+ identifier_num_all = len(adata.obs[query_id].unique())
488
+ not_matched_identifiers = list(set(adata.obs[query_id]) - set(self.proteomics[reference_id]))
489
+
490
+ self._warn_unmatch(
491
+ total_identifiers=identifier_num_all,
492
+ unmatched_identifiers=not_matched_identifiers,
493
+ query_id=query_id,
494
+ reference_id=reference_id,
495
+ metadata_type="protein expression",
496
+ verbosity=verbosity,
497
+ )
498
+
499
+ # convert the original protein intensities table from long format to wide format, group by the cell lines
500
+ adata.obsm["proteomics_" + protein_information] = (
501
+ self.proteomics[[reference_id, protein_id, protein_information]]
502
+ .pivot(index=reference_id, columns=protein_id, values=protein_information)
503
+ .reindex(adata.obs.index)
504
+ )
505
+ return adata
506
+
507
+ def annotate_from_gdsc(
508
+ self,
509
+ adata: AnnData,
510
+ query_id: str = "cell_line_name",
511
+ reference_id: Literal["cell_line_name", "sanger_model_id", "cosmic_id"] = "cell_line_name",
512
+ query_perturbation: str = "perturbation",
513
+ reference_perturbation: Literal["drug_name", "drug_id"] = "drug_name",
514
+ gdsc_dataset: Literal[1, 2] = 1,
515
+ verbosity: int | str = 5,
516
+ copy: bool = False,
517
+ ) -> AnnData:
518
+ """Fetch drug response data from GDSC.
519
+
520
+ For each cell, we fetch drug response data as natural log of the fitted IC50 for its
521
+ corresponding cell line and perturbation from GDSC fitted data results file.
522
+
523
+ Args:
524
+ adata: The data object to annotate.
525
+ query_id: The column of `.obs` with cell line information. Defaults to "cell_line_name".
526
+ reference_id: The type of cell line identifier in the meta data, cell_line_name, sanger_model_id or cosmic_id.
527
+ Defaults to "cell_line_name".
528
+ query_perturbation: The column of `.obs` with perturbation information.
529
+ Defaults to "perturbation".
530
+ reference_perturbation: The type of perturbation in the meta data, drug_name or drug_id.
531
+ Defaults to 'drug_name'.
532
+ gdsc_dataset: The GDSC dataset, 1 or 2.
533
+ The GDSC1 dataset updates previous releases with additional drug screening data from the
534
+ Sanger Institute and Massachusetts General Hospital.
535
+ It covers 970 Cell lines and 403 Compounds with 333292 IC50s.
536
+ GDSC2 is new and has 243,466 IC50 results from the latest screening at the Sanger Institute.
537
+ Defaults to 1.
538
+ verbosity: The number of unmatched identifiers to print, can be either non-negative values or 'all'.
539
+ Defaults to 5.
540
+ copy: Determines whether a copy of the `adata` is returned. Defaults to False.
541
+
542
+ Returns:
543
+ Returns an AnnData object with drug response annotation.
544
+
545
+ Examples:
546
+ >>> import pertpy as pt
547
+ >>> adata = pt.dt.mcfarland_2020()
548
+ >>> pt_metadata = pt.md.CellLine()
549
+ >>> pt_metadata.annotate_from_gdsc(adata, query_id="cell_line")
550
+ """
551
+ if copy:
552
+ adata = adata.copy()
553
+ if query_id not in adata.obs.columns:
554
+ raise ValueError(
555
+ f"The specified `query_id` {query_id} can't be found in the `adata.obs`. \n"
556
+ "Ensure that you are using one of the available query IDs present in 'adata.obs' for the annotation.\n"
557
+ "If the desired query ID is not available, you can fetch the cell line metadata "
558
+ "using the `annotate()` function before calling `annotate_from_gdsc()`. "
559
+ "This ensures that the required query ID is included in your data."
560
+ )
561
+ # Lazily download the GDSC data
562
+ if gdsc_dataset == 1:
563
+ if self.drug_response_gdsc1 is None:
564
+ self._download_gdsc(gdsc_dataset=1)
565
+ gdsc_data = self.drug_response_gdsc1
566
+ else:
567
+ if self.drug_response_gdsc2 is None:
568
+ self._download_gdsc(gdsc_dataset=2)
569
+ gdsc_data = self.drug_response_gdsc2
570
+
571
+ identifier_num_all = len(adata.obs[query_id].unique())
572
+ not_matched_identifiers = list(set(adata.obs[query_id]) - set(gdsc_data[reference_id]))
573
+ self._warn_unmatch(
574
+ total_identifiers=identifier_num_all,
575
+ unmatched_identifiers=not_matched_identifiers,
576
+ query_id=query_id,
577
+ reference_id=reference_id,
578
+ metadata_type="drug response",
579
+ verbosity=verbosity,
580
+ )
581
+
582
+ old_index_name = "index" if adata.obs.index.name is None else adata.obs.index.name
583
+ adata.obs = (
584
+ adata.obs.reset_index()
585
+ .set_index([query_id, query_perturbation])
586
+ .assign(ln_ic50=self.drug_response_gdsc1.set_index([reference_id, reference_perturbation]).ln_ic50)
587
+ .reset_index()
588
+ .set_index(old_index_name)
589
+ )
590
+
591
+ return adata
592
+
593
+ def lookup(self) -> LookUp:
594
+ """Generate LookUp object for CellLineMetaData.
595
+
596
+ The LookUp object provides an overview of the metadata to annotate.
597
+ Each annotate_{metadata} function has a corresponding lookup function in the LookUp object,
598
+ where users can search the reference_id in the metadata and
599
+ compare with the query_id in their own data.
600
+
601
+ Returns:
602
+ A LookUp object specific for cell line annotation.
603
+
604
+ Examples:
605
+ >>> import pertpy as pt
606
+ >>> pt_metadata = pt.md.CellLine()
607
+ >>> lookup = pt_metadata.lookup()
608
+ """
609
+ # Fetch the metadata if it hasn't beed downloaded yet
610
+ if self.depmap is None:
611
+ self._download_cell_line(cell_line_source="DepMap")
612
+ if self.cancerxgene is None:
613
+ self._download_cell_line(cell_line_source="Cancerrxgene")
614
+ if self.gene_annotation is None:
615
+ self._download_gene_annotation()
616
+ if self.bulk_rna_broad is None:
617
+ self._download_bulk_rna(cell_line_source="broad")
618
+ if self.bulk_rna_sanger is None:
619
+ self._download_bulk_rna(cell_line_source="sanger")
620
+ if self.proteomics is None:
621
+ self._download_proteomics()
622
+ if self.drug_response_gdsc1 is None:
623
+ self._download_gdsc(gdsc_dataset=1)
624
+ if self.drug_response_gdsc2 is None:
625
+ self._download_gdsc(gdsc_dataset=2)
626
+
627
+ # Transfer the data
628
+ return LookUp(
629
+ type="cell_line",
630
+ transfer_metadata=[
631
+ self.depmap,
632
+ self.cancerxgene,
633
+ self.gene_annotation,
634
+ self.bulk_rna_sanger,
635
+ self.bulk_rna_broad,
636
+ self.proteomics,
637
+ self.drug_response_gdsc1,
638
+ self.drug_response_gdsc2,
639
+ ],
640
+ )
641
+
642
+ def _pairwise_correlation(
643
+ self, mat1: np.array, mat2: np.array, row_name: Iterable, col_name: Iterable
644
+ ) -> tuple[pd.DataFrame, pd.DataFrame]:
645
+ """Calculate the row-wise pearson correlation between two matrices.
646
+
647
+ Args:
648
+ mat1: Input array
649
+ mat2: Input array
650
+ row_name: Row name of the output dataframes
651
+ col_name: Row name of the output dataframes
652
+
653
+ Returns:
654
+ Returns DataFrames for both the Pearson correlation coefficients and their associated p-values.
655
+ """
656
+ corr = np.empty((mat1.shape[0], mat2.shape[0]))
657
+ pvals = np.empty((mat1.shape[0], mat2.shape[0]))
658
+
659
+ for i in range(mat1.shape[0]):
660
+ for j in range(mat2.shape[0]):
661
+ if i > j:
662
+ corr[i, j] = corr[j, i]
663
+ pvals[i, j] = pvals[j, i]
664
+ else:
665
+ corr[i, j], pvals[i, j] = stats.pearsonr(mat1[i], mat2[j])
666
+ corr = pd.DataFrame(corr, index=row_name, columns=col_name)
667
+ pvals = pd.DataFrame(pvals, index=row_name, columns=col_name)
668
+
669
+ return corr, pvals
670
+
671
+ def correlate(
672
+ self,
673
+ adata: AnnData,
674
+ identifier: str = "DepMap_ID",
675
+ metadata_key: str = "bulk_rna_broad",
676
+ ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame | None, pd.DataFrame | None]:
677
+ """Correlate cell lines with annotated metadata.
678
+
679
+ Args:
680
+ adata: Input data object.
681
+ identifier: Column in `.obs` containing cell line identifiers. Defaults to "DepMap_ID".
682
+ metadata_key: Key of the AnnData obsm for comparison with the X matrix. Defaults to "bulk_rna_broad".
683
+
684
+ Returns:
685
+ Returns pearson correlation coefficients and their corresponding p-values for matched and unmatched cell lines separately.
686
+ """
687
+ if metadata_key not in adata.obsm:
688
+ raise ValueError("The metadata can not be found in adata.obsm")
689
+ if identifier not in adata.obs:
690
+ raise ValueError("The identifier can not be found in adata.obs")
691
+ if adata.X.shape[1] != adata.obsm[metadata_key].shape[1]:
692
+ raise ValueError(
693
+ "Dimensions of adata.X do not match those of metadata. Ensure that they have the same gene list."
694
+ )
695
+ if isinstance(adata.obsm[metadata_key], pd.DataFrame):
696
+ # Give warning if the genes are not the same
697
+ if sum(adata.obsm[metadata_key].columns != adata.var.index.values) > 0:
698
+ print(
699
+ "Column name of metadata is not the same as the index of adata.var. Ensure that the genes are in the same order."
700
+ )
701
+
702
+ # Divide cell lines into those are present and not present in the metadata
703
+ overlapped_cl = adata[~adata.obsm[metadata_key].isna().all(axis=1), :]
704
+ missing_cl = adata[adata.obsm[metadata_key].isna().all(axis=1), :]
705
+
706
+ corr, pvals = self._pairwise_correlation(
707
+ overlapped_cl.X,
708
+ overlapped_cl.obsm[metadata_key].values,
709
+ row_name=overlapped_cl.obs[identifier],
710
+ col_name=overlapped_cl.obs[identifier],
711
+ )
712
+ if missing_cl is not None:
713
+ new_corr, new_pvals = self._pairwise_correlation(
714
+ missing_cl.X,
715
+ overlapped_cl.obsm[metadata_key].values,
716
+ row_name=missing_cl.obs[identifier],
717
+ col_name=overlapped_cl.obs[identifier],
718
+ )
719
+ else:
720
+ new_corr = new_pvals = None
721
+
722
+ return corr, pvals, new_corr, new_pvals
723
+
724
+ def plot_correlation(
725
+ self,
726
+ adata: AnnData,
727
+ corr: pd.DataFrame,
728
+ pval: pd.DataFrame,
729
+ identifier: str = "DepMap_ID",
730
+ metadata_key: str = "bulk_rna_broad",
731
+ category: str = "cell line",
732
+ subset_identifier: str | int | Iterable[str] | Iterable[int] | None = None,
733
+ ) -> None:
734
+ """Visualise the correlation of cell lines with annotated metadata.
735
+
736
+ Args:
737
+ adata: Input data object.
738
+ corr: Pearson correlation scores.
739
+ pval: P-values for pearson correlation.
740
+ identifier: Column in `.obs` containing the identifiers. Defaults to 'DepMap_ID'.
741
+ metadata_key: Key of the AnnData obsm for comparison with the X matrix. Defaults to 'bulk_rna_broad'.
742
+ category: The category for correlation comparison. Defaults to "cell line".
743
+ subset_identifier: Selected identifiers for scatter plot visualization between the X matrix and `metadata_key`.
744
+ If not None, only the chosen cell line will be plotted, either specified as a value in `identifier` (string) or as an index number.
745
+ If None, all cell lines will be plotted.
746
+ Defaults to None.
747
+ Returns:
748
+ Pearson correlation coefficients and their corresponding p-values for matched and unmatched cell lines separately.
749
+ """
750
+ if corr is None or pval is None:
751
+ raise ValueError(
752
+ "Missing required input parameter: 'corr' or 'pval'. Please call the function `pt.md.CellLine.correlate()` to generate these outputs before proceeding."
753
+ )
754
+
755
+ if category == "cell line":
756
+ if subset_identifier is None:
757
+ annotation = "\n".join(
758
+ (
759
+ f"Mean pearson correlation: {np.mean(np.diag(corr)):.4f}",
760
+ f"Mean p-value: {np.mean(np.diag(pval)):.4f}",
761
+ )
762
+ )
763
+ plt.scatter(x=adata.obsm[metadata_key], y=adata.X)
764
+ plt.xlabel(metadata_key)
765
+ plt.ylabel("Baseline")
766
+ else:
767
+ subset_identifier_list = (
768
+ [subset_identifier] if isinstance(subset_identifier, str | int) else list(subset_identifier)
769
+ )
770
+ # Convert the valid identifiers to the index list
771
+ if all(isinstance(id, str) for id in subset_identifier_list):
772
+ if set(subset_identifier_list).issubset(adata.obs[identifier].unique()):
773
+ subset_identifier_list = np.where(
774
+ np.in1d(adata.obs[identifier].values, subset_identifier_list)
775
+ )[0]
776
+ else:
777
+ raise ValueError("`Subset_identifier` must be found in adata.obs.`identifier`.")
778
+ elif all(isinstance(id, int) and 0 <= id < adata.n_obs for id in subset_identifier_list):
779
+ pass
780
+ elif all(isinstance(id, int) and (id < 0 or id >= adata.n_obs) for id in subset_identifier_list):
781
+ raise ValueError("`Subset_identifier` out of index.")
782
+ else:
783
+ raise ValueError("`Subset_identifier` must contain either all strings or all integers.")
784
+
785
+ plt.scatter(
786
+ x=adata.obsm[metadata_key].iloc[subset_identifier_list],
787
+ y=adata[subset_identifier_list].X,
788
+ )
789
+ plt.xlabel(
790
+ f"{metadata_key}: {adata.obs[identifier].values[subset_identifier_list[0]]}"
791
+ if len(subset_identifier_list) == 1
792
+ else f"{metadata_key}"
793
+ )
794
+ plt.ylabel(
795
+ f"Baseline: {adata.obs[identifier].values[subset_identifier_list[0]]}"
796
+ if len(subset_identifier_list) == 1
797
+ else "Baseline"
798
+ )
799
+
800
+ # Annotate with the correlation coefficient and p-value of the chosen cell lines
801
+ subset_cor = np.mean(np.diag(corr.iloc[subset_identifier_list, subset_identifier_list]))
802
+ subset_pval = np.mean(np.diag(pval.iloc[subset_identifier_list, subset_identifier_list]))
803
+ annotation = "\n".join(
804
+ (
805
+ f"Pearson correlation: {subset_cor:.4f}",
806
+ f"P-value: {subset_pval:.4f}",
807
+ )
808
+ )
809
+
810
+ plt.text(
811
+ 0.05,
812
+ 0.95,
813
+ annotation,
814
+ fontsize=10,
815
+ transform=plt.gca().transAxes,
816
+ verticalalignment="top",
817
+ bbox={
818
+ "boxstyle": "round",
819
+ "alpha": 0.5,
820
+ "facecolor": "white",
821
+ "edgecolor": "black",
822
+ },
823
+ )
824
+ plt.show()
825
+ else:
826
+ raise NotImplementedError