pertpy 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- pertpy/__init__.py +3 -2
- pertpy/data/__init__.py +5 -1
- pertpy/data/_dataloader.py +2 -4
- pertpy/data/_datasets.py +203 -92
- pertpy/metadata/__init__.py +4 -0
- pertpy/metadata/_cell_line.py +826 -0
- pertpy/metadata/_compound.py +129 -0
- pertpy/metadata/_drug.py +242 -0
- pertpy/metadata/_look_up.py +582 -0
- pertpy/metadata/_metadata.py +73 -0
- pertpy/metadata/_moa.py +129 -0
- pertpy/plot/__init__.py +1 -9
- pertpy/plot/_augur.py +53 -116
- pertpy/plot/_coda.py +277 -677
- pertpy/plot/_guide_rna.py +17 -35
- pertpy/plot/_milopy.py +59 -134
- pertpy/plot/_mixscape.py +152 -391
- pertpy/preprocessing/_guide_rna.py +88 -4
- pertpy/tools/__init__.py +8 -13
- pertpy/tools/_augur.py +315 -17
- pertpy/tools/_cinemaot.py +143 -4
- pertpy/tools/_coda/_base_coda.py +1210 -65
- pertpy/tools/_coda/_sccoda.py +50 -21
- pertpy/tools/_coda/_tasccoda.py +27 -19
- pertpy/tools/_dialogue.py +164 -56
- pertpy/tools/_differential_gene_expression.py +240 -14
- pertpy/tools/_distances/_distance_tests.py +8 -8
- pertpy/tools/_distances/_distances.py +184 -34
- pertpy/tools/_enrichment.py +465 -0
- pertpy/tools/_milo.py +345 -11
- pertpy/tools/_mixscape.py +668 -50
- pertpy/tools/_perturbation_space/_clustering.py +5 -1
- pertpy/tools/_perturbation_space/_discriminator_classifiers.py +526 -0
- pertpy/tools/_perturbation_space/_perturbation_space.py +135 -43
- pertpy/tools/_perturbation_space/_simple.py +51 -10
- pertpy/tools/_scgen/__init__.py +1 -1
- pertpy/tools/_scgen/_scgen.py +701 -0
- pertpy/tools/_scgen/_utils.py +1 -3
- pertpy/tools/decoupler_LICENSE +674 -0
- {pertpy-0.6.0.dist-info → pertpy-0.7.0.dist-info}/METADATA +31 -12
- pertpy-0.7.0.dist-info/RECORD +53 -0
- {pertpy-0.6.0.dist-info → pertpy-0.7.0.dist-info}/WHEEL +1 -1
- pertpy/plot/_cinemaot.py +0 -81
- pertpy/plot/_dialogue.py +0 -91
- pertpy/plot/_scgen.py +0 -337
- pertpy/tools/_metadata/__init__.py +0 -0
- pertpy/tools/_metadata/_cell_line.py +0 -613
- pertpy/tools/_metadata/_look_up.py +0 -342
- pertpy/tools/_perturbation_space/_discriminator_classifier.py +0 -381
- pertpy/tools/_scgen/_jax_scgen.py +0 -370
- pertpy-0.6.0.dist-info/RECORD +0 -50
- /pertpy/tools/_scgen/{_jax_scgenvae.py → _scgenvae.py} +0 -0
- {pertpy-0.6.0.dist-info → pertpy-0.7.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,613 +0,0 @@
|
|
1
|
-
from __future__ import annotations
|
2
|
-
|
3
|
-
from pathlib import Path
|
4
|
-
from typing import TYPE_CHECKING, Literal
|
5
|
-
|
6
|
-
import pandas as pd
|
7
|
-
from rich import print
|
8
|
-
from scanpy import settings
|
9
|
-
|
10
|
-
from pertpy.data._dataloader import _download
|
11
|
-
|
12
|
-
from ._look_up import LookUp
|
13
|
-
|
14
|
-
if TYPE_CHECKING:
|
15
|
-
from anndata import AnnData
|
16
|
-
|
17
|
-
|
18
|
-
class CellLineMetaData:
|
19
|
-
"""Utilities to fetch cell line metadata."""
|
20
|
-
|
21
|
-
def __init__(self):
|
22
|
-
settings.cachedir = ".pertpy_cache"
|
23
|
-
# Download cell line metadata from DepMap
|
24
|
-
# Source: https://depmap.org/portal/download/all/ (DepMap Public 22Q2)
|
25
|
-
cell_line_file_path = settings.cachedir.__str__() + "/sample_info.csv"
|
26
|
-
if not Path(cell_line_file_path).exists():
|
27
|
-
print("[bold yellow]No DepMap metadata file found. Starting download now.")
|
28
|
-
_download(
|
29
|
-
url="https://ndownloader.figshare.com/files/35020903",
|
30
|
-
output_file_name="sample_info.csv",
|
31
|
-
output_path=settings.cachedir,
|
32
|
-
block_size=4096,
|
33
|
-
is_zip=False,
|
34
|
-
)
|
35
|
-
self.cell_line_meta = pd.read_csv(cell_line_file_path)
|
36
|
-
|
37
|
-
# Download cell line metadata from The Genomics of Drug Sensitivity in Cancer Project
|
38
|
-
# Source: https://www.cancerrxgene.org/celllines
|
39
|
-
cell_line_cancer_project_file_path = settings.cachedir.__str__() + "/cell_line_cancer_project.csv"
|
40
|
-
cell_line_cancer_project_transformed_path = (
|
41
|
-
settings.cachedir.__str__() + "/cell_line_cancer_project_transformed.csv"
|
42
|
-
)
|
43
|
-
if not Path(cell_line_cancer_project_transformed_path).exists():
|
44
|
-
if not Path(cell_line_cancer_project_file_path).exists():
|
45
|
-
print(
|
46
|
-
"[bold yellow]No cell line metadata file from The Genomics of Drug Sensitivity "
|
47
|
-
"in Cancer Project found. Starting download now."
|
48
|
-
)
|
49
|
-
_download(
|
50
|
-
url="https://www.cancerrxgene.org/api/celllines?list=all&sEcho=1&iColumns=7&sColumns=&"
|
51
|
-
"iDisplayStart=0&iDisplayLength=25&mDataProp_0=0&mDataProp_1=1&mDataProp_2=2&mDataProp_3=3&"
|
52
|
-
"mDataProp_4=4&mDataProp_5=5&mDataProp_6=6&sSearch=&bRegex=false&sSearch_0=&bRegex_0=false&"
|
53
|
-
"bSearchable_0=true&sSearch_1=&bRegex_1=false&bSearchable_1=true&sSearch_2=&bRegex_2=false&"
|
54
|
-
"bSearchable_2=true&sSearch_3=&bRegex_3=false&bSearchable_3=true&sSearch_4=&bRegex_4=false&"
|
55
|
-
"bSearchable_4=true&sSearch_5=&bRegex_5=false&bSearchable_5=true&sSearch_6=&bRegex_6=false&"
|
56
|
-
"bSearchable_6=true&iSortCol_0=0&sSortDir_0=asc&iSortingCols=1&bSortable_0=true&bSortable_1=true&"
|
57
|
-
"bSortable_2=true&bSortable_3=true&bSortable_4=true&bSortable_5=true&bSortable_6=true&export=csv",
|
58
|
-
output_file_name="cell_line_cancer_project.csv",
|
59
|
-
output_path=settings.cachedir,
|
60
|
-
block_size=4096,
|
61
|
-
is_zip=False,
|
62
|
-
)
|
63
|
-
|
64
|
-
self.cl_cancer_project_meta = pd.read_csv(cell_line_cancer_project_file_path)
|
65
|
-
self.cl_cancer_project_meta.columns = self.cl_cancer_project_meta.columns.str.strip()
|
66
|
-
self.cl_cancer_project_meta["stripped_cell_line_name"] = self.cl_cancer_project_meta[
|
67
|
-
"Cell line Name"
|
68
|
-
].str.replace(r"\-|\.", "", regex=True)
|
69
|
-
self.cl_cancer_project_meta["stripped_cell_line_name"] = pd.Categorical(
|
70
|
-
self.cl_cancer_project_meta["stripped_cell_line_name"].str.upper()
|
71
|
-
)
|
72
|
-
# pivot the data frame so that each cell line has only one row of metadata
|
73
|
-
index_col = list(set(self.cl_cancer_project_meta.columns) - {"Datasets", "number of drugs"})
|
74
|
-
self.cl_cancer_project_meta = self.cl_cancer_project_meta.pivot(
|
75
|
-
index=index_col, columns="Datasets", values="number of drugs"
|
76
|
-
)
|
77
|
-
self.cl_cancer_project_meta.columns.name = None
|
78
|
-
self.cl_cancer_project_meta = self.cl_cancer_project_meta.reset_index()
|
79
|
-
self.cl_cancer_project_meta = self.cl_cancer_project_meta.rename(
|
80
|
-
columns={"Cell line Name": "cell_line_name"}
|
81
|
-
)
|
82
|
-
self.cl_cancer_project_meta.to_csv(cell_line_cancer_project_transformed_path)
|
83
|
-
|
84
|
-
else:
|
85
|
-
self.cl_cancer_project_meta = pd.read_csv(cell_line_cancer_project_transformed_path, index_col=0)
|
86
|
-
|
87
|
-
# Download metadata for driver genes from DepMap.Sanger
|
88
|
-
# Source: https://cellmodelpassports.sanger.ac.uk/downloads (Gene annotation)
|
89
|
-
gene_annotation_file_path = settings.cachedir.__str__() + "/gene_identifiers_20191101.csv"
|
90
|
-
if not Path(gene_annotation_file_path).exists():
|
91
|
-
print("[bold yellow]No metadata file was found for gene annotation." " Starting download now.")
|
92
|
-
_download(
|
93
|
-
url="https://cog.sanger.ac.uk/cmp/download/gene_identifiers_20191101.csv",
|
94
|
-
output_file_name="gene_identifiers_20191101.csv",
|
95
|
-
output_path=settings.cachedir,
|
96
|
-
block_size=4096,
|
97
|
-
is_zip=False,
|
98
|
-
)
|
99
|
-
self.gene_annotation = pd.read_table(gene_annotation_file_path, delimiter=",")
|
100
|
-
|
101
|
-
# Download bulk RNA-seq data collated by the Wellcome Sanger Institute and the Broad Institute from DepMap.Sanger
|
102
|
-
# Source: https://cellmodelpassports.sanger.ac.uk/downloads (Expression data)
|
103
|
-
# issue: read count values contain random whitespace, not sure what it supposes to mean
|
104
|
-
# solution: remove the white space and convert to int before depmap updates the metadata
|
105
|
-
bulk_rna_sanger_file_path = settings.cachedir.__str__() + "/rnaseq_read_count_20220624_processed.csv"
|
106
|
-
if not Path(bulk_rna_sanger_file_path).exists():
|
107
|
-
print(
|
108
|
-
"[bold yellow]No metadata file was found for bulk RNA-seq data of Sanger cell line."
|
109
|
-
" Starting download now..."
|
110
|
-
)
|
111
|
-
_download(
|
112
|
-
url="https://figshare.com/ndownloader/files/42467103",
|
113
|
-
output_file_name="rnaseq_read_count_20220624_processed.csv",
|
114
|
-
output_path=settings.cachedir,
|
115
|
-
block_size=4096,
|
116
|
-
is_zip=False,
|
117
|
-
)
|
118
|
-
self.bulk_rna_sanger = pd.read_csv(bulk_rna_sanger_file_path, index_col=0)
|
119
|
-
|
120
|
-
# Download CCLE expression data from DepMap
|
121
|
-
# Source: https://depmap.org/portal/download/all/ (DepMap Public 22Q2)
|
122
|
-
bulk_rna_broad_file_path = settings.cachedir.__str__() + "/CCLE_expression_full.csv"
|
123
|
-
if not Path(bulk_rna_broad_file_path).exists():
|
124
|
-
print("[bold yellow]No metadata file was found for CCLE expression data. Starting download now.")
|
125
|
-
_download(
|
126
|
-
url="https://figshare.com/ndownloader/files/34989922",
|
127
|
-
output_file_name="CCLE_expression_full.csv",
|
128
|
-
output_path=settings.cachedir,
|
129
|
-
block_size=4096,
|
130
|
-
is_zip=False,
|
131
|
-
)
|
132
|
-
self.bulk_rna_broad = pd.read_csv(bulk_rna_broad_file_path, index_col=0)
|
133
|
-
|
134
|
-
# Download proteomics data processed by DepMap.Sanger
|
135
|
-
# Source: https://cellmodelpassports.sanger.ac.uk/downloads (Proteomics)
|
136
|
-
proteomics_file_path = settings.cachedir.__str__() + "/proteomics_all_20221214_processed.csv"
|
137
|
-
if not Path(proteomics_file_path).exists():
|
138
|
-
print("[bold yellow]No metadata file was found for proteomics data (DepMap.Sanger). Starting download now.")
|
139
|
-
_download(
|
140
|
-
url="https://figshare.com/ndownloader/files/42468393",
|
141
|
-
output_file_name="proteomics_all_20221214_processed.csv",
|
142
|
-
output_path=settings.cachedir,
|
143
|
-
block_size=4096,
|
144
|
-
is_zip=False,
|
145
|
-
)
|
146
|
-
self.proteomics_data = pd.read_csv(proteomics_file_path, index_col=0)
|
147
|
-
|
148
|
-
# Download GDSC drug response data
|
149
|
-
# Source: https://www.cancerrxgene.org/downloads/bulk_download (Drug Screening - IC50s)
|
150
|
-
drug_response_gdsc1_file_path = settings.cachedir.__str__() + "/ic50_gdsc1.xlsx"
|
151
|
-
if not Path(drug_response_gdsc1_file_path).exists():
|
152
|
-
print(
|
153
|
-
"[bold yellow]No metadata file was found for drug response data of GDSC1 dataset."
|
154
|
-
" Starting download now."
|
155
|
-
)
|
156
|
-
_download(
|
157
|
-
url="https://cog.sanger.ac.uk/cancerrxgene/GDSC_release8.4/GDSC1_fitted_dose_response_24Jul22.xlsx",
|
158
|
-
output_file_name="ic50_gdsc1.xlsx",
|
159
|
-
output_path=settings.cachedir,
|
160
|
-
block_size=4096,
|
161
|
-
is_zip=False,
|
162
|
-
)
|
163
|
-
self.drug_response_gdsc1 = pd.read_excel(drug_response_gdsc1_file_path)
|
164
|
-
self.drug_response_gdsc1 = self.drug_response_gdsc1.iloc[:, [3, 4, 5, 7, 8, 15, 16]]
|
165
|
-
self.drug_response_gdsc1.rename(columns=lambda col: col.lower(), inplace=True)
|
166
|
-
self.drug_response_gdsc1 = self.drug_response_gdsc1.loc[
|
167
|
-
self.drug_response_gdsc1.groupby(["cell_line_name", "drug_name"])["auc"].idxmax()
|
168
|
-
]
|
169
|
-
self.drug_response_gdsc1 = self.drug_response_gdsc1.reset_index(drop=True)
|
170
|
-
|
171
|
-
drug_response_gdsc2_file_path = settings.cachedir.__str__() + "/ic50_gdsc2.xlsx"
|
172
|
-
if not Path(drug_response_gdsc2_file_path).exists():
|
173
|
-
print(
|
174
|
-
"[bold yellow]No metadata file was found for drug response data of GDSC2 dataset."
|
175
|
-
" Starting download now."
|
176
|
-
)
|
177
|
-
_download(
|
178
|
-
url="https://cog.sanger.ac.uk/cancerrxgene/GDSC_release8.4/GDSC2_fitted_dose_response_24Jul22.xlsx",
|
179
|
-
output_file_name="ic50_gdsc2.xlsx",
|
180
|
-
output_path=settings.cachedir,
|
181
|
-
block_size=4096,
|
182
|
-
is_zip=False,
|
183
|
-
)
|
184
|
-
self.drug_response_gdsc2 = pd.read_excel(drug_response_gdsc2_file_path)
|
185
|
-
self.drug_response_gdsc2 = self.drug_response_gdsc2.iloc[:, [3, 4, 5, 7, 8, 15, 16]]
|
186
|
-
self.drug_response_gdsc2.rename(columns=lambda col: col.lower(), inplace=True)
|
187
|
-
self.drug_response_gdsc2 = self.drug_response_gdsc2.loc[
|
188
|
-
self.drug_response_gdsc2.groupby(["cell_line_name", "drug_name"])["auc"].idxmax()
|
189
|
-
]
|
190
|
-
self.drug_response_gdsc2 = self.drug_response_gdsc2.reset_index(drop=True)
|
191
|
-
|
192
|
-
def annotate_cell_lines(
|
193
|
-
self,
|
194
|
-
adata: AnnData,
|
195
|
-
query_id: str = "DepMap_ID",
|
196
|
-
reference_id: str = "DepMap_ID",
|
197
|
-
cell_line_information: list[str] | None = None,
|
198
|
-
cell_line_source: Literal["DepMap", "Cancerrxgene"] = "DepMap",
|
199
|
-
copy: bool = False,
|
200
|
-
) -> AnnData:
|
201
|
-
"""Fetch cell line annotation.
|
202
|
-
|
203
|
-
For each cell, we fetch cell line annotation from Dependency Map (DepMap).
|
204
|
-
|
205
|
-
Args:
|
206
|
-
adata: The data object to annotate.
|
207
|
-
query_id: The column of `.obs` with cell line information. Defaults to "DepMap_ID".
|
208
|
-
reference_id: The type of cell line identifier in the meta data, e.g. DepMap_ID, cell_line_name or
|
209
|
-
stripped_cell_line_name. If fetch cell line metadata from Cancerrxgene, it is recommended to choose
|
210
|
-
"stripped_cell_line_name". Defaults to "DepMap_ID".
|
211
|
-
cell_line_information: The metadata to fetch. All metadata will be fetched by default. Defaults to None (=all).
|
212
|
-
cell_line_source: The source of cell line metadata, DepMap or Cancerrxgene. Defaults to "DepMap".
|
213
|
-
copy: Determines whether a copy of the `adata` is returned. Defaults to False.
|
214
|
-
|
215
|
-
Returns:
|
216
|
-
Returns an AnnData object with cell line annotation.
|
217
|
-
|
218
|
-
Examples:
|
219
|
-
>>> import pertpy as pt
|
220
|
-
>>> adata = pt.dt.dialogue_example()
|
221
|
-
>>> adata.obs['cell_line_name'] = 'MCF7'
|
222
|
-
>>> pt_metadata = pt.tl.CellLineMetaData()
|
223
|
-
>>> adata_annotated = pt_metadata.annotate_cell_lines(adata=adata, reference_id='cell_line_name', query_id='cell_line_name', copy=True)
|
224
|
-
"""
|
225
|
-
if copy:
|
226
|
-
adata = adata.copy()
|
227
|
-
|
228
|
-
if cell_line_source == "DepMap":
|
229
|
-
cell_line_meta = self.cell_line_meta
|
230
|
-
else:
|
231
|
-
reference_id = "stripped_cell_line_name"
|
232
|
-
if query_id == "DepMap_ID":
|
233
|
-
query_id = "stripped_cell_line_name"
|
234
|
-
print(
|
235
|
-
"[bold blue]`stripped_cell_line_name` is used as reference and query indentifier ",
|
236
|
-
" to annotate cell line metadata from Cancerrxgene. "
|
237
|
-
"So please make sure that `stripped_cell_line_name` is available in the adata.obs. ",
|
238
|
-
"or use the DepMap as `cell_line_source` to annotate the cell line first ",
|
239
|
-
)
|
240
|
-
cell_line_meta = self.cl_cancer_project_meta
|
241
|
-
|
242
|
-
if query_id not in adata.obs.columns:
|
243
|
-
raise ValueError(f"The requested query_id {query_id} is not in `adata.obs`. \n" "Please check again. ")
|
244
|
-
|
245
|
-
if reference_id in cell_line_meta.columns:
|
246
|
-
# If the specified cell line type can be found in the database,
|
247
|
-
# we can compare these keys and fetch the corresponding metadata.
|
248
|
-
identifier_num_all = len(adata.obs[query_id].unique())
|
249
|
-
not_matched_identifiers = list(set(adata.obs[query_id]) - set(cell_line_meta[reference_id]))
|
250
|
-
if len(not_matched_identifiers) == identifier_num_all:
|
251
|
-
raise ValueError(
|
252
|
-
f"Attempting to match the query id {query_id} in the adata.obs to the reference id {reference_id} in the metadata.\n"
|
253
|
-
"However, none of the query IDs could be found in the cell line annotation data.\n"
|
254
|
-
"The annotation process has been halted.\n"
|
255
|
-
"To resolve this issue, please call the `CellLineMetaData.lookup()` function to create a LookUp object.\n"
|
256
|
-
"By using the `LookUp.cell_line()` method, "
|
257
|
-
"you can obtain the count of matched identifiers in the adata for different types of reference IDs and query IDs."
|
258
|
-
)
|
259
|
-
|
260
|
-
if len(not_matched_identifiers) > 0:
|
261
|
-
print(
|
262
|
-
f"[bold blue]There are {identifier_num_all} identifiers in `adata.obs`."
|
263
|
-
f"However, {len(not_matched_identifiers)} identifiers can't be found in the cell line annotation,"
|
264
|
-
"leading to the presence of NA values for their respective metadata.\n",
|
265
|
-
"Please check again: ",
|
266
|
-
*not_matched_identifiers,
|
267
|
-
sep="\n- ",
|
268
|
-
)
|
269
|
-
|
270
|
-
if cell_line_information is None:
|
271
|
-
# If no cell_line_information is specified, all metadata is fetched by default.
|
272
|
-
# Sometimes there is already different cell line information in the AnnData object.
|
273
|
-
# To avoid redundant information we will remove the duplicate information from metadata after merging.
|
274
|
-
adata.obs = (
|
275
|
-
adata.obs.merge(
|
276
|
-
cell_line_meta,
|
277
|
-
left_on=query_id,
|
278
|
-
right_on=reference_id,
|
279
|
-
how="left",
|
280
|
-
suffixes=("", "_fromMeta"),
|
281
|
-
)
|
282
|
-
.filter(regex="^(?!.*_fromMeta)")
|
283
|
-
.set_index(adata.obs.index)
|
284
|
-
)
|
285
|
-
# If query_id and reference_id have different names,
|
286
|
-
# there will be a column for each of them after merging,
|
287
|
-
# which is redundant as they refer to the same information.
|
288
|
-
# We will move the reference_id column.
|
289
|
-
if query_id != reference_id:
|
290
|
-
del adata.obs[reference_id]
|
291
|
-
|
292
|
-
elif set(cell_line_information).issubset(set(cell_line_meta.columns)):
|
293
|
-
# If cell_line_information is specified and can be found in the DepMap database,
|
294
|
-
# We will subset the original metadata dataframe correspondingly and add them to the AnnData object.
|
295
|
-
# Again, redundant information will be removed.
|
296
|
-
if reference_id not in cell_line_information:
|
297
|
-
cell_line_information.append(reference_id)
|
298
|
-
cell_line_meta_part = cell_line_meta[cell_line_information]
|
299
|
-
adata.obs = (
|
300
|
-
adata.obs.merge(
|
301
|
-
cell_line_meta_part,
|
302
|
-
left_on=query_id,
|
303
|
-
right_on=reference_id,
|
304
|
-
how="left",
|
305
|
-
suffixes=("", "_fromMeta"),
|
306
|
-
)
|
307
|
-
.filter(regex="^(?!.*_fromMeta)")
|
308
|
-
.set_index(adata.obs.index)
|
309
|
-
)
|
310
|
-
# Again, redundant information will be removed.
|
311
|
-
if query_id != reference_id:
|
312
|
-
del adata.obs[reference_id]
|
313
|
-
|
314
|
-
else:
|
315
|
-
raise ValueError(
|
316
|
-
f"The requested cell line type {reference_id} is currently unavailable in the database.\n"
|
317
|
-
"To solve ths issue, please refer to the available reference identifier in the chosen database.\n"
|
318
|
-
"DepMap_ID is compared by default.\n"
|
319
|
-
"Alternatively, you can call the `CellLineMetaData.lookup()` function to create a LookUp object.\n"
|
320
|
-
"By using the `LookUp.cell_line()` method, you can obtain the available reference identifiers in the metadata."
|
321
|
-
)
|
322
|
-
|
323
|
-
return adata
|
324
|
-
|
325
|
-
def annotate_bulk_rna_expression(
|
326
|
-
self,
|
327
|
-
adata: AnnData,
|
328
|
-
query_id: str = "cell_line_name",
|
329
|
-
cell_line_source: Literal["broad", "sanger"] = "sanger",
|
330
|
-
copy: bool = False,
|
331
|
-
) -> AnnData:
|
332
|
-
"""Fetch bulk rna expression.
|
333
|
-
|
334
|
-
For each cell, we fetch bulk rna expression from either Broad or Sanger cell line.
|
335
|
-
|
336
|
-
Args:
|
337
|
-
adata: The data object to annotate.
|
338
|
-
query_id: The column of `.obs` with cell line information. Defaults to "cell_line_name" if `cell_line_source` is sanger, otherwise "DepMap_ID".
|
339
|
-
cell_line_source: The bulk rna expression data from either broad or sanger cell line. Defaults to "sanger".
|
340
|
-
copy: Determines whether a copy of the `adata` is returned. Defaults to False.
|
341
|
-
|
342
|
-
Returns:
|
343
|
-
Returns an AnnData object with bulk rna expression annotation.
|
344
|
-
|
345
|
-
Examples:
|
346
|
-
>>> import pertpy as pt
|
347
|
-
>>> adata = pt.dt.dialogue_example()
|
348
|
-
>>> adata.obs['cell_line_name'] = 'MCF7'
|
349
|
-
>>> pt_metadata = pt.tl.CellLineMetaData()
|
350
|
-
>>> adata_annotated = pt_metadata.annotate_cell_lines(adata=adata, reference_id='cell_line_name', query_id='cell_line_name', copy=True)
|
351
|
-
>>> pt_metadata.annotate_bulk_rna_expression(adata_annotated)
|
352
|
-
"""
|
353
|
-
if copy:
|
354
|
-
adata = adata.copy()
|
355
|
-
|
356
|
-
# Make sure that the specified `cell_line_type` can be found in the bulk rna expression data,
|
357
|
-
# then we can compare these keys and fetch the corresponding metadata.
|
358
|
-
if query_id not in adata.obs.columns:
|
359
|
-
raise ValueError(
|
360
|
-
f"The specified `query_id` {query_id} can't be found in the `adata.obs`.\n"
|
361
|
-
"Please ensure that you are using one of the available query IDs present in the adata.obs for the annotation.\n"
|
362
|
-
"If the desired query ID is not available, you can fetch the cell line metadata "
|
363
|
-
"using the `annotate_cell_lines()` function before calling annotate_ccle_expression(). "
|
364
|
-
"This will help ensure that the required query ID is included in your data, e.g. stripped_cell_line_name, DepMap ID."
|
365
|
-
)
|
366
|
-
|
367
|
-
identifier_num_all = len(adata.obs[query_id].unique())
|
368
|
-
if cell_line_source == "sanger":
|
369
|
-
reference_id = "model_name"
|
370
|
-
not_matched_identifiers = list(set(adata.obs[query_id]) - set(self.bulk_rna_sanger.index))
|
371
|
-
else:
|
372
|
-
reference_id = "DepMap_ID"
|
373
|
-
print(
|
374
|
-
"To annotate bulk RNA expression data from Broad Institue, ",
|
375
|
-
"`DepMap_ID` is used as default reference and query indentifier if no `reference_id` is given. ",
|
376
|
-
"Please make sure that `DepMap_ID` is available in the adata.obs. ",
|
377
|
-
"Alternatively, use the `annotate_cell_lines()` function to annotate the cell line first ",
|
378
|
-
)
|
379
|
-
|
380
|
-
if query_id == "cell_line_name":
|
381
|
-
query_id = "DepMap_ID"
|
382
|
-
not_matched_identifiers = list(set(adata.obs[query_id]) - set(self.bulk_rna_broad.index))
|
383
|
-
|
384
|
-
if len(not_matched_identifiers) == identifier_num_all:
|
385
|
-
raise ValueError(
|
386
|
-
f"You are attempting to match the query id {query_id} in the adata.obs to the reference id {reference_id} in the metadata."
|
387
|
-
"However, none of the query IDs could be found in the bulk RNA expression data.\n"
|
388
|
-
"The annotation process has been halted.\n"
|
389
|
-
"To resolve this issue, please call the `CellLineMetaData.lookup()` function to create a LookUp object.\n"
|
390
|
-
"By using the `LookUp.bulk_rna_expression()` method, "
|
391
|
-
"you can obtain the count of matched identifiers in the adata for different types of reference IDs and query IDs.\n"
|
392
|
-
"Additionally, you can call the `CellLineMetaData.annotate_cell_lines()` function "
|
393
|
-
"to acquire more possible query IDs that can be used for annotation purposes."
|
394
|
-
)
|
395
|
-
|
396
|
-
if len(not_matched_identifiers) > 0:
|
397
|
-
print(f"There are {identifier_num_all} identifiers in `adata.obs`.")
|
398
|
-
print(
|
399
|
-
f"[bold yellow]Following {len(not_matched_identifiers)} identifiers can't be found in bulk RNA expression data, "
|
400
|
-
)
|
401
|
-
print(
|
402
|
-
"leading to the presence of NA values for their respective metadata. Please check again: ",
|
403
|
-
*not_matched_identifiers,
|
404
|
-
sep="\n- ",
|
405
|
-
)
|
406
|
-
if cell_line_source == "sanger":
|
407
|
-
sanger_rna_exp = self.bulk_rna_sanger[self.bulk_rna_sanger.index.isin(adata.obs[query_id])]
|
408
|
-
sanger_rna_exp = sanger_rna_exp.reindex(adata.obs[query_id])
|
409
|
-
sanger_rna_exp.index = adata.obs.index
|
410
|
-
adata.obsm["bulk_rna_expression_sanger"] = sanger_rna_exp
|
411
|
-
else:
|
412
|
-
broad_rna_exp = self.bulk_rna_broad[self.bulk_rna_broad.index.isin(adata.obs[query_id])]
|
413
|
-
ccle_expression = broad_rna_exp.reindex(adata.obs[query_id])
|
414
|
-
ccle_expression.index = adata.obs.index
|
415
|
-
adata.obsm["bulk_rna_expression_broad"] = ccle_expression
|
416
|
-
|
417
|
-
return adata
|
418
|
-
|
419
|
-
def annotate_protein_expression(
|
420
|
-
self,
|
421
|
-
adata: AnnData,
|
422
|
-
query_id: str = "cell_line_name",
|
423
|
-
reference_id: Literal["model_name", "model_id"] = "model_name",
|
424
|
-
protein_information: Literal["protein_intensity", "zscore"] = "protein_intensity",
|
425
|
-
protein_id: Literal["uniprot_id", "symbol"] = "uniprot_id",
|
426
|
-
copy: bool = False,
|
427
|
-
) -> AnnData:
|
428
|
-
"""Fetch protein expression.
|
429
|
-
|
430
|
-
For each cell, we fetch protein intensity values acquired using data-independent acquisition mass spectrometry (DIA-MS).
|
431
|
-
|
432
|
-
Args:
|
433
|
-
adata: The data object to annotate.
|
434
|
-
query_id: The column of `.obs` with cell line information. Defaults to "cell_line_name".
|
435
|
-
reference_id: The type of cell line identifier in the meta data, model_name or model_id. Defaults to "model_name".
|
436
|
-
protein_information: The type of protein expression data to fetch, protein_intensity or zscore. Defaults to "protein_intensity".
|
437
|
-
protein_id: The protein identifier saved in the fetched meta data, uniprot_id or symbol. Defaults to "uniprot_id".
|
438
|
-
copy: Determines whether a copy of the `adata` is returned. Defaults to False.
|
439
|
-
|
440
|
-
Returns:
|
441
|
-
Returns an AnnData object with protein expression annotation.
|
442
|
-
|
443
|
-
Examples:
|
444
|
-
>>> import pertpy as pt
|
445
|
-
>>> adata = pt.dt.dialogue_example()
|
446
|
-
>>> adata.obs['cell_line_name'] = 'MCF7'
|
447
|
-
>>> pt_metadata = pt.tl.CellLineMetaData()
|
448
|
-
>>> adata_annotated = pt_metadata.annotate_cell_lines(adata=adata, reference_id='cell_line_name', query_id='cell_line_name', copy=True)
|
449
|
-
>>> pt_metadata.annotate_protein_expression(adata_annotated)
|
450
|
-
"""
|
451
|
-
if copy:
|
452
|
-
adata = adata.copy()
|
453
|
-
|
454
|
-
# Make sure that the specified `cell_line_type` can be found in the protein expression data,
|
455
|
-
# then we can compare these keys and fetch the corresponding metadata.
|
456
|
-
if query_id not in adata.obs.columns:
|
457
|
-
raise ValueError(
|
458
|
-
f"The specified `query_id` {query_id} can't be found in the `adata.obs`. \n"
|
459
|
-
"Please ensure that you are using one of the available query IDs present in the adata.obs for the annotation. \n"
|
460
|
-
"If the desired query ID is not available, you can fetch the cell line metadata \n"
|
461
|
-
"using the `annotate_cell_lines()` function before calling annotate_protein_expression(). \n"
|
462
|
-
"This will help ensure that the required query ID is included in your data"
|
463
|
-
)
|
464
|
-
|
465
|
-
if reference_id not in self.proteomics_data.columns:
|
466
|
-
raise ValueError(
|
467
|
-
f"The specified `reference_id`{reference_id} can't be found in the protein expression data. \n"
|
468
|
-
"To solve the issue, please use the reference identifier available in the metadata. \n"
|
469
|
-
"Alternatively, you can call the `CellLineMetaData.lookup()` function to create a LookUp object. \n"
|
470
|
-
"By using the `LookUp.protein_expression()` method, you can obtain the available reference identifiers in the metadata. "
|
471
|
-
)
|
472
|
-
|
473
|
-
identifier_num_all = len(adata.obs[query_id].unique())
|
474
|
-
not_matched_identifiers = list(set(adata.obs[query_id]) - set(self.proteomics_data[reference_id]))
|
475
|
-
|
476
|
-
if len(not_matched_identifiers) == identifier_num_all:
|
477
|
-
raise ValueError(
|
478
|
-
f"You are attempting to match the query id {query_id} in the adata.obs to the reference id {reference_id} in the metadata."
|
479
|
-
"However, none of the query IDs could be found in the proteomics data. \n"
|
480
|
-
"The annotation process has been halted. \n"
|
481
|
-
"To resolve this issue, please call the `CellLineMetaData.lookup()` function to create a LookUp object. \n"
|
482
|
-
"By using the `LookUp.protein_expression()` method, "
|
483
|
-
"you can obtain the count of matched identifiers in the adata for different types of reference IDs and query IDs. \n"
|
484
|
-
"Additionally, you can call the `CellLineMetaData.annotate_cell_lines` function "
|
485
|
-
"to acquire more possible query IDs that can be used for annotation purposes."
|
486
|
-
)
|
487
|
-
|
488
|
-
if len(not_matched_identifiers) > 0:
|
489
|
-
print(
|
490
|
-
f"[bold yellow]There are {identifier_num_all} identifiers in `adata.obs`. "
|
491
|
-
f"However {len(not_matched_identifiers)} identifiers can't be found in the protein expression data, "
|
492
|
-
"leading to the presence of NA values for their respective metadata. Please check again: ",
|
493
|
-
*not_matched_identifiers,
|
494
|
-
sep="\n- ",
|
495
|
-
)
|
496
|
-
# convert the original protein intensities table from long format to wide format, group by the cell lines
|
497
|
-
prot_exp = self.proteomics_data[[reference_id, protein_id, protein_information]]
|
498
|
-
prot_exp = pd.pivot(prot_exp, index=reference_id, columns=protein_id, values=protein_information)
|
499
|
-
prot_exp = prot_exp.reindex(adata.obs[query_id])
|
500
|
-
prot_exp.index = adata.obs.index
|
501
|
-
adata.obsm["proteomics_" + protein_information] = prot_exp
|
502
|
-
|
503
|
-
return adata
|
504
|
-
|
505
|
-
def annotate_from_gdsc(
|
506
|
-
self,
|
507
|
-
adata: AnnData,
|
508
|
-
query_id: str = "cell_line_name",
|
509
|
-
reference_id: Literal["cell_line_name", "sanger_model_id", "cosmic_id"] = "cell_line_name",
|
510
|
-
query_perturbation: str = "perturbation",
|
511
|
-
reference_perturbation: Literal["drug_name", "drug_id"] = "drug_name",
|
512
|
-
gdsc_dataset: Literal[1, 2] = 1,
|
513
|
-
copy: bool = False,
|
514
|
-
) -> AnnData:
|
515
|
-
"""Fetch drug response data.
|
516
|
-
|
517
|
-
For each cell, we fetch drug response data as natural log of the fitted IC50 for its corresponding cell line and perturbation from GDSC fitted data results file.
|
518
|
-
|
519
|
-
Args:
|
520
|
-
adata: The data object to annotate.
|
521
|
-
query_id: The column of `.obs` with cell line information. Defaults to "cell_line_name".
|
522
|
-
reference_id: The type of cell line identifier in the meta data, cell_line_name, sanger_model_id or cosmic_id. Defaults to "cell_line_name".
|
523
|
-
query_perturbation: The column of `.obs` with perturbation information. Defaults to "perturbation".
|
524
|
-
reference_perturbation: The type of perturbation in the meta data, drug_name or drug_id. Defaults to "drug_name".
|
525
|
-
gdsc_dataset: The GDSC dataset, 1 or 2. Defaults to 1. The GDSC1 dataset updates previous releases with additional drug screening data from the Wellcome Sanger Institute and Massachusetts General Hospital. It covers 970 Cell lines and 403 Compounds with 333292 IC50s. GDSC2 is new and has 243,466 IC50 results from the latest screening at the Wellcome Sanger Institute using improved experimental procedures.
|
526
|
-
copy: Determines whether a copy of the `adata` is returned. Defaults to False.
|
527
|
-
|
528
|
-
Returns:
|
529
|
-
Returns an AnnData object with drug response annotation.
|
530
|
-
|
531
|
-
Examples:
|
532
|
-
>>> import pertpy as pt
|
533
|
-
>>> adata = pt.dt.mcfarland_2020()
|
534
|
-
>>> pt_metadata = pt.tl.CellLineMetaData()
|
535
|
-
>>> pt_metadata.annotate_from_gdsc(adata, query_id='cell_line')
|
536
|
-
"""
|
537
|
-
if copy:
|
538
|
-
adata = adata.copy()
|
539
|
-
if query_id not in adata.obs.columns:
|
540
|
-
raise ValueError(
|
541
|
-
f"The specified `query_id` {query_id} can't be found in the `adata.obs`. \n"
|
542
|
-
"Please ensure that you are using one of the available query IDs present in the adata.obs for the annotation. \n"
|
543
|
-
"If the desired query ID is not available, you can fetch the cell line metadata "
|
544
|
-
"using the `annotate_cell_lines()` function before calling `annotate_from_gdsc()`. "
|
545
|
-
"This will help ensure that the required query ID is included in your data."
|
546
|
-
)
|
547
|
-
if gdsc_dataset == 1:
|
548
|
-
gdsc_data = self.drug_response_gdsc1
|
549
|
-
else:
|
550
|
-
gdsc_data = self.drug_response_gdsc2
|
551
|
-
not_matched_identifiers = list(set(adata.obs[query_id]) - set(gdsc_data[reference_id]))
|
552
|
-
if len(not_matched_identifiers) > 0:
|
553
|
-
print(
|
554
|
-
f"[bold yellow]Following {len(not_matched_identifiers)} identifiers can not be found in the drug response data for GDSC{gdsc_dataset},"
|
555
|
-
"leading to the presence of NA values for their respective metadata. Please check it again:",
|
556
|
-
*not_matched_identifiers,
|
557
|
-
sep="\n- ",
|
558
|
-
)
|
559
|
-
identifier_num_all = len(adata.obs[query_id].unique())
|
560
|
-
if len(not_matched_identifiers) == identifier_num_all:
|
561
|
-
raise ValueError(
|
562
|
-
f"You are attempting to match the query id {query_id} in the adata.obs to the reference id {reference_id} in the metadata. \n"
|
563
|
-
"However, none of the query IDs could be found in the drug response data. \n"
|
564
|
-
"The annotation process has been halted. \n"
|
565
|
-
"To resolve this issue, please call the `CellLineMetaData.lookup()` function to create a LookUp object. \n"
|
566
|
-
"By using the `LookUp.drug_response_gdsc()` method, \n"
|
567
|
-
"you can obtain the count of matched identifiers in the adata for different query IDs. \n"
|
568
|
-
"Additionally, you can call the `CellLineMetaData.annotate_cell_lines()` function to \n"
|
569
|
-
"acquire more cell line information that can be used for annotation purposes."
|
570
|
-
)
|
571
|
-
adata.obs = (
|
572
|
-
adata.obs.merge(
|
573
|
-
gdsc_data[[reference_id, reference_perturbation, "ln_ic50"]],
|
574
|
-
how="left",
|
575
|
-
left_on=[query_id, query_perturbation],
|
576
|
-
right_on=[reference_id, reference_perturbation],
|
577
|
-
suffixes=("", "_fromMeta"),
|
578
|
-
)
|
579
|
-
.filter(regex="^(?!.*_fromMeta)")
|
580
|
-
.set_index(adata.obs.index)
|
581
|
-
)
|
582
|
-
|
583
|
-
return adata
|
584
|
-
|
585
|
-
def lookup(self) -> LookUp:
|
586
|
-
"""Generate LookUp object for CellLineMetaData.
|
587
|
-
|
588
|
-
The LookUp object provides an overview of the metadata to annotate.
|
589
|
-
Each annotate_{metadata} function has a corresponding lookup function in the LookUp object,
|
590
|
-
where users can search the reference_id in the metadata and
|
591
|
-
compare with the query_id in their own data.
|
592
|
-
|
593
|
-
Returns:
|
594
|
-
Returns a LookUp object specific for cell line annotation.
|
595
|
-
|
596
|
-
Examples:
|
597
|
-
>>> import pertpy as pt
|
598
|
-
>>> pt_metadata = pt.tl.CellLineMetaData()
|
599
|
-
>>> lookup = pt_metadata.lookup()
|
600
|
-
"""
|
601
|
-
return LookUp(
|
602
|
-
type="cell_line",
|
603
|
-
transfer_metadata=[
|
604
|
-
self.cell_line_meta,
|
605
|
-
self.cl_cancer_project_meta,
|
606
|
-
self.gene_annotation,
|
607
|
-
self.bulk_rna_sanger,
|
608
|
-
self.bulk_rna_broad,
|
609
|
-
self.proteomics_data,
|
610
|
-
self.drug_response_gdsc1,
|
611
|
-
self.drug_response_gdsc2,
|
612
|
-
],
|
613
|
-
)
|