pertpy 0.6.0__py3-none-any.whl → 0.8.0__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (66) hide show
  1. pertpy/__init__.py +4 -2
  2. pertpy/data/__init__.py +66 -1
  3. pertpy/data/_dataloader.py +28 -26
  4. pertpy/data/_datasets.py +261 -92
  5. pertpy/metadata/__init__.py +6 -0
  6. pertpy/metadata/_cell_line.py +795 -0
  7. pertpy/metadata/_compound.py +128 -0
  8. pertpy/metadata/_drug.py +238 -0
  9. pertpy/metadata/_look_up.py +569 -0
  10. pertpy/metadata/_metadata.py +70 -0
  11. pertpy/metadata/_moa.py +125 -0
  12. pertpy/plot/__init__.py +0 -13
  13. pertpy/preprocessing/__init__.py +2 -0
  14. pertpy/preprocessing/_guide_rna.py +89 -6
  15. pertpy/tools/__init__.py +48 -15
  16. pertpy/tools/_augur.py +329 -32
  17. pertpy/tools/_cinemaot.py +145 -6
  18. pertpy/tools/_coda/_base_coda.py +1237 -116
  19. pertpy/tools/_coda/_sccoda.py +66 -36
  20. pertpy/tools/_coda/_tasccoda.py +46 -39
  21. pertpy/tools/_dialogue.py +180 -77
  22. pertpy/tools/_differential_gene_expression/__init__.py +20 -0
  23. pertpy/tools/_differential_gene_expression/_base.py +657 -0
  24. pertpy/tools/_differential_gene_expression/_checks.py +41 -0
  25. pertpy/tools/_differential_gene_expression/_dge_comparison.py +86 -0
  26. pertpy/tools/_differential_gene_expression/_edger.py +125 -0
  27. pertpy/tools/_differential_gene_expression/_formulaic.py +189 -0
  28. pertpy/tools/_differential_gene_expression/_pydeseq2.py +95 -0
  29. pertpy/tools/_differential_gene_expression/_simple_tests.py +162 -0
  30. pertpy/tools/_differential_gene_expression/_statsmodels.py +72 -0
  31. pertpy/tools/_distances/_distance_tests.py +29 -24
  32. pertpy/tools/_distances/_distances.py +584 -98
  33. pertpy/tools/_enrichment.py +460 -0
  34. pertpy/tools/_kernel_pca.py +1 -1
  35. pertpy/tools/_milo.py +406 -49
  36. pertpy/tools/_mixscape.py +677 -55
  37. pertpy/tools/_perturbation_space/_clustering.py +10 -3
  38. pertpy/tools/_perturbation_space/_comparison.py +112 -0
  39. pertpy/tools/_perturbation_space/_discriminator_classifiers.py +524 -0
  40. pertpy/tools/_perturbation_space/_perturbation_space.py +146 -52
  41. pertpy/tools/_perturbation_space/_simple.py +52 -11
  42. pertpy/tools/_scgen/__init__.py +1 -1
  43. pertpy/tools/_scgen/_base_components.py +2 -3
  44. pertpy/tools/_scgen/_scgen.py +706 -0
  45. pertpy/tools/_scgen/_utils.py +3 -5
  46. pertpy/tools/decoupler_LICENSE +674 -0
  47. {pertpy-0.6.0.dist-info → pertpy-0.8.0.dist-info}/METADATA +48 -20
  48. pertpy-0.8.0.dist-info/RECORD +57 -0
  49. {pertpy-0.6.0.dist-info → pertpy-0.8.0.dist-info}/WHEEL +1 -1
  50. pertpy/plot/_augur.py +0 -234
  51. pertpy/plot/_cinemaot.py +0 -81
  52. pertpy/plot/_coda.py +0 -1001
  53. pertpy/plot/_dialogue.py +0 -91
  54. pertpy/plot/_guide_rna.py +0 -82
  55. pertpy/plot/_milopy.py +0 -284
  56. pertpy/plot/_mixscape.py +0 -594
  57. pertpy/plot/_scgen.py +0 -337
  58. pertpy/tools/_differential_gene_expression.py +0 -99
  59. pertpy/tools/_metadata/__init__.py +0 -0
  60. pertpy/tools/_metadata/_cell_line.py +0 -613
  61. pertpy/tools/_metadata/_look_up.py +0 -342
  62. pertpy/tools/_perturbation_space/_discriminator_classifier.py +0 -381
  63. pertpy/tools/_scgen/_jax_scgen.py +0 -370
  64. pertpy-0.6.0.dist-info/RECORD +0 -50
  65. /pertpy/tools/_scgen/{_jax_scgenvae.py → _scgenvae.py} +0 -0
  66. {pertpy-0.6.0.dist-info → pertpy-0.8.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,569 @@
1
+ from __future__ import annotations
2
+
3
+ from collections import namedtuple
4
+ from typing import TYPE_CHECKING, Literal
5
+
6
+ from lamin_utils import logger
7
+
8
+ if TYPE_CHECKING:
9
+ from collections.abc import Sequence
10
+
11
+ if TYPE_CHECKING:
12
+ import pandas as pd
13
+
14
+ import pubchempy as pcp
15
+
16
+
17
+ class LookUp:
18
+ """Generate LookUp object for different type of metadata."""
19
+
20
+ def __init__(
21
+ self,
22
+ type: Literal["cell_line", "moa", "compound", "drug"] = "cell_line",
23
+ transfer_metadata: Sequence[pd.DataFrame] | None = None,
24
+ ):
25
+ """
26
+ Args:
27
+ type: Metadata type for annotation. One of 'cell_line', 'compound', 'moa' or 'drug.
28
+ transfer_metadata: DataFrames used to generate Lookup object.
29
+ This is currently set to None for CompoundMetaData which does not require any dataframes for transfer.
30
+ """
31
+ self.type = type
32
+ if type == "cell_line":
33
+ self.cell_line_meta = transfer_metadata[0]
34
+ self.cl_cancer_project_meta = transfer_metadata[1]
35
+ self.gene_annotation = transfer_metadata[2]
36
+ self.bulk_rna_sanger = transfer_metadata[3]
37
+ self.bulk_rna_broad = transfer_metadata[4]
38
+ self.proteomics_data = transfer_metadata[5]
39
+ self.drug_response_gdsc1 = transfer_metadata[6]
40
+ self.drug_response_gdsc2 = transfer_metadata[7]
41
+
42
+ cell_line_annotation = namedtuple(
43
+ "cell_line_annotation",
44
+ "n_cell_line cell_line n_metadata metadata reference_id reference_id_example default_parameter",
45
+ )
46
+ cell_lines = namedtuple("cell_lines", ["depmap", "cancerrxgene"])
47
+
48
+ depmap_data = {
49
+ "n_cell_line": len(self.cell_line_meta.index),
50
+ "n_metadata": len(self.cell_line_meta.columns),
51
+ "cell_line": self.cell_line_meta.ModelID.values,
52
+ "metadata": self.cell_line_meta.columns.values,
53
+ "reference_id": [
54
+ "ModelID",
55
+ "CellLineName",
56
+ "StrippedCellLineName",
57
+ "CCLE_Name",
58
+ ],
59
+ "reference_id_example": "ModelID: ACH-000001 | CellLineName: NIH:OVCAR-3 | StrippedCellLineName: NIHOVCAR3 | CCLEName: NIHOVCAR3_OVARY",
60
+ "default_parameter": {
61
+ "cell_line_source": "DepMap",
62
+ "query_id": "DepMap_ID",
63
+ "reference_id": "ModelID",
64
+ "fetch": "None",
65
+ },
66
+ }
67
+ depmap_record = cell_line_annotation(**depmap_data)
68
+
69
+ cancerrxgene_data = {
70
+ "n_cell_line": len(self.cl_cancer_project_meta.index),
71
+ "n_metadata": len(self.cl_cancer_project_meta.columns),
72
+ "cell_line": self.cl_cancer_project_meta.stripped_cell_line_name.values,
73
+ "metadata": self.cl_cancer_project_meta.columns.values,
74
+ "reference_id": [
75
+ "cell_line_name",
76
+ "stripped_cell_line_name",
77
+ "Model ID",
78
+ "COSMIC ID",
79
+ ],
80
+ "reference_id_example": "cell_line_name: SNU-283 | stripped_cell_line_name: SNU283 | Model ID: SIDM00215 | COSMIC ID: 1659929",
81
+ "default_parameter": {
82
+ "query_id": "stripped_cell_line_name",
83
+ "reference_id": "stripped_cell_line_name",
84
+ "fetch": "None",
85
+ },
86
+ }
87
+ cancerrxgene_record = cell_line_annotation(**cancerrxgene_data)
88
+ self.cell_lines = cell_lines(depmap_record, cancerrxgene_record)
89
+
90
+ bulk_rna_annotation = namedtuple(
91
+ "bulk_rna_annotation",
92
+ "n_cell_line cell_line n_gene gene reference_id reference_id_example default_parameter",
93
+ )
94
+ bulk_rna_expression = namedtuple("bulk_rna_expression", ["broad", "sanger"])
95
+
96
+ broad_data = {
97
+ "n_cell_line": len(self.bulk_rna_broad.index),
98
+ "n_gene": len(self.bulk_rna_broad.columns),
99
+ "cell_line": self.bulk_rna_broad.index.values,
100
+ "gene": self.bulk_rna_broad.columns.values,
101
+ "reference_id": "DepMap_ID",
102
+ "reference_id_example": "DepMap_ID: ACH-001113",
103
+ "default_parameter": {
104
+ "query_id": "DepMap_ID",
105
+ "cell_line_source": "broad",
106
+ },
107
+ }
108
+ broad_record = bulk_rna_annotation(**broad_data)
109
+
110
+ sanger_data = {
111
+ "n_cell_line": len(self.bulk_rna_sanger.index),
112
+ "n_gene": len(self.bulk_rna_sanger.columns),
113
+ "cell_line": self.bulk_rna_sanger.index.values,
114
+ "gene": self.bulk_rna_sanger.columns.values,
115
+ "reference_id": "model_name",
116
+ "reference_id_example": "model_name: MEC-1",
117
+ "default_parameter": {
118
+ "query_id": "cell_line_name",
119
+ "cell_line_source": "sanger",
120
+ },
121
+ }
122
+ sanger_record = bulk_rna_annotation(**sanger_data)
123
+ self.bulk_rna = bulk_rna_expression(broad_record, sanger_record)
124
+
125
+ proteomics = namedtuple(
126
+ "proteomics",
127
+ "n_cell_line cell_line n_protein protein metadata reference_id reference_id_example default_parameter",
128
+ )
129
+ proteomics_data = {
130
+ "n_cell_line": len(self.proteomics_data["model_name"].unique()),
131
+ "n_protein": len(self.proteomics_data.uniprot_id.unique()),
132
+ "cell_line": self.proteomics_data["model_name"].unique(),
133
+ "protein": self.proteomics_data.uniprot_id.unique(),
134
+ "metadata": self.proteomics_data.columns.values,
135
+ "reference_id": ["model_id", "model_name"],
136
+ "reference_id_example": "model_id: SIDM00483 | model_name: SK-GT-4",
137
+ "default_parameter": {
138
+ "query_id": "cell_line_name",
139
+ "reference_id": "model_name",
140
+ "bulk_rna_information": "read_count",
141
+ "protein_information": "protein_intensity",
142
+ "protein_id": "uniprot_id",
143
+ },
144
+ }
145
+ self.proteomics = proteomics(**proteomics_data)
146
+
147
+ drug_response_annotation = namedtuple(
148
+ "drug_response_annotation",
149
+ "n_cell_line cell_line n_drug drug_name metadata reference_id reference_id_example default_parameter",
150
+ )
151
+ drug_response = namedtuple("drug_response", ["gdsc1", "gdsc2"])
152
+
153
+ gdsc1_data = {
154
+ "n_cell_line": len(self.drug_response_gdsc1["cell_line_name"].unique()),
155
+ "n_drug": len(self.drug_response_gdsc1.drug_name.unique()),
156
+ "cell_line": self.drug_response_gdsc1.cell_line_name.unique(),
157
+ "drug_name": self.drug_response_gdsc1.drug_name.unique(),
158
+ "metadata": self.drug_response_gdsc1.columns.values,
159
+ "reference_id": ["cell_line_name", "sanger_model_id", "cosmic_id"],
160
+ "reference_id_example": "cell_line_name: ES5 | sanger_model_id: SIDM00263 | cosmic_id: 684057",
161
+ "default_parameter": {
162
+ "gdsc_dataset": "1",
163
+ "query_id": "cell_line_name",
164
+ "reference_id": "cell_line_name",
165
+ "query_perturbation": "perturbation",
166
+ "reference_perturbation": "drug_name",
167
+ },
168
+ }
169
+ gdsc1_dict = drug_response_annotation(**gdsc1_data)
170
+
171
+ gdsc2_data = {
172
+ "n_cell_line": len(self.drug_response_gdsc2["cell_line_name"].unique()),
173
+ "n_drug": len(self.drug_response_gdsc2.drug_name.unique()),
174
+ "cell_line": self.drug_response_gdsc2.cell_line_name.unique(),
175
+ "drug_name": self.drug_response_gdsc2.drug_name.unique(),
176
+ "metadata": self.drug_response_gdsc2.columns.values,
177
+ "reference_id": ["cell_line_name", "sanger_model_id", "cosmic_id"],
178
+ "reference_id_example": "cell_line_name: PFSK-1 | sanger_model_id: SIDM01132 | cosmic_id: 683667",
179
+ "default_parameter": {
180
+ "gdsc_dataset": "1",
181
+ "query_id": "cell_line_name",
182
+ "reference_id": "cell_line_name",
183
+ "query_perturbation": "perturbation",
184
+ "reference_perturbation": "drug_name",
185
+ },
186
+ }
187
+ gdsc2_dict = drug_response_annotation(**gdsc2_data)
188
+
189
+ self.drug_response = drug_response(gdsc1_dict, gdsc2_dict)
190
+
191
+ elif type == "moa":
192
+ self.moa_meta = transfer_metadata[0]
193
+ moa_annotation = namedtuple(
194
+ "moa_annotation",
195
+ "n_pert n_moa query_id query_id_example target_example default_parameter",
196
+ )
197
+ moa_data = {
198
+ "n_pert": len(self.moa_meta.pert_iname.unique()),
199
+ "n_moa": len(self.moa_meta.moa.unique()),
200
+ "query_id": "pert_iname",
201
+ "query_id_example": [
202
+ "(R)-(-)-apomorphine",
203
+ "9-aminocamptothecin",
204
+ "A-803467",
205
+ ],
206
+ "target_example": [
207
+ "ADRA2A|ADRA2B|ADRA2C|CALY|DRD1|DRD2|DRD3|DRD4|DRD5|HTR1A|HTR1B|HTR1D|HTR2A|HTR2B|HTR2C|HTR5A",
208
+ "SCN10A",
209
+ "TOP1",
210
+ ],
211
+ "default_parameter": {
212
+ "query_id": "pert_iname",
213
+ "target": None,
214
+ },
215
+ }
216
+ self.moa = moa_annotation(**moa_data)
217
+
218
+ elif type == "compound":
219
+ compound_annotation = namedtuple("compound_annotation", "query_id query_id_example default_parameter")
220
+ compound_data = {
221
+ "query_id_type": ["name", "cid"],
222
+ "query_id_example": "name: ACH-000016 | cid: SLR 21",
223
+ "default_parameter": {
224
+ "query_id": "perturbation",
225
+ "query_id_type": "name",
226
+ },
227
+ }
228
+ self.compound = compound_annotation(**compound_data)
229
+
230
+ elif type == "drug":
231
+ self.chembl = transfer_metadata[0]
232
+ self.dgidb = transfer_metadata[1]
233
+ self.pharmgkb = transfer_metadata[2]
234
+
235
+ drug_annotation = namedtuple(
236
+ "drug_annotation",
237
+ "n_compound compound_example n_target target_example n_disease disease_example",
238
+ )
239
+ drugs = namedtuple("drugs", ["chembl", "dgidb", "pharmgkb"])
240
+
241
+ dgidb_data = {
242
+ "n_compound": len(self.dgidb.drug_claim_name.unique()),
243
+ "n_target": len(self.dgidb.gene_claim_name.unique()),
244
+ "compound_example": self.dgidb.drug_claim_name.values[0:5],
245
+ "target_example": self.dgidb.gene_claim_name.unique()[0:5],
246
+ "n_disease": 0,
247
+ "disease_example": "",
248
+ }
249
+ dgidb_record = drug_annotation(**dgidb_data)
250
+
251
+ chembl_targets = list(
252
+ {t for target in self.chembl.targets.tolist() for t in target}
253
+ ) # flatten the target column and remove duplicates
254
+ chembl_data = {
255
+ "n_compound": len(self.chembl.compounds),
256
+ "n_target": len(chembl_targets),
257
+ "compound_example": self.chembl.compounds.values[0:5],
258
+ "target_example": chembl_targets[0:5],
259
+ "n_disease": 0,
260
+ "disease_example": "",
261
+ }
262
+ chembl_record = drug_annotation(**chembl_data)
263
+
264
+ pharmgkb_data = {
265
+ "n_compound": len(self.pharmgkb[self.pharmgkb.Type == "Chemical"]["Compound|Disease"].unique()),
266
+ "n_target": len(self.pharmgkb.Gene.unique()),
267
+ "compound_example": self.pharmgkb[self.pharmgkb.Type == "Chemical"]["Compound|Disease"].unique()[0:5],
268
+ "target_example": self.pharmgkb.Gene.unique()[0:5],
269
+ "n_disease": len(self.pharmgkb[self.pharmgkb.Type == "Disease"]["Compound|Disease"].unique()),
270
+ "disease_example": self.pharmgkb[self.pharmgkb.Type == "Disease"]["Compound|Disease"].unique()[0:5],
271
+ }
272
+ pharmgkb_record = drug_annotation(**pharmgkb_data)
273
+ self.drugs = drugs(chembl_record, dgidb_record, pharmgkb_record)
274
+
275
+ else:
276
+ raise NotImplementedError
277
+
278
+ def available_cell_lines(
279
+ self,
280
+ cell_line_source: Literal["DepMap", "Cancerrxgene"] = "DepMap",
281
+ reference_id: str = "ModelID",
282
+ query_id_list: Sequence[str] | None = None,
283
+ ) -> None:
284
+ """A brief summary of cell line metadata.
285
+
286
+ Args:
287
+ cell_line_source: the source of cell line annotation, DepMap or Cancerrxgene.
288
+ reference_id: The type of cell line identifier in the meta data, e.g. ModelID, CellLineName or StrippedCellLineName.
289
+ If fetch cell line metadata from Cancerrxgene, it is recommended to choose "stripped_cell_line_name".
290
+ query_id_list: Unique cell line identifiers to test the number of matched ids present in the
291
+ metadata. If set to None, the query of metadata identifiers will be disabled.
292
+ """
293
+ if self.type != "cell_line":
294
+ raise ValueError("This is not a LookUp object specifically for CellLineMetaData!")
295
+
296
+ if query_id_list is not None:
297
+ identifier_num_all = len(query_id_list)
298
+ if cell_line_source == "DepMap":
299
+ if reference_id not in self.cell_line_meta.columns:
300
+ raise ValueError(
301
+ f"The specified `reference_id` {reference_id} is not available in the DepMap cell line annotation data. "
302
+ )
303
+ not_matched_identifiers = list(set(query_id_list) - set(self.cell_line_meta[reference_id]))
304
+ else:
305
+ if reference_id == "ModelID":
306
+ reference_id = "stripped_cell_line_name"
307
+ if reference_id not in self.cl_cancer_project_meta.columns:
308
+ raise ValueError(
309
+ f"The specified `reference_id` {reference_id} is not available "
310
+ f"in the cell line annotation from the project Genomics of Drug Sensitivity in Cancer. "
311
+ )
312
+ not_matched_identifiers = list(set(query_id_list) - set(self.cl_cancer_project_meta[reference_id]))
313
+
314
+ logger.info(f"{len(not_matched_identifiers)} cell lines are not found in the metadata.")
315
+ logger.info(f"{identifier_num_all - len(not_matched_identifiers)} cell lines are found! ")
316
+
317
+ def available_bulk_rna(
318
+ self,
319
+ cell_line_source: Literal["broad", "sanger"] = "sanger",
320
+ query_id_list: Sequence[str] | None = None,
321
+ ) -> None:
322
+ """A brief summary of bulk RNA expression data.
323
+
324
+ Args:
325
+ cell_line_source: the source of RNA-seq data, broad or sanger.
326
+ query_id_list: Unique cell line identifiers to test the number of matched ids present in the
327
+ metadata. If set to None, the query of metadata identifiers will be disabled.
328
+ """
329
+ if self.type != "cell_line":
330
+ raise ValueError("This is not a LookUp object specific for CellLineMetaData!")
331
+
332
+ if cell_line_source == "broad":
333
+ bulk_rna = self.bulk_rna_broad
334
+ else:
335
+ bulk_rna = self.bulk_rna_sanger
336
+
337
+ if query_id_list is not None:
338
+ identifier_num_all = len(query_id_list)
339
+ not_matched_identifiers = list(set(query_id_list) - set(bulk_rna.index))
340
+
341
+ logger.info(f"{len(not_matched_identifiers)} cell lines are not found in the metadata.")
342
+ logger.info(f"{identifier_num_all - len(not_matched_identifiers)} cell lines are found! ")
343
+
344
+ def available_protein_expression(
345
+ self,
346
+ reference_id: Literal["model_name", "model_id"] = "model_name",
347
+ query_id_list: Sequence[str] | None = None,
348
+ ) -> None:
349
+ """A brief summary of protein expression data.
350
+
351
+ Args:
352
+ reference_id: The type of cell line identifier in the meta data, model_name or model_id.
353
+ query_id_list: Unique cell line identifiers to test the number of matched ids present in the
354
+ metadata. If set to None, the query of metadata identifiers will be disabled.
355
+ """
356
+ if self.type != "cell_line":
357
+ raise ValueError("This is not a LookUp object specific for CellLineMetaData!")
358
+
359
+ if query_id_list is not None:
360
+ identifier_num_all = len(query_id_list)
361
+
362
+ if reference_id not in self.proteomics_data.columns:
363
+ raise ValueError(
364
+ f"The specified `reference_id` {reference_id} is not available in the proteomics data. "
365
+ )
366
+ not_matched_identifiers = list(set(query_id_list) - set(self.proteomics_data[reference_id]))
367
+ logger.info(f"{len(not_matched_identifiers)} cell lines are not found in the metadata.")
368
+ logger.info(f"{identifier_num_all - len(not_matched_identifiers)} cell lines are found! ")
369
+
370
+ def available_drug_response(
371
+ self,
372
+ gdsc_dataset: Literal[1, 2] = 1,
373
+ reference_id: Literal["cell_line_name", "sanger_model_id", "cosmic_id"] = "cell_line_name",
374
+ query_id_list: Sequence[str] | None = None,
375
+ reference_perturbation: Literal["drug_name", "drug_id"] = "drug_name",
376
+ query_perturbation_list: Sequence[str] | None = None,
377
+ ) -> None:
378
+ """A brief summary of drug response data.
379
+
380
+ Args:
381
+ gdsc_dataset: The GDSC dataset, 1 or 2.
382
+ The GDSC1 dataset updates previous releases with additional drug screening data from the Wellcome Sanger Institute and Massachusetts General Hospital.
383
+ It covers 970 Cell lines and 403 Compounds with 333292 IC50s.
384
+ GDSC2 is new and has 243,466 IC50 results from the latest screening at the Wellcome Sanger Institute using improved experimental procedures.
385
+ reference_id: The type of cell line identifier in the meta data, cell_line_name, sanger_model_id or cosmic_id.
386
+ query_id_list: Unique cell line identifiers to test the number of matched ids present in the metadata.
387
+ If set to None, the query of metadata identifiers will be disabled.
388
+ reference_perturbation: The perturbation information in the meta data, drug_name or drug_id.
389
+ query_perturbation_list: Unique perturbation types to test the number of matched ones present in the metadata.
390
+ If set to None, the query of perturbation types will be disabled.
391
+ """
392
+ if self.type != "cell_line":
393
+ raise ValueError("This is not a LookUp object specific for CellLineMetaData!")
394
+ if gdsc_dataset == 1:
395
+ gdsc_data = self.drug_response_gdsc1
396
+ else:
397
+ gdsc_data = self.drug_response_gdsc2
398
+
399
+ if query_id_list is not None:
400
+ if reference_id not in gdsc_data.columns:
401
+ raise ValueError(
402
+ f"The specified `reference_id` {reference_id} is not available in the GDSC drug response data. "
403
+ )
404
+ identifier_num_all = len(query_id_list)
405
+ not_matched_identifiers = list(set(query_id_list) - set(gdsc_data[reference_id]))
406
+ logger.info(f"{len(not_matched_identifiers)} cell lines are not found in the metadata.")
407
+ logger.info(f"{identifier_num_all - len(not_matched_identifiers)} cell lines are found! ")
408
+
409
+ if query_perturbation_list is not None:
410
+ if reference_perturbation not in gdsc_data.columns:
411
+ raise ValueError(
412
+ f"The specified `reference_perturbation` {reference_perturbation} is not available in the GDSC drug response data. "
413
+ )
414
+ identifier_num_all = len(query_perturbation_list)
415
+ not_matched_identifiers = list(set(query_perturbation_list) - set(gdsc_data[reference_perturbation]))
416
+ logger.info(f"{len(not_matched_identifiers)} perturbation types are not found in the metadata.")
417
+ logger.info(f"{identifier_num_all - len(not_matched_identifiers)} perturbation types are found! ")
418
+
419
+ def available_genes_annotation(
420
+ self,
421
+ reference_id: Literal["gene_id", "ensembl_gene_id", "hgnc_id", "hgnc_symbol"] = "ensembl_gene_id",
422
+ query_id_list: Sequence[str] | None = None,
423
+ ) -> None:
424
+ """A brief summary of gene annotation metadata
425
+
426
+ Args:
427
+ reference_id: The type of gene identifier in the meta data, gene_id, ensembl_gene_id, hgnc_id, hgnc_symbol.
428
+ query_id_list: Unique gene identifiers to test the number of matched ids present in the metadata.
429
+ """
430
+ if self.type != "cell_line":
431
+ raise ValueError("This is not a LookUp object specific for CellLineMetaData!")
432
+
433
+ logger.info("To summarize: in the DepMap_Sanger gene annotation file, you can find: ")
434
+ logger.info(f"{len(self.gene_annotation.index)} driver genes")
435
+ logger.info(
436
+ f"{len(self.gene_annotation.columns)} meta data including: ",
437
+ *list(self.gene_annotation.columns.values),
438
+ sep="\n- ",
439
+ )
440
+ logger.info("Overview of gene annotation: ")
441
+ logger.info(self.gene_annotation.head().to_string())
442
+ """
443
+ #not implemented yet
444
+ print("Default parameters to annotate gene annotation: ")
445
+ default_param = {
446
+ "query_id": "ensembl_gene_id",
447
+ }
448
+ print("\n".join(f"- {k}: {v}" for k, v in default_param.items()))
449
+ if query_id_list is not None:
450
+ identifier_num_all = len(query_id_list)
451
+ not_matched_identifiers = list(set(query_id_list) - set(self.gene_annotation[reference_id]))
452
+ print(f"{len(not_matched_identifiers)} genes are not found in the metadata.")
453
+ print(f"{identifier_num_all - len(not_matched_identifiers)} genes are found! ")
454
+ """
455
+
456
+ def available_moa(
457
+ self,
458
+ query_id_list: Sequence[str] | None = None,
459
+ target_list: Sequence[str] | None = None,
460
+ ) -> None:
461
+ """A brief summary of MoA annotation.
462
+
463
+ Args:
464
+ query_id_list: Unique perturbagens to test the number of matched ones present in the metadata.
465
+ If set to None, the query of metadata perturbagens will be disabled.
466
+ target_list: Unique molecular targets to test the number of matched ones present in the metadata.
467
+ If set to None, the comparison of molecular targets in the query of metadata perturbagens will be disabled.
468
+ """
469
+ if query_id_list is not None:
470
+ if self.type != "moa":
471
+ raise ValueError("This is not a LookUp object specific for MoaMetaData!")
472
+ identifier_num_all = len(query_id_list)
473
+ not_matched_identifiers = list(set(query_id_list) - set(self.moa_meta.pert_iname))
474
+ logger.info(f"{len(not_matched_identifiers)} perturbagens are not found in the metadata.")
475
+ logger.info(f"{identifier_num_all - len(not_matched_identifiers)} perturbagens are found! ")
476
+
477
+ if target_list is not None:
478
+ targets = self.moa_meta.target.astype(str).apply(lambda x: x.split("|"))
479
+ all_targets = [t for tl in targets for t in tl]
480
+ identifier_num_all = len(target_list)
481
+ not_matched_identifiers = list(set(target_list) - set(all_targets))
482
+ logger.info(f"{len(not_matched_identifiers)} molecular targets are not found in the metadata.")
483
+ logger.info(f"{identifier_num_all - len(not_matched_identifiers)} molecular targets are found! ")
484
+
485
+ def available_compounds(
486
+ self,
487
+ query_id_list: Sequence[str] | None = None,
488
+ query_id_type: Literal["name", "cid"] = "name",
489
+ ) -> None:
490
+ """A brief summary of compound annotation.
491
+
492
+ Args:
493
+ query_id_list: Unique compounds to test the number of matched ones present in the metadata.
494
+ If set to None, query of compound identifiers will be disabled.
495
+ query_id_type: The type of compound identifiers, name or cid.
496
+ """
497
+ if self.type != "compound":
498
+ raise ValueError("This is not a LookUp object specific for CompoundData!")
499
+ if query_id_list is not None:
500
+ identifier_num_all = len(query_id_list)
501
+ not_matched_identifiers = []
502
+
503
+ for compound in query_id_list:
504
+ if query_id_type == "name":
505
+ cids = pcp.get_compounds(compound, "name")
506
+ if len(cids) == 0: # search did not work
507
+ not_matched_identifiers.append(compound)
508
+ else:
509
+ try:
510
+ pcp.Compound.from_cid(compound)
511
+ except pcp.BadRequestError:
512
+ not_matched_identifiers.append(compound)
513
+
514
+ logger.info(f"{len(not_matched_identifiers)} compounds are not found in the metadata.")
515
+ logger.info(f"{identifier_num_all - len(not_matched_identifiers)} compounds are found! ")
516
+
517
+ def available_drug_annotation(
518
+ self,
519
+ drug_annotation_source: Literal["chembl", "dgidb", "pharmgkb"] = "chembl",
520
+ query_id_list: Sequence[str] | None = None,
521
+ query_id_type: Literal["target", "compound", "disease"] = "target",
522
+ ) -> None:
523
+ """A brief summary of drug annotation.
524
+
525
+ Args:
526
+ drug_annotation_source: the source of drug annotation data, chembl, dgidb or pharmgkb.
527
+ query_id_list: Unique target or compound names to test the number of matched ones present in the metadata.
528
+ If set to None, query of compound identifiers will be disabled.
529
+ query_id_type: The type of identifiers, target, compound and disease(pharmgkb only).
530
+ """
531
+ if self.type != "drug":
532
+ raise ValueError("This is not a LookUp object specific for DrugMetaData!")
533
+ if query_id_list is not None:
534
+ identifier_num_all = len(query_id_list)
535
+ not_matched_identifiers = []
536
+
537
+ if drug_annotation_source == "chembl":
538
+ if query_id_type == "target":
539
+ chembl_targets = {t for target in self.chembl.targets.tolist() for t in target}
540
+ # flatten the target column and remove duplicates
541
+ not_matched_identifiers = list(set(query_id_list) - chembl_targets)
542
+ elif query_id_type == "compound":
543
+ not_matched_identifiers = list(set(query_id_list) - self.chembl["compounds"])
544
+ else:
545
+ raise ValueError(
546
+ "Gene-disease association is not available in chembl dataset, please try with pharmgkb."
547
+ )
548
+
549
+ elif drug_annotation_source == "dgidb":
550
+ if query_id_type == "target":
551
+ not_matched_identifiers = list(set(query_id_list) - set(self.dgidb["gene_claim_name"]))
552
+ elif query_id_type == "compound":
553
+ not_matched_identifiers = list(set(query_id_list) - set(self.dgidb["drug_claim_name"]))
554
+ else:
555
+ raise ValueError(
556
+ "Gene-disease association is not available in dgidb dataset, please try with pharmgkb."
557
+ )
558
+ else:
559
+ if query_id_type == "target":
560
+ not_matched_identifiers = list(set(query_id_list) - set(self.pharmgkb["Gene"]))
561
+ elif query_id_type == "compound":
562
+ compounds = self.pharmgkb[self.pharmgkb["Type"] == "Chemical"]
563
+ not_matched_identifiers = list(set(query_id_list) - set(compounds["Compound|Disease"]))
564
+ else:
565
+ diseases = self.pharmgkb[self.pharmgkb["Type"] == "Disease"]
566
+ not_matched_identifiers = list(set(query_id_list) - set(diseases["Compound|Disease"]))
567
+
568
+ logger.info(f"{len(not_matched_identifiers)} {query_id_type}s are not found in the metadata.")
569
+ logger.info(f"{identifier_num_all - len(not_matched_identifiers)} {query_id_type}s are found! ")
@@ -0,0 +1,70 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Literal
4
+
5
+ from lamin_utils import logger
6
+
7
+ if TYPE_CHECKING:
8
+ from collections.abc import Sequence
9
+
10
+
11
+ class MetaData:
12
+ """Superclass for pertpy's MetaData components."""
13
+
14
+ def _warn_unmatch(
15
+ self,
16
+ total_identifiers: int,
17
+ unmatched_identifiers: Sequence[str],
18
+ query_id: str,
19
+ reference_id: str,
20
+ metadata_type: Literal[
21
+ "cell line",
22
+ "protein expression",
23
+ "bulk RNA",
24
+ "drug response",
25
+ "moa",
26
+ "compound",
27
+ ] = "cell line",
28
+ verbosity: int | str = 5,
29
+ ) -> None:
30
+ """Helper function to print out the unmatched identifiers.
31
+
32
+ Args:
33
+ total_identifiers: The total number of identifiers in the `adata` object.
34
+ unmatched_identifiers: Unmatched identifiers in the `adata` object.
35
+ query_id: The column of `.obs` with cell line information.
36
+ reference_id: The type of cell line identifier in the metadata.
37
+ metadata_type: The type of metadata where some identifiers are not matched during annotation such as
38
+ cell line, protein expression, bulk RNA expression, drug response, moa or compound.
39
+ verbosity: The number of unmatched identifiers to print, can be either non-negative values or 'all'.
40
+ """
41
+ if isinstance(verbosity, str):
42
+ if verbosity != "all":
43
+ raise ValueError("Only a non-negative value or 'all' is accepted.")
44
+ else:
45
+ verbosity = len(unmatched_identifiers)
46
+
47
+ if len(unmatched_identifiers) == total_identifiers:
48
+ hint = ""
49
+ if metadata_type in ["protein expression", "bulk RNA", "drug response"]:
50
+ hint = "Additionally, call the `CellLineMetaData.annotate()` function to acquire more possible query IDs that can be used for cell line annotation purposes."
51
+ raise ValueError(
52
+ f"Attempting to match the query id {query_id} in 'adata.obs' to the reference id {reference_id} in the metadata.\n"
53
+ "However, none of the query IDs could be found in the {metadata_type} annotation data.\n"
54
+ "To resolve this issue, call the `lookup()` function to create a LookUp object.\n"
55
+ "This enables obtaining the count of matched identifiers in the AnnData object for different types of reference and query IDs.\n"
56
+ f"{hint}"
57
+ )
58
+ if len(unmatched_identifiers) == 0:
59
+ return
60
+ if isinstance(verbosity, int) and verbosity >= 0:
61
+ verbosity = min(verbosity, len(unmatched_identifiers))
62
+ if verbosity > 0:
63
+ logger.info(
64
+ f"There are {total_identifiers} identifiers in `adata.obs`."
65
+ f"However, {len(unmatched_identifiers)} identifiers can't be found in the {metadata_type} annotation,"
66
+ "leading to the presence of NA values for their respective metadata.\n"
67
+ f"Please check again: *unmatched_identifiers[:verbosity]..."
68
+ )
69
+ else:
70
+ raise ValueError("Only 'all' or a non-negative value is accepted.")