pertpy 0.6.0__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. pertpy/__init__.py +4 -2
  2. pertpy/data/__init__.py +66 -1
  3. pertpy/data/_dataloader.py +28 -26
  4. pertpy/data/_datasets.py +261 -92
  5. pertpy/metadata/__init__.py +6 -0
  6. pertpy/metadata/_cell_line.py +795 -0
  7. pertpy/metadata/_compound.py +128 -0
  8. pertpy/metadata/_drug.py +238 -0
  9. pertpy/metadata/_look_up.py +569 -0
  10. pertpy/metadata/_metadata.py +70 -0
  11. pertpy/metadata/_moa.py +125 -0
  12. pertpy/plot/__init__.py +0 -13
  13. pertpy/preprocessing/__init__.py +2 -0
  14. pertpy/preprocessing/_guide_rna.py +89 -6
  15. pertpy/tools/__init__.py +48 -15
  16. pertpy/tools/_augur.py +329 -32
  17. pertpy/tools/_cinemaot.py +145 -6
  18. pertpy/tools/_coda/_base_coda.py +1237 -116
  19. pertpy/tools/_coda/_sccoda.py +66 -36
  20. pertpy/tools/_coda/_tasccoda.py +46 -39
  21. pertpy/tools/_dialogue.py +180 -77
  22. pertpy/tools/_differential_gene_expression/__init__.py +20 -0
  23. pertpy/tools/_differential_gene_expression/_base.py +657 -0
  24. pertpy/tools/_differential_gene_expression/_checks.py +41 -0
  25. pertpy/tools/_differential_gene_expression/_dge_comparison.py +86 -0
  26. pertpy/tools/_differential_gene_expression/_edger.py +125 -0
  27. pertpy/tools/_differential_gene_expression/_formulaic.py +189 -0
  28. pertpy/tools/_differential_gene_expression/_pydeseq2.py +95 -0
  29. pertpy/tools/_differential_gene_expression/_simple_tests.py +162 -0
  30. pertpy/tools/_differential_gene_expression/_statsmodels.py +72 -0
  31. pertpy/tools/_distances/_distance_tests.py +29 -24
  32. pertpy/tools/_distances/_distances.py +584 -98
  33. pertpy/tools/_enrichment.py +460 -0
  34. pertpy/tools/_kernel_pca.py +1 -1
  35. pertpy/tools/_milo.py +406 -49
  36. pertpy/tools/_mixscape.py +677 -55
  37. pertpy/tools/_perturbation_space/_clustering.py +10 -3
  38. pertpy/tools/_perturbation_space/_comparison.py +112 -0
  39. pertpy/tools/_perturbation_space/_discriminator_classifiers.py +524 -0
  40. pertpy/tools/_perturbation_space/_perturbation_space.py +146 -52
  41. pertpy/tools/_perturbation_space/_simple.py +52 -11
  42. pertpy/tools/_scgen/__init__.py +1 -1
  43. pertpy/tools/_scgen/_base_components.py +2 -3
  44. pertpy/tools/_scgen/_scgen.py +706 -0
  45. pertpy/tools/_scgen/_utils.py +3 -5
  46. pertpy/tools/decoupler_LICENSE +674 -0
  47. {pertpy-0.6.0.dist-info → pertpy-0.8.0.dist-info}/METADATA +48 -20
  48. pertpy-0.8.0.dist-info/RECORD +57 -0
  49. {pertpy-0.6.0.dist-info → pertpy-0.8.0.dist-info}/WHEEL +1 -1
  50. pertpy/plot/_augur.py +0 -234
  51. pertpy/plot/_cinemaot.py +0 -81
  52. pertpy/plot/_coda.py +0 -1001
  53. pertpy/plot/_dialogue.py +0 -91
  54. pertpy/plot/_guide_rna.py +0 -82
  55. pertpy/plot/_milopy.py +0 -284
  56. pertpy/plot/_mixscape.py +0 -594
  57. pertpy/plot/_scgen.py +0 -337
  58. pertpy/tools/_differential_gene_expression.py +0 -99
  59. pertpy/tools/_metadata/__init__.py +0 -0
  60. pertpy/tools/_metadata/_cell_line.py +0 -613
  61. pertpy/tools/_metadata/_look_up.py +0 -342
  62. pertpy/tools/_perturbation_space/_discriminator_classifier.py +0 -381
  63. pertpy/tools/_scgen/_jax_scgen.py +0 -370
  64. pertpy-0.6.0.dist-info/RECORD +0 -50
  65. /pertpy/tools/_scgen/{_jax_scgenvae.py → _scgenvae.py} +0 -0
  66. {pertpy-0.6.0.dist-info → pertpy-0.8.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,569 @@
1
+ from __future__ import annotations
2
+
3
+ from collections import namedtuple
4
+ from typing import TYPE_CHECKING, Literal
5
+
6
+ from lamin_utils import logger
7
+
8
+ if TYPE_CHECKING:
9
+ from collections.abc import Sequence
10
+
11
+ if TYPE_CHECKING:
12
+ import pandas as pd
13
+
14
+ import pubchempy as pcp
15
+
16
+
17
+ class LookUp:
18
+ """Generate LookUp object for different type of metadata."""
19
+
20
+ def __init__(
21
+ self,
22
+ type: Literal["cell_line", "moa", "compound", "drug"] = "cell_line",
23
+ transfer_metadata: Sequence[pd.DataFrame] | None = None,
24
+ ):
25
+ """
26
+ Args:
27
+ type: Metadata type for annotation. One of 'cell_line', 'compound', 'moa' or 'drug.
28
+ transfer_metadata: DataFrames used to generate Lookup object.
29
+ This is currently set to None for CompoundMetaData which does not require any dataframes for transfer.
30
+ """
31
+ self.type = type
32
+ if type == "cell_line":
33
+ self.cell_line_meta = transfer_metadata[0]
34
+ self.cl_cancer_project_meta = transfer_metadata[1]
35
+ self.gene_annotation = transfer_metadata[2]
36
+ self.bulk_rna_sanger = transfer_metadata[3]
37
+ self.bulk_rna_broad = transfer_metadata[4]
38
+ self.proteomics_data = transfer_metadata[5]
39
+ self.drug_response_gdsc1 = transfer_metadata[6]
40
+ self.drug_response_gdsc2 = transfer_metadata[7]
41
+
42
+ cell_line_annotation = namedtuple(
43
+ "cell_line_annotation",
44
+ "n_cell_line cell_line n_metadata metadata reference_id reference_id_example default_parameter",
45
+ )
46
+ cell_lines = namedtuple("cell_lines", ["depmap", "cancerrxgene"])
47
+
48
+ depmap_data = {
49
+ "n_cell_line": len(self.cell_line_meta.index),
50
+ "n_metadata": len(self.cell_line_meta.columns),
51
+ "cell_line": self.cell_line_meta.ModelID.values,
52
+ "metadata": self.cell_line_meta.columns.values,
53
+ "reference_id": [
54
+ "ModelID",
55
+ "CellLineName",
56
+ "StrippedCellLineName",
57
+ "CCLE_Name",
58
+ ],
59
+ "reference_id_example": "ModelID: ACH-000001 | CellLineName: NIH:OVCAR-3 | StrippedCellLineName: NIHOVCAR3 | CCLEName: NIHOVCAR3_OVARY",
60
+ "default_parameter": {
61
+ "cell_line_source": "DepMap",
62
+ "query_id": "DepMap_ID",
63
+ "reference_id": "ModelID",
64
+ "fetch": "None",
65
+ },
66
+ }
67
+ depmap_record = cell_line_annotation(**depmap_data)
68
+
69
+ cancerrxgene_data = {
70
+ "n_cell_line": len(self.cl_cancer_project_meta.index),
71
+ "n_metadata": len(self.cl_cancer_project_meta.columns),
72
+ "cell_line": self.cl_cancer_project_meta.stripped_cell_line_name.values,
73
+ "metadata": self.cl_cancer_project_meta.columns.values,
74
+ "reference_id": [
75
+ "cell_line_name",
76
+ "stripped_cell_line_name",
77
+ "Model ID",
78
+ "COSMIC ID",
79
+ ],
80
+ "reference_id_example": "cell_line_name: SNU-283 | stripped_cell_line_name: SNU283 | Model ID: SIDM00215 | COSMIC ID: 1659929",
81
+ "default_parameter": {
82
+ "query_id": "stripped_cell_line_name",
83
+ "reference_id": "stripped_cell_line_name",
84
+ "fetch": "None",
85
+ },
86
+ }
87
+ cancerrxgene_record = cell_line_annotation(**cancerrxgene_data)
88
+ self.cell_lines = cell_lines(depmap_record, cancerrxgene_record)
89
+
90
+ bulk_rna_annotation = namedtuple(
91
+ "bulk_rna_annotation",
92
+ "n_cell_line cell_line n_gene gene reference_id reference_id_example default_parameter",
93
+ )
94
+ bulk_rna_expression = namedtuple("bulk_rna_expression", ["broad", "sanger"])
95
+
96
+ broad_data = {
97
+ "n_cell_line": len(self.bulk_rna_broad.index),
98
+ "n_gene": len(self.bulk_rna_broad.columns),
99
+ "cell_line": self.bulk_rna_broad.index.values,
100
+ "gene": self.bulk_rna_broad.columns.values,
101
+ "reference_id": "DepMap_ID",
102
+ "reference_id_example": "DepMap_ID: ACH-001113",
103
+ "default_parameter": {
104
+ "query_id": "DepMap_ID",
105
+ "cell_line_source": "broad",
106
+ },
107
+ }
108
+ broad_record = bulk_rna_annotation(**broad_data)
109
+
110
+ sanger_data = {
111
+ "n_cell_line": len(self.bulk_rna_sanger.index),
112
+ "n_gene": len(self.bulk_rna_sanger.columns),
113
+ "cell_line": self.bulk_rna_sanger.index.values,
114
+ "gene": self.bulk_rna_sanger.columns.values,
115
+ "reference_id": "model_name",
116
+ "reference_id_example": "model_name: MEC-1",
117
+ "default_parameter": {
118
+ "query_id": "cell_line_name",
119
+ "cell_line_source": "sanger",
120
+ },
121
+ }
122
+ sanger_record = bulk_rna_annotation(**sanger_data)
123
+ self.bulk_rna = bulk_rna_expression(broad_record, sanger_record)
124
+
125
+ proteomics = namedtuple(
126
+ "proteomics",
127
+ "n_cell_line cell_line n_protein protein metadata reference_id reference_id_example default_parameter",
128
+ )
129
+ proteomics_data = {
130
+ "n_cell_line": len(self.proteomics_data["model_name"].unique()),
131
+ "n_protein": len(self.proteomics_data.uniprot_id.unique()),
132
+ "cell_line": self.proteomics_data["model_name"].unique(),
133
+ "protein": self.proteomics_data.uniprot_id.unique(),
134
+ "metadata": self.proteomics_data.columns.values,
135
+ "reference_id": ["model_id", "model_name"],
136
+ "reference_id_example": "model_id: SIDM00483 | model_name: SK-GT-4",
137
+ "default_parameter": {
138
+ "query_id": "cell_line_name",
139
+ "reference_id": "model_name",
140
+ "bulk_rna_information": "read_count",
141
+ "protein_information": "protein_intensity",
142
+ "protein_id": "uniprot_id",
143
+ },
144
+ }
145
+ self.proteomics = proteomics(**proteomics_data)
146
+
147
+ drug_response_annotation = namedtuple(
148
+ "drug_response_annotation",
149
+ "n_cell_line cell_line n_drug drug_name metadata reference_id reference_id_example default_parameter",
150
+ )
151
+ drug_response = namedtuple("drug_response", ["gdsc1", "gdsc2"])
152
+
153
+ gdsc1_data = {
154
+ "n_cell_line": len(self.drug_response_gdsc1["cell_line_name"].unique()),
155
+ "n_drug": len(self.drug_response_gdsc1.drug_name.unique()),
156
+ "cell_line": self.drug_response_gdsc1.cell_line_name.unique(),
157
+ "drug_name": self.drug_response_gdsc1.drug_name.unique(),
158
+ "metadata": self.drug_response_gdsc1.columns.values,
159
+ "reference_id": ["cell_line_name", "sanger_model_id", "cosmic_id"],
160
+ "reference_id_example": "cell_line_name: ES5 | sanger_model_id: SIDM00263 | cosmic_id: 684057",
161
+ "default_parameter": {
162
+ "gdsc_dataset": "1",
163
+ "query_id": "cell_line_name",
164
+ "reference_id": "cell_line_name",
165
+ "query_perturbation": "perturbation",
166
+ "reference_perturbation": "drug_name",
167
+ },
168
+ }
169
+ gdsc1_dict = drug_response_annotation(**gdsc1_data)
170
+
171
+ gdsc2_data = {
172
+ "n_cell_line": len(self.drug_response_gdsc2["cell_line_name"].unique()),
173
+ "n_drug": len(self.drug_response_gdsc2.drug_name.unique()),
174
+ "cell_line": self.drug_response_gdsc2.cell_line_name.unique(),
175
+ "drug_name": self.drug_response_gdsc2.drug_name.unique(),
176
+ "metadata": self.drug_response_gdsc2.columns.values,
177
+ "reference_id": ["cell_line_name", "sanger_model_id", "cosmic_id"],
178
+ "reference_id_example": "cell_line_name: PFSK-1 | sanger_model_id: SIDM01132 | cosmic_id: 683667",
179
+ "default_parameter": {
180
+ "gdsc_dataset": "1",
181
+ "query_id": "cell_line_name",
182
+ "reference_id": "cell_line_name",
183
+ "query_perturbation": "perturbation",
184
+ "reference_perturbation": "drug_name",
185
+ },
186
+ }
187
+ gdsc2_dict = drug_response_annotation(**gdsc2_data)
188
+
189
+ self.drug_response = drug_response(gdsc1_dict, gdsc2_dict)
190
+
191
+ elif type == "moa":
192
+ self.moa_meta = transfer_metadata[0]
193
+ moa_annotation = namedtuple(
194
+ "moa_annotation",
195
+ "n_pert n_moa query_id query_id_example target_example default_parameter",
196
+ )
197
+ moa_data = {
198
+ "n_pert": len(self.moa_meta.pert_iname.unique()),
199
+ "n_moa": len(self.moa_meta.moa.unique()),
200
+ "query_id": "pert_iname",
201
+ "query_id_example": [
202
+ "(R)-(-)-apomorphine",
203
+ "9-aminocamptothecin",
204
+ "A-803467",
205
+ ],
206
+ "target_example": [
207
+ "ADRA2A|ADRA2B|ADRA2C|CALY|DRD1|DRD2|DRD3|DRD4|DRD5|HTR1A|HTR1B|HTR1D|HTR2A|HTR2B|HTR2C|HTR5A",
208
+ "SCN10A",
209
+ "TOP1",
210
+ ],
211
+ "default_parameter": {
212
+ "query_id": "pert_iname",
213
+ "target": None,
214
+ },
215
+ }
216
+ self.moa = moa_annotation(**moa_data)
217
+
218
+ elif type == "compound":
219
+ compound_annotation = namedtuple("compound_annotation", "query_id query_id_example default_parameter")
220
+ compound_data = {
221
+ "query_id_type": ["name", "cid"],
222
+ "query_id_example": "name: ACH-000016 | cid: SLR 21",
223
+ "default_parameter": {
224
+ "query_id": "perturbation",
225
+ "query_id_type": "name",
226
+ },
227
+ }
228
+ self.compound = compound_annotation(**compound_data)
229
+
230
+ elif type == "drug":
231
+ self.chembl = transfer_metadata[0]
232
+ self.dgidb = transfer_metadata[1]
233
+ self.pharmgkb = transfer_metadata[2]
234
+
235
+ drug_annotation = namedtuple(
236
+ "drug_annotation",
237
+ "n_compound compound_example n_target target_example n_disease disease_example",
238
+ )
239
+ drugs = namedtuple("drugs", ["chembl", "dgidb", "pharmgkb"])
240
+
241
+ dgidb_data = {
242
+ "n_compound": len(self.dgidb.drug_claim_name.unique()),
243
+ "n_target": len(self.dgidb.gene_claim_name.unique()),
244
+ "compound_example": self.dgidb.drug_claim_name.values[0:5],
245
+ "target_example": self.dgidb.gene_claim_name.unique()[0:5],
246
+ "n_disease": 0,
247
+ "disease_example": "",
248
+ }
249
+ dgidb_record = drug_annotation(**dgidb_data)
250
+
251
+ chembl_targets = list(
252
+ {t for target in self.chembl.targets.tolist() for t in target}
253
+ ) # flatten the target column and remove duplicates
254
+ chembl_data = {
255
+ "n_compound": len(self.chembl.compounds),
256
+ "n_target": len(chembl_targets),
257
+ "compound_example": self.chembl.compounds.values[0:5],
258
+ "target_example": chembl_targets[0:5],
259
+ "n_disease": 0,
260
+ "disease_example": "",
261
+ }
262
+ chembl_record = drug_annotation(**chembl_data)
263
+
264
+ pharmgkb_data = {
265
+ "n_compound": len(self.pharmgkb[self.pharmgkb.Type == "Chemical"]["Compound|Disease"].unique()),
266
+ "n_target": len(self.pharmgkb.Gene.unique()),
267
+ "compound_example": self.pharmgkb[self.pharmgkb.Type == "Chemical"]["Compound|Disease"].unique()[0:5],
268
+ "target_example": self.pharmgkb.Gene.unique()[0:5],
269
+ "n_disease": len(self.pharmgkb[self.pharmgkb.Type == "Disease"]["Compound|Disease"].unique()),
270
+ "disease_example": self.pharmgkb[self.pharmgkb.Type == "Disease"]["Compound|Disease"].unique()[0:5],
271
+ }
272
+ pharmgkb_record = drug_annotation(**pharmgkb_data)
273
+ self.drugs = drugs(chembl_record, dgidb_record, pharmgkb_record)
274
+
275
+ else:
276
+ raise NotImplementedError
277
+
278
+ def available_cell_lines(
279
+ self,
280
+ cell_line_source: Literal["DepMap", "Cancerrxgene"] = "DepMap",
281
+ reference_id: str = "ModelID",
282
+ query_id_list: Sequence[str] | None = None,
283
+ ) -> None:
284
+ """A brief summary of cell line metadata.
285
+
286
+ Args:
287
+ cell_line_source: the source of cell line annotation, DepMap or Cancerrxgene.
288
+ reference_id: The type of cell line identifier in the meta data, e.g. ModelID, CellLineName or StrippedCellLineName.
289
+ If fetch cell line metadata from Cancerrxgene, it is recommended to choose "stripped_cell_line_name".
290
+ query_id_list: Unique cell line identifiers to test the number of matched ids present in the
291
+ metadata. If set to None, the query of metadata identifiers will be disabled.
292
+ """
293
+ if self.type != "cell_line":
294
+ raise ValueError("This is not a LookUp object specifically for CellLineMetaData!")
295
+
296
+ if query_id_list is not None:
297
+ identifier_num_all = len(query_id_list)
298
+ if cell_line_source == "DepMap":
299
+ if reference_id not in self.cell_line_meta.columns:
300
+ raise ValueError(
301
+ f"The specified `reference_id` {reference_id} is not available in the DepMap cell line annotation data. "
302
+ )
303
+ not_matched_identifiers = list(set(query_id_list) - set(self.cell_line_meta[reference_id]))
304
+ else:
305
+ if reference_id == "ModelID":
306
+ reference_id = "stripped_cell_line_name"
307
+ if reference_id not in self.cl_cancer_project_meta.columns:
308
+ raise ValueError(
309
+ f"The specified `reference_id` {reference_id} is not available "
310
+ f"in the cell line annotation from the project Genomics of Drug Sensitivity in Cancer. "
311
+ )
312
+ not_matched_identifiers = list(set(query_id_list) - set(self.cl_cancer_project_meta[reference_id]))
313
+
314
+ logger.info(f"{len(not_matched_identifiers)} cell lines are not found in the metadata.")
315
+ logger.info(f"{identifier_num_all - len(not_matched_identifiers)} cell lines are found! ")
316
+
317
+ def available_bulk_rna(
318
+ self,
319
+ cell_line_source: Literal["broad", "sanger"] = "sanger",
320
+ query_id_list: Sequence[str] | None = None,
321
+ ) -> None:
322
+ """A brief summary of bulk RNA expression data.
323
+
324
+ Args:
325
+ cell_line_source: the source of RNA-seq data, broad or sanger.
326
+ query_id_list: Unique cell line identifiers to test the number of matched ids present in the
327
+ metadata. If set to None, the query of metadata identifiers will be disabled.
328
+ """
329
+ if self.type != "cell_line":
330
+ raise ValueError("This is not a LookUp object specific for CellLineMetaData!")
331
+
332
+ if cell_line_source == "broad":
333
+ bulk_rna = self.bulk_rna_broad
334
+ else:
335
+ bulk_rna = self.bulk_rna_sanger
336
+
337
+ if query_id_list is not None:
338
+ identifier_num_all = len(query_id_list)
339
+ not_matched_identifiers = list(set(query_id_list) - set(bulk_rna.index))
340
+
341
+ logger.info(f"{len(not_matched_identifiers)} cell lines are not found in the metadata.")
342
+ logger.info(f"{identifier_num_all - len(not_matched_identifiers)} cell lines are found! ")
343
+
344
+ def available_protein_expression(
345
+ self,
346
+ reference_id: Literal["model_name", "model_id"] = "model_name",
347
+ query_id_list: Sequence[str] | None = None,
348
+ ) -> None:
349
+ """A brief summary of protein expression data.
350
+
351
+ Args:
352
+ reference_id: The type of cell line identifier in the meta data, model_name or model_id.
353
+ query_id_list: Unique cell line identifiers to test the number of matched ids present in the
354
+ metadata. If set to None, the query of metadata identifiers will be disabled.
355
+ """
356
+ if self.type != "cell_line":
357
+ raise ValueError("This is not a LookUp object specific for CellLineMetaData!")
358
+
359
+ if query_id_list is not None:
360
+ identifier_num_all = len(query_id_list)
361
+
362
+ if reference_id not in self.proteomics_data.columns:
363
+ raise ValueError(
364
+ f"The specified `reference_id` {reference_id} is not available in the proteomics data. "
365
+ )
366
+ not_matched_identifiers = list(set(query_id_list) - set(self.proteomics_data[reference_id]))
367
+ logger.info(f"{len(not_matched_identifiers)} cell lines are not found in the metadata.")
368
+ logger.info(f"{identifier_num_all - len(not_matched_identifiers)} cell lines are found! ")
369
+
370
+ def available_drug_response(
371
+ self,
372
+ gdsc_dataset: Literal[1, 2] = 1,
373
+ reference_id: Literal["cell_line_name", "sanger_model_id", "cosmic_id"] = "cell_line_name",
374
+ query_id_list: Sequence[str] | None = None,
375
+ reference_perturbation: Literal["drug_name", "drug_id"] = "drug_name",
376
+ query_perturbation_list: Sequence[str] | None = None,
377
+ ) -> None:
378
+ """A brief summary of drug response data.
379
+
380
+ Args:
381
+ gdsc_dataset: The GDSC dataset, 1 or 2.
382
+ The GDSC1 dataset updates previous releases with additional drug screening data from the Wellcome Sanger Institute and Massachusetts General Hospital.
383
+ It covers 970 Cell lines and 403 Compounds with 333292 IC50s.
384
+ GDSC2 is new and has 243,466 IC50 results from the latest screening at the Wellcome Sanger Institute using improved experimental procedures.
385
+ reference_id: The type of cell line identifier in the meta data, cell_line_name, sanger_model_id or cosmic_id.
386
+ query_id_list: Unique cell line identifiers to test the number of matched ids present in the metadata.
387
+ If set to None, the query of metadata identifiers will be disabled.
388
+ reference_perturbation: The perturbation information in the meta data, drug_name or drug_id.
389
+ query_perturbation_list: Unique perturbation types to test the number of matched ones present in the metadata.
390
+ If set to None, the query of perturbation types will be disabled.
391
+ """
392
+ if self.type != "cell_line":
393
+ raise ValueError("This is not a LookUp object specific for CellLineMetaData!")
394
+ if gdsc_dataset == 1:
395
+ gdsc_data = self.drug_response_gdsc1
396
+ else:
397
+ gdsc_data = self.drug_response_gdsc2
398
+
399
+ if query_id_list is not None:
400
+ if reference_id not in gdsc_data.columns:
401
+ raise ValueError(
402
+ f"The specified `reference_id` {reference_id} is not available in the GDSC drug response data. "
403
+ )
404
+ identifier_num_all = len(query_id_list)
405
+ not_matched_identifiers = list(set(query_id_list) - set(gdsc_data[reference_id]))
406
+ logger.info(f"{len(not_matched_identifiers)} cell lines are not found in the metadata.")
407
+ logger.info(f"{identifier_num_all - len(not_matched_identifiers)} cell lines are found! ")
408
+
409
+ if query_perturbation_list is not None:
410
+ if reference_perturbation not in gdsc_data.columns:
411
+ raise ValueError(
412
+ f"The specified `reference_perturbation` {reference_perturbation} is not available in the GDSC drug response data. "
413
+ )
414
+ identifier_num_all = len(query_perturbation_list)
415
+ not_matched_identifiers = list(set(query_perturbation_list) - set(gdsc_data[reference_perturbation]))
416
+ logger.info(f"{len(not_matched_identifiers)} perturbation types are not found in the metadata.")
417
+ logger.info(f"{identifier_num_all - len(not_matched_identifiers)} perturbation types are found! ")
418
+
419
+ def available_genes_annotation(
420
+ self,
421
+ reference_id: Literal["gene_id", "ensembl_gene_id", "hgnc_id", "hgnc_symbol"] = "ensembl_gene_id",
422
+ query_id_list: Sequence[str] | None = None,
423
+ ) -> None:
424
+ """A brief summary of gene annotation metadata
425
+
426
+ Args:
427
+ reference_id: The type of gene identifier in the meta data, gene_id, ensembl_gene_id, hgnc_id, hgnc_symbol.
428
+ query_id_list: Unique gene identifiers to test the number of matched ids present in the metadata.
429
+ """
430
+ if self.type != "cell_line":
431
+ raise ValueError("This is not a LookUp object specific for CellLineMetaData!")
432
+
433
+ logger.info("To summarize: in the DepMap_Sanger gene annotation file, you can find: ")
434
+ logger.info(f"{len(self.gene_annotation.index)} driver genes")
435
+ logger.info(
436
+ f"{len(self.gene_annotation.columns)} meta data including: ",
437
+ *list(self.gene_annotation.columns.values),
438
+ sep="\n- ",
439
+ )
440
+ logger.info("Overview of gene annotation: ")
441
+ logger.info(self.gene_annotation.head().to_string())
442
+ """
443
+ #not implemented yet
444
+ print("Default parameters to annotate gene annotation: ")
445
+ default_param = {
446
+ "query_id": "ensembl_gene_id",
447
+ }
448
+ print("\n".join(f"- {k}: {v}" for k, v in default_param.items()))
449
+ if query_id_list is not None:
450
+ identifier_num_all = len(query_id_list)
451
+ not_matched_identifiers = list(set(query_id_list) - set(self.gene_annotation[reference_id]))
452
+ print(f"{len(not_matched_identifiers)} genes are not found in the metadata.")
453
+ print(f"{identifier_num_all - len(not_matched_identifiers)} genes are found! ")
454
+ """
455
+
456
+ def available_moa(
457
+ self,
458
+ query_id_list: Sequence[str] | None = None,
459
+ target_list: Sequence[str] | None = None,
460
+ ) -> None:
461
+ """A brief summary of MoA annotation.
462
+
463
+ Args:
464
+ query_id_list: Unique perturbagens to test the number of matched ones present in the metadata.
465
+ If set to None, the query of metadata perturbagens will be disabled.
466
+ target_list: Unique molecular targets to test the number of matched ones present in the metadata.
467
+ If set to None, the comparison of molecular targets in the query of metadata perturbagens will be disabled.
468
+ """
469
+ if query_id_list is not None:
470
+ if self.type != "moa":
471
+ raise ValueError("This is not a LookUp object specific for MoaMetaData!")
472
+ identifier_num_all = len(query_id_list)
473
+ not_matched_identifiers = list(set(query_id_list) - set(self.moa_meta.pert_iname))
474
+ logger.info(f"{len(not_matched_identifiers)} perturbagens are not found in the metadata.")
475
+ logger.info(f"{identifier_num_all - len(not_matched_identifiers)} perturbagens are found! ")
476
+
477
+ if target_list is not None:
478
+ targets = self.moa_meta.target.astype(str).apply(lambda x: x.split("|"))
479
+ all_targets = [t for tl in targets for t in tl]
480
+ identifier_num_all = len(target_list)
481
+ not_matched_identifiers = list(set(target_list) - set(all_targets))
482
+ logger.info(f"{len(not_matched_identifiers)} molecular targets are not found in the metadata.")
483
+ logger.info(f"{identifier_num_all - len(not_matched_identifiers)} molecular targets are found! ")
484
+
485
+ def available_compounds(
486
+ self,
487
+ query_id_list: Sequence[str] | None = None,
488
+ query_id_type: Literal["name", "cid"] = "name",
489
+ ) -> None:
490
+ """A brief summary of compound annotation.
491
+
492
+ Args:
493
+ query_id_list: Unique compounds to test the number of matched ones present in the metadata.
494
+ If set to None, query of compound identifiers will be disabled.
495
+ query_id_type: The type of compound identifiers, name or cid.
496
+ """
497
+ if self.type != "compound":
498
+ raise ValueError("This is not a LookUp object specific for CompoundData!")
499
+ if query_id_list is not None:
500
+ identifier_num_all = len(query_id_list)
501
+ not_matched_identifiers = []
502
+
503
+ for compound in query_id_list:
504
+ if query_id_type == "name":
505
+ cids = pcp.get_compounds(compound, "name")
506
+ if len(cids) == 0: # search did not work
507
+ not_matched_identifiers.append(compound)
508
+ else:
509
+ try:
510
+ pcp.Compound.from_cid(compound)
511
+ except pcp.BadRequestError:
512
+ not_matched_identifiers.append(compound)
513
+
514
+ logger.info(f"{len(not_matched_identifiers)} compounds are not found in the metadata.")
515
+ logger.info(f"{identifier_num_all - len(not_matched_identifiers)} compounds are found! ")
516
+
517
+ def available_drug_annotation(
518
+ self,
519
+ drug_annotation_source: Literal["chembl", "dgidb", "pharmgkb"] = "chembl",
520
+ query_id_list: Sequence[str] | None = None,
521
+ query_id_type: Literal["target", "compound", "disease"] = "target",
522
+ ) -> None:
523
+ """A brief summary of drug annotation.
524
+
525
+ Args:
526
+ drug_annotation_source: the source of drug annotation data, chembl, dgidb or pharmgkb.
527
+ query_id_list: Unique target or compound names to test the number of matched ones present in the metadata.
528
+ If set to None, query of compound identifiers will be disabled.
529
+ query_id_type: The type of identifiers, target, compound and disease(pharmgkb only).
530
+ """
531
+ if self.type != "drug":
532
+ raise ValueError("This is not a LookUp object specific for DrugMetaData!")
533
+ if query_id_list is not None:
534
+ identifier_num_all = len(query_id_list)
535
+ not_matched_identifiers = []
536
+
537
+ if drug_annotation_source == "chembl":
538
+ if query_id_type == "target":
539
+ chembl_targets = {t for target in self.chembl.targets.tolist() for t in target}
540
+ # flatten the target column and remove duplicates
541
+ not_matched_identifiers = list(set(query_id_list) - chembl_targets)
542
+ elif query_id_type == "compound":
543
+ not_matched_identifiers = list(set(query_id_list) - self.chembl["compounds"])
544
+ else:
545
+ raise ValueError(
546
+ "Gene-disease association is not available in chembl dataset, please try with pharmgkb."
547
+ )
548
+
549
+ elif drug_annotation_source == "dgidb":
550
+ if query_id_type == "target":
551
+ not_matched_identifiers = list(set(query_id_list) - set(self.dgidb["gene_claim_name"]))
552
+ elif query_id_type == "compound":
553
+ not_matched_identifiers = list(set(query_id_list) - set(self.dgidb["drug_claim_name"]))
554
+ else:
555
+ raise ValueError(
556
+ "Gene-disease association is not available in dgidb dataset, please try with pharmgkb."
557
+ )
558
+ else:
559
+ if query_id_type == "target":
560
+ not_matched_identifiers = list(set(query_id_list) - set(self.pharmgkb["Gene"]))
561
+ elif query_id_type == "compound":
562
+ compounds = self.pharmgkb[self.pharmgkb["Type"] == "Chemical"]
563
+ not_matched_identifiers = list(set(query_id_list) - set(compounds["Compound|Disease"]))
564
+ else:
565
+ diseases = self.pharmgkb[self.pharmgkb["Type"] == "Disease"]
566
+ not_matched_identifiers = list(set(query_id_list) - set(diseases["Compound|Disease"]))
567
+
568
+ logger.info(f"{len(not_matched_identifiers)} {query_id_type}s are not found in the metadata.")
569
+ logger.info(f"{identifier_num_all - len(not_matched_identifiers)} {query_id_type}s are found! ")
@@ -0,0 +1,70 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Literal
4
+
5
+ from lamin_utils import logger
6
+
7
+ if TYPE_CHECKING:
8
+ from collections.abc import Sequence
9
+
10
+
11
+ class MetaData:
12
+ """Superclass for pertpy's MetaData components."""
13
+
14
+ def _warn_unmatch(
15
+ self,
16
+ total_identifiers: int,
17
+ unmatched_identifiers: Sequence[str],
18
+ query_id: str,
19
+ reference_id: str,
20
+ metadata_type: Literal[
21
+ "cell line",
22
+ "protein expression",
23
+ "bulk RNA",
24
+ "drug response",
25
+ "moa",
26
+ "compound",
27
+ ] = "cell line",
28
+ verbosity: int | str = 5,
29
+ ) -> None:
30
+ """Helper function to print out the unmatched identifiers.
31
+
32
+ Args:
33
+ total_identifiers: The total number of identifiers in the `adata` object.
34
+ unmatched_identifiers: Unmatched identifiers in the `adata` object.
35
+ query_id: The column of `.obs` with cell line information.
36
+ reference_id: The type of cell line identifier in the metadata.
37
+ metadata_type: The type of metadata where some identifiers are not matched during annotation such as
38
+ cell line, protein expression, bulk RNA expression, drug response, moa or compound.
39
+ verbosity: The number of unmatched identifiers to print, can be either non-negative values or 'all'.
40
+ """
41
+ if isinstance(verbosity, str):
42
+ if verbosity != "all":
43
+ raise ValueError("Only a non-negative value or 'all' is accepted.")
44
+ else:
45
+ verbosity = len(unmatched_identifiers)
46
+
47
+ if len(unmatched_identifiers) == total_identifiers:
48
+ hint = ""
49
+ if metadata_type in ["protein expression", "bulk RNA", "drug response"]:
50
+ hint = "Additionally, call the `CellLineMetaData.annotate()` function to acquire more possible query IDs that can be used for cell line annotation purposes."
51
+ raise ValueError(
52
+ f"Attempting to match the query id {query_id} in 'adata.obs' to the reference id {reference_id} in the metadata.\n"
53
+ "However, none of the query IDs could be found in the {metadata_type} annotation data.\n"
54
+ "To resolve this issue, call the `lookup()` function to create a LookUp object.\n"
55
+ "This enables obtaining the count of matched identifiers in the AnnData object for different types of reference and query IDs.\n"
56
+ f"{hint}"
57
+ )
58
+ if len(unmatched_identifiers) == 0:
59
+ return
60
+ if isinstance(verbosity, int) and verbosity >= 0:
61
+ verbosity = min(verbosity, len(unmatched_identifiers))
62
+ if verbosity > 0:
63
+ logger.info(
64
+ f"There are {total_identifiers} identifiers in `adata.obs`."
65
+ f"However, {len(unmatched_identifiers)} identifiers can't be found in the {metadata_type} annotation,"
66
+ "leading to the presence of NA values for their respective metadata.\n"
67
+ f"Please check again: *unmatched_identifiers[:verbosity]..."
68
+ )
69
+ else:
70
+ raise ValueError("Only 'all' or a non-negative value is accepted.")