pertpy 0.6.0__py3-none-any.whl → 0.8.0__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (66) hide show
  1. pertpy/__init__.py +4 -2
  2. pertpy/data/__init__.py +66 -1
  3. pertpy/data/_dataloader.py +28 -26
  4. pertpy/data/_datasets.py +261 -92
  5. pertpy/metadata/__init__.py +6 -0
  6. pertpy/metadata/_cell_line.py +795 -0
  7. pertpy/metadata/_compound.py +128 -0
  8. pertpy/metadata/_drug.py +238 -0
  9. pertpy/metadata/_look_up.py +569 -0
  10. pertpy/metadata/_metadata.py +70 -0
  11. pertpy/metadata/_moa.py +125 -0
  12. pertpy/plot/__init__.py +0 -13
  13. pertpy/preprocessing/__init__.py +2 -0
  14. pertpy/preprocessing/_guide_rna.py +89 -6
  15. pertpy/tools/__init__.py +48 -15
  16. pertpy/tools/_augur.py +329 -32
  17. pertpy/tools/_cinemaot.py +145 -6
  18. pertpy/tools/_coda/_base_coda.py +1237 -116
  19. pertpy/tools/_coda/_sccoda.py +66 -36
  20. pertpy/tools/_coda/_tasccoda.py +46 -39
  21. pertpy/tools/_dialogue.py +180 -77
  22. pertpy/tools/_differential_gene_expression/__init__.py +20 -0
  23. pertpy/tools/_differential_gene_expression/_base.py +657 -0
  24. pertpy/tools/_differential_gene_expression/_checks.py +41 -0
  25. pertpy/tools/_differential_gene_expression/_dge_comparison.py +86 -0
  26. pertpy/tools/_differential_gene_expression/_edger.py +125 -0
  27. pertpy/tools/_differential_gene_expression/_formulaic.py +189 -0
  28. pertpy/tools/_differential_gene_expression/_pydeseq2.py +95 -0
  29. pertpy/tools/_differential_gene_expression/_simple_tests.py +162 -0
  30. pertpy/tools/_differential_gene_expression/_statsmodels.py +72 -0
  31. pertpy/tools/_distances/_distance_tests.py +29 -24
  32. pertpy/tools/_distances/_distances.py +584 -98
  33. pertpy/tools/_enrichment.py +460 -0
  34. pertpy/tools/_kernel_pca.py +1 -1
  35. pertpy/tools/_milo.py +406 -49
  36. pertpy/tools/_mixscape.py +677 -55
  37. pertpy/tools/_perturbation_space/_clustering.py +10 -3
  38. pertpy/tools/_perturbation_space/_comparison.py +112 -0
  39. pertpy/tools/_perturbation_space/_discriminator_classifiers.py +524 -0
  40. pertpy/tools/_perturbation_space/_perturbation_space.py +146 -52
  41. pertpy/tools/_perturbation_space/_simple.py +52 -11
  42. pertpy/tools/_scgen/__init__.py +1 -1
  43. pertpy/tools/_scgen/_base_components.py +2 -3
  44. pertpy/tools/_scgen/_scgen.py +706 -0
  45. pertpy/tools/_scgen/_utils.py +3 -5
  46. pertpy/tools/decoupler_LICENSE +674 -0
  47. {pertpy-0.6.0.dist-info → pertpy-0.8.0.dist-info}/METADATA +48 -20
  48. pertpy-0.8.0.dist-info/RECORD +57 -0
  49. {pertpy-0.6.0.dist-info → pertpy-0.8.0.dist-info}/WHEEL +1 -1
  50. pertpy/plot/_augur.py +0 -234
  51. pertpy/plot/_cinemaot.py +0 -81
  52. pertpy/plot/_coda.py +0 -1001
  53. pertpy/plot/_dialogue.py +0 -91
  54. pertpy/plot/_guide_rna.py +0 -82
  55. pertpy/plot/_milopy.py +0 -284
  56. pertpy/plot/_mixscape.py +0 -594
  57. pertpy/plot/_scgen.py +0 -337
  58. pertpy/tools/_differential_gene_expression.py +0 -99
  59. pertpy/tools/_metadata/__init__.py +0 -0
  60. pertpy/tools/_metadata/_cell_line.py +0 -613
  61. pertpy/tools/_metadata/_look_up.py +0 -342
  62. pertpy/tools/_perturbation_space/_discriminator_classifier.py +0 -381
  63. pertpy/tools/_scgen/_jax_scgen.py +0 -370
  64. pertpy-0.6.0.dist-info/RECORD +0 -50
  65. /pertpy/tools/_scgen/{_jax_scgenvae.py → _scgenvae.py} +0 -0
  66. {pertpy-0.6.0.dist-info → pertpy-0.8.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,342 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from collections import namedtuple
4
- from typing import TYPE_CHECKING, Literal
5
-
6
- from rich import print
7
-
8
- if TYPE_CHECKING:
9
- import pandas as pd
10
-
11
-
12
- class LookUp:
13
- """Generate LookUp object for different type of metadata."""
14
-
15
- def __init__(self, type: str = "cell_line", transfer_metadata: list[pd.DataFrame] | None = None):
16
- """
17
- Args:
18
- type: metadata type. Default: cell_line. Currrently, LookUp object is only implemented for CellLineMetaData.
19
- transfer_metadata: a list of dataframes used to generate Lookup object. To ensure efficient transfer of
20
- metadata during initialization, LookUp object should always be generated by the corresponding MetaData
21
- class. Also, different MetaData classes have different required metadata to transfer.
22
- """
23
- if type == "cell_line":
24
- self.type = type
25
- self.cell_line_meta = transfer_metadata[0]
26
- self.cl_cancer_project_meta = transfer_metadata[1]
27
- self.gene_annotation = transfer_metadata[2]
28
- self.bulk_rna_sanger = transfer_metadata[3]
29
- self.bulk_rna_broad = transfer_metadata[4]
30
- self.proteomics_data = transfer_metadata[5]
31
- self.drug_response_gdsc1 = transfer_metadata[6]
32
- self.drug_response_gdsc2 = transfer_metadata[7]
33
-
34
- cell_line_annotation = namedtuple(
35
- "cell_line_annotation",
36
- "n_cell_line cell_line n_metadata metadata reference_id reference_id_example default_parameter",
37
- )
38
- cell_lines = namedtuple("cell_lines", ["depmap", "cancerrxgene"])
39
-
40
- depmap_data = {
41
- "n_cell_line": len(self.cell_line_meta.index),
42
- "n_metadata": len(self.cell_line_meta.columns),
43
- "cell_line": self.cell_line_meta.DepMap_ID.values,
44
- "metadata": self.cell_line_meta.columns.values,
45
- "reference_id": ["DepMap_ID", "cell_line_name", "stripped_cell_line_name", "CCLE_Name"],
46
- "reference_id_example": "DepMap_ID: ACH-000016 | cell_line_name: SLR 21 | stripped_cell_line_name: SLR21 | CCLE_Name: SLR21_KIDNEY",
47
- "default_parameter": {
48
- "cell_line_source": "DepMap",
49
- "query_id": "DepMap_ID",
50
- "reference_id": "DepMap_ID",
51
- "cell_line_information": "None",
52
- },
53
- }
54
- depmap_record = cell_line_annotation(**depmap_data)
55
-
56
- cancerrxgene_data = {
57
- "n_cell_line": len(self.cl_cancer_project_meta.index),
58
- "n_metadata": len(self.cl_cancer_project_meta.columns),
59
- "cell_line": self.cl_cancer_project_meta.stripped_cell_line_name.values,
60
- "metadata": self.cl_cancer_project_meta.columns.values,
61
- "reference_id": ["cell_line_name", "stripped_cell_line_name", "Model ID", "COSMIC ID"],
62
- "reference_id_example": "cell_line_name: SNU-283 | stripped_cell_line_name: SNU283 | Model ID: SIDM00215 | COSMIC ID: 1659929",
63
- "default_parameter": {
64
- "query_id": "stripped_cell_line_name",
65
- "reference_id": "stripped_cell_line_name",
66
- "cell_line_information": "None",
67
- },
68
- }
69
- ccancerrxgene_record = cell_line_annotation(**cancerrxgene_data)
70
- self.cell_lines = cell_lines(depmap_record, ccancerrxgene_record)
71
-
72
- bulk_rna_annotation = namedtuple(
73
- "bulk_rna_annotation",
74
- "n_cell_line cell_line n_gene gene reference_id reference_id_example default_parameter",
75
- )
76
- bulk_rna_expression = namedtuple("bulk_rna_expression", ["broad", "sanger"])
77
-
78
- broad_data = {
79
- "n_cell_line": len(self.bulk_rna_broad.index),
80
- "n_gene": len(self.bulk_rna_broad.columns),
81
- "cell_line": self.bulk_rna_broad.index.values,
82
- "gene": self.bulk_rna_broad.columns.values,
83
- "reference_id": "DepMap_ID",
84
- "reference_id_example": "DepMap_ID: ACH-001113",
85
- "default_parameter": {"query_id": "DepMap_ID", "cell_line_source": "broad"},
86
- }
87
- broad_record = bulk_rna_annotation(**broad_data)
88
-
89
- sanger_data = {
90
- "n_cell_line": len(self.bulk_rna_sanger.index),
91
- "n_gene": len(self.bulk_rna_sanger.columns),
92
- "cell_line": self.bulk_rna_sanger.index.values,
93
- "gene": self.bulk_rna_sanger.columns.values,
94
- "reference_id": "model_name",
95
- "reference_id_example": "model_name: MEC-1",
96
- "default_parameter": {"query_id": "cell_line_name", "cell_line_source": "sanger"},
97
- }
98
- sanger_record = bulk_rna_annotation(**sanger_data)
99
- self.bulk_rna = bulk_rna_expression(broad_record, sanger_record)
100
-
101
- proteomics = namedtuple(
102
- "proteomics",
103
- "n_cell_line cell_line n_protein protein metadata reference_id reference_id_example default_parameter",
104
- )
105
- proteomics_data = {
106
- "n_cell_line": len(self.proteomics_data["model_name"].unique()),
107
- "n_protein": len(self.proteomics_data.uniprot_id.unique()),
108
- "cell_line": self.proteomics_data["model_name"].unique(),
109
- "protein": self.proteomics_data.uniprot_id.unique(),
110
- "metadata": self.proteomics_data.columns.values,
111
- "reference_id": ["model_id", "model_name"],
112
- "reference_id_example": "model_id: SIDM00483 | model_name: SK-GT-4",
113
- "default_parameter": {
114
- "query_id": "cell_line_name",
115
- "reference_id": "model_name",
116
- "bulk_rna_information": "read_count",
117
- "protein_information": "protein_intensity",
118
- "protein_id": "uniprot_id",
119
- },
120
- }
121
- self.proteomics = proteomics(**proteomics_data)
122
-
123
- drug_response_annotation = namedtuple(
124
- "drug_response_annotation",
125
- "n_cell_line cell_line n_drug drug_name metadata reference_id reference_id_example default_parameter",
126
- )
127
- drug_response = namedtuple("drug_response", ["gdsc1", "gdsc2"])
128
-
129
- gdsc1_data = {
130
- "n_cell_line": len(self.drug_response_gdsc1["cell_line_name"].unique()),
131
- "n_drug": len(self.drug_response_gdsc1.drug_name.unique()),
132
- "cell_line": self.drug_response_gdsc1.cell_line_name.unique(),
133
- "drug_name": self.drug_response_gdsc1.drug_name.unique(),
134
- "metadata": self.drug_response_gdsc1.columns.values,
135
- "reference_id": ["cell_line_name", "sanger_model_id", "cosmic_id"],
136
- "reference_id_example": "cell_line_name: ES5 | sanger_model_id: SIDM00263 | cosmic_id: 684057",
137
- "default_parameter": {
138
- "gdsc_dataset": "1",
139
- "query_id": "cell_line_name",
140
- "reference_id": "cell_line_name",
141
- "query_perturbation": "perturbation",
142
- "reference_perturbation": "drug_name",
143
- },
144
- }
145
- gdsc1_dict = drug_response_annotation(**gdsc1_data)
146
-
147
- gdsc2_data = {
148
- "n_cell_line": len(self.drug_response_gdsc2["cell_line_name"].unique()),
149
- "n_drug": len(self.drug_response_gdsc2.drug_name.unique()),
150
- "cell_line": self.drug_response_gdsc2.cell_line_name.unique(),
151
- "drug_name": self.drug_response_gdsc2.drug_name.unique(),
152
- "metadata": self.drug_response_gdsc2.columns.values,
153
- "reference_id": ["cell_line_name", "sanger_model_id", "cosmic_id"],
154
- "reference_id_example": "cell_line_name: PFSK-1 | sanger_model_id: SIDM01132 | cosmic_id: 683667",
155
- "default_parameter": {
156
- "gdsc_dataset": "1",
157
- "query_id": "cell_line_name",
158
- "reference_id": "cell_line_name",
159
- "query_perturbation": "perturbation",
160
- "reference_perturbation": "drug_name",
161
- },
162
- }
163
- gdsc2_dict = drug_response_annotation(**gdsc2_data)
164
-
165
- self.drug_response = drug_response(gdsc1_dict, gdsc2_dict)
166
- else:
167
- raise NotImplementedError
168
-
169
- def available_cell_lines(
170
- self,
171
- cell_line_source: Literal["DepMap", "Cancerrxgene"] = "DepMap",
172
- reference_id: str = "DepMap_ID",
173
- query_id_list: list[str] | None = None,
174
- ) -> None:
175
- """A brief summary of cell line metadata.
176
-
177
- Args:
178
- cell_line_source: the source of cell line annotation, DepMap or Cancerrxgene. Defaults to "DepMap".
179
- reference_id: The type of cell line identifier in the meta data, e.g. DepMap_ID, cell_line_name or
180
- stripped_cell_line_name. If fetch cell line metadata from Cancerrxgene, it is recommended to choose
181
- "stripped_cell_line_name". Defaults to "DepMap_ID".
182
- query_id_list: A list of unique cell line identifiers to test the number of matched ids present in the
183
- metadata. Defaults to None.
184
-
185
- """
186
- if self.type != "cell_line":
187
- raise ValueError("This is not a LookUp object specifically for CellLineMetaData!")
188
-
189
- if query_id_list is not None:
190
- identifier_num_all = len(query_id_list)
191
- if cell_line_source == "DepMap":
192
- if reference_id not in self.cell_line_meta.columns:
193
- raise ValueError(
194
- f"The specified `reference_id` {reference_id} is not available in the DepMap cell line annotation data. "
195
- )
196
- not_matched_identifiers = list(set(query_id_list) - set(self.cell_line_meta[reference_id]))
197
- else:
198
- if reference_id == "DepMap_ID":
199
- reference_id = "stripped_cell_line_name"
200
- if reference_id not in self.cl_cancer_project_meta.columns:
201
- raise ValueError(
202
- f"The specified `reference_id` {reference_id} is not available "
203
- f"in the cell line annotation from the project Genomics of Drug Sensitivity in Cancer. "
204
- )
205
- not_matched_identifiers = list(set(query_id_list) - set(self.cl_cancer_project_meta[reference_id]))
206
-
207
- print(f"{len(not_matched_identifiers)} cell lines are not found in the metadata.")
208
- print(f"{identifier_num_all - len(not_matched_identifiers)} cell lines are found! ")
209
-
210
- def available_bulk_rna_expression(
211
- self,
212
- cell_line_source: Literal["broad", "sanger"] = "sanger",
213
- query_id_list: list[str] | None = None,
214
- ) -> None:
215
- """A brief summary of bulk RNA expression data.
216
-
217
- Args:
218
- cell_line_source: the source of RNA-seq data, broad or sanger. Defaults to "sanger".
219
- query_id_list: A list of unique cell line identifiers to test the number of matched ids present in the
220
- metadata. Defaults to None.
221
- """
222
- if self.type != "cell_line":
223
- raise ValueError("This is not a LookUp object specific for CellLineMetaData!")
224
-
225
- if cell_line_source == "broad":
226
- bulk_rna = self.bulk_rna_broad
227
- else:
228
- bulk_rna = self.bulk_rna_sanger
229
-
230
- if query_id_list is not None:
231
- identifier_num_all = len(query_id_list)
232
- not_matched_identifiers = list(set(query_id_list) - set(bulk_rna.index))
233
-
234
- print(f"{len(not_matched_identifiers)} cell lines are not found in the metadata.")
235
- print(f"{identifier_num_all - len(not_matched_identifiers)} cell lines are found! ")
236
-
237
- def available_protein_expression(
238
- self, reference_id: Literal["model_name", "model_id"] = "model_name", query_id_list: list[str] | None = None
239
- ) -> None:
240
- """A brief summary of protein expression data.
241
-
242
- Args:
243
- reference_id: The type of cell line identifier in the meta data, model_name or model_id.
244
- Defaults to "model_name".
245
- query_id_list: A list of unique cell line identifiers to test the number of matched ids present in the
246
- metadata. Defaults to None.
247
- """
248
- if self.type != "cell_line":
249
- raise ValueError("This is not a LookUp object specific for CellLineMetaData!")
250
-
251
- if query_id_list is not None:
252
- identifier_num_all = len(query_id_list)
253
-
254
- if reference_id not in self.proteomics_data.columns:
255
- raise ValueError(
256
- f"The specified `reference_id` {reference_id} is not available in the proteomics data. "
257
- )
258
- not_matched_identifiers = list(set(query_id_list) - set(self.proteomics_data[reference_id]))
259
- print(f"[bold blue]{len(not_matched_identifiers)} cell lines are not found in the metadata.")
260
- print(f"[bold yellow]{identifier_num_all - len(not_matched_identifiers)} cell lines are found! ")
261
-
262
- def available_drug_response(
263
- self,
264
- gdsc_dataset: Literal[1, 2] = 1,
265
- reference_id: Literal["cell_line_name", "sanger_model_id", "cosmic_id"] = "cell_line_name",
266
- query_id_list: list[str] | None = None,
267
- reference_perturbation: Literal["drug_name", "drug_id"] = "drug_name",
268
- query_perturbation_list: list[str] | None = None,
269
- ) -> None:
270
- """A brief summary of drug response data.
271
-
272
- Args:
273
- gdsc_dataset: The GDSC dataset, 1 or 2. Defaults to 1. The GDSC1 dataset updates previous releases with additional drug screening data from the Wellcome Sanger Institute and Massachusetts General Hospital. It covers 970 Cell lines and 403 Compounds with 333292 IC50s. GDSC2 is new and has 243,466 IC50 results from the latest screening at the Wellcome Sanger Institute using improved experimental procedures.
274
- reference_id: The type of cell line identifier in the meta data, cell_line_name, sanger_model_id or cosmic_id. Defaults to "cell_line_name".
275
- query_id_list: A list of unique cell line identifiers to test the number of matched ids present in the metadata. Defaults to None.
276
- reference_perturbation: The perturbation information in the meta data, drug_name or drug_id. Defaults to "drug_name".
277
- query_perturbation_list: A list of unique perturbation types to test the number of matched ones present in the metadata. Defaults to None.
278
-
279
- """
280
- if self.type != "cell_line":
281
- raise ValueError("This is not a LookUp object specific for CellLineMetaData!")
282
- if gdsc_dataset == 1:
283
- gdsc_data = self.drug_response_gdsc1
284
- else:
285
- gdsc_data = self.drug_response_gdsc2
286
-
287
- if query_id_list is not None:
288
- if reference_id not in gdsc_data.columns:
289
- raise ValueError(
290
- f"The specified `reference_id` {reference_id} is not available in the GDSC drug response data. "
291
- )
292
- identifier_num_all = len(query_id_list)
293
- not_matched_identifiers = list(set(query_id_list) - set(gdsc_data[reference_id]))
294
- print(f"{len(not_matched_identifiers)} cell lines are not found in the metadata.")
295
- print(f"{identifier_num_all - len(not_matched_identifiers)} cell lines are found! ")
296
-
297
- if query_perturbation_list is not None:
298
- if reference_perturbation not in gdsc_data.columns:
299
- raise ValueError(
300
- f"The specified `reference_perturbation` {reference_perturbation} is not available in the GDSC drug response data. "
301
- )
302
- identifier_num_all = len(query_perturbation_list)
303
- not_matched_identifiers = list(set(query_perturbation_list) - set(gdsc_data[reference_perturbation]))
304
- print(f"{len(not_matched_identifiers)} perturbation types are not found in the metadata.")
305
- print(f"{identifier_num_all - len(not_matched_identifiers)} perturbation types are found! ")
306
-
307
- def available_genes_annotation(
308
- self,
309
- reference_id: Literal["gene_id", "ensembl_gene_id", "hgnc_id", "hgnc_symbol"] = "ensembl_gene_id",
310
- query_id_list: list[str] | None = None,
311
- ) -> None:
312
- """A brief summary of gene annotation metadata
313
-
314
- Args:
315
- reference_id: The type of gene identifier in the meta data, gene_id, ensembl_gene_id, hgnc_id, hgnc_symbol. Defaults to "ensembl_gene_id".
316
- query_id_list: A list of unique gene identifiers to test the number of matched ids present in the metadata. Defaults to None.
317
- """
318
- if self.type != "cell_line":
319
- raise ValueError("This is not a LookUp object specific for CellLineMetaData!")
320
-
321
- print("To summarize: in the DepMap_Sanger gene annotation file, you can find: ")
322
- print(f"{len(self.gene_annotation.index)} driver genes")
323
- print(
324
- f"{len(self.gene_annotation.columns)} meta data including: ",
325
- *list(self.gene_annotation.columns.values),
326
- sep="\n- ",
327
- )
328
- print("Overview of gene annotation: ")
329
- print(self.gene_annotation.head().to_string())
330
- """
331
- #not implemented yet
332
- print("Default parameters to annotate gene annotation: ")
333
- default_param = {
334
- "query_id": "ensembl_gene_id",
335
- }
336
- print("\n".join(f"- {k}: {v}" for k, v in default_param.items()))
337
- if query_id_list is not None:
338
- identifier_num_all = len(query_id_list)
339
- not_matched_identifiers = list(set(query_id_list) - set(self.gene_annotation[reference_id]))
340
- print(f"{len(not_matched_identifiers)} genes are not found in the metadata.")
341
- print(f"{identifier_num_all - len(not_matched_identifiers)} genes are found! ")
342
- """