pertpy 0.6.0__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. pertpy/__init__.py +4 -2
  2. pertpy/data/__init__.py +66 -1
  3. pertpy/data/_dataloader.py +28 -26
  4. pertpy/data/_datasets.py +261 -92
  5. pertpy/metadata/__init__.py +6 -0
  6. pertpy/metadata/_cell_line.py +795 -0
  7. pertpy/metadata/_compound.py +128 -0
  8. pertpy/metadata/_drug.py +238 -0
  9. pertpy/metadata/_look_up.py +569 -0
  10. pertpy/metadata/_metadata.py +70 -0
  11. pertpy/metadata/_moa.py +125 -0
  12. pertpy/plot/__init__.py +0 -13
  13. pertpy/preprocessing/__init__.py +2 -0
  14. pertpy/preprocessing/_guide_rna.py +89 -6
  15. pertpy/tools/__init__.py +48 -15
  16. pertpy/tools/_augur.py +329 -32
  17. pertpy/tools/_cinemaot.py +145 -6
  18. pertpy/tools/_coda/_base_coda.py +1237 -116
  19. pertpy/tools/_coda/_sccoda.py +66 -36
  20. pertpy/tools/_coda/_tasccoda.py +46 -39
  21. pertpy/tools/_dialogue.py +180 -77
  22. pertpy/tools/_differential_gene_expression/__init__.py +20 -0
  23. pertpy/tools/_differential_gene_expression/_base.py +657 -0
  24. pertpy/tools/_differential_gene_expression/_checks.py +41 -0
  25. pertpy/tools/_differential_gene_expression/_dge_comparison.py +86 -0
  26. pertpy/tools/_differential_gene_expression/_edger.py +125 -0
  27. pertpy/tools/_differential_gene_expression/_formulaic.py +189 -0
  28. pertpy/tools/_differential_gene_expression/_pydeseq2.py +95 -0
  29. pertpy/tools/_differential_gene_expression/_simple_tests.py +162 -0
  30. pertpy/tools/_differential_gene_expression/_statsmodels.py +72 -0
  31. pertpy/tools/_distances/_distance_tests.py +29 -24
  32. pertpy/tools/_distances/_distances.py +584 -98
  33. pertpy/tools/_enrichment.py +460 -0
  34. pertpy/tools/_kernel_pca.py +1 -1
  35. pertpy/tools/_milo.py +406 -49
  36. pertpy/tools/_mixscape.py +677 -55
  37. pertpy/tools/_perturbation_space/_clustering.py +10 -3
  38. pertpy/tools/_perturbation_space/_comparison.py +112 -0
  39. pertpy/tools/_perturbation_space/_discriminator_classifiers.py +524 -0
  40. pertpy/tools/_perturbation_space/_perturbation_space.py +146 -52
  41. pertpy/tools/_perturbation_space/_simple.py +52 -11
  42. pertpy/tools/_scgen/__init__.py +1 -1
  43. pertpy/tools/_scgen/_base_components.py +2 -3
  44. pertpy/tools/_scgen/_scgen.py +706 -0
  45. pertpy/tools/_scgen/_utils.py +3 -5
  46. pertpy/tools/decoupler_LICENSE +674 -0
  47. {pertpy-0.6.0.dist-info → pertpy-0.8.0.dist-info}/METADATA +48 -20
  48. pertpy-0.8.0.dist-info/RECORD +57 -0
  49. {pertpy-0.6.0.dist-info → pertpy-0.8.0.dist-info}/WHEEL +1 -1
  50. pertpy/plot/_augur.py +0 -234
  51. pertpy/plot/_cinemaot.py +0 -81
  52. pertpy/plot/_coda.py +0 -1001
  53. pertpy/plot/_dialogue.py +0 -91
  54. pertpy/plot/_guide_rna.py +0 -82
  55. pertpy/plot/_milopy.py +0 -284
  56. pertpy/plot/_mixscape.py +0 -594
  57. pertpy/plot/_scgen.py +0 -337
  58. pertpy/tools/_differential_gene_expression.py +0 -99
  59. pertpy/tools/_metadata/__init__.py +0 -0
  60. pertpy/tools/_metadata/_cell_line.py +0 -613
  61. pertpy/tools/_metadata/_look_up.py +0 -342
  62. pertpy/tools/_perturbation_space/_discriminator_classifier.py +0 -381
  63. pertpy/tools/_scgen/_jax_scgen.py +0 -370
  64. pertpy-0.6.0.dist-info/RECORD +0 -50
  65. /pertpy/tools/_scgen/{_jax_scgenvae.py → _scgenvae.py} +0 -0
  66. {pertpy-0.6.0.dist-info → pertpy-0.8.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,128 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Literal
4
+
5
+ import pandas as pd
6
+ import pubchempy as pcp
7
+
8
+ from ._look_up import LookUp
9
+ from ._metadata import MetaData
10
+
11
+ if TYPE_CHECKING:
12
+ from anndata import AnnData
13
+
14
+
15
+ class Compound(MetaData):
16
+ """Utilities to fetch metadata for compounds."""
17
+
18
+ def __init__(self):
19
+ super().__init__()
20
+
21
+ def annotate_compounds(
22
+ self,
23
+ adata: AnnData,
24
+ query_id: str = "perturbation",
25
+ query_id_type: Literal["name", "cid"] = "name",
26
+ verbosity: int | str = 5,
27
+ copy: bool = False,
28
+ ) -> AnnData:
29
+ """Fetch compound annotation from pubchempy.
30
+
31
+ Args:
32
+ adata: The data object to annotate.
33
+ query_id: The column of `.obs` with compound identifiers.
34
+ query_id_type: The type of compound identifiers, 'name' or 'cid'.
35
+ verbosity: The number of unmatched identifiers to print, can be either non-negative values or "all".
36
+ copy: Determines whether a copy of the `adata` is returned.
37
+
38
+ Returns:
39
+ Returns an AnnData object with compound annotation.
40
+ """
41
+ if copy:
42
+ adata = adata.copy()
43
+
44
+ if query_id not in adata.obs.columns:
45
+ raise ValueError(f"The requested query_id {query_id} is not in `adata.obs`.\n" f"Please check again. ")
46
+
47
+ query_dict = {}
48
+ not_matched_identifiers = []
49
+ for compound in adata.obs[query_id].dropna().astype(str).unique():
50
+ if query_id_type == "name":
51
+ cids = pcp.get_compounds(compound, "name")
52
+ if len(cids) == 0: # search did not work
53
+ not_matched_identifiers.append(compound)
54
+ if len(cids) >= 1:
55
+ # If the name matches the first synonym offered by PubChem (outside of capitalization),
56
+ # it is not changed (outside of capitalization). Otherwise, it is replaced with the first synonym.
57
+ query_dict[compound] = [
58
+ cids[0].synonyms[0],
59
+ cids[0].cid,
60
+ cids[0].canonical_smiles,
61
+ ]
62
+ else:
63
+ try:
64
+ cid = pcp.Compound.from_cid(compound)
65
+ query_dict[compound] = [
66
+ cid.synonyms[0],
67
+ compound,
68
+ cid.canonical_smiles,
69
+ ]
70
+ except pcp.BadRequestError:
71
+ # pubchempy throws badrequest if a cid is not found
72
+ not_matched_identifiers.append(compound)
73
+
74
+ identifier_num_all = len(adata.obs[query_id].unique())
75
+ self._warn_unmatch(
76
+ total_identifiers=identifier_num_all,
77
+ unmatched_identifiers=not_matched_identifiers,
78
+ query_id=query_id,
79
+ reference_id=query_id_type,
80
+ metadata_type="compound",
81
+ verbosity=verbosity,
82
+ )
83
+
84
+ query_df = pd.DataFrame.from_dict(query_dict, orient="index", columns=["pubchem_name", "pubchem_ID", "smiles"])
85
+ # Merge and remove duplicate columns
86
+ # Column is converted to float after merging due to unmatches
87
+ # Convert back to integers
88
+ if query_id_type == "cid":
89
+ query_df.pubchem_ID = query_df.pubchem_ID.astype("Int64")
90
+ adata.obs = (
91
+ adata.obs.merge(
92
+ query_df,
93
+ left_on=query_id,
94
+ right_on="pubchem_ID",
95
+ how="left",
96
+ suffixes=("", "_fromMeta"),
97
+ )
98
+ .filter(regex="^(?!.*_fromMeta)")
99
+ .set_index(adata.obs.index)
100
+ )
101
+ else:
102
+ adata.obs = (
103
+ adata.obs.merge(
104
+ query_df,
105
+ left_on=query_id,
106
+ right_index=True,
107
+ how="left",
108
+ suffixes=("", "_fromMeta"),
109
+ )
110
+ .filter(regex="^(?!.*_fromMeta)")
111
+ .set_index(adata.obs.index)
112
+ )
113
+ adata.obs.pubchem_ID = adata.obs.pubchem_ID.astype("Int64")
114
+
115
+ return adata
116
+
117
+ def lookup(self) -> LookUp:
118
+ """Generate LookUp object for CompoundMetaData.
119
+
120
+ The LookUp object provides an overview of the metadata to annotate.
121
+ Each annotate_{metadata} function has a corresponding lookup function in the LookUp object,
122
+ where users can search the reference_id in the metadata and
123
+ compare with the query_id in their own data.
124
+
125
+ Returns:
126
+ Returns a LookUp object specific for compound annotation.
127
+ """
128
+ return LookUp(type="compound")
@@ -0,0 +1,238 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from collections import ChainMap
5
+ from pathlib import Path
6
+ from typing import TYPE_CHECKING, Literal
7
+
8
+ import pandas as pd
9
+ from scanpy import settings
10
+
11
+ from pertpy.data._dataloader import _download
12
+
13
+ from ._look_up import LookUp
14
+ from ._metadata import MetaData
15
+
16
+ if TYPE_CHECKING:
17
+ from anndata import AnnData
18
+
19
+
20
+ def _download_drug_annotation(
21
+ source: Literal["chembl", "dgidb", "pharmgkb"] = "chembl",
22
+ ) -> pd.DataFrame | dict[str, dict[str, list[str]]]:
23
+ if source == "chembl":
24
+ # Prepared in https://github.com/theislab/pertpy-datasets/blob/main/chembl_data.ipynb
25
+ chembl_path = Path(settings.cachedir) / "chembl.json"
26
+ if not Path(chembl_path).exists():
27
+ _download(
28
+ url="https://figshare.com/ndownloader/files/43871718",
29
+ output_file_name="chembl.json",
30
+ output_path=settings.cachedir,
31
+ block_size=4096,
32
+ is_zip=False,
33
+ )
34
+ with chembl_path.open() as file:
35
+ chembl_json = json.load(file)
36
+ return chembl_json
37
+
38
+ elif source == "dgidb":
39
+ dgidb_path = Path(settings.cachedir) / "dgidb.tsv"
40
+ if not Path(dgidb_path).exists():
41
+ _download(
42
+ url="https://www.dgidb.org/data/latest/interactions.tsv",
43
+ output_file_name="dgidb.tsv",
44
+ output_path=settings.cachedir,
45
+ block_size=4096,
46
+ is_zip=False,
47
+ )
48
+ dgidb_df = pd.read_table(dgidb_path)
49
+ return dgidb_df
50
+
51
+ else:
52
+ pharmgkb_path = Path(settings.cachedir) / "pharmgkb.tsv"
53
+ if not Path(pharmgkb_path).exists():
54
+ _download(
55
+ url="https://api.pharmgkb.org/v1/download/file/data/relationships.zip",
56
+ output_file_name="pharmgkb.zip",
57
+ output_path=settings.cachedir,
58
+ block_size=4096,
59
+ is_zip=True,
60
+ )
61
+ Path.rename(Path(settings.cachedir) / "relationships.tsv", pharmgkb_path)
62
+
63
+ pharmgkb_df = pd.read_table(pharmgkb_path)
64
+ pharmgkb_df = pharmgkb_df[pharmgkb_df["Association"] != "not associated"]
65
+ pharmgkb_df = pharmgkb_df[
66
+ (pharmgkb_df["Entity1_type"] == "Gene")
67
+ & ((pharmgkb_df["Entity2_type"] == "Chemical") | (pharmgkb_df["Entity2_type"] == "Disease"))
68
+ ]
69
+ pharmgkb_df.rename(
70
+ columns={
71
+ "Entity2_name": "Compound|Disease",
72
+ "Entity1_name": "Gene",
73
+ "Entity2_type": "Type",
74
+ },
75
+ inplace=True,
76
+ )
77
+ pharmgkb_df.drop(["Entity1_type", "Entity1_id", "Entity2_id"], axis=1, inplace=True)
78
+
79
+ return pharmgkb_df
80
+
81
+
82
+ class Drug(MetaData):
83
+ """Utilities to fetch metadata for drug studies."""
84
+
85
+ def __init__(self):
86
+ self.chembl = self.DrugDataBase(database="chembl")
87
+ self.dgidb = self.DrugDataBase(database="dgidb")
88
+ self.pharmgkb = self.DrugDataBase(database="pharmgkb")
89
+
90
+ def annotate(
91
+ self,
92
+ adata: AnnData,
93
+ source: Literal["chembl", "dgidb", "pharmgkb"] = "chembl",
94
+ copy: bool = False,
95
+ ) -> AnnData:
96
+ """Annotates genes by their involvement in applied drugs.
97
+
98
+ Genes need to be in HGNC format.
99
+
100
+ Args:
101
+ adata: AnnData object containing log-normalised data.
102
+ source: Source of the metadata, chembl, dgidb or pharmgkb.
103
+ copy: Determines whether a copy of the `adata` is returned.
104
+
105
+ Returns:
106
+ An AnnData object with a new column `drug` in the var slot.
107
+ """
108
+ if copy:
109
+ adata = adata.copy()
110
+
111
+ if source == "chembl":
112
+ if not self.chembl.loaded:
113
+ self.chembl.set()
114
+ interaction = self.chembl.dataframe
115
+ elif source == "dgidb":
116
+ if not self.dgidb.loaded:
117
+ self.dgidb.set()
118
+ interaction = self.dgidb.dataframe
119
+ else:
120
+ if not self.pharmgkb.loaded:
121
+ self.pharmgkb.set()
122
+ interaction = self.pharmgkb.data
123
+
124
+ if source != "pharmgkb":
125
+ exploded_df = interaction.explode("targets")
126
+ gene_compound_dict = (
127
+ exploded_df.groupby("targets")["compounds"]
128
+ .apply(lambda compounds: "|".join(sorted(set(compounds))))
129
+ .to_dict()
130
+ )
131
+
132
+ adata.var["compounds"] = adata.var_names.map(lambda gene: gene_compound_dict.get(gene, ""))
133
+ else:
134
+ compounds = interaction[interaction["Type"] == "Chemical"]
135
+ exploded_df = compounds.explode("Gene")
136
+ gene_compound_dict = (
137
+ exploded_df.groupby("Gene")["Compound|Disease"]
138
+ .apply(lambda compounds: "|".join(sorted(set(compounds))))
139
+ .to_dict()
140
+ )
141
+
142
+ adata.var["compounds"] = adata.var_names.map(lambda gene: gene_compound_dict.get(gene, ""))
143
+ diseases = interaction[interaction["Type"] == "Disease"]
144
+ exploded_df = diseases.explode("Gene")
145
+ gene_disease_dict = (
146
+ exploded_df.groupby("Gene")["Compound|Disease"]
147
+ .apply(lambda diseases: "|".join(sorted(set(diseases))))
148
+ .to_dict()
149
+ )
150
+
151
+ adata.var["diseases"] = adata.var_names.map(lambda gene: gene_disease_dict.get(gene, ""))
152
+ return adata
153
+
154
+ def lookup(self) -> LookUp:
155
+ """Generate LookUp object for Drug.
156
+
157
+ The LookUp object provides an overview of the metadata to annotate.
158
+ annotate function has a corresponding lookup function in the LookUp object,
159
+ where users can search the compound and targets in the metadata.
160
+
161
+ Returns:
162
+ Returns a LookUp object specific for drug annotation.
163
+ """
164
+ if not self.chembl.loaded:
165
+ self.chembl.set()
166
+ if not self.dgidb.loaded:
167
+ self.dgidb.set()
168
+ if not self.pharmgkb.loaded:
169
+ self.pharmgkb.set()
170
+
171
+ return LookUp(
172
+ type="drug",
173
+ transfer_metadata=[
174
+ self.chembl.dataframe,
175
+ self.dgidb.data,
176
+ self.pharmgkb.data,
177
+ ],
178
+ )
179
+
180
+ class DrugDataBase:
181
+ def __init__(self, database: Literal["chembl", "dgidb", "pharmgkb"] = "chembl"):
182
+ self.database = database
183
+ self.loaded = False
184
+
185
+ def set(self) -> None:
186
+ self.loaded = True
187
+ data = _download_drug_annotation(source=self.database)
188
+ self.data = data
189
+ if self.database == "chembl":
190
+ if not isinstance(data, dict):
191
+ raise ValueError(
192
+ "The chembl data is in a wrong format. Please clear the cache and reinitialize the object."
193
+ )
194
+ self.dictionary = data
195
+ targets = dict(ChainMap(*[data[cat] for cat in data]))
196
+ self.dataframe = pd.DataFrame([{"Compound": k, "Targets": v} for k, v in targets.items()])
197
+ self.dataframe.rename(
198
+ columns={"Targets": "targets", "Compound": "compounds"},
199
+ inplace=True,
200
+ )
201
+ elif self.database == "dgidb":
202
+ if not isinstance(data, pd.DataFrame):
203
+ raise ValueError(
204
+ "The dgidb data is in a wrong format. Please clear the cache and reinitialize the object."
205
+ )
206
+ self.dataframe = data.groupby("drug_claim_name")["gene_claim_name"].apply(list).reset_index()
207
+ self.dataframe.rename(
208
+ columns={
209
+ "gene_claim_name": "targets",
210
+ "drug_claim_name": "compounds",
211
+ },
212
+ inplace=True,
213
+ )
214
+ self.dictionary = self.dataframe.set_index("compounds")["targets"].to_dict()
215
+ else:
216
+ if not isinstance(data, pd.DataFrame):
217
+ raise ValueError(
218
+ "The pharmGKB data is in a wrong format. Please clear the cache and reinitialize the object."
219
+ )
220
+ self.dataframe = data.groupby("Compound|Disease")["Gene"].apply(list).reset_index()
221
+ self.dataframe.rename(
222
+ columns={
223
+ "Gene": "targets",
224
+ "Compound|Disease": "compounds|diseases",
225
+ },
226
+ inplace=True,
227
+ )
228
+ self.dictionary = self.dataframe.set_index("compounds|diseases")["targets"].to_dict()
229
+
230
+ def df(self) -> pd.DataFrame:
231
+ if not self.loaded:
232
+ self.set()
233
+ return self.dataframe
234
+
235
+ def dict(self) -> dict[str, list[str]] | dict[str, dict[str, list[str]]]:
236
+ if not self.loaded:
237
+ self.set()
238
+ return self.dictionary