pertpy 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. pertpy/__init__.py +3 -2
  2. pertpy/data/__init__.py +5 -1
  3. pertpy/data/_dataloader.py +2 -4
  4. pertpy/data/_datasets.py +203 -92
  5. pertpy/metadata/__init__.py +4 -0
  6. pertpy/metadata/_cell_line.py +826 -0
  7. pertpy/metadata/_compound.py +129 -0
  8. pertpy/metadata/_drug.py +242 -0
  9. pertpy/metadata/_look_up.py +582 -0
  10. pertpy/metadata/_metadata.py +73 -0
  11. pertpy/metadata/_moa.py +129 -0
  12. pertpy/plot/__init__.py +1 -9
  13. pertpy/plot/_augur.py +53 -116
  14. pertpy/plot/_coda.py +277 -677
  15. pertpy/plot/_guide_rna.py +17 -35
  16. pertpy/plot/_milopy.py +59 -134
  17. pertpy/plot/_mixscape.py +152 -391
  18. pertpy/preprocessing/_guide_rna.py +88 -4
  19. pertpy/tools/__init__.py +8 -13
  20. pertpy/tools/_augur.py +315 -17
  21. pertpy/tools/_cinemaot.py +143 -4
  22. pertpy/tools/_coda/_base_coda.py +1210 -65
  23. pertpy/tools/_coda/_sccoda.py +50 -21
  24. pertpy/tools/_coda/_tasccoda.py +27 -19
  25. pertpy/tools/_dialogue.py +164 -56
  26. pertpy/tools/_differential_gene_expression.py +240 -14
  27. pertpy/tools/_distances/_distance_tests.py +8 -8
  28. pertpy/tools/_distances/_distances.py +184 -34
  29. pertpy/tools/_enrichment.py +465 -0
  30. pertpy/tools/_milo.py +345 -11
  31. pertpy/tools/_mixscape.py +668 -50
  32. pertpy/tools/_perturbation_space/_clustering.py +5 -1
  33. pertpy/tools/_perturbation_space/_discriminator_classifiers.py +526 -0
  34. pertpy/tools/_perturbation_space/_perturbation_space.py +135 -43
  35. pertpy/tools/_perturbation_space/_simple.py +51 -10
  36. pertpy/tools/_scgen/__init__.py +1 -1
  37. pertpy/tools/_scgen/_scgen.py +701 -0
  38. pertpy/tools/_scgen/_utils.py +1 -3
  39. pertpy/tools/decoupler_LICENSE +674 -0
  40. {pertpy-0.6.0.dist-info → pertpy-0.7.0.dist-info}/METADATA +31 -12
  41. pertpy-0.7.0.dist-info/RECORD +53 -0
  42. {pertpy-0.6.0.dist-info → pertpy-0.7.0.dist-info}/WHEEL +1 -1
  43. pertpy/plot/_cinemaot.py +0 -81
  44. pertpy/plot/_dialogue.py +0 -91
  45. pertpy/plot/_scgen.py +0 -337
  46. pertpy/tools/_metadata/__init__.py +0 -0
  47. pertpy/tools/_metadata/_cell_line.py +0 -613
  48. pertpy/tools/_metadata/_look_up.py +0 -342
  49. pertpy/tools/_perturbation_space/_discriminator_classifier.py +0 -381
  50. pertpy/tools/_scgen/_jax_scgen.py +0 -370
  51. pertpy-0.6.0.dist-info/RECORD +0 -50
  52. /pertpy/tools/_scgen/{_jax_scgenvae.py → _scgenvae.py} +0 -0
  53. {pertpy-0.6.0.dist-info → pertpy-0.7.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,129 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Literal
4
+
5
+ import pandas as pd
6
+ import pubchempy as pcp
7
+
8
+ from ._look_up import LookUp
9
+ from ._metadata import MetaData
10
+
11
+ if TYPE_CHECKING:
12
+ from anndata import AnnData
13
+
14
+
15
+ class Compound(MetaData):
16
+ """Utilities to fetch metadata for compounds."""
17
+
18
+ def __init__(self):
19
+ super().__init__()
20
+
21
+ def annotate_compounds(
22
+ self,
23
+ adata: AnnData,
24
+ query_id: str = "perturbation",
25
+ query_id_type: Literal["name", "cid"] = "name",
26
+ verbosity: int | str = 5,
27
+ copy: bool = False,
28
+ ) -> AnnData:
29
+ """Fetch compound annotation from pubchempy.
30
+
31
+ Args:
32
+ adata: The data object to annotate.
33
+ query_id: The column of `.obs` with compound identifiers. Defaults to 'perturbation'.
34
+ query_id_type: The type of compound identifiers, 'name' or 'cid'. Defaults to 'name'.
35
+ verbosity: The number of unmatched identifiers to print, can be either non-negative values or "all".
36
+ Defaults to 5.
37
+ copy: Determines whether a copy of the `adata` is returned. Defaults to False.
38
+
39
+ Returns:
40
+ Returns an AnnData object with compound annotation.
41
+ """
42
+ if copy:
43
+ adata = adata.copy()
44
+
45
+ if query_id not in adata.obs.columns:
46
+ raise ValueError(f"The requested query_id {query_id} is not in `adata.obs`.\n" f"Please check again. ")
47
+
48
+ query_dict = {}
49
+ not_matched_identifiers = []
50
+ for compound in adata.obs[query_id].dropna().astype(str).unique():
51
+ if query_id_type == "name":
52
+ cids = pcp.get_compounds(compound, "name")
53
+ if len(cids) == 0: # search did not work
54
+ not_matched_identifiers.append(compound)
55
+ if len(cids) >= 1:
56
+ # If the name matches the first synonym offered by PubChem (outside of capitalization),
57
+ # it is not changed (outside of capitalization). Otherwise, it is replaced with the first synonym.
58
+ query_dict[compound] = [
59
+ cids[0].synonyms[0],
60
+ cids[0].cid,
61
+ cids[0].canonical_smiles,
62
+ ]
63
+ else:
64
+ try:
65
+ cid = pcp.Compound.from_cid(compound)
66
+ query_dict[compound] = [
67
+ cid.synonyms[0],
68
+ compound,
69
+ cid.canonical_smiles,
70
+ ]
71
+ except pcp.BadRequestError:
72
+ # pubchempy throws badrequest if a cid is not found
73
+ not_matched_identifiers.append(compound)
74
+
75
+ identifier_num_all = len(adata.obs[query_id].unique())
76
+ self._warn_unmatch(
77
+ total_identifiers=identifier_num_all,
78
+ unmatched_identifiers=not_matched_identifiers,
79
+ query_id=query_id,
80
+ reference_id=query_id_type,
81
+ metadata_type="compound",
82
+ verbosity=verbosity,
83
+ )
84
+
85
+ query_df = pd.DataFrame.from_dict(query_dict, orient="index", columns=["pubchem_name", "pubchem_ID", "smiles"])
86
+ # Merge and remove duplicate columns
87
+ # Column is converted to float after merging due to unmatches
88
+ # Convert back to integers
89
+ if query_id_type == "cid":
90
+ query_df.pubchem_ID = query_df.pubchem_ID.astype("Int64")
91
+ adata.obs = (
92
+ adata.obs.merge(
93
+ query_df,
94
+ left_on=query_id,
95
+ right_on="pubchem_ID",
96
+ how="left",
97
+ suffixes=("", "_fromMeta"),
98
+ )
99
+ .filter(regex="^(?!.*_fromMeta)")
100
+ .set_index(adata.obs.index)
101
+ )
102
+ else:
103
+ adata.obs = (
104
+ adata.obs.merge(
105
+ query_df,
106
+ left_on=query_id,
107
+ right_index=True,
108
+ how="left",
109
+ suffixes=("", "_fromMeta"),
110
+ )
111
+ .filter(regex="^(?!.*_fromMeta)")
112
+ .set_index(adata.obs.index)
113
+ )
114
+ adata.obs.pubchem_ID = adata.obs.pubchem_ID.astype("Int64")
115
+
116
+ return adata
117
+
118
+ def lookup(self) -> LookUp:
119
+ """Generate LookUp object for CompoundMetaData.
120
+
121
+ The LookUp object provides an overview of the metadata to annotate.
122
+ Each annotate_{metadata} function has a corresponding lookup function in the LookUp object,
123
+ where users can search the reference_id in the metadata and
124
+ compare with the query_id in their own data.
125
+
126
+ Returns:
127
+ Returns a LookUp object specific for compound annotation.
128
+ """
129
+ return LookUp(type="compound")
@@ -0,0 +1,242 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from collections import ChainMap
5
+ from pathlib import Path
6
+ from typing import TYPE_CHECKING, Literal
7
+
8
+ import pandas as pd
9
+ from rich import print
10
+ from scanpy import settings
11
+
12
+ from pertpy.data._dataloader import _download
13
+
14
+ from ._look_up import LookUp
15
+ from ._metadata import MetaData
16
+
17
+ if TYPE_CHECKING:
18
+ from anndata import AnnData
19
+
20
+
21
+ def _download_drug_annotation(
22
+ source: Literal["chembl", "dgidb", "pharmgkb"] = "chembl",
23
+ ) -> pd.DataFrame | dict[str, dict[str, list[str]]]:
24
+ if source == "chembl":
25
+ # Prepared in https://github.com/theislab/pertpy-datasets/blob/main/chembl_data.ipynb
26
+ chembl_path = Path(settings.cachedir) / "chembl.json"
27
+ if not Path(chembl_path).exists():
28
+ print("[bold yellow]No metadata file was found for chembl. Starting download now.")
29
+ _download(
30
+ url="https://figshare.com/ndownloader/files/43871718",
31
+ output_file_name="chembl.json",
32
+ output_path=settings.cachedir,
33
+ block_size=4096,
34
+ is_zip=False,
35
+ )
36
+ with chembl_path.open() as file:
37
+ chembl_json = json.load(file)
38
+ return chembl_json
39
+
40
+ elif source == "dgidb":
41
+ dgidb_path = Path(settings.cachedir) / "dgidb.tsv"
42
+ if not Path(dgidb_path).exists():
43
+ print("[bold yellow]No metadata file was found for dgidb. Starting download now.")
44
+ _download(
45
+ url="https://www.dgidb.org/data/latest/interactions.tsv",
46
+ output_file_name="dgidb.tsv",
47
+ output_path=settings.cachedir,
48
+ block_size=4096,
49
+ is_zip=False,
50
+ )
51
+ dgidb_df = pd.read_table(dgidb_path)
52
+ return dgidb_df
53
+
54
+ else:
55
+ pharmgkb_path = Path(settings.cachedir) / "pharmgkb.tsv"
56
+ if not Path(pharmgkb_path).exists():
57
+ print("[bold yellow]No metadata file was found for pharmGKB. Starting download now.")
58
+ _download(
59
+ url="https://api.pharmgkb.org/v1/download/file/data/relationships.zip",
60
+ output_file_name="pharmgkb.zip",
61
+ output_path=settings.cachedir,
62
+ block_size=4096,
63
+ is_zip=True,
64
+ )
65
+ Path.rename(Path(settings.cachedir) / "relationships.tsv", pharmgkb_path)
66
+
67
+ pharmgkb_df = pd.read_table(pharmgkb_path)
68
+ pharmgkb_df = pharmgkb_df[pharmgkb_df["Association"] != "not associated"]
69
+ pharmgkb_df = pharmgkb_df[
70
+ (pharmgkb_df["Entity1_type"] == "Gene")
71
+ & ((pharmgkb_df["Entity2_type"] == "Chemical") | (pharmgkb_df["Entity2_type"] == "Disease"))
72
+ ]
73
+ pharmgkb_df.rename(
74
+ columns={
75
+ "Entity2_name": "Compound|Disease",
76
+ "Entity1_name": "Gene",
77
+ "Entity2_type": "Type",
78
+ },
79
+ inplace=True,
80
+ )
81
+ pharmgkb_df.drop(["Entity1_type", "Entity1_id", "Entity2_id"], axis=1, inplace=True)
82
+
83
+ return pharmgkb_df
84
+
85
+
86
+ class Drug(MetaData):
87
+ """Utilities to fetch metadata for drug studies."""
88
+
89
+ def __init__(self):
90
+ self.chembl = self.DrugDataBase(database="chembl")
91
+ self.dgidb = self.DrugDataBase(database="dgidb")
92
+ self.pharmgkb = self.DrugDataBase(database="pharmgkb")
93
+
94
+ def annotate(
95
+ self,
96
+ adata: AnnData,
97
+ source: Literal["chembl", "dgidb", "pharmgkb"] = "chembl",
98
+ copy: bool = False,
99
+ ) -> AnnData:
100
+ """Annotates genes by their involvement in applied drugs.
101
+
102
+ Genes need to be in HGNC format.
103
+
104
+ Args:
105
+ adata: AnnData object containing log-normalised data.
106
+ source: Source of the metadata, chembl, dgidb or pharmgkb. Defaults to chembl.
107
+ copy: Determines whether a copy of the `adata` is returned. Defaults to False.
108
+
109
+ Returns:
110
+ An AnnData object with a new column `drug` in the var slot.
111
+ """
112
+ if copy:
113
+ adata = adata.copy()
114
+
115
+ if source == "chembl":
116
+ if not self.chembl.loaded:
117
+ self.chembl.set()
118
+ interaction = self.chembl.dataframe
119
+ elif source == "dgidb":
120
+ if not self.dgidb.loaded:
121
+ self.dgidb.set()
122
+ interaction = self.dgidb.dataframe
123
+ else:
124
+ if not self.pharmgkb.loaded:
125
+ self.pharmgkb.set()
126
+ interaction = self.pharmgkb.data
127
+
128
+ if source != "pharmgkb":
129
+ exploded_df = interaction.explode("targets")
130
+ gene_compound_dict = (
131
+ exploded_df.groupby("targets")["compounds"]
132
+ .apply(lambda compounds: "|".join(sorted(set(compounds))))
133
+ .to_dict()
134
+ )
135
+
136
+ adata.var["compounds"] = adata.var_names.map(lambda gene: gene_compound_dict.get(gene, ""))
137
+ else:
138
+ compounds = interaction[interaction["Type"] == "Chemical"]
139
+ exploded_df = compounds.explode("Gene")
140
+ gene_compound_dict = (
141
+ exploded_df.groupby("Gene")["Compound|Disease"]
142
+ .apply(lambda compounds: "|".join(sorted(set(compounds))))
143
+ .to_dict()
144
+ )
145
+
146
+ adata.var["compounds"] = adata.var_names.map(lambda gene: gene_compound_dict.get(gene, ""))
147
+ diseases = interaction[interaction["Type"] == "Disease"]
148
+ exploded_df = diseases.explode("Gene")
149
+ gene_disease_dict = (
150
+ exploded_df.groupby("Gene")["Compound|Disease"]
151
+ .apply(lambda diseases: "|".join(sorted(set(diseases))))
152
+ .to_dict()
153
+ )
154
+
155
+ adata.var["diseases"] = adata.var_names.map(lambda gene: gene_disease_dict.get(gene, ""))
156
+ return adata
157
+
158
+ def lookup(self) -> LookUp:
159
+ """Generate LookUp object for Drug.
160
+
161
+ The LookUp object provides an overview of the metadata to annotate.
162
+ annotate function has a corresponding lookup function in the LookUp object,
163
+ where users can search the compound and targets in the metadata.
164
+
165
+ Returns:
166
+ Returns a LookUp object specific for drug annotation.
167
+ """
168
+ if not self.chembl.loaded:
169
+ self.chembl.set()
170
+ if not self.dgidb.loaded:
171
+ self.dgidb.set()
172
+ if not self.pharmgkb.loaded:
173
+ self.pharmgkb.set()
174
+
175
+ return LookUp(
176
+ type="drug",
177
+ transfer_metadata=[
178
+ self.chembl.dataframe,
179
+ self.dgidb.data,
180
+ self.pharmgkb.data,
181
+ ],
182
+ )
183
+
184
+ class DrugDataBase:
185
+ def __init__(self, database: Literal["chembl", "dgidb", "pharmgkb"] = "chembl"):
186
+ self.database = database
187
+ self.loaded = False
188
+
189
+ def set(self) -> None:
190
+ self.loaded = True
191
+ data = _download_drug_annotation(source=self.database)
192
+ self.data = data
193
+ if self.database == "chembl":
194
+ if not isinstance(data, dict):
195
+ raise ValueError(
196
+ "The chembl data is in a wrong format. Please clear the cache and reinitialize the object."
197
+ )
198
+ self.dictionary = data
199
+ targets = dict(ChainMap(*[data[cat] for cat in data]))
200
+ self.dataframe = pd.DataFrame([{"Compound": k, "Targets": v} for k, v in targets.items()])
201
+ self.dataframe.rename(
202
+ columns={"Targets": "targets", "Compound": "compounds"},
203
+ inplace=True,
204
+ )
205
+ elif self.database == "dgidb":
206
+ if not isinstance(data, pd.DataFrame):
207
+ raise ValueError(
208
+ "The dgidb data is in a wrong format. Please clear the cache and reinitialize the object."
209
+ )
210
+ self.dataframe = data.groupby("drug_claim_name")["gene_claim_name"].apply(list).reset_index()
211
+ self.dataframe.rename(
212
+ columns={
213
+ "gene_claim_name": "targets",
214
+ "drug_claim_name": "compounds",
215
+ },
216
+ inplace=True,
217
+ )
218
+ self.dictionary = self.dataframe.set_index("compounds")["targets"].to_dict()
219
+ else:
220
+ if not isinstance(data, pd.DataFrame):
221
+ raise ValueError(
222
+ "The pharmGKB data is in a wrong format. Please clear the cache and reinitialize the object."
223
+ )
224
+ self.dataframe = data.groupby("Compound|Disease")["Gene"].apply(list).reset_index()
225
+ self.dataframe.rename(
226
+ columns={
227
+ "Gene": "targets",
228
+ "Compound|Disease": "compounds|diseases",
229
+ },
230
+ inplace=True,
231
+ )
232
+ self.dictionary = self.dataframe.set_index("compounds|diseases")["targets"].to_dict()
233
+
234
+ def df(self) -> pd.DataFrame:
235
+ if not self.loaded:
236
+ self.set()
237
+ return self.dataframe
238
+
239
+ def dict(self) -> dict[str, list[str]] | dict[str, dict[str, list[str]]]:
240
+ if not self.loaded:
241
+ self.set()
242
+ return self.dictionary