pertpy 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pertpy/__init__.py +3 -2
- pertpy/data/__init__.py +5 -1
- pertpy/data/_dataloader.py +2 -4
- pertpy/data/_datasets.py +203 -92
- pertpy/metadata/__init__.py +4 -0
- pertpy/metadata/_cell_line.py +826 -0
- pertpy/metadata/_compound.py +129 -0
- pertpy/metadata/_drug.py +242 -0
- pertpy/metadata/_look_up.py +582 -0
- pertpy/metadata/_metadata.py +73 -0
- pertpy/metadata/_moa.py +129 -0
- pertpy/plot/__init__.py +1 -9
- pertpy/plot/_augur.py +53 -116
- pertpy/plot/_coda.py +277 -677
- pertpy/plot/_guide_rna.py +17 -35
- pertpy/plot/_milopy.py +59 -134
- pertpy/plot/_mixscape.py +152 -391
- pertpy/preprocessing/_guide_rna.py +88 -4
- pertpy/tools/__init__.py +8 -13
- pertpy/tools/_augur.py +315 -17
- pertpy/tools/_cinemaot.py +143 -4
- pertpy/tools/_coda/_base_coda.py +1210 -65
- pertpy/tools/_coda/_sccoda.py +50 -21
- pertpy/tools/_coda/_tasccoda.py +27 -19
- pertpy/tools/_dialogue.py +164 -56
- pertpy/tools/_differential_gene_expression.py +240 -14
- pertpy/tools/_distances/_distance_tests.py +8 -8
- pertpy/tools/_distances/_distances.py +184 -34
- pertpy/tools/_enrichment.py +465 -0
- pertpy/tools/_milo.py +345 -11
- pertpy/tools/_mixscape.py +668 -50
- pertpy/tools/_perturbation_space/_clustering.py +5 -1
- pertpy/tools/_perturbation_space/_discriminator_classifiers.py +526 -0
- pertpy/tools/_perturbation_space/_perturbation_space.py +135 -43
- pertpy/tools/_perturbation_space/_simple.py +51 -10
- pertpy/tools/_scgen/__init__.py +1 -1
- pertpy/tools/_scgen/_scgen.py +701 -0
- pertpy/tools/_scgen/_utils.py +1 -3
- pertpy/tools/decoupler_LICENSE +674 -0
- {pertpy-0.6.0.dist-info → pertpy-0.7.0.dist-info}/METADATA +31 -12
- pertpy-0.7.0.dist-info/RECORD +53 -0
- {pertpy-0.6.0.dist-info → pertpy-0.7.0.dist-info}/WHEEL +1 -1
- pertpy/plot/_cinemaot.py +0 -81
- pertpy/plot/_dialogue.py +0 -91
- pertpy/plot/_scgen.py +0 -337
- pertpy/tools/_metadata/__init__.py +0 -0
- pertpy/tools/_metadata/_cell_line.py +0 -613
- pertpy/tools/_metadata/_look_up.py +0 -342
- pertpy/tools/_perturbation_space/_discriminator_classifier.py +0 -381
- pertpy/tools/_scgen/_jax_scgen.py +0 -370
- pertpy-0.6.0.dist-info/RECORD +0 -50
- /pertpy/tools/_scgen/{_jax_scgenvae.py → _scgenvae.py} +0 -0
- {pertpy-0.6.0.dist-info → pertpy-0.7.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,129 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from typing import TYPE_CHECKING, Literal
|
4
|
+
|
5
|
+
import pandas as pd
|
6
|
+
import pubchempy as pcp
|
7
|
+
|
8
|
+
from ._look_up import LookUp
|
9
|
+
from ._metadata import MetaData
|
10
|
+
|
11
|
+
if TYPE_CHECKING:
|
12
|
+
from anndata import AnnData
|
13
|
+
|
14
|
+
|
15
|
+
class Compound(MetaData):
|
16
|
+
"""Utilities to fetch metadata for compounds."""
|
17
|
+
|
18
|
+
def __init__(self):
|
19
|
+
super().__init__()
|
20
|
+
|
21
|
+
def annotate_compounds(
|
22
|
+
self,
|
23
|
+
adata: AnnData,
|
24
|
+
query_id: str = "perturbation",
|
25
|
+
query_id_type: Literal["name", "cid"] = "name",
|
26
|
+
verbosity: int | str = 5,
|
27
|
+
copy: bool = False,
|
28
|
+
) -> AnnData:
|
29
|
+
"""Fetch compound annotation from pubchempy.
|
30
|
+
|
31
|
+
Args:
|
32
|
+
adata: The data object to annotate.
|
33
|
+
query_id: The column of `.obs` with compound identifiers. Defaults to 'perturbation'.
|
34
|
+
query_id_type: The type of compound identifiers, 'name' or 'cid'. Defaults to 'name'.
|
35
|
+
verbosity: The number of unmatched identifiers to print, can be either non-negative values or "all".
|
36
|
+
Defaults to 5.
|
37
|
+
copy: Determines whether a copy of the `adata` is returned. Defaults to False.
|
38
|
+
|
39
|
+
Returns:
|
40
|
+
Returns an AnnData object with compound annotation.
|
41
|
+
"""
|
42
|
+
if copy:
|
43
|
+
adata = adata.copy()
|
44
|
+
|
45
|
+
if query_id not in adata.obs.columns:
|
46
|
+
raise ValueError(f"The requested query_id {query_id} is not in `adata.obs`.\n" f"Please check again. ")
|
47
|
+
|
48
|
+
query_dict = {}
|
49
|
+
not_matched_identifiers = []
|
50
|
+
for compound in adata.obs[query_id].dropna().astype(str).unique():
|
51
|
+
if query_id_type == "name":
|
52
|
+
cids = pcp.get_compounds(compound, "name")
|
53
|
+
if len(cids) == 0: # search did not work
|
54
|
+
not_matched_identifiers.append(compound)
|
55
|
+
if len(cids) >= 1:
|
56
|
+
# If the name matches the first synonym offered by PubChem (outside of capitalization),
|
57
|
+
# it is not changed (outside of capitalization). Otherwise, it is replaced with the first synonym.
|
58
|
+
query_dict[compound] = [
|
59
|
+
cids[0].synonyms[0],
|
60
|
+
cids[0].cid,
|
61
|
+
cids[0].canonical_smiles,
|
62
|
+
]
|
63
|
+
else:
|
64
|
+
try:
|
65
|
+
cid = pcp.Compound.from_cid(compound)
|
66
|
+
query_dict[compound] = [
|
67
|
+
cid.synonyms[0],
|
68
|
+
compound,
|
69
|
+
cid.canonical_smiles,
|
70
|
+
]
|
71
|
+
except pcp.BadRequestError:
|
72
|
+
# pubchempy throws badrequest if a cid is not found
|
73
|
+
not_matched_identifiers.append(compound)
|
74
|
+
|
75
|
+
identifier_num_all = len(adata.obs[query_id].unique())
|
76
|
+
self._warn_unmatch(
|
77
|
+
total_identifiers=identifier_num_all,
|
78
|
+
unmatched_identifiers=not_matched_identifiers,
|
79
|
+
query_id=query_id,
|
80
|
+
reference_id=query_id_type,
|
81
|
+
metadata_type="compound",
|
82
|
+
verbosity=verbosity,
|
83
|
+
)
|
84
|
+
|
85
|
+
query_df = pd.DataFrame.from_dict(query_dict, orient="index", columns=["pubchem_name", "pubchem_ID", "smiles"])
|
86
|
+
# Merge and remove duplicate columns
|
87
|
+
# Column is converted to float after merging due to unmatches
|
88
|
+
# Convert back to integers
|
89
|
+
if query_id_type == "cid":
|
90
|
+
query_df.pubchem_ID = query_df.pubchem_ID.astype("Int64")
|
91
|
+
adata.obs = (
|
92
|
+
adata.obs.merge(
|
93
|
+
query_df,
|
94
|
+
left_on=query_id,
|
95
|
+
right_on="pubchem_ID",
|
96
|
+
how="left",
|
97
|
+
suffixes=("", "_fromMeta"),
|
98
|
+
)
|
99
|
+
.filter(regex="^(?!.*_fromMeta)")
|
100
|
+
.set_index(adata.obs.index)
|
101
|
+
)
|
102
|
+
else:
|
103
|
+
adata.obs = (
|
104
|
+
adata.obs.merge(
|
105
|
+
query_df,
|
106
|
+
left_on=query_id,
|
107
|
+
right_index=True,
|
108
|
+
how="left",
|
109
|
+
suffixes=("", "_fromMeta"),
|
110
|
+
)
|
111
|
+
.filter(regex="^(?!.*_fromMeta)")
|
112
|
+
.set_index(adata.obs.index)
|
113
|
+
)
|
114
|
+
adata.obs.pubchem_ID = adata.obs.pubchem_ID.astype("Int64")
|
115
|
+
|
116
|
+
return adata
|
117
|
+
|
118
|
+
def lookup(self) -> LookUp:
|
119
|
+
"""Generate LookUp object for CompoundMetaData.
|
120
|
+
|
121
|
+
The LookUp object provides an overview of the metadata to annotate.
|
122
|
+
Each annotate_{metadata} function has a corresponding lookup function in the LookUp object,
|
123
|
+
where users can search the reference_id in the metadata and
|
124
|
+
compare with the query_id in their own data.
|
125
|
+
|
126
|
+
Returns:
|
127
|
+
Returns a LookUp object specific for compound annotation.
|
128
|
+
"""
|
129
|
+
return LookUp(type="compound")
|
pertpy/metadata/_drug.py
ADDED
@@ -0,0 +1,242 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import json
|
4
|
+
from collections import ChainMap
|
5
|
+
from pathlib import Path
|
6
|
+
from typing import TYPE_CHECKING, Literal
|
7
|
+
|
8
|
+
import pandas as pd
|
9
|
+
from rich import print
|
10
|
+
from scanpy import settings
|
11
|
+
|
12
|
+
from pertpy.data._dataloader import _download
|
13
|
+
|
14
|
+
from ._look_up import LookUp
|
15
|
+
from ._metadata import MetaData
|
16
|
+
|
17
|
+
if TYPE_CHECKING:
|
18
|
+
from anndata import AnnData
|
19
|
+
|
20
|
+
|
21
|
+
def _download_drug_annotation(
|
22
|
+
source: Literal["chembl", "dgidb", "pharmgkb"] = "chembl",
|
23
|
+
) -> pd.DataFrame | dict[str, dict[str, list[str]]]:
|
24
|
+
if source == "chembl":
|
25
|
+
# Prepared in https://github.com/theislab/pertpy-datasets/blob/main/chembl_data.ipynb
|
26
|
+
chembl_path = Path(settings.cachedir) / "chembl.json"
|
27
|
+
if not Path(chembl_path).exists():
|
28
|
+
print("[bold yellow]No metadata file was found for chembl. Starting download now.")
|
29
|
+
_download(
|
30
|
+
url="https://figshare.com/ndownloader/files/43871718",
|
31
|
+
output_file_name="chembl.json",
|
32
|
+
output_path=settings.cachedir,
|
33
|
+
block_size=4096,
|
34
|
+
is_zip=False,
|
35
|
+
)
|
36
|
+
with chembl_path.open() as file:
|
37
|
+
chembl_json = json.load(file)
|
38
|
+
return chembl_json
|
39
|
+
|
40
|
+
elif source == "dgidb":
|
41
|
+
dgidb_path = Path(settings.cachedir) / "dgidb.tsv"
|
42
|
+
if not Path(dgidb_path).exists():
|
43
|
+
print("[bold yellow]No metadata file was found for dgidb. Starting download now.")
|
44
|
+
_download(
|
45
|
+
url="https://www.dgidb.org/data/latest/interactions.tsv",
|
46
|
+
output_file_name="dgidb.tsv",
|
47
|
+
output_path=settings.cachedir,
|
48
|
+
block_size=4096,
|
49
|
+
is_zip=False,
|
50
|
+
)
|
51
|
+
dgidb_df = pd.read_table(dgidb_path)
|
52
|
+
return dgidb_df
|
53
|
+
|
54
|
+
else:
|
55
|
+
pharmgkb_path = Path(settings.cachedir) / "pharmgkb.tsv"
|
56
|
+
if not Path(pharmgkb_path).exists():
|
57
|
+
print("[bold yellow]No metadata file was found for pharmGKB. Starting download now.")
|
58
|
+
_download(
|
59
|
+
url="https://api.pharmgkb.org/v1/download/file/data/relationships.zip",
|
60
|
+
output_file_name="pharmgkb.zip",
|
61
|
+
output_path=settings.cachedir,
|
62
|
+
block_size=4096,
|
63
|
+
is_zip=True,
|
64
|
+
)
|
65
|
+
Path.rename(Path(settings.cachedir) / "relationships.tsv", pharmgkb_path)
|
66
|
+
|
67
|
+
pharmgkb_df = pd.read_table(pharmgkb_path)
|
68
|
+
pharmgkb_df = pharmgkb_df[pharmgkb_df["Association"] != "not associated"]
|
69
|
+
pharmgkb_df = pharmgkb_df[
|
70
|
+
(pharmgkb_df["Entity1_type"] == "Gene")
|
71
|
+
& ((pharmgkb_df["Entity2_type"] == "Chemical") | (pharmgkb_df["Entity2_type"] == "Disease"))
|
72
|
+
]
|
73
|
+
pharmgkb_df.rename(
|
74
|
+
columns={
|
75
|
+
"Entity2_name": "Compound|Disease",
|
76
|
+
"Entity1_name": "Gene",
|
77
|
+
"Entity2_type": "Type",
|
78
|
+
},
|
79
|
+
inplace=True,
|
80
|
+
)
|
81
|
+
pharmgkb_df.drop(["Entity1_type", "Entity1_id", "Entity2_id"], axis=1, inplace=True)
|
82
|
+
|
83
|
+
return pharmgkb_df
|
84
|
+
|
85
|
+
|
86
|
+
class Drug(MetaData):
|
87
|
+
"""Utilities to fetch metadata for drug studies."""
|
88
|
+
|
89
|
+
def __init__(self):
|
90
|
+
self.chembl = self.DrugDataBase(database="chembl")
|
91
|
+
self.dgidb = self.DrugDataBase(database="dgidb")
|
92
|
+
self.pharmgkb = self.DrugDataBase(database="pharmgkb")
|
93
|
+
|
94
|
+
def annotate(
|
95
|
+
self,
|
96
|
+
adata: AnnData,
|
97
|
+
source: Literal["chembl", "dgidb", "pharmgkb"] = "chembl",
|
98
|
+
copy: bool = False,
|
99
|
+
) -> AnnData:
|
100
|
+
"""Annotates genes by their involvement in applied drugs.
|
101
|
+
|
102
|
+
Genes need to be in HGNC format.
|
103
|
+
|
104
|
+
Args:
|
105
|
+
adata: AnnData object containing log-normalised data.
|
106
|
+
source: Source of the metadata, chembl, dgidb or pharmgkb. Defaults to chembl.
|
107
|
+
copy: Determines whether a copy of the `adata` is returned. Defaults to False.
|
108
|
+
|
109
|
+
Returns:
|
110
|
+
An AnnData object with a new column `drug` in the var slot.
|
111
|
+
"""
|
112
|
+
if copy:
|
113
|
+
adata = adata.copy()
|
114
|
+
|
115
|
+
if source == "chembl":
|
116
|
+
if not self.chembl.loaded:
|
117
|
+
self.chembl.set()
|
118
|
+
interaction = self.chembl.dataframe
|
119
|
+
elif source == "dgidb":
|
120
|
+
if not self.dgidb.loaded:
|
121
|
+
self.dgidb.set()
|
122
|
+
interaction = self.dgidb.dataframe
|
123
|
+
else:
|
124
|
+
if not self.pharmgkb.loaded:
|
125
|
+
self.pharmgkb.set()
|
126
|
+
interaction = self.pharmgkb.data
|
127
|
+
|
128
|
+
if source != "pharmgkb":
|
129
|
+
exploded_df = interaction.explode("targets")
|
130
|
+
gene_compound_dict = (
|
131
|
+
exploded_df.groupby("targets")["compounds"]
|
132
|
+
.apply(lambda compounds: "|".join(sorted(set(compounds))))
|
133
|
+
.to_dict()
|
134
|
+
)
|
135
|
+
|
136
|
+
adata.var["compounds"] = adata.var_names.map(lambda gene: gene_compound_dict.get(gene, ""))
|
137
|
+
else:
|
138
|
+
compounds = interaction[interaction["Type"] == "Chemical"]
|
139
|
+
exploded_df = compounds.explode("Gene")
|
140
|
+
gene_compound_dict = (
|
141
|
+
exploded_df.groupby("Gene")["Compound|Disease"]
|
142
|
+
.apply(lambda compounds: "|".join(sorted(set(compounds))))
|
143
|
+
.to_dict()
|
144
|
+
)
|
145
|
+
|
146
|
+
adata.var["compounds"] = adata.var_names.map(lambda gene: gene_compound_dict.get(gene, ""))
|
147
|
+
diseases = interaction[interaction["Type"] == "Disease"]
|
148
|
+
exploded_df = diseases.explode("Gene")
|
149
|
+
gene_disease_dict = (
|
150
|
+
exploded_df.groupby("Gene")["Compound|Disease"]
|
151
|
+
.apply(lambda diseases: "|".join(sorted(set(diseases))))
|
152
|
+
.to_dict()
|
153
|
+
)
|
154
|
+
|
155
|
+
adata.var["diseases"] = adata.var_names.map(lambda gene: gene_disease_dict.get(gene, ""))
|
156
|
+
return adata
|
157
|
+
|
158
|
+
def lookup(self) -> LookUp:
|
159
|
+
"""Generate LookUp object for Drug.
|
160
|
+
|
161
|
+
The LookUp object provides an overview of the metadata to annotate.
|
162
|
+
annotate function has a corresponding lookup function in the LookUp object,
|
163
|
+
where users can search the compound and targets in the metadata.
|
164
|
+
|
165
|
+
Returns:
|
166
|
+
Returns a LookUp object specific for drug annotation.
|
167
|
+
"""
|
168
|
+
if not self.chembl.loaded:
|
169
|
+
self.chembl.set()
|
170
|
+
if not self.dgidb.loaded:
|
171
|
+
self.dgidb.set()
|
172
|
+
if not self.pharmgkb.loaded:
|
173
|
+
self.pharmgkb.set()
|
174
|
+
|
175
|
+
return LookUp(
|
176
|
+
type="drug",
|
177
|
+
transfer_metadata=[
|
178
|
+
self.chembl.dataframe,
|
179
|
+
self.dgidb.data,
|
180
|
+
self.pharmgkb.data,
|
181
|
+
],
|
182
|
+
)
|
183
|
+
|
184
|
+
class DrugDataBase:
|
185
|
+
def __init__(self, database: Literal["chembl", "dgidb", "pharmgkb"] = "chembl"):
|
186
|
+
self.database = database
|
187
|
+
self.loaded = False
|
188
|
+
|
189
|
+
def set(self) -> None:
|
190
|
+
self.loaded = True
|
191
|
+
data = _download_drug_annotation(source=self.database)
|
192
|
+
self.data = data
|
193
|
+
if self.database == "chembl":
|
194
|
+
if not isinstance(data, dict):
|
195
|
+
raise ValueError(
|
196
|
+
"The chembl data is in a wrong format. Please clear the cache and reinitialize the object."
|
197
|
+
)
|
198
|
+
self.dictionary = data
|
199
|
+
targets = dict(ChainMap(*[data[cat] for cat in data]))
|
200
|
+
self.dataframe = pd.DataFrame([{"Compound": k, "Targets": v} for k, v in targets.items()])
|
201
|
+
self.dataframe.rename(
|
202
|
+
columns={"Targets": "targets", "Compound": "compounds"},
|
203
|
+
inplace=True,
|
204
|
+
)
|
205
|
+
elif self.database == "dgidb":
|
206
|
+
if not isinstance(data, pd.DataFrame):
|
207
|
+
raise ValueError(
|
208
|
+
"The dgidb data is in a wrong format. Please clear the cache and reinitialize the object."
|
209
|
+
)
|
210
|
+
self.dataframe = data.groupby("drug_claim_name")["gene_claim_name"].apply(list).reset_index()
|
211
|
+
self.dataframe.rename(
|
212
|
+
columns={
|
213
|
+
"gene_claim_name": "targets",
|
214
|
+
"drug_claim_name": "compounds",
|
215
|
+
},
|
216
|
+
inplace=True,
|
217
|
+
)
|
218
|
+
self.dictionary = self.dataframe.set_index("compounds")["targets"].to_dict()
|
219
|
+
else:
|
220
|
+
if not isinstance(data, pd.DataFrame):
|
221
|
+
raise ValueError(
|
222
|
+
"The pharmGKB data is in a wrong format. Please clear the cache and reinitialize the object."
|
223
|
+
)
|
224
|
+
self.dataframe = data.groupby("Compound|Disease")["Gene"].apply(list).reset_index()
|
225
|
+
self.dataframe.rename(
|
226
|
+
columns={
|
227
|
+
"Gene": "targets",
|
228
|
+
"Compound|Disease": "compounds|diseases",
|
229
|
+
},
|
230
|
+
inplace=True,
|
231
|
+
)
|
232
|
+
self.dictionary = self.dataframe.set_index("compounds|diseases")["targets"].to_dict()
|
233
|
+
|
234
|
+
def df(self) -> pd.DataFrame:
|
235
|
+
if not self.loaded:
|
236
|
+
self.set()
|
237
|
+
return self.dataframe
|
238
|
+
|
239
|
+
def dict(self) -> dict[str, list[str]] | dict[str, dict[str, list[str]]]:
|
240
|
+
if not self.loaded:
|
241
|
+
self.set()
|
242
|
+
return self.dictionary
|