pertpy 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pertpy/__init__.py +3 -2
- pertpy/data/__init__.py +5 -1
- pertpy/data/_dataloader.py +2 -4
- pertpy/data/_datasets.py +203 -92
- pertpy/metadata/__init__.py +4 -0
- pertpy/metadata/_cell_line.py +826 -0
- pertpy/metadata/_compound.py +129 -0
- pertpy/metadata/_drug.py +242 -0
- pertpy/metadata/_look_up.py +582 -0
- pertpy/metadata/_metadata.py +73 -0
- pertpy/metadata/_moa.py +129 -0
- pertpy/plot/__init__.py +1 -9
- pertpy/plot/_augur.py +53 -116
- pertpy/plot/_coda.py +277 -677
- pertpy/plot/_guide_rna.py +17 -35
- pertpy/plot/_milopy.py +59 -134
- pertpy/plot/_mixscape.py +152 -391
- pertpy/preprocessing/_guide_rna.py +88 -4
- pertpy/tools/__init__.py +8 -13
- pertpy/tools/_augur.py +315 -17
- pertpy/tools/_cinemaot.py +143 -4
- pertpy/tools/_coda/_base_coda.py +1210 -65
- pertpy/tools/_coda/_sccoda.py +50 -21
- pertpy/tools/_coda/_tasccoda.py +27 -19
- pertpy/tools/_dialogue.py +164 -56
- pertpy/tools/_differential_gene_expression.py +240 -14
- pertpy/tools/_distances/_distance_tests.py +8 -8
- pertpy/tools/_distances/_distances.py +184 -34
- pertpy/tools/_enrichment.py +465 -0
- pertpy/tools/_milo.py +345 -11
- pertpy/tools/_mixscape.py +668 -50
- pertpy/tools/_perturbation_space/_clustering.py +5 -1
- pertpy/tools/_perturbation_space/_discriminator_classifiers.py +526 -0
- pertpy/tools/_perturbation_space/_perturbation_space.py +135 -43
- pertpy/tools/_perturbation_space/_simple.py +51 -10
- pertpy/tools/_scgen/__init__.py +1 -1
- pertpy/tools/_scgen/_scgen.py +701 -0
- pertpy/tools/_scgen/_utils.py +1 -3
- pertpy/tools/decoupler_LICENSE +674 -0
- {pertpy-0.6.0.dist-info → pertpy-0.7.0.dist-info}/METADATA +31 -12
- pertpy-0.7.0.dist-info/RECORD +53 -0
- {pertpy-0.6.0.dist-info → pertpy-0.7.0.dist-info}/WHEEL +1 -1
- pertpy/plot/_cinemaot.py +0 -81
- pertpy/plot/_dialogue.py +0 -91
- pertpy/plot/_scgen.py +0 -337
- pertpy/tools/_metadata/__init__.py +0 -0
- pertpy/tools/_metadata/_cell_line.py +0 -613
- pertpy/tools/_metadata/_look_up.py +0 -342
- pertpy/tools/_perturbation_space/_discriminator_classifier.py +0 -381
- pertpy/tools/_scgen/_jax_scgen.py +0 -370
- pertpy-0.6.0.dist-info/RECORD +0 -50
- /pertpy/tools/_scgen/{_jax_scgenvae.py → _scgenvae.py} +0 -0
- {pertpy-0.6.0.dist-info → pertpy-0.7.0.dist-info}/licenses/LICENSE +0 -0
@@ -5,16 +5,18 @@ from typing import TYPE_CHECKING, Literal
|
|
5
5
|
import decoupler as dc
|
6
6
|
import numpy as np
|
7
7
|
import numpy.typing as npt
|
8
|
+
import pandas as pd
|
9
|
+
from scipy.stats import kendalltau, pearsonr, spearmanr
|
10
|
+
from statsmodels.stats.multitest import fdrcorrection
|
8
11
|
|
9
12
|
if TYPE_CHECKING:
|
10
|
-
import pandas as pd
|
11
13
|
from anndata import AnnData
|
12
14
|
|
13
15
|
|
14
16
|
class DifferentialGeneExpression:
|
15
17
|
"""Support for differential gene expression for scverse."""
|
16
18
|
|
17
|
-
def
|
19
|
+
def get_pseudobulk(
|
18
20
|
self,
|
19
21
|
adata: AnnData,
|
20
22
|
sample_col: str,
|
@@ -22,28 +24,44 @@ class DifferentialGeneExpression:
|
|
22
24
|
obs: pd.DataFrame = None,
|
23
25
|
layer: str = None,
|
24
26
|
use_raw: bool = False,
|
25
|
-
|
27
|
+
mode: str = "sum",
|
28
|
+
min_cells=10,
|
26
29
|
min_counts: int = 1000,
|
27
|
-
min_samples: int = 2,
|
28
30
|
dtype: npt.DTypeLike = np.float32,
|
31
|
+
skip_checks: bool = False,
|
29
32
|
) -> AnnData:
|
30
|
-
"""
|
33
|
+
"""Summarizes expression profiles across cells per sample and group.
|
31
34
|
|
32
|
-
|
35
|
+
Generates summarized expression profiles across cells per sample (e.g. sample id) and group (e.g. cell type) based on the metadata found in .obs.
|
36
|
+
To ensure a minimum quality control, this function removes genes that are not expressed enough across cells (min_prop) or samples (min_smpls),
|
37
|
+
and samples with not enough cells (min_cells) or gene counts (min_counts).
|
38
|
+
|
39
|
+
By default this function expects raw integer counts as input and sums them per sample and group (mode='sum'), but other modes are available.
|
40
|
+
|
41
|
+
This function produces some quality control metrics to assess if is necessary to filter some samples.
|
42
|
+
The number of cells that belong to each sample is stored in `.obs['psbulk_n_cells']`,
|
43
|
+
the total sum of counts per sample in .obs['psbulk_counts'], and the proportion of cells that express a given gene in `.layers[‘psbulk_props’]`.
|
44
|
+
|
45
|
+
Wraps decoupler's `get_pseudobulk` function.
|
33
46
|
See: https://decoupler-py.readthedocs.io/en/latest/generated/decoupler.get_pseudobulk.html#decoupler.get_pseudobulk
|
34
|
-
for more details
|
47
|
+
for more details.
|
35
48
|
|
36
49
|
Args:
|
37
50
|
adata: Input AnnData object.
|
38
51
|
sample_col: Column of obs where to extract the samples names.
|
39
52
|
groups_col: Column of obs where to extract the groups names.
|
40
|
-
obs: If provided, metadata
|
53
|
+
obs: If provided, metadata DataFrame.
|
41
54
|
layer: If provided, which layer to use.
|
42
|
-
use_raw: Use raw attribute of
|
43
|
-
|
44
|
-
|
45
|
-
|
55
|
+
use_raw: Use raw attribute of the AnnData object if present.
|
56
|
+
mode: How to perform the pseudobulk.
|
57
|
+
Available options are 'sum', 'mean' or 'median'. Also accepts callback functions to perform custom aggregations.
|
58
|
+
Additionally, it is also possible to provide a dictionary of different callback functions, each one stored in a different resulting `.layer`.
|
59
|
+
In this case, the result of the first callback function of the dictionary is stored in .X by default.
|
60
|
+
min_cells: Filter to remove samples by a minimum number of cells in a sample-group pair.
|
61
|
+
min_counts: Filter to remove samples by a minimum number of summed counts in a sample-group pair.
|
46
62
|
dtype: Type of float used.
|
63
|
+
skip_checks: Whether to skip input checks.
|
64
|
+
Set to True when working with positive and negative data, or when counts are not integers.
|
47
65
|
|
48
66
|
Returns:
|
49
67
|
Returns new AnnData object with unormalized pseudobulk profiles per sample and group.
|
@@ -55,14 +73,222 @@ class DifferentialGeneExpression:
|
|
55
73
|
obs=obs,
|
56
74
|
layer=layer,
|
57
75
|
use_raw=use_raw,
|
58
|
-
|
76
|
+
mode=mode,
|
59
77
|
min_counts=min_counts,
|
60
|
-
min_smpls=min_samples,
|
61
78
|
dtype=dtype,
|
79
|
+
min_cells=min_cells,
|
80
|
+
skip_checks=skip_checks,
|
62
81
|
)
|
63
82
|
|
64
83
|
return pseudobulk_adata
|
65
84
|
|
85
|
+
def filter_by_expr(
|
86
|
+
self,
|
87
|
+
adata: AnnData,
|
88
|
+
obs: pd.DataFrame = None,
|
89
|
+
group: str | None = None,
|
90
|
+
lib_size: int | float | None = None,
|
91
|
+
min_count: int = 10,
|
92
|
+
min_total_count: int = 15,
|
93
|
+
large_n: int = 10,
|
94
|
+
min_prop: float = 0.7,
|
95
|
+
) -> AnnData:
|
96
|
+
"""Filter AnnData by which genes have sufficiently large counts to be retained in a statistical analysis.
|
97
|
+
|
98
|
+
Wraps decoupler's `filter_by_expr` function.
|
99
|
+
See https://decoupler-py.readthedocs.io/en/latest/generated/decoupler.filter_by_expr.html#decoupler.filter_by_expr
|
100
|
+
for more details.
|
101
|
+
|
102
|
+
Args:
|
103
|
+
adata: AnnData obtained after running `get_pseudobulk`.
|
104
|
+
obs: Metadata dataframe, only needed if `adata` is not an `AnnData`.
|
105
|
+
group: Name of the `.obs` column to group by. If None, assumes all samples belong to one group.
|
106
|
+
lib_size: Library size. Defaults to the sum of reads per sample if None.
|
107
|
+
min_count: Minimum count required per gene for at least some samples.
|
108
|
+
min_total_count: Minimum total count required per gene across all samples.
|
109
|
+
large_n: Number of samples per group considered to be "large".
|
110
|
+
min_prop: Minimum proportion of samples in the smallest group that express the gene.
|
111
|
+
|
112
|
+
Returns:
|
113
|
+
AnnData with only the genes that are to be kept.
|
114
|
+
"""
|
115
|
+
genes = dc.filter_by_expr(
|
116
|
+
adata=adata,
|
117
|
+
obs=obs,
|
118
|
+
group=group,
|
119
|
+
lib_size=lib_size,
|
120
|
+
min_count=min_count,
|
121
|
+
min_total_count=min_total_count,
|
122
|
+
large_n=large_n,
|
123
|
+
min_prop=min_prop,
|
124
|
+
)
|
125
|
+
filtered_adata = adata[:, genes].copy()
|
126
|
+
|
127
|
+
return filtered_adata
|
128
|
+
|
129
|
+
def filter_by_prop(self, adata: AnnData, min_prop: float = 0.2, min_samples: int = 2) -> AnnData:
|
130
|
+
"""Determine which genes are expressed in a sufficient proportion of cells across samples.
|
131
|
+
|
132
|
+
This function selects genes that are sufficiently expressed across cells in each sample and that this condition
|
133
|
+
is met across a minimum number of samples.
|
134
|
+
|
135
|
+
Args:
|
136
|
+
adata: AnnData obtained after running `get_pseudobulk`. It requieres `.layer['psbulk_props']`.
|
137
|
+
min_prop: Minimum proportion of cells that express a gene in a sample.
|
138
|
+
min_samples: Minimum number of samples with bigger or equal proportion of cells with expression than `min_prop`.
|
139
|
+
|
140
|
+
Returns:
|
141
|
+
AnnData with only the genes that are to be kept.
|
142
|
+
"""
|
143
|
+
genes = dc.filter_by_prop(adata=adata, min_prop=min_prop, min_smpls=min_samples)
|
144
|
+
filtered_adata = adata[:, genes].copy()
|
145
|
+
|
146
|
+
return filtered_adata
|
147
|
+
|
148
|
+
def calculate_correlation(
|
149
|
+
self,
|
150
|
+
de_res_1: pd.DataFrame,
|
151
|
+
de_res_2: pd.DataFrame,
|
152
|
+
method: Literal["spearman", "pearson", "kendall-tau"] = "spearman",
|
153
|
+
) -> pd.DataFrame:
|
154
|
+
"""Calculate the Spearman correlation coefficient for 'pvals_adj' and 'logfoldchanges' columns.
|
155
|
+
|
156
|
+
Args:
|
157
|
+
de_res_1: A DataFrame with DE result columns.
|
158
|
+
de_res_2: Another DataFrame with the same DE result columns.
|
159
|
+
method: The correlation method to apply. One of `spearman`, `pearson`, `kendall-tau`.
|
160
|
+
Defaults to `spearman`.
|
161
|
+
|
162
|
+
Returns:
|
163
|
+
A DataFrame with the Spearman correlation coefficients for 'pvals_adj' and 'logfoldchanges'.
|
164
|
+
"""
|
165
|
+
columns_of_interest = ["pvals_adj", "logfoldchanges"]
|
166
|
+
correlation_data = {}
|
167
|
+
for col in columns_of_interest:
|
168
|
+
match method:
|
169
|
+
case "spearman":
|
170
|
+
correlation, _ = spearmanr(de_res_1[col], de_res_2[col])
|
171
|
+
case "pearson":
|
172
|
+
correlation, _ = pearsonr(de_res_1[col], de_res_2[col])
|
173
|
+
case "kendall-tau":
|
174
|
+
correlation, _ = kendalltau(de_res_1[col], de_res_2[col])
|
175
|
+
case _:
|
176
|
+
raise ValueError("Unknown correlation method.")
|
177
|
+
correlation_data[col] = correlation
|
178
|
+
|
179
|
+
return pd.DataFrame([correlation_data], columns=columns_of_interest)
|
180
|
+
|
181
|
+
def calculate_jaccard_index(self, de_res_1: pd.DataFrame, de_res_2: pd.DataFrame, threshold: float = 0.05) -> float:
|
182
|
+
"""Calculate the Jaccard index for sets of significantly expressed genes/features based on a p-value threshold.
|
183
|
+
|
184
|
+
Args:
|
185
|
+
de_res_1: A DataFrame with DE result columns, including 'pvals'.
|
186
|
+
de_res_2: Another DataFrame with the same DE result columns.
|
187
|
+
threshold: A threshold for determining significant expression (default is 0.05).
|
188
|
+
|
189
|
+
Returns:
|
190
|
+
The Jaccard index.
|
191
|
+
"""
|
192
|
+
significant_set_1 = set(de_res_1[de_res_1["pvals"] <= threshold].index)
|
193
|
+
significant_set_2 = set(de_res_2[de_res_2["pvals"] <= threshold].index)
|
194
|
+
|
195
|
+
intersection = significant_set_1.intersection(significant_set_2)
|
196
|
+
union = significant_set_1.union(significant_set_2)
|
197
|
+
|
198
|
+
return len(intersection) / len(union) if union else 0
|
199
|
+
|
200
|
+
def calculate_cohens_d(self, de_res_1: pd.DataFrame, de_res_2: pd.DataFrame) -> pd.Series:
|
201
|
+
"""Calculate Cohen's D for the logfoldchanges.
|
202
|
+
|
203
|
+
Args:
|
204
|
+
de_res_1: A DataFrame with DE result columns, including 'logfoldchanges'.
|
205
|
+
de_res_2: Another DataFrame with the same DE result columns.
|
206
|
+
|
207
|
+
Returns:
|
208
|
+
A pandas Series containing Cohen's D for each gene/feature.
|
209
|
+
"""
|
210
|
+
means_1 = de_res_1["logfoldchanges"].mean()
|
211
|
+
means_2 = de_res_2["logfoldchanges"].mean()
|
212
|
+
sd_1 = de_res_1["logfoldchanges"].std()
|
213
|
+
sd_2 = de_res_2["logfoldchanges"].std()
|
214
|
+
|
215
|
+
pooled_sd = np.sqrt((sd_1**2 + sd_2**2) / 2)
|
216
|
+
cohens_d = (means_1 - means_2) / pooled_sd
|
217
|
+
|
218
|
+
return cohens_d
|
219
|
+
|
220
|
+
def de_res_to_anndata(
|
221
|
+
self,
|
222
|
+
adata: AnnData,
|
223
|
+
de_res: pd.DataFrame,
|
224
|
+
*,
|
225
|
+
groupby: str,
|
226
|
+
gene_id_col: str = "gene_symbols",
|
227
|
+
score_col: str = "scores",
|
228
|
+
pval_col: str = "pvals",
|
229
|
+
pval_adj_col: str | None = "pvals_adj",
|
230
|
+
lfc_col: str = "logfoldchanges",
|
231
|
+
key_added: str = "rank_genes_groups",
|
232
|
+
) -> None:
|
233
|
+
"""Add tabular differential expression result to AnnData as if it was produced by `scanpy.tl.rank_genes_groups`.
|
234
|
+
|
235
|
+
Args:
|
236
|
+
adata:
|
237
|
+
Annotated data matrix
|
238
|
+
de_res:
|
239
|
+
Tablular de result
|
240
|
+
groupby:
|
241
|
+
Column in `de_res` that indicates the group. This column must also exist in `adata.obs`.
|
242
|
+
gene_id_col:
|
243
|
+
Column in `de_res` that holds the gene identifiers
|
244
|
+
score_col:
|
245
|
+
Column in `de_res` that holds the score (results will be ordered by score).
|
246
|
+
pval_col:
|
247
|
+
Column in `de_res` that holds the unadjusted pvalue
|
248
|
+
pval_adj_col:
|
249
|
+
Column in `de_res` that holds the adjusted pvalue.
|
250
|
+
If not specified, the unadjusted pvalues will be FDR-adjusted.
|
251
|
+
lfc_col:
|
252
|
+
Column in `de_res` that holds the log fold change
|
253
|
+
key_added:
|
254
|
+
Key under which the results will be stored in `adata.uns`
|
255
|
+
"""
|
256
|
+
if groupby not in adata.obs.columns or groupby not in de_res.columns:
|
257
|
+
raise ValueError("groupby column must exist in both adata and de_res.")
|
258
|
+
res_dict = {
|
259
|
+
"params": {
|
260
|
+
"groupby": groupby,
|
261
|
+
"reference": "rest",
|
262
|
+
"method": "other",
|
263
|
+
"use_raw": True,
|
264
|
+
"layer": None,
|
265
|
+
"corr_method": "other",
|
266
|
+
},
|
267
|
+
"names": [],
|
268
|
+
"scores": [],
|
269
|
+
"pvals": [],
|
270
|
+
"pvals_adj": [],
|
271
|
+
"logfoldchanges": [],
|
272
|
+
}
|
273
|
+
df_groupby = de_res.groupby(groupby)
|
274
|
+
for _, tmp_df in df_groupby:
|
275
|
+
tmp_df = tmp_df.sort_values(score_col, ascending=False)
|
276
|
+
res_dict["names"].append(tmp_df[gene_id_col].values) # type: ignore
|
277
|
+
res_dict["scores"].append(tmp_df[score_col].values) # type: ignore
|
278
|
+
res_dict["pvals"].append(tmp_df[pval_col].values) # type: ignore
|
279
|
+
if pval_adj_col is not None:
|
280
|
+
res_dict["pvals_adj"].append(tmp_df[pval_adj_col].values) # type: ignore
|
281
|
+
else:
|
282
|
+
res_dict["pvals_adj"].append(fdrcorrection(tmp_df[pval_col].values)[1]) # type: ignore
|
283
|
+
res_dict["logfoldchanges"].append(tmp_df[lfc_col].values) # type: ignore
|
284
|
+
|
285
|
+
for key in ["names", "scores", "pvals", "pvals_adj", "logfoldchanges"]:
|
286
|
+
res_dict[key] = pd.DataFrame(
|
287
|
+
np.vstack(res_dict[key]).T,
|
288
|
+
columns=list(df_groupby.groups.keys()),
|
289
|
+
).to_records(index=False, column_dtypes="O")
|
290
|
+
adata.uns[key_added] = res_dict
|
291
|
+
|
66
292
|
def de_analysis(
|
67
293
|
self,
|
68
294
|
adata: AnnData,
|
@@ -37,8 +37,8 @@ class DistanceTest:
|
|
37
37
|
Examples:
|
38
38
|
>>> import pertpy as pt
|
39
39
|
>>> adata = pt.dt.distance_example_data()
|
40
|
-
>>> distance_test = pt.tl.DistanceTest(
|
41
|
-
>>> tab = distance_test(adata, groupby=
|
40
|
+
>>> distance_test = pt.tl.DistanceTest("edistance", n_perms=1000)
|
41
|
+
>>> tab = distance_test(adata, groupby="perturbation", contrast="control")
|
42
42
|
"""
|
43
43
|
|
44
44
|
def __init__(
|
@@ -100,8 +100,8 @@ class DistanceTest:
|
|
100
100
|
Examples:
|
101
101
|
>>> import pertpy as pt
|
102
102
|
>>> adata = pt.dt.distance_example_data()
|
103
|
-
>>> distance_test = pt.tl.DistanceTest(
|
104
|
-
>>> tab = distance_test(adata, groupby=
|
103
|
+
>>> distance_test = pt.tl.DistanceTest("edistance", n_perms=1000)
|
104
|
+
>>> tab = distance_test(adata, groupby="perturbation", contrast="control")
|
105
105
|
"""
|
106
106
|
if self.distance.metric_fct.accepts_precomputed:
|
107
107
|
# Much faster if the metric can be called on the precomputed
|
@@ -134,8 +134,8 @@ class DistanceTest:
|
|
134
134
|
Examples:
|
135
135
|
>>> import pertpy as pt
|
136
136
|
>>> adata = pt.dt.distance_example_data()
|
137
|
-
>>> distance_test = pt.tl.DistanceTest(
|
138
|
-
>>> test_results = distance_test.test_xy(adata, groupby=
|
137
|
+
>>> distance_test = pt.tl.DistanceTest("edistance", n_perms=1000)
|
138
|
+
>>> test_results = distance_test.test_xy(adata, groupby="perturbation", contrast="control")
|
139
139
|
"""
|
140
140
|
groups = adata.obs[groupby].unique()
|
141
141
|
if contrast not in groups:
|
@@ -226,8 +226,8 @@ class DistanceTest:
|
|
226
226
|
Examples:
|
227
227
|
>>> import pertpy as pt
|
228
228
|
>>> adata = pt.dt.distance_example_data()
|
229
|
-
>>> distance_test = pt.tl.DistanceTest(
|
230
|
-
>>> test_results = distance_test.test_precomputed(adata, groupby=
|
229
|
+
>>> distance_test = pt.tl.DistanceTest("edistance", n_perms=1000)
|
230
|
+
>>> test_results = distance_test.test_precomputed(adata, groupby="perturbation", contrast="control")
|
231
231
|
"""
|
232
232
|
if not self.distance.metric_fct.accepts_precomputed:
|
233
233
|
raise ValueError(f"Metric {self.metric} does not accept precomputed distances.")
|