pertpy 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- pertpy/__init__.py +3 -2
- pertpy/data/__init__.py +5 -1
- pertpy/data/_dataloader.py +2 -4
- pertpy/data/_datasets.py +203 -92
- pertpy/metadata/__init__.py +4 -0
- pertpy/metadata/_cell_line.py +826 -0
- pertpy/metadata/_compound.py +129 -0
- pertpy/metadata/_drug.py +242 -0
- pertpy/metadata/_look_up.py +582 -0
- pertpy/metadata/_metadata.py +73 -0
- pertpy/metadata/_moa.py +129 -0
- pertpy/plot/__init__.py +1 -9
- pertpy/plot/_augur.py +53 -116
- pertpy/plot/_coda.py +277 -677
- pertpy/plot/_guide_rna.py +17 -35
- pertpy/plot/_milopy.py +59 -134
- pertpy/plot/_mixscape.py +152 -391
- pertpy/preprocessing/_guide_rna.py +88 -4
- pertpy/tools/__init__.py +8 -13
- pertpy/tools/_augur.py +315 -17
- pertpy/tools/_cinemaot.py +143 -4
- pertpy/tools/_coda/_base_coda.py +1210 -65
- pertpy/tools/_coda/_sccoda.py +50 -21
- pertpy/tools/_coda/_tasccoda.py +27 -19
- pertpy/tools/_dialogue.py +164 -56
- pertpy/tools/_differential_gene_expression.py +240 -14
- pertpy/tools/_distances/_distance_tests.py +8 -8
- pertpy/tools/_distances/_distances.py +184 -34
- pertpy/tools/_enrichment.py +465 -0
- pertpy/tools/_milo.py +345 -11
- pertpy/tools/_mixscape.py +668 -50
- pertpy/tools/_perturbation_space/_clustering.py +5 -1
- pertpy/tools/_perturbation_space/_discriminator_classifiers.py +526 -0
- pertpy/tools/_perturbation_space/_perturbation_space.py +135 -43
- pertpy/tools/_perturbation_space/_simple.py +51 -10
- pertpy/tools/_scgen/__init__.py +1 -1
- pertpy/tools/_scgen/_scgen.py +701 -0
- pertpy/tools/_scgen/_utils.py +1 -3
- pertpy/tools/decoupler_LICENSE +674 -0
- {pertpy-0.6.0.dist-info → pertpy-0.7.0.dist-info}/METADATA +31 -12
- pertpy-0.7.0.dist-info/RECORD +53 -0
- {pertpy-0.6.0.dist-info → pertpy-0.7.0.dist-info}/WHEEL +1 -1
- pertpy/plot/_cinemaot.py +0 -81
- pertpy/plot/_dialogue.py +0 -91
- pertpy/plot/_scgen.py +0 -337
- pertpy/tools/_metadata/__init__.py +0 -0
- pertpy/tools/_metadata/_cell_line.py +0 -613
- pertpy/tools/_metadata/_look_up.py +0 -342
- pertpy/tools/_perturbation_space/_discriminator_classifier.py +0 -381
- pertpy/tools/_scgen/_jax_scgen.py +0 -370
- pertpy-0.6.0.dist-info/RECORD +0 -50
- /pertpy/tools/_scgen/{_jax_scgenvae.py → _scgenvae.py} +0 -0
- {pertpy-0.6.0.dist-info → pertpy-0.7.0.dist-info}/licenses/LICENSE +0 -0
@@ -5,16 +5,18 @@ from typing import TYPE_CHECKING, Literal
|
|
5
5
|
import decoupler as dc
|
6
6
|
import numpy as np
|
7
7
|
import numpy.typing as npt
|
8
|
+
import pandas as pd
|
9
|
+
from scipy.stats import kendalltau, pearsonr, spearmanr
|
10
|
+
from statsmodels.stats.multitest import fdrcorrection
|
8
11
|
|
9
12
|
if TYPE_CHECKING:
|
10
|
-
import pandas as pd
|
11
13
|
from anndata import AnnData
|
12
14
|
|
13
15
|
|
14
16
|
class DifferentialGeneExpression:
|
15
17
|
"""Support for differential gene expression for scverse."""
|
16
18
|
|
17
|
-
def
|
19
|
+
def get_pseudobulk(
|
18
20
|
self,
|
19
21
|
adata: AnnData,
|
20
22
|
sample_col: str,
|
@@ -22,28 +24,44 @@ class DifferentialGeneExpression:
|
|
22
24
|
obs: pd.DataFrame = None,
|
23
25
|
layer: str = None,
|
24
26
|
use_raw: bool = False,
|
25
|
-
|
27
|
+
mode: str = "sum",
|
28
|
+
min_cells=10,
|
26
29
|
min_counts: int = 1000,
|
27
|
-
min_samples: int = 2,
|
28
30
|
dtype: npt.DTypeLike = np.float32,
|
31
|
+
skip_checks: bool = False,
|
29
32
|
) -> AnnData:
|
30
|
-
"""
|
33
|
+
"""Summarizes expression profiles across cells per sample and group.
|
31
34
|
|
32
|
-
|
35
|
+
Generates summarized expression profiles across cells per sample (e.g. sample id) and group (e.g. cell type) based on the metadata found in .obs.
|
36
|
+
To ensure a minimum quality control, this function removes genes that are not expressed enough across cells (min_prop) or samples (min_smpls),
|
37
|
+
and samples with not enough cells (min_cells) or gene counts (min_counts).
|
38
|
+
|
39
|
+
By default this function expects raw integer counts as input and sums them per sample and group (mode='sum'), but other modes are available.
|
40
|
+
|
41
|
+
This function produces some quality control metrics to assess if is necessary to filter some samples.
|
42
|
+
The number of cells that belong to each sample is stored in `.obs['psbulk_n_cells']`,
|
43
|
+
the total sum of counts per sample in .obs['psbulk_counts'], and the proportion of cells that express a given gene in `.layers[‘psbulk_props’]`.
|
44
|
+
|
45
|
+
Wraps decoupler's `get_pseudobulk` function.
|
33
46
|
See: https://decoupler-py.readthedocs.io/en/latest/generated/decoupler.get_pseudobulk.html#decoupler.get_pseudobulk
|
34
|
-
for more details
|
47
|
+
for more details.
|
35
48
|
|
36
49
|
Args:
|
37
50
|
adata: Input AnnData object.
|
38
51
|
sample_col: Column of obs where to extract the samples names.
|
39
52
|
groups_col: Column of obs where to extract the groups names.
|
40
|
-
obs: If provided, metadata
|
53
|
+
obs: If provided, metadata DataFrame.
|
41
54
|
layer: If provided, which layer to use.
|
42
|
-
use_raw: Use raw attribute of
|
43
|
-
|
44
|
-
|
45
|
-
|
55
|
+
use_raw: Use raw attribute of the AnnData object if present.
|
56
|
+
mode: How to perform the pseudobulk.
|
57
|
+
Available options are 'sum', 'mean' or 'median'. Also accepts callback functions to perform custom aggregations.
|
58
|
+
Additionally, it is also possible to provide a dictionary of different callback functions, each one stored in a different resulting `.layer`.
|
59
|
+
In this case, the result of the first callback function of the dictionary is stored in .X by default.
|
60
|
+
min_cells: Filter to remove samples by a minimum number of cells in a sample-group pair.
|
61
|
+
min_counts: Filter to remove samples by a minimum number of summed counts in a sample-group pair.
|
46
62
|
dtype: Type of float used.
|
63
|
+
skip_checks: Whether to skip input checks.
|
64
|
+
Set to True when working with positive and negative data, or when counts are not integers.
|
47
65
|
|
48
66
|
Returns:
|
49
67
|
Returns new AnnData object with unormalized pseudobulk profiles per sample and group.
|
@@ -55,14 +73,222 @@ class DifferentialGeneExpression:
|
|
55
73
|
obs=obs,
|
56
74
|
layer=layer,
|
57
75
|
use_raw=use_raw,
|
58
|
-
|
76
|
+
mode=mode,
|
59
77
|
min_counts=min_counts,
|
60
|
-
min_smpls=min_samples,
|
61
78
|
dtype=dtype,
|
79
|
+
min_cells=min_cells,
|
80
|
+
skip_checks=skip_checks,
|
62
81
|
)
|
63
82
|
|
64
83
|
return pseudobulk_adata
|
65
84
|
|
85
|
+
def filter_by_expr(
|
86
|
+
self,
|
87
|
+
adata: AnnData,
|
88
|
+
obs: pd.DataFrame = None,
|
89
|
+
group: str | None = None,
|
90
|
+
lib_size: int | float | None = None,
|
91
|
+
min_count: int = 10,
|
92
|
+
min_total_count: int = 15,
|
93
|
+
large_n: int = 10,
|
94
|
+
min_prop: float = 0.7,
|
95
|
+
) -> AnnData:
|
96
|
+
"""Filter AnnData by which genes have sufficiently large counts to be retained in a statistical analysis.
|
97
|
+
|
98
|
+
Wraps decoupler's `filter_by_expr` function.
|
99
|
+
See https://decoupler-py.readthedocs.io/en/latest/generated/decoupler.filter_by_expr.html#decoupler.filter_by_expr
|
100
|
+
for more details.
|
101
|
+
|
102
|
+
Args:
|
103
|
+
adata: AnnData obtained after running `get_pseudobulk`.
|
104
|
+
obs: Metadata dataframe, only needed if `adata` is not an `AnnData`.
|
105
|
+
group: Name of the `.obs` column to group by. If None, assumes all samples belong to one group.
|
106
|
+
lib_size: Library size. Defaults to the sum of reads per sample if None.
|
107
|
+
min_count: Minimum count required per gene for at least some samples.
|
108
|
+
min_total_count: Minimum total count required per gene across all samples.
|
109
|
+
large_n: Number of samples per group considered to be "large".
|
110
|
+
min_prop: Minimum proportion of samples in the smallest group that express the gene.
|
111
|
+
|
112
|
+
Returns:
|
113
|
+
AnnData with only the genes that are to be kept.
|
114
|
+
"""
|
115
|
+
genes = dc.filter_by_expr(
|
116
|
+
adata=adata,
|
117
|
+
obs=obs,
|
118
|
+
group=group,
|
119
|
+
lib_size=lib_size,
|
120
|
+
min_count=min_count,
|
121
|
+
min_total_count=min_total_count,
|
122
|
+
large_n=large_n,
|
123
|
+
min_prop=min_prop,
|
124
|
+
)
|
125
|
+
filtered_adata = adata[:, genes].copy()
|
126
|
+
|
127
|
+
return filtered_adata
|
128
|
+
|
129
|
+
def filter_by_prop(self, adata: AnnData, min_prop: float = 0.2, min_samples: int = 2) -> AnnData:
|
130
|
+
"""Determine which genes are expressed in a sufficient proportion of cells across samples.
|
131
|
+
|
132
|
+
This function selects genes that are sufficiently expressed across cells in each sample and that this condition
|
133
|
+
is met across a minimum number of samples.
|
134
|
+
|
135
|
+
Args:
|
136
|
+
adata: AnnData obtained after running `get_pseudobulk`. It requieres `.layer['psbulk_props']`.
|
137
|
+
min_prop: Minimum proportion of cells that express a gene in a sample.
|
138
|
+
min_samples: Minimum number of samples with bigger or equal proportion of cells with expression than `min_prop`.
|
139
|
+
|
140
|
+
Returns:
|
141
|
+
AnnData with only the genes that are to be kept.
|
142
|
+
"""
|
143
|
+
genes = dc.filter_by_prop(adata=adata, min_prop=min_prop, min_smpls=min_samples)
|
144
|
+
filtered_adata = adata[:, genes].copy()
|
145
|
+
|
146
|
+
return filtered_adata
|
147
|
+
|
148
|
+
def calculate_correlation(
|
149
|
+
self,
|
150
|
+
de_res_1: pd.DataFrame,
|
151
|
+
de_res_2: pd.DataFrame,
|
152
|
+
method: Literal["spearman", "pearson", "kendall-tau"] = "spearman",
|
153
|
+
) -> pd.DataFrame:
|
154
|
+
"""Calculate the Spearman correlation coefficient for 'pvals_adj' and 'logfoldchanges' columns.
|
155
|
+
|
156
|
+
Args:
|
157
|
+
de_res_1: A DataFrame with DE result columns.
|
158
|
+
de_res_2: Another DataFrame with the same DE result columns.
|
159
|
+
method: The correlation method to apply. One of `spearman`, `pearson`, `kendall-tau`.
|
160
|
+
Defaults to `spearman`.
|
161
|
+
|
162
|
+
Returns:
|
163
|
+
A DataFrame with the Spearman correlation coefficients for 'pvals_adj' and 'logfoldchanges'.
|
164
|
+
"""
|
165
|
+
columns_of_interest = ["pvals_adj", "logfoldchanges"]
|
166
|
+
correlation_data = {}
|
167
|
+
for col in columns_of_interest:
|
168
|
+
match method:
|
169
|
+
case "spearman":
|
170
|
+
correlation, _ = spearmanr(de_res_1[col], de_res_2[col])
|
171
|
+
case "pearson":
|
172
|
+
correlation, _ = pearsonr(de_res_1[col], de_res_2[col])
|
173
|
+
case "kendall-tau":
|
174
|
+
correlation, _ = kendalltau(de_res_1[col], de_res_2[col])
|
175
|
+
case _:
|
176
|
+
raise ValueError("Unknown correlation method.")
|
177
|
+
correlation_data[col] = correlation
|
178
|
+
|
179
|
+
return pd.DataFrame([correlation_data], columns=columns_of_interest)
|
180
|
+
|
181
|
+
def calculate_jaccard_index(self, de_res_1: pd.DataFrame, de_res_2: pd.DataFrame, threshold: float = 0.05) -> float:
|
182
|
+
"""Calculate the Jaccard index for sets of significantly expressed genes/features based on a p-value threshold.
|
183
|
+
|
184
|
+
Args:
|
185
|
+
de_res_1: A DataFrame with DE result columns, including 'pvals'.
|
186
|
+
de_res_2: Another DataFrame with the same DE result columns.
|
187
|
+
threshold: A threshold for determining significant expression (default is 0.05).
|
188
|
+
|
189
|
+
Returns:
|
190
|
+
The Jaccard index.
|
191
|
+
"""
|
192
|
+
significant_set_1 = set(de_res_1[de_res_1["pvals"] <= threshold].index)
|
193
|
+
significant_set_2 = set(de_res_2[de_res_2["pvals"] <= threshold].index)
|
194
|
+
|
195
|
+
intersection = significant_set_1.intersection(significant_set_2)
|
196
|
+
union = significant_set_1.union(significant_set_2)
|
197
|
+
|
198
|
+
return len(intersection) / len(union) if union else 0
|
199
|
+
|
200
|
+
def calculate_cohens_d(self, de_res_1: pd.DataFrame, de_res_2: pd.DataFrame) -> pd.Series:
|
201
|
+
"""Calculate Cohen's D for the logfoldchanges.
|
202
|
+
|
203
|
+
Args:
|
204
|
+
de_res_1: A DataFrame with DE result columns, including 'logfoldchanges'.
|
205
|
+
de_res_2: Another DataFrame with the same DE result columns.
|
206
|
+
|
207
|
+
Returns:
|
208
|
+
A pandas Series containing Cohen's D for each gene/feature.
|
209
|
+
"""
|
210
|
+
means_1 = de_res_1["logfoldchanges"].mean()
|
211
|
+
means_2 = de_res_2["logfoldchanges"].mean()
|
212
|
+
sd_1 = de_res_1["logfoldchanges"].std()
|
213
|
+
sd_2 = de_res_2["logfoldchanges"].std()
|
214
|
+
|
215
|
+
pooled_sd = np.sqrt((sd_1**2 + sd_2**2) / 2)
|
216
|
+
cohens_d = (means_1 - means_2) / pooled_sd
|
217
|
+
|
218
|
+
return cohens_d
|
219
|
+
|
220
|
+
def de_res_to_anndata(
|
221
|
+
self,
|
222
|
+
adata: AnnData,
|
223
|
+
de_res: pd.DataFrame,
|
224
|
+
*,
|
225
|
+
groupby: str,
|
226
|
+
gene_id_col: str = "gene_symbols",
|
227
|
+
score_col: str = "scores",
|
228
|
+
pval_col: str = "pvals",
|
229
|
+
pval_adj_col: str | None = "pvals_adj",
|
230
|
+
lfc_col: str = "logfoldchanges",
|
231
|
+
key_added: str = "rank_genes_groups",
|
232
|
+
) -> None:
|
233
|
+
"""Add tabular differential expression result to AnnData as if it was produced by `scanpy.tl.rank_genes_groups`.
|
234
|
+
|
235
|
+
Args:
|
236
|
+
adata:
|
237
|
+
Annotated data matrix
|
238
|
+
de_res:
|
239
|
+
Tablular de result
|
240
|
+
groupby:
|
241
|
+
Column in `de_res` that indicates the group. This column must also exist in `adata.obs`.
|
242
|
+
gene_id_col:
|
243
|
+
Column in `de_res` that holds the gene identifiers
|
244
|
+
score_col:
|
245
|
+
Column in `de_res` that holds the score (results will be ordered by score).
|
246
|
+
pval_col:
|
247
|
+
Column in `de_res` that holds the unadjusted pvalue
|
248
|
+
pval_adj_col:
|
249
|
+
Column in `de_res` that holds the adjusted pvalue.
|
250
|
+
If not specified, the unadjusted pvalues will be FDR-adjusted.
|
251
|
+
lfc_col:
|
252
|
+
Column in `de_res` that holds the log fold change
|
253
|
+
key_added:
|
254
|
+
Key under which the results will be stored in `adata.uns`
|
255
|
+
"""
|
256
|
+
if groupby not in adata.obs.columns or groupby not in de_res.columns:
|
257
|
+
raise ValueError("groupby column must exist in both adata and de_res.")
|
258
|
+
res_dict = {
|
259
|
+
"params": {
|
260
|
+
"groupby": groupby,
|
261
|
+
"reference": "rest",
|
262
|
+
"method": "other",
|
263
|
+
"use_raw": True,
|
264
|
+
"layer": None,
|
265
|
+
"corr_method": "other",
|
266
|
+
},
|
267
|
+
"names": [],
|
268
|
+
"scores": [],
|
269
|
+
"pvals": [],
|
270
|
+
"pvals_adj": [],
|
271
|
+
"logfoldchanges": [],
|
272
|
+
}
|
273
|
+
df_groupby = de_res.groupby(groupby)
|
274
|
+
for _, tmp_df in df_groupby:
|
275
|
+
tmp_df = tmp_df.sort_values(score_col, ascending=False)
|
276
|
+
res_dict["names"].append(tmp_df[gene_id_col].values) # type: ignore
|
277
|
+
res_dict["scores"].append(tmp_df[score_col].values) # type: ignore
|
278
|
+
res_dict["pvals"].append(tmp_df[pval_col].values) # type: ignore
|
279
|
+
if pval_adj_col is not None:
|
280
|
+
res_dict["pvals_adj"].append(tmp_df[pval_adj_col].values) # type: ignore
|
281
|
+
else:
|
282
|
+
res_dict["pvals_adj"].append(fdrcorrection(tmp_df[pval_col].values)[1]) # type: ignore
|
283
|
+
res_dict["logfoldchanges"].append(tmp_df[lfc_col].values) # type: ignore
|
284
|
+
|
285
|
+
for key in ["names", "scores", "pvals", "pvals_adj", "logfoldchanges"]:
|
286
|
+
res_dict[key] = pd.DataFrame(
|
287
|
+
np.vstack(res_dict[key]).T,
|
288
|
+
columns=list(df_groupby.groups.keys()),
|
289
|
+
).to_records(index=False, column_dtypes="O")
|
290
|
+
adata.uns[key_added] = res_dict
|
291
|
+
|
66
292
|
def de_analysis(
|
67
293
|
self,
|
68
294
|
adata: AnnData,
|
@@ -37,8 +37,8 @@ class DistanceTest:
|
|
37
37
|
Examples:
|
38
38
|
>>> import pertpy as pt
|
39
39
|
>>> adata = pt.dt.distance_example_data()
|
40
|
-
>>> distance_test = pt.tl.DistanceTest(
|
41
|
-
>>> tab = distance_test(adata, groupby=
|
40
|
+
>>> distance_test = pt.tl.DistanceTest("edistance", n_perms=1000)
|
41
|
+
>>> tab = distance_test(adata, groupby="perturbation", contrast="control")
|
42
42
|
"""
|
43
43
|
|
44
44
|
def __init__(
|
@@ -100,8 +100,8 @@ class DistanceTest:
|
|
100
100
|
Examples:
|
101
101
|
>>> import pertpy as pt
|
102
102
|
>>> adata = pt.dt.distance_example_data()
|
103
|
-
>>> distance_test = pt.tl.DistanceTest(
|
104
|
-
>>> tab = distance_test(adata, groupby=
|
103
|
+
>>> distance_test = pt.tl.DistanceTest("edistance", n_perms=1000)
|
104
|
+
>>> tab = distance_test(adata, groupby="perturbation", contrast="control")
|
105
105
|
"""
|
106
106
|
if self.distance.metric_fct.accepts_precomputed:
|
107
107
|
# Much faster if the metric can be called on the precomputed
|
@@ -134,8 +134,8 @@ class DistanceTest:
|
|
134
134
|
Examples:
|
135
135
|
>>> import pertpy as pt
|
136
136
|
>>> adata = pt.dt.distance_example_data()
|
137
|
-
>>> distance_test = pt.tl.DistanceTest(
|
138
|
-
>>> test_results = distance_test.test_xy(adata, groupby=
|
137
|
+
>>> distance_test = pt.tl.DistanceTest("edistance", n_perms=1000)
|
138
|
+
>>> test_results = distance_test.test_xy(adata, groupby="perturbation", contrast="control")
|
139
139
|
"""
|
140
140
|
groups = adata.obs[groupby].unique()
|
141
141
|
if contrast not in groups:
|
@@ -226,8 +226,8 @@ class DistanceTest:
|
|
226
226
|
Examples:
|
227
227
|
>>> import pertpy as pt
|
228
228
|
>>> adata = pt.dt.distance_example_data()
|
229
|
-
>>> distance_test = pt.tl.DistanceTest(
|
230
|
-
>>> test_results = distance_test.test_precomputed(adata, groupby=
|
229
|
+
>>> distance_test = pt.tl.DistanceTest("edistance", n_perms=1000)
|
230
|
+
>>> test_results = distance_test.test_precomputed(adata, groupby="perturbation", contrast="control")
|
231
231
|
"""
|
232
232
|
if not self.distance.metric_fct.accepts_precomputed:
|
233
233
|
raise ValueError(f"Metric {self.metric} does not accept precomputed distances.")
|