pertpy 0.6.0__py3-none-any.whl → 0.8.0__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- pertpy/__init__.py +4 -2
- pertpy/data/__init__.py +66 -1
- pertpy/data/_dataloader.py +28 -26
- pertpy/data/_datasets.py +261 -92
- pertpy/metadata/__init__.py +6 -0
- pertpy/metadata/_cell_line.py +795 -0
- pertpy/metadata/_compound.py +128 -0
- pertpy/metadata/_drug.py +238 -0
- pertpy/metadata/_look_up.py +569 -0
- pertpy/metadata/_metadata.py +70 -0
- pertpy/metadata/_moa.py +125 -0
- pertpy/plot/__init__.py +0 -13
- pertpy/preprocessing/__init__.py +2 -0
- pertpy/preprocessing/_guide_rna.py +89 -6
- pertpy/tools/__init__.py +48 -15
- pertpy/tools/_augur.py +329 -32
- pertpy/tools/_cinemaot.py +145 -6
- pertpy/tools/_coda/_base_coda.py +1237 -116
- pertpy/tools/_coda/_sccoda.py +66 -36
- pertpy/tools/_coda/_tasccoda.py +46 -39
- pertpy/tools/_dialogue.py +180 -77
- pertpy/tools/_differential_gene_expression/__init__.py +20 -0
- pertpy/tools/_differential_gene_expression/_base.py +657 -0
- pertpy/tools/_differential_gene_expression/_checks.py +41 -0
- pertpy/tools/_differential_gene_expression/_dge_comparison.py +86 -0
- pertpy/tools/_differential_gene_expression/_edger.py +125 -0
- pertpy/tools/_differential_gene_expression/_formulaic.py +189 -0
- pertpy/tools/_differential_gene_expression/_pydeseq2.py +95 -0
- pertpy/tools/_differential_gene_expression/_simple_tests.py +162 -0
- pertpy/tools/_differential_gene_expression/_statsmodels.py +72 -0
- pertpy/tools/_distances/_distance_tests.py +29 -24
- pertpy/tools/_distances/_distances.py +584 -98
- pertpy/tools/_enrichment.py +460 -0
- pertpy/tools/_kernel_pca.py +1 -1
- pertpy/tools/_milo.py +406 -49
- pertpy/tools/_mixscape.py +677 -55
- pertpy/tools/_perturbation_space/_clustering.py +10 -3
- pertpy/tools/_perturbation_space/_comparison.py +112 -0
- pertpy/tools/_perturbation_space/_discriminator_classifiers.py +524 -0
- pertpy/tools/_perturbation_space/_perturbation_space.py +146 -52
- pertpy/tools/_perturbation_space/_simple.py +52 -11
- pertpy/tools/_scgen/__init__.py +1 -1
- pertpy/tools/_scgen/_base_components.py +2 -3
- pertpy/tools/_scgen/_scgen.py +706 -0
- pertpy/tools/_scgen/_utils.py +3 -5
- pertpy/tools/decoupler_LICENSE +674 -0
- {pertpy-0.6.0.dist-info → pertpy-0.8.0.dist-info}/METADATA +48 -20
- pertpy-0.8.0.dist-info/RECORD +57 -0
- {pertpy-0.6.0.dist-info → pertpy-0.8.0.dist-info}/WHEEL +1 -1
- pertpy/plot/_augur.py +0 -234
- pertpy/plot/_cinemaot.py +0 -81
- pertpy/plot/_coda.py +0 -1001
- pertpy/plot/_dialogue.py +0 -91
- pertpy/plot/_guide_rna.py +0 -82
- pertpy/plot/_milopy.py +0 -284
- pertpy/plot/_mixscape.py +0 -594
- pertpy/plot/_scgen.py +0 -337
- pertpy/tools/_differential_gene_expression.py +0 -99
- pertpy/tools/_metadata/__init__.py +0 -0
- pertpy/tools/_metadata/_cell_line.py +0 -613
- pertpy/tools/_metadata/_look_up.py +0 -342
- pertpy/tools/_perturbation_space/_discriminator_classifier.py +0 -381
- pertpy/tools/_scgen/_jax_scgen.py +0 -370
- pertpy-0.6.0.dist-info/RECORD +0 -50
- /pertpy/tools/_scgen/{_jax_scgenvae.py → _scgenvae.py} +0 -0
- {pertpy-0.6.0.dist-info → pertpy-0.8.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,86 @@
|
|
1
|
+
import numpy as np
|
2
|
+
import pandas as pd
|
3
|
+
from anndata import AnnData
|
4
|
+
|
5
|
+
|
6
|
+
class DGEEVAL:
|
7
|
+
def compare(
|
8
|
+
self,
|
9
|
+
adata: AnnData | None = None,
|
10
|
+
de_key1: str = None,
|
11
|
+
de_key2: str = None,
|
12
|
+
de_df1: pd.DataFrame | None = None,
|
13
|
+
de_df2: pd.DataFrame | None = None,
|
14
|
+
shared_top: int = 100,
|
15
|
+
) -> dict[str, float]:
|
16
|
+
"""Compare two differential expression analyses.
|
17
|
+
|
18
|
+
Compare two sets of DE results and evaluate the similarity by the overlap of top DEG and
|
19
|
+
the correlation of their scores and adjusted p-values.
|
20
|
+
|
21
|
+
Args:
|
22
|
+
adata: AnnData object containing DE results in `uns`. Required if `de_key1` and `de_key2` are used.
|
23
|
+
de_key1: Key for DE results in `adata.uns`, e.g., output of `tl.rank_genes_groups`.
|
24
|
+
de_key2: Another key for DE results in `adata.uns`, e.g., output of `tl.rank_genes_groups`.
|
25
|
+
de_df1: DataFrame containing DE results, e.g. output from pertpy differential gene expression interface.
|
26
|
+
de_df2: DataFrame containing DE results, e.g. output from pertpy differential gene expression interface.
|
27
|
+
shared_top: The number of top DEG to compute the proportion of their intersection.
|
28
|
+
|
29
|
+
"""
|
30
|
+
if (de_key1 or de_key2) and (de_df1 is not None or de_df2 is not None):
|
31
|
+
raise ValueError(
|
32
|
+
"Please provide either both `de_key1` and `de_key2` with `adata`, or `de_df1` and `de_df2`, but not both."
|
33
|
+
)
|
34
|
+
|
35
|
+
if de_df1 is None and de_df2 is None: # use keys
|
36
|
+
if not de_key1 or not de_key2:
|
37
|
+
raise ValueError("Both `de_key1` and `de_key2` must be provided together if using `adata`.")
|
38
|
+
|
39
|
+
else: # use dfs
|
40
|
+
if de_df1 is None or de_df2 is None:
|
41
|
+
raise ValueError("Both `de_df1` and `de_df2` must be provided together if using DataFrames.")
|
42
|
+
|
43
|
+
if de_key1:
|
44
|
+
if not adata:
|
45
|
+
raise ValueError("`adata` should be provided with `de_key1` and `de_key2`. ")
|
46
|
+
assert all(
|
47
|
+
k in adata.uns for k in [de_key1, de_key2]
|
48
|
+
), "Provided `de_key1` and `de_key2` must exist in `adata.uns`."
|
49
|
+
vars = adata.var_names
|
50
|
+
|
51
|
+
if de_df1 is not None:
|
52
|
+
for df in (de_df1, de_df2):
|
53
|
+
if not {"variable", "log_fc", "adj_p_value"}.issubset(df.columns):
|
54
|
+
raise ValueError("Each DataFrame must contain columns: 'variable', 'log_fc', and 'adj_p_value'.")
|
55
|
+
|
56
|
+
assert set(de_df1["variable"]) == set(de_df2["variable"]), "Variables in both dataframes must match."
|
57
|
+
vars = de_df1["variable"].sort_values()
|
58
|
+
|
59
|
+
shared_top = min(shared_top, len(vars))
|
60
|
+
vars_ranks = np.arange(1, len(vars) + 1)
|
61
|
+
results = pd.DataFrame(index=vars)
|
62
|
+
top_names = []
|
63
|
+
|
64
|
+
if de_key1 and de_key2:
|
65
|
+
for i, k in enumerate([de_key1, de_key2]):
|
66
|
+
label = adata.uns[k]["names"].dtype.names[0]
|
67
|
+
srt_idx = np.argsort(adata.uns[k]["names"][label])
|
68
|
+
results[f"scores_{i}"] = adata.uns[k]["scores"][label][srt_idx]
|
69
|
+
results[f"pvals_adj_{i}"] = adata.uns[k]["pvals_adj"][label][srt_idx]
|
70
|
+
results[f"ranks_{i}"] = vars_ranks[srt_idx]
|
71
|
+
top_names.append(adata.uns[k]["names"][label][:shared_top])
|
72
|
+
else:
|
73
|
+
for i, df in enumerate([de_df1, de_df2]):
|
74
|
+
srt_idx = np.argsort(df["variable"])
|
75
|
+
results[f"scores_{i}"] = df["log_fc"].values[srt_idx]
|
76
|
+
results[f"pvals_adj_{i}"] = df["adj_p_value"].values[srt_idx]
|
77
|
+
results[f"ranks_{i}"] = vars_ranks[srt_idx]
|
78
|
+
top_names.append(df["variable"][:shared_top])
|
79
|
+
|
80
|
+
metrics = {}
|
81
|
+
metrics["shared_top_genes"] = len(set(top_names[0]).intersection(top_names[1])) / shared_top
|
82
|
+
metrics["scores_corr"] = results["scores_0"].corr(results["scores_1"], method="pearson")
|
83
|
+
metrics["pvals_adj_corr"] = results["pvals_adj_0"].corr(results["pvals_adj_1"], method="pearson")
|
84
|
+
metrics["scores_ranks_corr"] = results["ranks_0"].corr(results["ranks_1"], method="spearman")
|
85
|
+
|
86
|
+
return metrics
|
@@ -0,0 +1,125 @@
|
|
1
|
+
from collections.abc import Sequence
|
2
|
+
|
3
|
+
import numpy as np
|
4
|
+
import pandas as pd
|
5
|
+
from scanpy import logging
|
6
|
+
from scipy.sparse import issparse
|
7
|
+
|
8
|
+
from ._base import LinearModelBase
|
9
|
+
from ._checks import check_is_integer_matrix
|
10
|
+
|
11
|
+
|
12
|
+
class EdgeR(LinearModelBase):
|
13
|
+
"""Differential expression test using EdgeR"""
|
14
|
+
|
15
|
+
def _check_counts(self):
|
16
|
+
check_is_integer_matrix(self.data)
|
17
|
+
|
18
|
+
def fit(self, **kwargs): # adata, design, mask, layer
|
19
|
+
"""Fit model using edgeR.
|
20
|
+
|
21
|
+
Note: this creates its own AnnData object for downstream.
|
22
|
+
|
23
|
+
Args:
|
24
|
+
**kwargs: Keyword arguments specific to glmQLFit()
|
25
|
+
"""
|
26
|
+
# For running in notebook
|
27
|
+
# pandas2ri.activate()
|
28
|
+
# rpy2.robjects.numpy2ri.activate()
|
29
|
+
try:
|
30
|
+
import rpy2.robjects.numpy2ri
|
31
|
+
import rpy2.robjects.pandas2ri
|
32
|
+
from rpy2 import robjects as ro
|
33
|
+
from rpy2.robjects import numpy2ri, pandas2ri
|
34
|
+
from rpy2.robjects.conversion import localconverter
|
35
|
+
from rpy2.robjects.packages import importr
|
36
|
+
|
37
|
+
pandas2ri.activate()
|
38
|
+
rpy2.robjects.numpy2ri.activate()
|
39
|
+
|
40
|
+
except ImportError:
|
41
|
+
raise ImportError("edger requires rpy2 to be installed.") from None
|
42
|
+
|
43
|
+
try:
|
44
|
+
edger = importr("edgeR")
|
45
|
+
except ImportError as e:
|
46
|
+
raise ImportError(
|
47
|
+
"edgeR requires a valid R installation with the following packages:\n"
|
48
|
+
"edgeR, BiocParallel, RhpcBLASctl"
|
49
|
+
) from e
|
50
|
+
|
51
|
+
# Convert dataframe
|
52
|
+
with localconverter(ro.default_converter + numpy2ri.converter):
|
53
|
+
expr = self.adata.X if self.layer is None else self.adata.layers[self.layer]
|
54
|
+
if issparse(expr):
|
55
|
+
expr = expr.T.toarray()
|
56
|
+
else:
|
57
|
+
expr = expr.T
|
58
|
+
|
59
|
+
expr_r = ro.conversion.py2rpy(pd.DataFrame(expr, index=self.adata.var_names, columns=self.adata.obs_names))
|
60
|
+
|
61
|
+
dge = edger.DGEList(counts=expr_r, samples=self.adata.obs)
|
62
|
+
|
63
|
+
logging.info("Calculating NormFactors")
|
64
|
+
dge = edger.calcNormFactors(dge)
|
65
|
+
|
66
|
+
logging.info("Estimating Dispersions")
|
67
|
+
dge = edger.estimateDisp(dge, design=self.design)
|
68
|
+
|
69
|
+
logging.info("Fitting linear model")
|
70
|
+
fit = edger.glmQLFit(dge, design=self.design, **kwargs)
|
71
|
+
|
72
|
+
ro.globalenv["fit"] = fit
|
73
|
+
self.fit = fit
|
74
|
+
|
75
|
+
def _test_single_contrast(self, contrast: Sequence[float], **kwargs) -> pd.DataFrame:
|
76
|
+
"""Conduct test for each contrast and return a data frame
|
77
|
+
|
78
|
+
Args:
|
79
|
+
contrast: numpy array of integars indicating contrast i.e. [-1, 0, 1, 0, 0]
|
80
|
+
"""
|
81
|
+
## -- Check installations
|
82
|
+
# For running in notebook
|
83
|
+
# pandas2ri.activate()
|
84
|
+
# rpy2.robjects.numpy2ri.activate()
|
85
|
+
|
86
|
+
# ToDo:
|
87
|
+
# parse **kwargs to R function
|
88
|
+
# Fix mask for .fit()
|
89
|
+
|
90
|
+
try:
|
91
|
+
import rpy2.robjects.numpy2ri
|
92
|
+
import rpy2.robjects.pandas2ri
|
93
|
+
from rpy2 import robjects as ro
|
94
|
+
from rpy2.robjects import numpy2ri, pandas2ri
|
95
|
+
from rpy2.robjects.conversion import localconverter
|
96
|
+
from rpy2.robjects.packages import importr
|
97
|
+
|
98
|
+
except ImportError:
|
99
|
+
raise ImportError("edger requires rpy2 to be installed.") from None
|
100
|
+
|
101
|
+
try:
|
102
|
+
importr("edgeR")
|
103
|
+
except ImportError:
|
104
|
+
raise ImportError(
|
105
|
+
"edgeR requires a valid R installation with the following packages: " "edgeR, BiocParallel, RhpcBLASctl"
|
106
|
+
) from None
|
107
|
+
|
108
|
+
# Convert vector to R, which drops a category like `self.design_matrix` to use the intercept for the left out.
|
109
|
+
contrast_vec_r = ro.conversion.py2rpy(np.asarray(contrast))
|
110
|
+
ro.globalenv["contrast_vec"] = contrast_vec_r
|
111
|
+
|
112
|
+
# Test contrast with R
|
113
|
+
ro.r(
|
114
|
+
"""
|
115
|
+
test = edgeR::glmQLFTest(fit, contrast=contrast_vec)
|
116
|
+
de_res = edgeR::topTags(test, n=Inf, adjust.method="BH", sort.by="PValue")$table
|
117
|
+
"""
|
118
|
+
)
|
119
|
+
|
120
|
+
# Convert results to pandas
|
121
|
+
de_res = ro.conversion.rpy2py(ro.globalenv["de_res"])
|
122
|
+
de_res.index.name = "variable"
|
123
|
+
de_res = de_res.reset_index()
|
124
|
+
|
125
|
+
return de_res.rename(columns={"PValue": "p_value", "logFC": "log_fc", "FDR": "adj_p_value"})
|
@@ -0,0 +1,189 @@
|
|
1
|
+
"""Helpers to interact with Formulaic Formulas
|
2
|
+
|
3
|
+
Some helpful definitions for working with formulaic formulas (e.g. `~ 0 + C(donor):treatment + np.log1p(continuous)`):
|
4
|
+
* A *term* refers to an expression in the formula, separated by `+`, e.g. `C(donor):treatment`, or `np.log1p(continuous)`.
|
5
|
+
* A *variable* refers to a column of the data frame passed to formulaic, e.g. `donor`.
|
6
|
+
* A *factor* is the specification of how a certain variable is represented in the design matrix, e.g. treatment coding with base level "A" and reduced rank.
|
7
|
+
"""
|
8
|
+
|
9
|
+
from collections import defaultdict
|
10
|
+
from collections.abc import Mapping, Sequence
|
11
|
+
from dataclasses import dataclass
|
12
|
+
from typing import Any
|
13
|
+
|
14
|
+
from formulaic import FactorValues, ModelSpec
|
15
|
+
from formulaic.materializers import PandasMaterializer
|
16
|
+
from formulaic.materializers.types import EvaluatedFactor
|
17
|
+
from formulaic.parser.types import Factor
|
18
|
+
from interface_meta import override
|
19
|
+
|
20
|
+
|
21
|
+
@dataclass
|
22
|
+
class FactorMetadata:
|
23
|
+
"""Store (relevant) metadata for a factor of a formula."""
|
24
|
+
|
25
|
+
name: str
|
26
|
+
"""The unambiguous factor name as specified in the formula. E.g. `donor`, or `C(donor, contr.treatment(base="A"))`"""
|
27
|
+
|
28
|
+
reduced_rank: bool
|
29
|
+
"""Whether a column will be dropped because it is redundant"""
|
30
|
+
|
31
|
+
custom_encoder: bool
|
32
|
+
"""Whether or not a custom encoder (e.g. `C(...)`) was used."""
|
33
|
+
|
34
|
+
categories: Sequence[str]
|
35
|
+
"""The unique categories in this factor (after applying `drop_rows`)"""
|
36
|
+
|
37
|
+
kind: Factor.Kind
|
38
|
+
"""Type of the factor"""
|
39
|
+
|
40
|
+
drop_field: str = None
|
41
|
+
"""The category that is dropped.
|
42
|
+
|
43
|
+
Note that
|
44
|
+
* this may also be populated if `reduced_rank = False`
|
45
|
+
* this is only populated when no encoder was used (e.g. `~ donor` but NOT `~ C(donor)`.
|
46
|
+
"""
|
47
|
+
|
48
|
+
column_names: Sequence[str] = None
|
49
|
+
"""The column names for this factor included in the design matrix.
|
50
|
+
|
51
|
+
This may be the same as `categories` if the default encoder is used, or
|
52
|
+
categories without the base level if a custom encoder (e.g. `C(...)`) is used.
|
53
|
+
"""
|
54
|
+
|
55
|
+
colname_format: str = None
|
56
|
+
"""A formattable string that can be used to generate the column name in the design matrix, e.g. `{name}[T.{field}]`"""
|
57
|
+
|
58
|
+
@property
|
59
|
+
def base(self) -> str | None:
|
60
|
+
"""
|
61
|
+
The base category for this categorical factor.
|
62
|
+
|
63
|
+
This is derived from `drop_field` (for default encoding) or by comparing the column names in
|
64
|
+
the design matrix with all categories (for custom encoding, e.g. `C(...)`).
|
65
|
+
"""
|
66
|
+
if not self.reduced_rank:
|
67
|
+
return None
|
68
|
+
else:
|
69
|
+
if self.custom_encoder:
|
70
|
+
tmp_base = set(self.categories) - set(self.column_names)
|
71
|
+
assert len(tmp_base) == 1
|
72
|
+
return tmp_base.pop()
|
73
|
+
else:
|
74
|
+
assert self.drop_field is not None
|
75
|
+
return self.drop_field
|
76
|
+
|
77
|
+
|
78
|
+
def get_factor_storage_and_materializer() -> tuple[dict[str, list[FactorMetadata]], dict[str, set[str]], type]:
|
79
|
+
"""Keeps track of categorical factors used in a model specification by generating a custom materializer.
|
80
|
+
|
81
|
+
This materializer reports back metadata upon materialization of the model matrix.
|
82
|
+
|
83
|
+
Returns:
|
84
|
+
- A dictionary storing metadata for each factor processed by the custom materializer, named `factor_storage`.
|
85
|
+
- A dictionary mapping variables to factor names, which works similarly to model_spec.variable_terms
|
86
|
+
but maps to factors rather than terms, named `variable_to_factors`.
|
87
|
+
- A materializer class tied to the specific instance of `factor_storage`.
|
88
|
+
"""
|
89
|
+
# There can be multiple FactorMetadata entries per sample, for instance when formulaic generates an interaction
|
90
|
+
# term, it generates the factor with both full rank and reduced rank.
|
91
|
+
factor_storage: dict[str, list[FactorMetadata]] = defaultdict(list)
|
92
|
+
variable_to_factors: dict[str, set[str]] = defaultdict(set)
|
93
|
+
|
94
|
+
class CustomPandasMaterializer(PandasMaterializer):
|
95
|
+
"""An extension of the PandasMaterializer that records all categorical variables and their (base) categories."""
|
96
|
+
|
97
|
+
REGISTER_NAME = "custom_pandas"
|
98
|
+
REGISTER_INPUTS = ("pandas.core.frame.DataFrame",)
|
99
|
+
REGISTER_OUTPUTS = ("pandas", "numpy", "sparse")
|
100
|
+
|
101
|
+
def __init__(
|
102
|
+
self,
|
103
|
+
data: Any,
|
104
|
+
context: Mapping[str, Any] | None = None,
|
105
|
+
record_factor_metadata: bool = False,
|
106
|
+
**params: Any,
|
107
|
+
):
|
108
|
+
"""Initialize the Materializer.
|
109
|
+
|
110
|
+
Args:
|
111
|
+
data: Passed to PandasMaterializer.
|
112
|
+
context: Passed to PandasMaterializer
|
113
|
+
record_factor_metadata: Flag that tells whether this particular instance of the custom materializer class
|
114
|
+
is supposed to record factor metadata. Only the instance that is used for building the design
|
115
|
+
matrix should record the metadata. All other instances (e.g. used to generate contrast vectors)
|
116
|
+
should not record metadata to not overwrite the specifications from the design matrix.
|
117
|
+
**params: Passed to PandasMaterializer
|
118
|
+
"""
|
119
|
+
self.factor_metadata_storage = factor_storage if record_factor_metadata else None
|
120
|
+
self.variable_to_factors = variable_to_factors if record_factor_metadata else None
|
121
|
+
# temporary pointer to metadata of factor that is currently evaluated
|
122
|
+
self._current_factor: FactorMetadata = None
|
123
|
+
super().__init__(data, context, **params)
|
124
|
+
|
125
|
+
@override
|
126
|
+
def _encode_evaled_factor(
|
127
|
+
self, factor: EvaluatedFactor, spec: ModelSpec, drop_rows: Sequence[int], reduced_rank: bool = False
|
128
|
+
) -> dict[str, Any]:
|
129
|
+
"""Function is called just before the factor is evaluated.
|
130
|
+
|
131
|
+
We can record some metadata, before we call the original function.
|
132
|
+
"""
|
133
|
+
assert (
|
134
|
+
self._current_factor is None
|
135
|
+
), "_current_factor should always be None when we start recording metadata"
|
136
|
+
if self.factor_metadata_storage is not None:
|
137
|
+
# Don't store if the factor is cached (then we should already have recorded it)
|
138
|
+
if factor.expr in self.encoded_cache or (factor.expr, reduced_rank) in self.encoded_cache:
|
139
|
+
assert factor.expr in self.factor_metadata_storage, "Factor should be there since it's cached"
|
140
|
+
else:
|
141
|
+
for var in factor.variables:
|
142
|
+
self.variable_to_factors[var].add(factor.expr)
|
143
|
+
self._current_factor = FactorMetadata(
|
144
|
+
name=factor.expr,
|
145
|
+
reduced_rank=reduced_rank,
|
146
|
+
categories=tuple(sorted(factor.values.drop(index=factor.values.index[drop_rows]).unique())),
|
147
|
+
custom_encoder=factor.metadata.encoder is not None,
|
148
|
+
kind=factor.metadata.kind,
|
149
|
+
)
|
150
|
+
return super()._encode_evaled_factor(factor, spec, drop_rows, reduced_rank)
|
151
|
+
|
152
|
+
@override
|
153
|
+
def _flatten_encoded_evaled_factor(self, name: str, values: FactorValues[dict]) -> dict[str, Any]:
|
154
|
+
"""
|
155
|
+
Function is called at the end, before the design matrix gets materialized.
|
156
|
+
|
157
|
+
Here we have access to additional metadata, such as `drop_field`.
|
158
|
+
"""
|
159
|
+
if self._current_factor is not None:
|
160
|
+
assert self._current_factor.name == name
|
161
|
+
self._current_factor.drop_field = values.__formulaic_metadata__.drop_field
|
162
|
+
self._current_factor.column_names = values.__formulaic_metadata__.column_names
|
163
|
+
self._current_factor.colname_format = values.__formulaic_metadata__.format
|
164
|
+
self.factor_metadata_storage[name].append(self._current_factor)
|
165
|
+
self._current_factor = None
|
166
|
+
|
167
|
+
return super()._flatten_encoded_evaled_factor(name, values)
|
168
|
+
|
169
|
+
return factor_storage, variable_to_factors, CustomPandasMaterializer
|
170
|
+
|
171
|
+
|
172
|
+
class AmbiguousAttributeError(ValueError):
|
173
|
+
pass
|
174
|
+
|
175
|
+
|
176
|
+
def resolve_ambiguous(objs: Sequence[Any], attr: str) -> Any:
|
177
|
+
"""Given a list of objects, return an attribute if it is the same between all object. Otherwise, raise an error."""
|
178
|
+
if not objs:
|
179
|
+
raise ValueError("Collection is empty")
|
180
|
+
|
181
|
+
first_obj_attr = getattr(objs[0], attr)
|
182
|
+
|
183
|
+
# Check if the attribute is the same for all objects
|
184
|
+
for obj in objs[1:]:
|
185
|
+
if getattr(obj, attr) != first_obj_attr:
|
186
|
+
raise AmbiguousAttributeError(f"Ambiguous attribute '{attr}': values differ between objects")
|
187
|
+
|
188
|
+
# If attribute is the same for all objects, return it
|
189
|
+
return first_obj_attr
|
@@ -0,0 +1,95 @@
|
|
1
|
+
import os
|
2
|
+
import re
|
3
|
+
import warnings
|
4
|
+
|
5
|
+
import pandas as pd
|
6
|
+
from anndata import AnnData
|
7
|
+
from numpy import ndarray
|
8
|
+
from pydeseq2.dds import DeseqDataSet
|
9
|
+
from pydeseq2.default_inference import DefaultInference
|
10
|
+
from pydeseq2.ds import DeseqStats
|
11
|
+
from scipy.sparse import issparse
|
12
|
+
|
13
|
+
from ._base import LinearModelBase
|
14
|
+
from ._checks import check_is_integer_matrix
|
15
|
+
|
16
|
+
|
17
|
+
class PyDESeq2(LinearModelBase):
|
18
|
+
"""Differential expression test using a PyDESeq2"""
|
19
|
+
|
20
|
+
def __init__(
|
21
|
+
self, adata: AnnData, design: str | ndarray, *, mask: str | None = None, layer: str | None = None, **kwargs
|
22
|
+
):
|
23
|
+
super().__init__(adata, design, mask=mask, layer=layer, **kwargs)
|
24
|
+
# work around pydeseq2 issue with sparse matrices
|
25
|
+
# see also https://github.com/owkin/PyDESeq2/issues/25
|
26
|
+
if issparse(self.data):
|
27
|
+
if self.layer is None:
|
28
|
+
self.adata.X = self.adata.X.toarray()
|
29
|
+
else:
|
30
|
+
self.adata.layers[self.layer] = self.adata.layers[self.layer].toarray()
|
31
|
+
|
32
|
+
def _check_counts(self):
|
33
|
+
check_is_integer_matrix(self.data)
|
34
|
+
|
35
|
+
def fit(self, **kwargs) -> pd.DataFrame:
|
36
|
+
"""Fit dds model using pydeseq2.
|
37
|
+
|
38
|
+
Note: this creates its own AnnData object for downstream processing.
|
39
|
+
|
40
|
+
Args:
|
41
|
+
**kwargs: Keyword arguments specific to DeseqDataSet(), except for `n_cpus` which will use all available CPUs minus one if the argument is not passed.
|
42
|
+
"""
|
43
|
+
inference = DefaultInference(n_cpus=kwargs.pop("n_cpus", os.cpu_count() - 1))
|
44
|
+
covars = self.design.columns.tolist()
|
45
|
+
if "Intercept" not in covars:
|
46
|
+
warnings.warn(
|
47
|
+
"Warning: Pydeseq is hard-coded to use Intercept, please include intercept into the model", stacklevel=2
|
48
|
+
)
|
49
|
+
processed_covars = list({re.sub(r"\[T\.(.*)\]", "", col) for col in covars if col != "Intercept"})
|
50
|
+
dds = DeseqDataSet(
|
51
|
+
adata=self.adata, design_factors=processed_covars, refit_cooks=True, inference=inference, **kwargs
|
52
|
+
)
|
53
|
+
# workaround code to insert design array
|
54
|
+
des_mtx_cols = dds.obsm["design_matrix"].columns
|
55
|
+
dds.obsm["design_matrix"] = self.design
|
56
|
+
if dds.obsm["design_matrix"].shape[1] == len(des_mtx_cols):
|
57
|
+
dds.obsm["design_matrix"].columns = des_mtx_cols.copy()
|
58
|
+
|
59
|
+
dds.deseq2()
|
60
|
+
self.dds = dds
|
61
|
+
|
62
|
+
# TODO: PyDeseq2 doesn't support arbitrary designs and contrasts yet
|
63
|
+
# see https://github.com/owkin/PyDESeq2/issues/213
|
64
|
+
|
65
|
+
# Therefore these functions are overridden in a way to make it work with PyDESeq2,
|
66
|
+
# ingoring the inconsistency of function signatures. Once arbitrary design
|
67
|
+
# matrices and contrasts are supported by PyDEseq2, we can fully support the
|
68
|
+
# Linear model interface.
|
69
|
+
def _test_single_contrast(self, contrast: list[str], alpha=0.05, **kwargs) -> pd.DataFrame: # type: ignore
|
70
|
+
"""Conduct a specific test and returns a Pandas DataFrame.
|
71
|
+
|
72
|
+
Args:
|
73
|
+
contrast: list of three strings of the form `["variable", "tested level", "reference level"]`.
|
74
|
+
alpha: p value threshold used for controlling fdr with independent hypothesis weighting
|
75
|
+
**kwargs: extra arguments to pass to DeseqStats()
|
76
|
+
"""
|
77
|
+
stat_res = DeseqStats(self.dds, contrast=contrast, alpha=alpha, **kwargs)
|
78
|
+
# Calling `.summary()` is required to fill the `results_df` data frame
|
79
|
+
stat_res.summary()
|
80
|
+
res_df = (
|
81
|
+
pd.DataFrame(stat_res.results_df)
|
82
|
+
.rename(columns={"pvalue": "p_value", "padj": "adj_p_value", "log2FoldChange": "log_fc"})
|
83
|
+
.sort_values("p_value")
|
84
|
+
)
|
85
|
+
res_df.index.name = "variable"
|
86
|
+
res_df = res_df.reset_index()
|
87
|
+
return res_df
|
88
|
+
|
89
|
+
def cond(self, **kwargs) -> ndarray:
|
90
|
+
raise NotImplementedError(
|
91
|
+
"PyDESeq2 currently doesn't support arbitrary contrasts, see https://github.com/owkin/PyDESeq2/issues/213"
|
92
|
+
)
|
93
|
+
|
94
|
+
def contrast(self, column: str, baseline: str, group_to_compare: str) -> tuple[str, str, str]: # type: ignore
|
95
|
+
return (column, group_to_compare, baseline)
|
@@ -0,0 +1,162 @@
|
|
1
|
+
"""Simple tests such as t-test, wilcoxon"""
|
2
|
+
|
3
|
+
import warnings
|
4
|
+
from abc import abstractmethod
|
5
|
+
from collections.abc import Mapping, Sequence
|
6
|
+
from types import MappingProxyType
|
7
|
+
|
8
|
+
import numpy as np
|
9
|
+
import pandas as pd
|
10
|
+
import scipy.stats
|
11
|
+
import statsmodels
|
12
|
+
from anndata import AnnData
|
13
|
+
from pandas.core.api import DataFrame as DataFrame
|
14
|
+
from scipy.sparse import diags, issparse
|
15
|
+
from tqdm.auto import tqdm
|
16
|
+
|
17
|
+
from ._base import MethodBase
|
18
|
+
|
19
|
+
|
20
|
+
def fdr_correction(
|
21
|
+
df: pd.DataFrame, pvalue_col: str = "p_value", *, key_added: str = "adj_p_value", inplace: bool = False
|
22
|
+
):
|
23
|
+
"""Adjust p-values in a DataFrame with test results using FDR correction."""
|
24
|
+
if not inplace:
|
25
|
+
df = df.copy()
|
26
|
+
|
27
|
+
df[key_added] = statsmodels.stats.multitest.fdrcorrection(df[pvalue_col].values)[1]
|
28
|
+
|
29
|
+
if not inplace:
|
30
|
+
return df
|
31
|
+
|
32
|
+
|
33
|
+
class SimpleComparisonBase(MethodBase):
|
34
|
+
@staticmethod
|
35
|
+
@abstractmethod
|
36
|
+
def _test(x0: np.ndarray, x1: np.ndarray, paired: bool, **kwargs) -> float:
|
37
|
+
"""Perform a statistical test between values in x0 and x1.
|
38
|
+
|
39
|
+
If `paired` is True, x0 and x1 must be of the same length and ordered such that
|
40
|
+
paired elements have the same position.
|
41
|
+
|
42
|
+
Args:
|
43
|
+
x0: Array with baseline values.
|
44
|
+
x1: Array with values to compare.
|
45
|
+
paired: Indicates whether to perform a paired test
|
46
|
+
**kwargs: kwargs passed to the test function
|
47
|
+
"""
|
48
|
+
...
|
49
|
+
|
50
|
+
def _compare_single_group(
|
51
|
+
self, baseline_idx: np.ndarray, comparison_idx: np.ndarray, *, paired: bool, **kwargs
|
52
|
+
) -> DataFrame:
|
53
|
+
"""Perform a single comparison between two groups.
|
54
|
+
|
55
|
+
Args:
|
56
|
+
baseline_idx: Numeric indices indicating which observations are in the baseline group.
|
57
|
+
comparison_idx: Numeric indices indicating which observations are in the comparison/treatment group
|
58
|
+
paired: Whether to perform a paired test. Note that in the case of a paired test,
|
59
|
+
the indices must be ordered such that paired observations appear at the same position.
|
60
|
+
**kwargs: kwargs passed to the test function
|
61
|
+
"""
|
62
|
+
if paired:
|
63
|
+
assert len(baseline_idx) == len(comparison_idx), "For a paired test, indices must be of the same length"
|
64
|
+
|
65
|
+
x0 = self.data[baseline_idx, :]
|
66
|
+
x1 = self.data[comparison_idx, :]
|
67
|
+
|
68
|
+
# In the following loop, we are doing a lot of column slicing -- which is significantly
|
69
|
+
# more efficient in csc format.
|
70
|
+
if issparse(self.data):
|
71
|
+
x0 = x0.tocsc()
|
72
|
+
x1 = x1.tocsc()
|
73
|
+
|
74
|
+
res = []
|
75
|
+
for var in tqdm(self.adata.var_names):
|
76
|
+
tmp_x0 = x0[:, self.adata.var_names == var]
|
77
|
+
tmp_x0 = np.asarray(tmp_x0.todense()).flatten() if issparse(tmp_x0) else tmp_x0.flatten()
|
78
|
+
tmp_x1 = x1[:, self.adata.var_names == var]
|
79
|
+
tmp_x1 = np.asarray(tmp_x1.todense()).flatten() if issparse(tmp_x1) else tmp_x1.flatten()
|
80
|
+
pval = self._test(tmp_x0, tmp_x1, paired, **kwargs)
|
81
|
+
mean_x0 = np.mean(tmp_x0)
|
82
|
+
mean_x1 = np.mean(tmp_x1)
|
83
|
+
res.append({"variable": var, "p_value": pval, "log_fc": np.log2(mean_x1) - np.log2(mean_x0)})
|
84
|
+
return pd.DataFrame(res).sort_values("p_value")
|
85
|
+
|
86
|
+
@classmethod
|
87
|
+
def compare_groups(
|
88
|
+
cls,
|
89
|
+
adata: AnnData,
|
90
|
+
column: str,
|
91
|
+
baseline: str,
|
92
|
+
groups_to_compare: str | Sequence[str],
|
93
|
+
*,
|
94
|
+
paired_by: str | None = None,
|
95
|
+
mask: str | None = None,
|
96
|
+
layer: str | None = None,
|
97
|
+
fit_kwargs: Mapping = MappingProxyType({}),
|
98
|
+
test_kwargs: Mapping = MappingProxyType({}),
|
99
|
+
) -> DataFrame:
|
100
|
+
if len(fit_kwargs):
|
101
|
+
warnings.warn("fit_kwargs not used for simple tests.", UserWarning, stacklevel=2)
|
102
|
+
paired = paired_by is not None
|
103
|
+
model = cls(adata, mask=mask, layer=layer)
|
104
|
+
if groups_to_compare is None:
|
105
|
+
# compare against all other
|
106
|
+
groups_to_compare = sorted(set(model.adata.obs[column]) - {baseline})
|
107
|
+
if isinstance(groups_to_compare, str):
|
108
|
+
groups_to_compare = [groups_to_compare]
|
109
|
+
|
110
|
+
def _get_idx(column, value):
|
111
|
+
mask = model.adata.obs[column] == value
|
112
|
+
if paired:
|
113
|
+
dummies = pd.get_dummies(model.adata.obs[paired_by], sparse=True).sparse.to_coo().tocsr()
|
114
|
+
if not np.all(np.sum(dummies, axis=0) == 2):
|
115
|
+
raise ValueError("Pairing is only possible with exactly two values per group")
|
116
|
+
# Use matrix multiplication to only retreive those dummy entries that are associated with the current `value`.
|
117
|
+
# Convert to COO matrix to get rows/cols
|
118
|
+
# row indices refers to the indices of rows that have `column == value` (equivalent to np.where(mask)[0])
|
119
|
+
# col indices refers to the numeric index of each "pair" in obs_names
|
120
|
+
ind_mat = diags(mask.values, dtype=bool) @ dummies
|
121
|
+
if not np.all(np.sum(ind_mat, axis=0) == 1):
|
122
|
+
raise ValueError("Pairing is only possible with exactly two values per group")
|
123
|
+
ind_mat = ind_mat.tocoo()
|
124
|
+
return ind_mat.row[np.argsort(ind_mat.col)]
|
125
|
+
else:
|
126
|
+
return np.where(mask)[0]
|
127
|
+
|
128
|
+
res_dfs = []
|
129
|
+
baseline_idx = _get_idx(column, baseline)
|
130
|
+
for group_to_compare in groups_to_compare:
|
131
|
+
comparison_idx = _get_idx(column, group_to_compare)
|
132
|
+
res_dfs.append(
|
133
|
+
model._compare_single_group(baseline_idx, comparison_idx, paired=paired, **test_kwargs).assign(
|
134
|
+
comparison=f"{group_to_compare}_vs_{baseline if baseline is not None else 'rest'}"
|
135
|
+
)
|
136
|
+
)
|
137
|
+
return fdr_correction(pd.concat(res_dfs))
|
138
|
+
|
139
|
+
|
140
|
+
class WilcoxonTest(SimpleComparisonBase):
|
141
|
+
"""Perform a unpaired or paired Wilcoxon test.
|
142
|
+
|
143
|
+
(the former is also known as "Mann-Whitney U test", the latter as "wilcoxon signed rank test")
|
144
|
+
"""
|
145
|
+
|
146
|
+
@staticmethod
|
147
|
+
def _test(x0: np.ndarray, x1: np.ndarray, paired: bool, **kwargs) -> float:
|
148
|
+
if paired:
|
149
|
+
return scipy.stats.wilcoxon(x0, x1, **kwargs).pvalue
|
150
|
+
else:
|
151
|
+
return scipy.stats.mannwhitneyu(x0, x1, **kwargs).pvalue
|
152
|
+
|
153
|
+
|
154
|
+
class TTest(SimpleComparisonBase):
|
155
|
+
"""Perform a unpaired or paired T-test"""
|
156
|
+
|
157
|
+
@staticmethod
|
158
|
+
def _test(x0: np.ndarray, x1: np.ndarray, paired: bool, **kwargs) -> float:
|
159
|
+
if paired:
|
160
|
+
return scipy.stats.ttest_rel(x0, x1, **kwargs).pvalue
|
161
|
+
else:
|
162
|
+
return scipy.stats.ttest_ind(x0, x1, **kwargs).pvalue
|