pertpy 0.6.0__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pertpy/__init__.py +4 -2
- pertpy/data/__init__.py +66 -1
- pertpy/data/_dataloader.py +28 -26
- pertpy/data/_datasets.py +261 -92
- pertpy/metadata/__init__.py +6 -0
- pertpy/metadata/_cell_line.py +795 -0
- pertpy/metadata/_compound.py +128 -0
- pertpy/metadata/_drug.py +238 -0
- pertpy/metadata/_look_up.py +569 -0
- pertpy/metadata/_metadata.py +70 -0
- pertpy/metadata/_moa.py +125 -0
- pertpy/plot/__init__.py +0 -13
- pertpy/preprocessing/__init__.py +2 -0
- pertpy/preprocessing/_guide_rna.py +89 -6
- pertpy/tools/__init__.py +48 -15
- pertpy/tools/_augur.py +329 -32
- pertpy/tools/_cinemaot.py +145 -6
- pertpy/tools/_coda/_base_coda.py +1237 -116
- pertpy/tools/_coda/_sccoda.py +66 -36
- pertpy/tools/_coda/_tasccoda.py +46 -39
- pertpy/tools/_dialogue.py +180 -77
- pertpy/tools/_differential_gene_expression/__init__.py +20 -0
- pertpy/tools/_differential_gene_expression/_base.py +657 -0
- pertpy/tools/_differential_gene_expression/_checks.py +41 -0
- pertpy/tools/_differential_gene_expression/_dge_comparison.py +86 -0
- pertpy/tools/_differential_gene_expression/_edger.py +125 -0
- pertpy/tools/_differential_gene_expression/_formulaic.py +189 -0
- pertpy/tools/_differential_gene_expression/_pydeseq2.py +95 -0
- pertpy/tools/_differential_gene_expression/_simple_tests.py +162 -0
- pertpy/tools/_differential_gene_expression/_statsmodels.py +72 -0
- pertpy/tools/_distances/_distance_tests.py +29 -24
- pertpy/tools/_distances/_distances.py +584 -98
- pertpy/tools/_enrichment.py +460 -0
- pertpy/tools/_kernel_pca.py +1 -1
- pertpy/tools/_milo.py +406 -49
- pertpy/tools/_mixscape.py +677 -55
- pertpy/tools/_perturbation_space/_clustering.py +10 -3
- pertpy/tools/_perturbation_space/_comparison.py +112 -0
- pertpy/tools/_perturbation_space/_discriminator_classifiers.py +524 -0
- pertpy/tools/_perturbation_space/_perturbation_space.py +146 -52
- pertpy/tools/_perturbation_space/_simple.py +52 -11
- pertpy/tools/_scgen/__init__.py +1 -1
- pertpy/tools/_scgen/_base_components.py +2 -3
- pertpy/tools/_scgen/_scgen.py +706 -0
- pertpy/tools/_scgen/_utils.py +3 -5
- pertpy/tools/decoupler_LICENSE +674 -0
- {pertpy-0.6.0.dist-info → pertpy-0.8.0.dist-info}/METADATA +48 -20
- pertpy-0.8.0.dist-info/RECORD +57 -0
- {pertpy-0.6.0.dist-info → pertpy-0.8.0.dist-info}/WHEEL +1 -1
- pertpy/plot/_augur.py +0 -234
- pertpy/plot/_cinemaot.py +0 -81
- pertpy/plot/_coda.py +0 -1001
- pertpy/plot/_dialogue.py +0 -91
- pertpy/plot/_guide_rna.py +0 -82
- pertpy/plot/_milopy.py +0 -284
- pertpy/plot/_mixscape.py +0 -594
- pertpy/plot/_scgen.py +0 -337
- pertpy/tools/_differential_gene_expression.py +0 -99
- pertpy/tools/_metadata/__init__.py +0 -0
- pertpy/tools/_metadata/_cell_line.py +0 -613
- pertpy/tools/_metadata/_look_up.py +0 -342
- pertpy/tools/_perturbation_space/_discriminator_classifier.py +0 -381
- pertpy/tools/_scgen/_jax_scgen.py +0 -370
- pertpy-0.6.0.dist-info/RECORD +0 -50
- /pertpy/tools/_scgen/{_jax_scgenvae.py → _scgenvae.py} +0 -0
- {pertpy-0.6.0.dist-info → pertpy-0.8.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,86 @@
|
|
1
|
+
import numpy as np
|
2
|
+
import pandas as pd
|
3
|
+
from anndata import AnnData
|
4
|
+
|
5
|
+
|
6
|
+
class DGEEVAL:
|
7
|
+
def compare(
|
8
|
+
self,
|
9
|
+
adata: AnnData | None = None,
|
10
|
+
de_key1: str = None,
|
11
|
+
de_key2: str = None,
|
12
|
+
de_df1: pd.DataFrame | None = None,
|
13
|
+
de_df2: pd.DataFrame | None = None,
|
14
|
+
shared_top: int = 100,
|
15
|
+
) -> dict[str, float]:
|
16
|
+
"""Compare two differential expression analyses.
|
17
|
+
|
18
|
+
Compare two sets of DE results and evaluate the similarity by the overlap of top DEG and
|
19
|
+
the correlation of their scores and adjusted p-values.
|
20
|
+
|
21
|
+
Args:
|
22
|
+
adata: AnnData object containing DE results in `uns`. Required if `de_key1` and `de_key2` are used.
|
23
|
+
de_key1: Key for DE results in `adata.uns`, e.g., output of `tl.rank_genes_groups`.
|
24
|
+
de_key2: Another key for DE results in `adata.uns`, e.g., output of `tl.rank_genes_groups`.
|
25
|
+
de_df1: DataFrame containing DE results, e.g. output from pertpy differential gene expression interface.
|
26
|
+
de_df2: DataFrame containing DE results, e.g. output from pertpy differential gene expression interface.
|
27
|
+
shared_top: The number of top DEG to compute the proportion of their intersection.
|
28
|
+
|
29
|
+
"""
|
30
|
+
if (de_key1 or de_key2) and (de_df1 is not None or de_df2 is not None):
|
31
|
+
raise ValueError(
|
32
|
+
"Please provide either both `de_key1` and `de_key2` with `adata`, or `de_df1` and `de_df2`, but not both."
|
33
|
+
)
|
34
|
+
|
35
|
+
if de_df1 is None and de_df2 is None: # use keys
|
36
|
+
if not de_key1 or not de_key2:
|
37
|
+
raise ValueError("Both `de_key1` and `de_key2` must be provided together if using `adata`.")
|
38
|
+
|
39
|
+
else: # use dfs
|
40
|
+
if de_df1 is None or de_df2 is None:
|
41
|
+
raise ValueError("Both `de_df1` and `de_df2` must be provided together if using DataFrames.")
|
42
|
+
|
43
|
+
if de_key1:
|
44
|
+
if not adata:
|
45
|
+
raise ValueError("`adata` should be provided with `de_key1` and `de_key2`. ")
|
46
|
+
assert all(
|
47
|
+
k in adata.uns for k in [de_key1, de_key2]
|
48
|
+
), "Provided `de_key1` and `de_key2` must exist in `adata.uns`."
|
49
|
+
vars = adata.var_names
|
50
|
+
|
51
|
+
if de_df1 is not None:
|
52
|
+
for df in (de_df1, de_df2):
|
53
|
+
if not {"variable", "log_fc", "adj_p_value"}.issubset(df.columns):
|
54
|
+
raise ValueError("Each DataFrame must contain columns: 'variable', 'log_fc', and 'adj_p_value'.")
|
55
|
+
|
56
|
+
assert set(de_df1["variable"]) == set(de_df2["variable"]), "Variables in both dataframes must match."
|
57
|
+
vars = de_df1["variable"].sort_values()
|
58
|
+
|
59
|
+
shared_top = min(shared_top, len(vars))
|
60
|
+
vars_ranks = np.arange(1, len(vars) + 1)
|
61
|
+
results = pd.DataFrame(index=vars)
|
62
|
+
top_names = []
|
63
|
+
|
64
|
+
if de_key1 and de_key2:
|
65
|
+
for i, k in enumerate([de_key1, de_key2]):
|
66
|
+
label = adata.uns[k]["names"].dtype.names[0]
|
67
|
+
srt_idx = np.argsort(adata.uns[k]["names"][label])
|
68
|
+
results[f"scores_{i}"] = adata.uns[k]["scores"][label][srt_idx]
|
69
|
+
results[f"pvals_adj_{i}"] = adata.uns[k]["pvals_adj"][label][srt_idx]
|
70
|
+
results[f"ranks_{i}"] = vars_ranks[srt_idx]
|
71
|
+
top_names.append(adata.uns[k]["names"][label][:shared_top])
|
72
|
+
else:
|
73
|
+
for i, df in enumerate([de_df1, de_df2]):
|
74
|
+
srt_idx = np.argsort(df["variable"])
|
75
|
+
results[f"scores_{i}"] = df["log_fc"].values[srt_idx]
|
76
|
+
results[f"pvals_adj_{i}"] = df["adj_p_value"].values[srt_idx]
|
77
|
+
results[f"ranks_{i}"] = vars_ranks[srt_idx]
|
78
|
+
top_names.append(df["variable"][:shared_top])
|
79
|
+
|
80
|
+
metrics = {}
|
81
|
+
metrics["shared_top_genes"] = len(set(top_names[0]).intersection(top_names[1])) / shared_top
|
82
|
+
metrics["scores_corr"] = results["scores_0"].corr(results["scores_1"], method="pearson")
|
83
|
+
metrics["pvals_adj_corr"] = results["pvals_adj_0"].corr(results["pvals_adj_1"], method="pearson")
|
84
|
+
metrics["scores_ranks_corr"] = results["ranks_0"].corr(results["ranks_1"], method="spearman")
|
85
|
+
|
86
|
+
return metrics
|
@@ -0,0 +1,125 @@
|
|
1
|
+
from collections.abc import Sequence
|
2
|
+
|
3
|
+
import numpy as np
|
4
|
+
import pandas as pd
|
5
|
+
from scanpy import logging
|
6
|
+
from scipy.sparse import issparse
|
7
|
+
|
8
|
+
from ._base import LinearModelBase
|
9
|
+
from ._checks import check_is_integer_matrix
|
10
|
+
|
11
|
+
|
12
|
+
class EdgeR(LinearModelBase):
|
13
|
+
"""Differential expression test using EdgeR"""
|
14
|
+
|
15
|
+
def _check_counts(self):
|
16
|
+
check_is_integer_matrix(self.data)
|
17
|
+
|
18
|
+
def fit(self, **kwargs): # adata, design, mask, layer
|
19
|
+
"""Fit model using edgeR.
|
20
|
+
|
21
|
+
Note: this creates its own AnnData object for downstream.
|
22
|
+
|
23
|
+
Args:
|
24
|
+
**kwargs: Keyword arguments specific to glmQLFit()
|
25
|
+
"""
|
26
|
+
# For running in notebook
|
27
|
+
# pandas2ri.activate()
|
28
|
+
# rpy2.robjects.numpy2ri.activate()
|
29
|
+
try:
|
30
|
+
import rpy2.robjects.numpy2ri
|
31
|
+
import rpy2.robjects.pandas2ri
|
32
|
+
from rpy2 import robjects as ro
|
33
|
+
from rpy2.robjects import numpy2ri, pandas2ri
|
34
|
+
from rpy2.robjects.conversion import localconverter
|
35
|
+
from rpy2.robjects.packages import importr
|
36
|
+
|
37
|
+
pandas2ri.activate()
|
38
|
+
rpy2.robjects.numpy2ri.activate()
|
39
|
+
|
40
|
+
except ImportError:
|
41
|
+
raise ImportError("edger requires rpy2 to be installed.") from None
|
42
|
+
|
43
|
+
try:
|
44
|
+
edger = importr("edgeR")
|
45
|
+
except ImportError as e:
|
46
|
+
raise ImportError(
|
47
|
+
"edgeR requires a valid R installation with the following packages:\n"
|
48
|
+
"edgeR, BiocParallel, RhpcBLASctl"
|
49
|
+
) from e
|
50
|
+
|
51
|
+
# Convert dataframe
|
52
|
+
with localconverter(ro.default_converter + numpy2ri.converter):
|
53
|
+
expr = self.adata.X if self.layer is None else self.adata.layers[self.layer]
|
54
|
+
if issparse(expr):
|
55
|
+
expr = expr.T.toarray()
|
56
|
+
else:
|
57
|
+
expr = expr.T
|
58
|
+
|
59
|
+
expr_r = ro.conversion.py2rpy(pd.DataFrame(expr, index=self.adata.var_names, columns=self.adata.obs_names))
|
60
|
+
|
61
|
+
dge = edger.DGEList(counts=expr_r, samples=self.adata.obs)
|
62
|
+
|
63
|
+
logging.info("Calculating NormFactors")
|
64
|
+
dge = edger.calcNormFactors(dge)
|
65
|
+
|
66
|
+
logging.info("Estimating Dispersions")
|
67
|
+
dge = edger.estimateDisp(dge, design=self.design)
|
68
|
+
|
69
|
+
logging.info("Fitting linear model")
|
70
|
+
fit = edger.glmQLFit(dge, design=self.design, **kwargs)
|
71
|
+
|
72
|
+
ro.globalenv["fit"] = fit
|
73
|
+
self.fit = fit
|
74
|
+
|
75
|
+
def _test_single_contrast(self, contrast: Sequence[float], **kwargs) -> pd.DataFrame:
|
76
|
+
"""Conduct test for each contrast and return a data frame
|
77
|
+
|
78
|
+
Args:
|
79
|
+
contrast: numpy array of integars indicating contrast i.e. [-1, 0, 1, 0, 0]
|
80
|
+
"""
|
81
|
+
## -- Check installations
|
82
|
+
# For running in notebook
|
83
|
+
# pandas2ri.activate()
|
84
|
+
# rpy2.robjects.numpy2ri.activate()
|
85
|
+
|
86
|
+
# ToDo:
|
87
|
+
# parse **kwargs to R function
|
88
|
+
# Fix mask for .fit()
|
89
|
+
|
90
|
+
try:
|
91
|
+
import rpy2.robjects.numpy2ri
|
92
|
+
import rpy2.robjects.pandas2ri
|
93
|
+
from rpy2 import robjects as ro
|
94
|
+
from rpy2.robjects import numpy2ri, pandas2ri
|
95
|
+
from rpy2.robjects.conversion import localconverter
|
96
|
+
from rpy2.robjects.packages import importr
|
97
|
+
|
98
|
+
except ImportError:
|
99
|
+
raise ImportError("edger requires rpy2 to be installed.") from None
|
100
|
+
|
101
|
+
try:
|
102
|
+
importr("edgeR")
|
103
|
+
except ImportError:
|
104
|
+
raise ImportError(
|
105
|
+
"edgeR requires a valid R installation with the following packages: " "edgeR, BiocParallel, RhpcBLASctl"
|
106
|
+
) from None
|
107
|
+
|
108
|
+
# Convert vector to R, which drops a category like `self.design_matrix` to use the intercept for the left out.
|
109
|
+
contrast_vec_r = ro.conversion.py2rpy(np.asarray(contrast))
|
110
|
+
ro.globalenv["contrast_vec"] = contrast_vec_r
|
111
|
+
|
112
|
+
# Test contrast with R
|
113
|
+
ro.r(
|
114
|
+
"""
|
115
|
+
test = edgeR::glmQLFTest(fit, contrast=contrast_vec)
|
116
|
+
de_res = edgeR::topTags(test, n=Inf, adjust.method="BH", sort.by="PValue")$table
|
117
|
+
"""
|
118
|
+
)
|
119
|
+
|
120
|
+
# Convert results to pandas
|
121
|
+
de_res = ro.conversion.rpy2py(ro.globalenv["de_res"])
|
122
|
+
de_res.index.name = "variable"
|
123
|
+
de_res = de_res.reset_index()
|
124
|
+
|
125
|
+
return de_res.rename(columns={"PValue": "p_value", "logFC": "log_fc", "FDR": "adj_p_value"})
|
@@ -0,0 +1,189 @@
|
|
1
|
+
"""Helpers to interact with Formulaic Formulas
|
2
|
+
|
3
|
+
Some helpful definitions for working with formulaic formulas (e.g. `~ 0 + C(donor):treatment + np.log1p(continuous)`):
|
4
|
+
* A *term* refers to an expression in the formula, separated by `+`, e.g. `C(donor):treatment`, or `np.log1p(continuous)`.
|
5
|
+
* A *variable* refers to a column of the data frame passed to formulaic, e.g. `donor`.
|
6
|
+
* A *factor* is the specification of how a certain variable is represented in the design matrix, e.g. treatment coding with base level "A" and reduced rank.
|
7
|
+
"""
|
8
|
+
|
9
|
+
from collections import defaultdict
|
10
|
+
from collections.abc import Mapping, Sequence
|
11
|
+
from dataclasses import dataclass
|
12
|
+
from typing import Any
|
13
|
+
|
14
|
+
from formulaic import FactorValues, ModelSpec
|
15
|
+
from formulaic.materializers import PandasMaterializer
|
16
|
+
from formulaic.materializers.types import EvaluatedFactor
|
17
|
+
from formulaic.parser.types import Factor
|
18
|
+
from interface_meta import override
|
19
|
+
|
20
|
+
|
21
|
+
@dataclass
|
22
|
+
class FactorMetadata:
|
23
|
+
"""Store (relevant) metadata for a factor of a formula."""
|
24
|
+
|
25
|
+
name: str
|
26
|
+
"""The unambiguous factor name as specified in the formula. E.g. `donor`, or `C(donor, contr.treatment(base="A"))`"""
|
27
|
+
|
28
|
+
reduced_rank: bool
|
29
|
+
"""Whether a column will be dropped because it is redundant"""
|
30
|
+
|
31
|
+
custom_encoder: bool
|
32
|
+
"""Whether or not a custom encoder (e.g. `C(...)`) was used."""
|
33
|
+
|
34
|
+
categories: Sequence[str]
|
35
|
+
"""The unique categories in this factor (after applying `drop_rows`)"""
|
36
|
+
|
37
|
+
kind: Factor.Kind
|
38
|
+
"""Type of the factor"""
|
39
|
+
|
40
|
+
drop_field: str = None
|
41
|
+
"""The category that is dropped.
|
42
|
+
|
43
|
+
Note that
|
44
|
+
* this may also be populated if `reduced_rank = False`
|
45
|
+
* this is only populated when no encoder was used (e.g. `~ donor` but NOT `~ C(donor)`.
|
46
|
+
"""
|
47
|
+
|
48
|
+
column_names: Sequence[str] = None
|
49
|
+
"""The column names for this factor included in the design matrix.
|
50
|
+
|
51
|
+
This may be the same as `categories` if the default encoder is used, or
|
52
|
+
categories without the base level if a custom encoder (e.g. `C(...)`) is used.
|
53
|
+
"""
|
54
|
+
|
55
|
+
colname_format: str = None
|
56
|
+
"""A formattable string that can be used to generate the column name in the design matrix, e.g. `{name}[T.{field}]`"""
|
57
|
+
|
58
|
+
@property
|
59
|
+
def base(self) -> str | None:
|
60
|
+
"""
|
61
|
+
The base category for this categorical factor.
|
62
|
+
|
63
|
+
This is derived from `drop_field` (for default encoding) or by comparing the column names in
|
64
|
+
the design matrix with all categories (for custom encoding, e.g. `C(...)`).
|
65
|
+
"""
|
66
|
+
if not self.reduced_rank:
|
67
|
+
return None
|
68
|
+
else:
|
69
|
+
if self.custom_encoder:
|
70
|
+
tmp_base = set(self.categories) - set(self.column_names)
|
71
|
+
assert len(tmp_base) == 1
|
72
|
+
return tmp_base.pop()
|
73
|
+
else:
|
74
|
+
assert self.drop_field is not None
|
75
|
+
return self.drop_field
|
76
|
+
|
77
|
+
|
78
|
+
def get_factor_storage_and_materializer() -> tuple[dict[str, list[FactorMetadata]], dict[str, set[str]], type]:
|
79
|
+
"""Keeps track of categorical factors used in a model specification by generating a custom materializer.
|
80
|
+
|
81
|
+
This materializer reports back metadata upon materialization of the model matrix.
|
82
|
+
|
83
|
+
Returns:
|
84
|
+
- A dictionary storing metadata for each factor processed by the custom materializer, named `factor_storage`.
|
85
|
+
- A dictionary mapping variables to factor names, which works similarly to model_spec.variable_terms
|
86
|
+
but maps to factors rather than terms, named `variable_to_factors`.
|
87
|
+
- A materializer class tied to the specific instance of `factor_storage`.
|
88
|
+
"""
|
89
|
+
# There can be multiple FactorMetadata entries per sample, for instance when formulaic generates an interaction
|
90
|
+
# term, it generates the factor with both full rank and reduced rank.
|
91
|
+
factor_storage: dict[str, list[FactorMetadata]] = defaultdict(list)
|
92
|
+
variable_to_factors: dict[str, set[str]] = defaultdict(set)
|
93
|
+
|
94
|
+
class CustomPandasMaterializer(PandasMaterializer):
|
95
|
+
"""An extension of the PandasMaterializer that records all categorical variables and their (base) categories."""
|
96
|
+
|
97
|
+
REGISTER_NAME = "custom_pandas"
|
98
|
+
REGISTER_INPUTS = ("pandas.core.frame.DataFrame",)
|
99
|
+
REGISTER_OUTPUTS = ("pandas", "numpy", "sparse")
|
100
|
+
|
101
|
+
def __init__(
|
102
|
+
self,
|
103
|
+
data: Any,
|
104
|
+
context: Mapping[str, Any] | None = None,
|
105
|
+
record_factor_metadata: bool = False,
|
106
|
+
**params: Any,
|
107
|
+
):
|
108
|
+
"""Initialize the Materializer.
|
109
|
+
|
110
|
+
Args:
|
111
|
+
data: Passed to PandasMaterializer.
|
112
|
+
context: Passed to PandasMaterializer
|
113
|
+
record_factor_metadata: Flag that tells whether this particular instance of the custom materializer class
|
114
|
+
is supposed to record factor metadata. Only the instance that is used for building the design
|
115
|
+
matrix should record the metadata. All other instances (e.g. used to generate contrast vectors)
|
116
|
+
should not record metadata to not overwrite the specifications from the design matrix.
|
117
|
+
**params: Passed to PandasMaterializer
|
118
|
+
"""
|
119
|
+
self.factor_metadata_storage = factor_storage if record_factor_metadata else None
|
120
|
+
self.variable_to_factors = variable_to_factors if record_factor_metadata else None
|
121
|
+
# temporary pointer to metadata of factor that is currently evaluated
|
122
|
+
self._current_factor: FactorMetadata = None
|
123
|
+
super().__init__(data, context, **params)
|
124
|
+
|
125
|
+
@override
|
126
|
+
def _encode_evaled_factor(
|
127
|
+
self, factor: EvaluatedFactor, spec: ModelSpec, drop_rows: Sequence[int], reduced_rank: bool = False
|
128
|
+
) -> dict[str, Any]:
|
129
|
+
"""Function is called just before the factor is evaluated.
|
130
|
+
|
131
|
+
We can record some metadata, before we call the original function.
|
132
|
+
"""
|
133
|
+
assert (
|
134
|
+
self._current_factor is None
|
135
|
+
), "_current_factor should always be None when we start recording metadata"
|
136
|
+
if self.factor_metadata_storage is not None:
|
137
|
+
# Don't store if the factor is cached (then we should already have recorded it)
|
138
|
+
if factor.expr in self.encoded_cache or (factor.expr, reduced_rank) in self.encoded_cache:
|
139
|
+
assert factor.expr in self.factor_metadata_storage, "Factor should be there since it's cached"
|
140
|
+
else:
|
141
|
+
for var in factor.variables:
|
142
|
+
self.variable_to_factors[var].add(factor.expr)
|
143
|
+
self._current_factor = FactorMetadata(
|
144
|
+
name=factor.expr,
|
145
|
+
reduced_rank=reduced_rank,
|
146
|
+
categories=tuple(sorted(factor.values.drop(index=factor.values.index[drop_rows]).unique())),
|
147
|
+
custom_encoder=factor.metadata.encoder is not None,
|
148
|
+
kind=factor.metadata.kind,
|
149
|
+
)
|
150
|
+
return super()._encode_evaled_factor(factor, spec, drop_rows, reduced_rank)
|
151
|
+
|
152
|
+
@override
|
153
|
+
def _flatten_encoded_evaled_factor(self, name: str, values: FactorValues[dict]) -> dict[str, Any]:
|
154
|
+
"""
|
155
|
+
Function is called at the end, before the design matrix gets materialized.
|
156
|
+
|
157
|
+
Here we have access to additional metadata, such as `drop_field`.
|
158
|
+
"""
|
159
|
+
if self._current_factor is not None:
|
160
|
+
assert self._current_factor.name == name
|
161
|
+
self._current_factor.drop_field = values.__formulaic_metadata__.drop_field
|
162
|
+
self._current_factor.column_names = values.__formulaic_metadata__.column_names
|
163
|
+
self._current_factor.colname_format = values.__formulaic_metadata__.format
|
164
|
+
self.factor_metadata_storage[name].append(self._current_factor)
|
165
|
+
self._current_factor = None
|
166
|
+
|
167
|
+
return super()._flatten_encoded_evaled_factor(name, values)
|
168
|
+
|
169
|
+
return factor_storage, variable_to_factors, CustomPandasMaterializer
|
170
|
+
|
171
|
+
|
172
|
+
class AmbiguousAttributeError(ValueError):
|
173
|
+
pass
|
174
|
+
|
175
|
+
|
176
|
+
def resolve_ambiguous(objs: Sequence[Any], attr: str) -> Any:
|
177
|
+
"""Given a list of objects, return an attribute if it is the same between all object. Otherwise, raise an error."""
|
178
|
+
if not objs:
|
179
|
+
raise ValueError("Collection is empty")
|
180
|
+
|
181
|
+
first_obj_attr = getattr(objs[0], attr)
|
182
|
+
|
183
|
+
# Check if the attribute is the same for all objects
|
184
|
+
for obj in objs[1:]:
|
185
|
+
if getattr(obj, attr) != first_obj_attr:
|
186
|
+
raise AmbiguousAttributeError(f"Ambiguous attribute '{attr}': values differ between objects")
|
187
|
+
|
188
|
+
# If attribute is the same for all objects, return it
|
189
|
+
return first_obj_attr
|
@@ -0,0 +1,95 @@
|
|
1
|
+
import os
|
2
|
+
import re
|
3
|
+
import warnings
|
4
|
+
|
5
|
+
import pandas as pd
|
6
|
+
from anndata import AnnData
|
7
|
+
from numpy import ndarray
|
8
|
+
from pydeseq2.dds import DeseqDataSet
|
9
|
+
from pydeseq2.default_inference import DefaultInference
|
10
|
+
from pydeseq2.ds import DeseqStats
|
11
|
+
from scipy.sparse import issparse
|
12
|
+
|
13
|
+
from ._base import LinearModelBase
|
14
|
+
from ._checks import check_is_integer_matrix
|
15
|
+
|
16
|
+
|
17
|
+
class PyDESeq2(LinearModelBase):
|
18
|
+
"""Differential expression test using a PyDESeq2"""
|
19
|
+
|
20
|
+
def __init__(
|
21
|
+
self, adata: AnnData, design: str | ndarray, *, mask: str | None = None, layer: str | None = None, **kwargs
|
22
|
+
):
|
23
|
+
super().__init__(adata, design, mask=mask, layer=layer, **kwargs)
|
24
|
+
# work around pydeseq2 issue with sparse matrices
|
25
|
+
# see also https://github.com/owkin/PyDESeq2/issues/25
|
26
|
+
if issparse(self.data):
|
27
|
+
if self.layer is None:
|
28
|
+
self.adata.X = self.adata.X.toarray()
|
29
|
+
else:
|
30
|
+
self.adata.layers[self.layer] = self.adata.layers[self.layer].toarray()
|
31
|
+
|
32
|
+
def _check_counts(self):
|
33
|
+
check_is_integer_matrix(self.data)
|
34
|
+
|
35
|
+
def fit(self, **kwargs) -> pd.DataFrame:
|
36
|
+
"""Fit dds model using pydeseq2.
|
37
|
+
|
38
|
+
Note: this creates its own AnnData object for downstream processing.
|
39
|
+
|
40
|
+
Args:
|
41
|
+
**kwargs: Keyword arguments specific to DeseqDataSet(), except for `n_cpus` which will use all available CPUs minus one if the argument is not passed.
|
42
|
+
"""
|
43
|
+
inference = DefaultInference(n_cpus=kwargs.pop("n_cpus", os.cpu_count() - 1))
|
44
|
+
covars = self.design.columns.tolist()
|
45
|
+
if "Intercept" not in covars:
|
46
|
+
warnings.warn(
|
47
|
+
"Warning: Pydeseq is hard-coded to use Intercept, please include intercept into the model", stacklevel=2
|
48
|
+
)
|
49
|
+
processed_covars = list({re.sub(r"\[T\.(.*)\]", "", col) for col in covars if col != "Intercept"})
|
50
|
+
dds = DeseqDataSet(
|
51
|
+
adata=self.adata, design_factors=processed_covars, refit_cooks=True, inference=inference, **kwargs
|
52
|
+
)
|
53
|
+
# workaround code to insert design array
|
54
|
+
des_mtx_cols = dds.obsm["design_matrix"].columns
|
55
|
+
dds.obsm["design_matrix"] = self.design
|
56
|
+
if dds.obsm["design_matrix"].shape[1] == len(des_mtx_cols):
|
57
|
+
dds.obsm["design_matrix"].columns = des_mtx_cols.copy()
|
58
|
+
|
59
|
+
dds.deseq2()
|
60
|
+
self.dds = dds
|
61
|
+
|
62
|
+
# TODO: PyDeseq2 doesn't support arbitrary designs and contrasts yet
|
63
|
+
# see https://github.com/owkin/PyDESeq2/issues/213
|
64
|
+
|
65
|
+
# Therefore these functions are overridden in a way to make it work with PyDESeq2,
|
66
|
+
# ingoring the inconsistency of function signatures. Once arbitrary design
|
67
|
+
# matrices and contrasts are supported by PyDEseq2, we can fully support the
|
68
|
+
# Linear model interface.
|
69
|
+
def _test_single_contrast(self, contrast: list[str], alpha=0.05, **kwargs) -> pd.DataFrame: # type: ignore
|
70
|
+
"""Conduct a specific test and returns a Pandas DataFrame.
|
71
|
+
|
72
|
+
Args:
|
73
|
+
contrast: list of three strings of the form `["variable", "tested level", "reference level"]`.
|
74
|
+
alpha: p value threshold used for controlling fdr with independent hypothesis weighting
|
75
|
+
**kwargs: extra arguments to pass to DeseqStats()
|
76
|
+
"""
|
77
|
+
stat_res = DeseqStats(self.dds, contrast=contrast, alpha=alpha, **kwargs)
|
78
|
+
# Calling `.summary()` is required to fill the `results_df` data frame
|
79
|
+
stat_res.summary()
|
80
|
+
res_df = (
|
81
|
+
pd.DataFrame(stat_res.results_df)
|
82
|
+
.rename(columns={"pvalue": "p_value", "padj": "adj_p_value", "log2FoldChange": "log_fc"})
|
83
|
+
.sort_values("p_value")
|
84
|
+
)
|
85
|
+
res_df.index.name = "variable"
|
86
|
+
res_df = res_df.reset_index()
|
87
|
+
return res_df
|
88
|
+
|
89
|
+
def cond(self, **kwargs) -> ndarray:
|
90
|
+
raise NotImplementedError(
|
91
|
+
"PyDESeq2 currently doesn't support arbitrary contrasts, see https://github.com/owkin/PyDESeq2/issues/213"
|
92
|
+
)
|
93
|
+
|
94
|
+
def contrast(self, column: str, baseline: str, group_to_compare: str) -> tuple[str, str, str]: # type: ignore
|
95
|
+
return (column, group_to_compare, baseline)
|
@@ -0,0 +1,162 @@
|
|
1
|
+
"""Simple tests such as t-test, wilcoxon"""
|
2
|
+
|
3
|
+
import warnings
|
4
|
+
from abc import abstractmethod
|
5
|
+
from collections.abc import Mapping, Sequence
|
6
|
+
from types import MappingProxyType
|
7
|
+
|
8
|
+
import numpy as np
|
9
|
+
import pandas as pd
|
10
|
+
import scipy.stats
|
11
|
+
import statsmodels
|
12
|
+
from anndata import AnnData
|
13
|
+
from pandas.core.api import DataFrame as DataFrame
|
14
|
+
from scipy.sparse import diags, issparse
|
15
|
+
from tqdm.auto import tqdm
|
16
|
+
|
17
|
+
from ._base import MethodBase
|
18
|
+
|
19
|
+
|
20
|
+
def fdr_correction(
|
21
|
+
df: pd.DataFrame, pvalue_col: str = "p_value", *, key_added: str = "adj_p_value", inplace: bool = False
|
22
|
+
):
|
23
|
+
"""Adjust p-values in a DataFrame with test results using FDR correction."""
|
24
|
+
if not inplace:
|
25
|
+
df = df.copy()
|
26
|
+
|
27
|
+
df[key_added] = statsmodels.stats.multitest.fdrcorrection(df[pvalue_col].values)[1]
|
28
|
+
|
29
|
+
if not inplace:
|
30
|
+
return df
|
31
|
+
|
32
|
+
|
33
|
+
class SimpleComparisonBase(MethodBase):
|
34
|
+
@staticmethod
|
35
|
+
@abstractmethod
|
36
|
+
def _test(x0: np.ndarray, x1: np.ndarray, paired: bool, **kwargs) -> float:
|
37
|
+
"""Perform a statistical test between values in x0 and x1.
|
38
|
+
|
39
|
+
If `paired` is True, x0 and x1 must be of the same length and ordered such that
|
40
|
+
paired elements have the same position.
|
41
|
+
|
42
|
+
Args:
|
43
|
+
x0: Array with baseline values.
|
44
|
+
x1: Array with values to compare.
|
45
|
+
paired: Indicates whether to perform a paired test
|
46
|
+
**kwargs: kwargs passed to the test function
|
47
|
+
"""
|
48
|
+
...
|
49
|
+
|
50
|
+
def _compare_single_group(
|
51
|
+
self, baseline_idx: np.ndarray, comparison_idx: np.ndarray, *, paired: bool, **kwargs
|
52
|
+
) -> DataFrame:
|
53
|
+
"""Perform a single comparison between two groups.
|
54
|
+
|
55
|
+
Args:
|
56
|
+
baseline_idx: Numeric indices indicating which observations are in the baseline group.
|
57
|
+
comparison_idx: Numeric indices indicating which observations are in the comparison/treatment group
|
58
|
+
paired: Whether to perform a paired test. Note that in the case of a paired test,
|
59
|
+
the indices must be ordered such that paired observations appear at the same position.
|
60
|
+
**kwargs: kwargs passed to the test function
|
61
|
+
"""
|
62
|
+
if paired:
|
63
|
+
assert len(baseline_idx) == len(comparison_idx), "For a paired test, indices must be of the same length"
|
64
|
+
|
65
|
+
x0 = self.data[baseline_idx, :]
|
66
|
+
x1 = self.data[comparison_idx, :]
|
67
|
+
|
68
|
+
# In the following loop, we are doing a lot of column slicing -- which is significantly
|
69
|
+
# more efficient in csc format.
|
70
|
+
if issparse(self.data):
|
71
|
+
x0 = x0.tocsc()
|
72
|
+
x1 = x1.tocsc()
|
73
|
+
|
74
|
+
res = []
|
75
|
+
for var in tqdm(self.adata.var_names):
|
76
|
+
tmp_x0 = x0[:, self.adata.var_names == var]
|
77
|
+
tmp_x0 = np.asarray(tmp_x0.todense()).flatten() if issparse(tmp_x0) else tmp_x0.flatten()
|
78
|
+
tmp_x1 = x1[:, self.adata.var_names == var]
|
79
|
+
tmp_x1 = np.asarray(tmp_x1.todense()).flatten() if issparse(tmp_x1) else tmp_x1.flatten()
|
80
|
+
pval = self._test(tmp_x0, tmp_x1, paired, **kwargs)
|
81
|
+
mean_x0 = np.mean(tmp_x0)
|
82
|
+
mean_x1 = np.mean(tmp_x1)
|
83
|
+
res.append({"variable": var, "p_value": pval, "log_fc": np.log2(mean_x1) - np.log2(mean_x0)})
|
84
|
+
return pd.DataFrame(res).sort_values("p_value")
|
85
|
+
|
86
|
+
@classmethod
|
87
|
+
def compare_groups(
|
88
|
+
cls,
|
89
|
+
adata: AnnData,
|
90
|
+
column: str,
|
91
|
+
baseline: str,
|
92
|
+
groups_to_compare: str | Sequence[str],
|
93
|
+
*,
|
94
|
+
paired_by: str | None = None,
|
95
|
+
mask: str | None = None,
|
96
|
+
layer: str | None = None,
|
97
|
+
fit_kwargs: Mapping = MappingProxyType({}),
|
98
|
+
test_kwargs: Mapping = MappingProxyType({}),
|
99
|
+
) -> DataFrame:
|
100
|
+
if len(fit_kwargs):
|
101
|
+
warnings.warn("fit_kwargs not used for simple tests.", UserWarning, stacklevel=2)
|
102
|
+
paired = paired_by is not None
|
103
|
+
model = cls(adata, mask=mask, layer=layer)
|
104
|
+
if groups_to_compare is None:
|
105
|
+
# compare against all other
|
106
|
+
groups_to_compare = sorted(set(model.adata.obs[column]) - {baseline})
|
107
|
+
if isinstance(groups_to_compare, str):
|
108
|
+
groups_to_compare = [groups_to_compare]
|
109
|
+
|
110
|
+
def _get_idx(column, value):
|
111
|
+
mask = model.adata.obs[column] == value
|
112
|
+
if paired:
|
113
|
+
dummies = pd.get_dummies(model.adata.obs[paired_by], sparse=True).sparse.to_coo().tocsr()
|
114
|
+
if not np.all(np.sum(dummies, axis=0) == 2):
|
115
|
+
raise ValueError("Pairing is only possible with exactly two values per group")
|
116
|
+
# Use matrix multiplication to only retreive those dummy entries that are associated with the current `value`.
|
117
|
+
# Convert to COO matrix to get rows/cols
|
118
|
+
# row indices refers to the indices of rows that have `column == value` (equivalent to np.where(mask)[0])
|
119
|
+
# col indices refers to the numeric index of each "pair" in obs_names
|
120
|
+
ind_mat = diags(mask.values, dtype=bool) @ dummies
|
121
|
+
if not np.all(np.sum(ind_mat, axis=0) == 1):
|
122
|
+
raise ValueError("Pairing is only possible with exactly two values per group")
|
123
|
+
ind_mat = ind_mat.tocoo()
|
124
|
+
return ind_mat.row[np.argsort(ind_mat.col)]
|
125
|
+
else:
|
126
|
+
return np.where(mask)[0]
|
127
|
+
|
128
|
+
res_dfs = []
|
129
|
+
baseline_idx = _get_idx(column, baseline)
|
130
|
+
for group_to_compare in groups_to_compare:
|
131
|
+
comparison_idx = _get_idx(column, group_to_compare)
|
132
|
+
res_dfs.append(
|
133
|
+
model._compare_single_group(baseline_idx, comparison_idx, paired=paired, **test_kwargs).assign(
|
134
|
+
comparison=f"{group_to_compare}_vs_{baseline if baseline is not None else 'rest'}"
|
135
|
+
)
|
136
|
+
)
|
137
|
+
return fdr_correction(pd.concat(res_dfs))
|
138
|
+
|
139
|
+
|
140
|
+
class WilcoxonTest(SimpleComparisonBase):
|
141
|
+
"""Perform a unpaired or paired Wilcoxon test.
|
142
|
+
|
143
|
+
(the former is also known as "Mann-Whitney U test", the latter as "wilcoxon signed rank test")
|
144
|
+
"""
|
145
|
+
|
146
|
+
@staticmethod
|
147
|
+
def _test(x0: np.ndarray, x1: np.ndarray, paired: bool, **kwargs) -> float:
|
148
|
+
if paired:
|
149
|
+
return scipy.stats.wilcoxon(x0, x1, **kwargs).pvalue
|
150
|
+
else:
|
151
|
+
return scipy.stats.mannwhitneyu(x0, x1, **kwargs).pvalue
|
152
|
+
|
153
|
+
|
154
|
+
class TTest(SimpleComparisonBase):
|
155
|
+
"""Perform a unpaired or paired T-test"""
|
156
|
+
|
157
|
+
@staticmethod
|
158
|
+
def _test(x0: np.ndarray, x1: np.ndarray, paired: bool, **kwargs) -> float:
|
159
|
+
if paired:
|
160
|
+
return scipy.stats.ttest_rel(x0, x1, **kwargs).pvalue
|
161
|
+
else:
|
162
|
+
return scipy.stats.ttest_ind(x0, x1, **kwargs).pvalue
|