pertpy 0.7.0__py3-none-any.whl → 0.9.1__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (56) hide show
  1. pertpy/__init__.py +2 -1
  2. pertpy/data/__init__.py +61 -0
  3. pertpy/data/_dataloader.py +27 -23
  4. pertpy/data/_datasets.py +58 -0
  5. pertpy/metadata/__init__.py +2 -0
  6. pertpy/metadata/_cell_line.py +39 -70
  7. pertpy/metadata/_compound.py +3 -4
  8. pertpy/metadata/_drug.py +2 -6
  9. pertpy/metadata/_look_up.py +38 -51
  10. pertpy/metadata/_metadata.py +7 -10
  11. pertpy/metadata/_moa.py +2 -6
  12. pertpy/plot/__init__.py +0 -5
  13. pertpy/preprocessing/__init__.py +2 -0
  14. pertpy/preprocessing/_guide_rna.py +6 -7
  15. pertpy/tools/__init__.py +67 -6
  16. pertpy/tools/_augur.py +14 -15
  17. pertpy/tools/_cinemaot.py +2 -2
  18. pertpy/tools/_coda/_base_coda.py +118 -142
  19. pertpy/tools/_coda/_sccoda.py +16 -15
  20. pertpy/tools/_coda/_tasccoda.py +21 -22
  21. pertpy/tools/_dialogue.py +18 -23
  22. pertpy/tools/_differential_gene_expression/__init__.py +20 -0
  23. pertpy/tools/_differential_gene_expression/_base.py +657 -0
  24. pertpy/tools/_differential_gene_expression/_checks.py +41 -0
  25. pertpy/tools/_differential_gene_expression/_dge_comparison.py +86 -0
  26. pertpy/tools/_differential_gene_expression/_edger.py +125 -0
  27. pertpy/tools/_differential_gene_expression/_formulaic.py +189 -0
  28. pertpy/tools/_differential_gene_expression/_pydeseq2.py +95 -0
  29. pertpy/tools/_differential_gene_expression/_simple_tests.py +162 -0
  30. pertpy/tools/_differential_gene_expression/_statsmodels.py +72 -0
  31. pertpy/tools/_distances/_distance_tests.py +21 -16
  32. pertpy/tools/_distances/_distances.py +406 -70
  33. pertpy/tools/_enrichment.py +10 -15
  34. pertpy/tools/_kernel_pca.py +1 -1
  35. pertpy/tools/_milo.py +77 -54
  36. pertpy/tools/_mixscape.py +15 -11
  37. pertpy/tools/_perturbation_space/_clustering.py +5 -2
  38. pertpy/tools/_perturbation_space/_comparison.py +112 -0
  39. pertpy/tools/_perturbation_space/_discriminator_classifiers.py +21 -23
  40. pertpy/tools/_perturbation_space/_perturbation_space.py +23 -21
  41. pertpy/tools/_perturbation_space/_simple.py +3 -3
  42. pertpy/tools/_scgen/__init__.py +1 -1
  43. pertpy/tools/_scgen/_base_components.py +2 -3
  44. pertpy/tools/_scgen/_scgen.py +33 -28
  45. pertpy/tools/_scgen/_utils.py +2 -2
  46. {pertpy-0.7.0.dist-info → pertpy-0.9.1.dist-info}/METADATA +32 -14
  47. pertpy-0.9.1.dist-info/RECORD +57 -0
  48. {pertpy-0.7.0.dist-info → pertpy-0.9.1.dist-info}/WHEEL +1 -1
  49. pertpy/plot/_augur.py +0 -171
  50. pertpy/plot/_coda.py +0 -601
  51. pertpy/plot/_guide_rna.py +0 -64
  52. pertpy/plot/_milopy.py +0 -209
  53. pertpy/plot/_mixscape.py +0 -355
  54. pertpy/tools/_differential_gene_expression.py +0 -325
  55. pertpy-0.7.0.dist-info/RECORD +0 -53
  56. {pertpy-0.7.0.dist-info → pertpy-0.9.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,86 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ from anndata import AnnData
4
+
5
+
6
+ class DGEEVAL:
7
+ def compare(
8
+ self,
9
+ adata: AnnData | None = None,
10
+ de_key1: str = None,
11
+ de_key2: str = None,
12
+ de_df1: pd.DataFrame | None = None,
13
+ de_df2: pd.DataFrame | None = None,
14
+ shared_top: int = 100,
15
+ ) -> dict[str, float]:
16
+ """Compare two differential expression analyses.
17
+
18
+ Compare two sets of DE results and evaluate the similarity by the overlap of top DEG and
19
+ the correlation of their scores and adjusted p-values.
20
+
21
+ Args:
22
+ adata: AnnData object containing DE results in `uns`. Required if `de_key1` and `de_key2` are used.
23
+ de_key1: Key for DE results in `adata.uns`, e.g., output of `tl.rank_genes_groups`.
24
+ de_key2: Another key for DE results in `adata.uns`, e.g., output of `tl.rank_genes_groups`.
25
+ de_df1: DataFrame containing DE results, e.g. output from pertpy differential gene expression interface.
26
+ de_df2: DataFrame containing DE results, e.g. output from pertpy differential gene expression interface.
27
+ shared_top: The number of top DEG to compute the proportion of their intersection.
28
+
29
+ """
30
+ if (de_key1 or de_key2) and (de_df1 is not None or de_df2 is not None):
31
+ raise ValueError(
32
+ "Please provide either both `de_key1` and `de_key2` with `adata`, or `de_df1` and `de_df2`, but not both."
33
+ )
34
+
35
+ if de_df1 is None and de_df2 is None: # use keys
36
+ if not de_key1 or not de_key2:
37
+ raise ValueError("Both `de_key1` and `de_key2` must be provided together if using `adata`.")
38
+
39
+ else: # use dfs
40
+ if de_df1 is None or de_df2 is None:
41
+ raise ValueError("Both `de_df1` and `de_df2` must be provided together if using DataFrames.")
42
+
43
+ if de_key1:
44
+ if not adata:
45
+ raise ValueError("`adata` should be provided with `de_key1` and `de_key2`. ")
46
+ assert all(
47
+ k in adata.uns for k in [de_key1, de_key2]
48
+ ), "Provided `de_key1` and `de_key2` must exist in `adata.uns`."
49
+ vars = adata.var_names
50
+
51
+ if de_df1 is not None:
52
+ for df in (de_df1, de_df2):
53
+ if not {"variable", "log_fc", "adj_p_value"}.issubset(df.columns):
54
+ raise ValueError("Each DataFrame must contain columns: 'variable', 'log_fc', and 'adj_p_value'.")
55
+
56
+ assert set(de_df1["variable"]) == set(de_df2["variable"]), "Variables in both dataframes must match."
57
+ vars = de_df1["variable"].sort_values()
58
+
59
+ shared_top = min(shared_top, len(vars))
60
+ vars_ranks = np.arange(1, len(vars) + 1)
61
+ results = pd.DataFrame(index=vars)
62
+ top_names = []
63
+
64
+ if de_key1 and de_key2:
65
+ for i, k in enumerate([de_key1, de_key2]):
66
+ label = adata.uns[k]["names"].dtype.names[0]
67
+ srt_idx = np.argsort(adata.uns[k]["names"][label])
68
+ results[f"scores_{i}"] = adata.uns[k]["scores"][label][srt_idx]
69
+ results[f"pvals_adj_{i}"] = adata.uns[k]["pvals_adj"][label][srt_idx]
70
+ results[f"ranks_{i}"] = vars_ranks[srt_idx]
71
+ top_names.append(adata.uns[k]["names"][label][:shared_top])
72
+ else:
73
+ for i, df in enumerate([de_df1, de_df2]):
74
+ srt_idx = np.argsort(df["variable"])
75
+ results[f"scores_{i}"] = df["log_fc"].values[srt_idx]
76
+ results[f"pvals_adj_{i}"] = df["adj_p_value"].values[srt_idx]
77
+ results[f"ranks_{i}"] = vars_ranks[srt_idx]
78
+ top_names.append(df["variable"][:shared_top])
79
+
80
+ metrics = {}
81
+ metrics["shared_top_genes"] = len(set(top_names[0]).intersection(top_names[1])) / shared_top
82
+ metrics["scores_corr"] = results["scores_0"].corr(results["scores_1"], method="pearson")
83
+ metrics["pvals_adj_corr"] = results["pvals_adj_0"].corr(results["pvals_adj_1"], method="pearson")
84
+ metrics["scores_ranks_corr"] = results["ranks_0"].corr(results["ranks_1"], method="spearman")
85
+
86
+ return metrics
@@ -0,0 +1,125 @@
1
+ from collections.abc import Sequence
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+ from scanpy import logging
6
+ from scipy.sparse import issparse
7
+
8
+ from ._base import LinearModelBase
9
+ from ._checks import check_is_integer_matrix
10
+
11
+
12
+ class EdgeR(LinearModelBase):
13
+ """Differential expression test using EdgeR"""
14
+
15
+ def _check_counts(self):
16
+ check_is_integer_matrix(self.data)
17
+
18
+ def fit(self, **kwargs): # adata, design, mask, layer
19
+ """Fit model using edgeR.
20
+
21
+ Note: this creates its own AnnData object for downstream.
22
+
23
+ Args:
24
+ **kwargs: Keyword arguments specific to glmQLFit()
25
+ """
26
+ # For running in notebook
27
+ # pandas2ri.activate()
28
+ # rpy2.robjects.numpy2ri.activate()
29
+ try:
30
+ import rpy2.robjects.numpy2ri
31
+ import rpy2.robjects.pandas2ri
32
+ from rpy2 import robjects as ro
33
+ from rpy2.robjects import numpy2ri, pandas2ri
34
+ from rpy2.robjects.conversion import localconverter
35
+ from rpy2.robjects.packages import importr
36
+
37
+ pandas2ri.activate()
38
+ rpy2.robjects.numpy2ri.activate()
39
+
40
+ except ImportError:
41
+ raise ImportError("edger requires rpy2 to be installed.") from None
42
+
43
+ try:
44
+ edger = importr("edgeR")
45
+ except ImportError as e:
46
+ raise ImportError(
47
+ "edgeR requires a valid R installation with the following packages:\n"
48
+ "edgeR, BiocParallel, RhpcBLASctl"
49
+ ) from e
50
+
51
+ # Convert dataframe
52
+ with localconverter(ro.default_converter + numpy2ri.converter):
53
+ expr = self.adata.X if self.layer is None else self.adata.layers[self.layer]
54
+ if issparse(expr):
55
+ expr = expr.T.toarray()
56
+ else:
57
+ expr = expr.T
58
+
59
+ expr_r = ro.conversion.py2rpy(pd.DataFrame(expr, index=self.adata.var_names, columns=self.adata.obs_names))
60
+
61
+ dge = edger.DGEList(counts=expr_r, samples=self.adata.obs)
62
+
63
+ logging.info("Calculating NormFactors")
64
+ dge = edger.calcNormFactors(dge)
65
+
66
+ logging.info("Estimating Dispersions")
67
+ dge = edger.estimateDisp(dge, design=self.design)
68
+
69
+ logging.info("Fitting linear model")
70
+ fit = edger.glmQLFit(dge, design=self.design, **kwargs)
71
+
72
+ ro.globalenv["fit"] = fit
73
+ self.fit = fit
74
+
75
+ def _test_single_contrast(self, contrast: Sequence[float], **kwargs) -> pd.DataFrame:
76
+ """Conduct test for each contrast and return a data frame
77
+
78
+ Args:
79
+ contrast: numpy array of integars indicating contrast i.e. [-1, 0, 1, 0, 0]
80
+ """
81
+ ## -- Check installations
82
+ # For running in notebook
83
+ # pandas2ri.activate()
84
+ # rpy2.robjects.numpy2ri.activate()
85
+
86
+ # ToDo:
87
+ # parse **kwargs to R function
88
+ # Fix mask for .fit()
89
+
90
+ try:
91
+ import rpy2.robjects.numpy2ri
92
+ import rpy2.robjects.pandas2ri
93
+ from rpy2 import robjects as ro
94
+ from rpy2.robjects import numpy2ri, pandas2ri
95
+ from rpy2.robjects.conversion import localconverter
96
+ from rpy2.robjects.packages import importr
97
+
98
+ except ImportError:
99
+ raise ImportError("edger requires rpy2 to be installed.") from None
100
+
101
+ try:
102
+ importr("edgeR")
103
+ except ImportError:
104
+ raise ImportError(
105
+ "edgeR requires a valid R installation with the following packages: " "edgeR, BiocParallel, RhpcBLASctl"
106
+ ) from None
107
+
108
+ # Convert vector to R, which drops a category like `self.design_matrix` to use the intercept for the left out.
109
+ contrast_vec_r = ro.conversion.py2rpy(np.asarray(contrast))
110
+ ro.globalenv["contrast_vec"] = contrast_vec_r
111
+
112
+ # Test contrast with R
113
+ ro.r(
114
+ """
115
+ test = edgeR::glmQLFTest(fit, contrast=contrast_vec)
116
+ de_res = edgeR::topTags(test, n=Inf, adjust.method="BH", sort.by="PValue")$table
117
+ """
118
+ )
119
+
120
+ # Convert results to pandas
121
+ de_res = ro.conversion.rpy2py(ro.globalenv["de_res"])
122
+ de_res.index.name = "variable"
123
+ de_res = de_res.reset_index()
124
+
125
+ return de_res.rename(columns={"PValue": "p_value", "logFC": "log_fc", "FDR": "adj_p_value"})
@@ -0,0 +1,189 @@
1
+ """Helpers to interact with Formulaic Formulas
2
+
3
+ Some helpful definitions for working with formulaic formulas (e.g. `~ 0 + C(donor):treatment + np.log1p(continuous)`):
4
+ * A *term* refers to an expression in the formula, separated by `+`, e.g. `C(donor):treatment`, or `np.log1p(continuous)`.
5
+ * A *variable* refers to a column of the data frame passed to formulaic, e.g. `donor`.
6
+ * A *factor* is the specification of how a certain variable is represented in the design matrix, e.g. treatment coding with base level "A" and reduced rank.
7
+ """
8
+
9
+ from collections import defaultdict
10
+ from collections.abc import Mapping, Sequence
11
+ from dataclasses import dataclass
12
+ from typing import Any
13
+
14
+ from formulaic import FactorValues, ModelSpec
15
+ from formulaic.materializers import PandasMaterializer
16
+ from formulaic.materializers.types import EvaluatedFactor
17
+ from formulaic.parser.types import Factor
18
+ from interface_meta import override
19
+
20
+
21
+ @dataclass
22
+ class FactorMetadata:
23
+ """Store (relevant) metadata for a factor of a formula."""
24
+
25
+ name: str
26
+ """The unambiguous factor name as specified in the formula. E.g. `donor`, or `C(donor, contr.treatment(base="A"))`"""
27
+
28
+ reduced_rank: bool
29
+ """Whether a column will be dropped because it is redundant"""
30
+
31
+ custom_encoder: bool
32
+ """Whether or not a custom encoder (e.g. `C(...)`) was used."""
33
+
34
+ categories: Sequence[str]
35
+ """The unique categories in this factor (after applying `drop_rows`)"""
36
+
37
+ kind: Factor.Kind
38
+ """Type of the factor"""
39
+
40
+ drop_field: str = None
41
+ """The category that is dropped.
42
+
43
+ Note that
44
+ * this may also be populated if `reduced_rank = False`
45
+ * this is only populated when no encoder was used (e.g. `~ donor` but NOT `~ C(donor)`.
46
+ """
47
+
48
+ column_names: Sequence[str] = None
49
+ """The column names for this factor included in the design matrix.
50
+
51
+ This may be the same as `categories` if the default encoder is used, or
52
+ categories without the base level if a custom encoder (e.g. `C(...)`) is used.
53
+ """
54
+
55
+ colname_format: str = None
56
+ """A formattable string that can be used to generate the column name in the design matrix, e.g. `{name}[T.{field}]`"""
57
+
58
+ @property
59
+ def base(self) -> str | None:
60
+ """
61
+ The base category for this categorical factor.
62
+
63
+ This is derived from `drop_field` (for default encoding) or by comparing the column names in
64
+ the design matrix with all categories (for custom encoding, e.g. `C(...)`).
65
+ """
66
+ if not self.reduced_rank:
67
+ return None
68
+ else:
69
+ if self.custom_encoder:
70
+ tmp_base = set(self.categories) - set(self.column_names)
71
+ assert len(tmp_base) == 1
72
+ return tmp_base.pop()
73
+ else:
74
+ assert self.drop_field is not None
75
+ return self.drop_field
76
+
77
+
78
+ def get_factor_storage_and_materializer() -> tuple[dict[str, list[FactorMetadata]], dict[str, set[str]], type]:
79
+ """Keeps track of categorical factors used in a model specification by generating a custom materializer.
80
+
81
+ This materializer reports back metadata upon materialization of the model matrix.
82
+
83
+ Returns:
84
+ - A dictionary storing metadata for each factor processed by the custom materializer, named `factor_storage`.
85
+ - A dictionary mapping variables to factor names, which works similarly to model_spec.variable_terms
86
+ but maps to factors rather than terms, named `variable_to_factors`.
87
+ - A materializer class tied to the specific instance of `factor_storage`.
88
+ """
89
+ # There can be multiple FactorMetadata entries per sample, for instance when formulaic generates an interaction
90
+ # term, it generates the factor with both full rank and reduced rank.
91
+ factor_storage: dict[str, list[FactorMetadata]] = defaultdict(list)
92
+ variable_to_factors: dict[str, set[str]] = defaultdict(set)
93
+
94
+ class CustomPandasMaterializer(PandasMaterializer):
95
+ """An extension of the PandasMaterializer that records all categorical variables and their (base) categories."""
96
+
97
+ REGISTER_NAME = "custom_pandas"
98
+ REGISTER_INPUTS = ("pandas.core.frame.DataFrame",)
99
+ REGISTER_OUTPUTS = ("pandas", "numpy", "sparse")
100
+
101
+ def __init__(
102
+ self,
103
+ data: Any,
104
+ context: Mapping[str, Any] | None = None,
105
+ record_factor_metadata: bool = False,
106
+ **params: Any,
107
+ ):
108
+ """Initialize the Materializer.
109
+
110
+ Args:
111
+ data: Passed to PandasMaterializer.
112
+ context: Passed to PandasMaterializer
113
+ record_factor_metadata: Flag that tells whether this particular instance of the custom materializer class
114
+ is supposed to record factor metadata. Only the instance that is used for building the design
115
+ matrix should record the metadata. All other instances (e.g. used to generate contrast vectors)
116
+ should not record metadata to not overwrite the specifications from the design matrix.
117
+ **params: Passed to PandasMaterializer
118
+ """
119
+ self.factor_metadata_storage = factor_storage if record_factor_metadata else None
120
+ self.variable_to_factors = variable_to_factors if record_factor_metadata else None
121
+ # temporary pointer to metadata of factor that is currently evaluated
122
+ self._current_factor: FactorMetadata = None
123
+ super().__init__(data, context, **params)
124
+
125
+ @override
126
+ def _encode_evaled_factor(
127
+ self, factor: EvaluatedFactor, spec: ModelSpec, drop_rows: Sequence[int], reduced_rank: bool = False
128
+ ) -> dict[str, Any]:
129
+ """Function is called just before the factor is evaluated.
130
+
131
+ We can record some metadata, before we call the original function.
132
+ """
133
+ assert (
134
+ self._current_factor is None
135
+ ), "_current_factor should always be None when we start recording metadata"
136
+ if self.factor_metadata_storage is not None:
137
+ # Don't store if the factor is cached (then we should already have recorded it)
138
+ if factor.expr in self.encoded_cache or (factor.expr, reduced_rank) in self.encoded_cache:
139
+ assert factor.expr in self.factor_metadata_storage, "Factor should be there since it's cached"
140
+ else:
141
+ for var in factor.variables:
142
+ self.variable_to_factors[var].add(factor.expr)
143
+ self._current_factor = FactorMetadata(
144
+ name=factor.expr,
145
+ reduced_rank=reduced_rank,
146
+ categories=tuple(sorted(factor.values.drop(index=factor.values.index[drop_rows]).unique())),
147
+ custom_encoder=factor.metadata.encoder is not None,
148
+ kind=factor.metadata.kind,
149
+ )
150
+ return super()._encode_evaled_factor(factor, spec, drop_rows, reduced_rank)
151
+
152
+ @override
153
+ def _flatten_encoded_evaled_factor(self, name: str, values: FactorValues[dict]) -> dict[str, Any]:
154
+ """
155
+ Function is called at the end, before the design matrix gets materialized.
156
+
157
+ Here we have access to additional metadata, such as `drop_field`.
158
+ """
159
+ if self._current_factor is not None:
160
+ assert self._current_factor.name == name
161
+ self._current_factor.drop_field = values.__formulaic_metadata__.drop_field
162
+ self._current_factor.column_names = values.__formulaic_metadata__.column_names
163
+ self._current_factor.colname_format = values.__formulaic_metadata__.format
164
+ self.factor_metadata_storage[name].append(self._current_factor)
165
+ self._current_factor = None
166
+
167
+ return super()._flatten_encoded_evaled_factor(name, values)
168
+
169
+ return factor_storage, variable_to_factors, CustomPandasMaterializer
170
+
171
+
172
+ class AmbiguousAttributeError(ValueError):
173
+ pass
174
+
175
+
176
+ def resolve_ambiguous(objs: Sequence[Any], attr: str) -> Any:
177
+ """Given a list of objects, return an attribute if it is the same between all object. Otherwise, raise an error."""
178
+ if not objs:
179
+ raise ValueError("Collection is empty")
180
+
181
+ first_obj_attr = getattr(objs[0], attr)
182
+
183
+ # Check if the attribute is the same for all objects
184
+ for obj in objs[1:]:
185
+ if getattr(obj, attr) != first_obj_attr:
186
+ raise AmbiguousAttributeError(f"Ambiguous attribute '{attr}': values differ between objects")
187
+
188
+ # If attribute is the same for all objects, return it
189
+ return first_obj_attr
@@ -0,0 +1,95 @@
1
+ import os
2
+ import re
3
+ import warnings
4
+
5
+ import pandas as pd
6
+ from anndata import AnnData
7
+ from numpy import ndarray
8
+ from pydeseq2.dds import DeseqDataSet
9
+ from pydeseq2.default_inference import DefaultInference
10
+ from pydeseq2.ds import DeseqStats
11
+ from scipy.sparse import issparse
12
+
13
+ from ._base import LinearModelBase
14
+ from ._checks import check_is_integer_matrix
15
+
16
+
17
+ class PyDESeq2(LinearModelBase):
18
+ """Differential expression test using a PyDESeq2"""
19
+
20
+ def __init__(
21
+ self, adata: AnnData, design: str | ndarray, *, mask: str | None = None, layer: str | None = None, **kwargs
22
+ ):
23
+ super().__init__(adata, design, mask=mask, layer=layer, **kwargs)
24
+ # work around pydeseq2 issue with sparse matrices
25
+ # see also https://github.com/owkin/PyDESeq2/issues/25
26
+ if issparse(self.data):
27
+ if self.layer is None:
28
+ self.adata.X = self.adata.X.toarray()
29
+ else:
30
+ self.adata.layers[self.layer] = self.adata.layers[self.layer].toarray()
31
+
32
+ def _check_counts(self):
33
+ check_is_integer_matrix(self.data)
34
+
35
+ def fit(self, **kwargs) -> pd.DataFrame:
36
+ """Fit dds model using pydeseq2.
37
+
38
+ Note: this creates its own AnnData object for downstream processing.
39
+
40
+ Args:
41
+ **kwargs: Keyword arguments specific to DeseqDataSet(), except for `n_cpus` which will use all available CPUs minus one if the argument is not passed.
42
+ """
43
+ inference = DefaultInference(n_cpus=kwargs.pop("n_cpus", os.cpu_count() - 1))
44
+ covars = self.design.columns.tolist()
45
+ if "Intercept" not in covars:
46
+ warnings.warn(
47
+ "Warning: Pydeseq is hard-coded to use Intercept, please include intercept into the model", stacklevel=2
48
+ )
49
+ processed_covars = list({re.sub(r"\[T\.(.*)\]", "", col) for col in covars if col != "Intercept"})
50
+ dds = DeseqDataSet(
51
+ adata=self.adata, design_factors=processed_covars, refit_cooks=True, inference=inference, **kwargs
52
+ )
53
+ # workaround code to insert design array
54
+ des_mtx_cols = dds.obsm["design_matrix"].columns
55
+ dds.obsm["design_matrix"] = self.design
56
+ if dds.obsm["design_matrix"].shape[1] == len(des_mtx_cols):
57
+ dds.obsm["design_matrix"].columns = des_mtx_cols.copy()
58
+
59
+ dds.deseq2()
60
+ self.dds = dds
61
+
62
+ # TODO: PyDeseq2 doesn't support arbitrary designs and contrasts yet
63
+ # see https://github.com/owkin/PyDESeq2/issues/213
64
+
65
+ # Therefore these functions are overridden in a way to make it work with PyDESeq2,
66
+ # ingoring the inconsistency of function signatures. Once arbitrary design
67
+ # matrices and contrasts are supported by PyDEseq2, we can fully support the
68
+ # Linear model interface.
69
+ def _test_single_contrast(self, contrast: list[str], alpha=0.05, **kwargs) -> pd.DataFrame: # type: ignore
70
+ """Conduct a specific test and returns a Pandas DataFrame.
71
+
72
+ Args:
73
+ contrast: list of three strings of the form `["variable", "tested level", "reference level"]`.
74
+ alpha: p value threshold used for controlling fdr with independent hypothesis weighting
75
+ **kwargs: extra arguments to pass to DeseqStats()
76
+ """
77
+ stat_res = DeseqStats(self.dds, contrast=contrast, alpha=alpha, **kwargs)
78
+ # Calling `.summary()` is required to fill the `results_df` data frame
79
+ stat_res.summary()
80
+ res_df = (
81
+ pd.DataFrame(stat_res.results_df)
82
+ .rename(columns={"pvalue": "p_value", "padj": "adj_p_value", "log2FoldChange": "log_fc"})
83
+ .sort_values("p_value")
84
+ )
85
+ res_df.index.name = "variable"
86
+ res_df = res_df.reset_index()
87
+ return res_df
88
+
89
+ def cond(self, **kwargs) -> ndarray:
90
+ raise NotImplementedError(
91
+ "PyDESeq2 currently doesn't support arbitrary contrasts, see https://github.com/owkin/PyDESeq2/issues/213"
92
+ )
93
+
94
+ def contrast(self, column: str, baseline: str, group_to_compare: str) -> tuple[str, str, str]: # type: ignore
95
+ return (column, group_to_compare, baseline)
@@ -0,0 +1,162 @@
1
+ """Simple tests such as t-test, wilcoxon"""
2
+
3
+ import warnings
4
+ from abc import abstractmethod
5
+ from collections.abc import Mapping, Sequence
6
+ from types import MappingProxyType
7
+
8
+ import numpy as np
9
+ import pandas as pd
10
+ import scipy.stats
11
+ import statsmodels
12
+ from anndata import AnnData
13
+ from pandas.core.api import DataFrame as DataFrame
14
+ from scipy.sparse import diags, issparse
15
+ from tqdm.auto import tqdm
16
+
17
+ from ._base import MethodBase
18
+
19
+
20
+ def fdr_correction(
21
+ df: pd.DataFrame, pvalue_col: str = "p_value", *, key_added: str = "adj_p_value", inplace: bool = False
22
+ ):
23
+ """Adjust p-values in a DataFrame with test results using FDR correction."""
24
+ if not inplace:
25
+ df = df.copy()
26
+
27
+ df[key_added] = statsmodels.stats.multitest.fdrcorrection(df[pvalue_col].values)[1]
28
+
29
+ if not inplace:
30
+ return df
31
+
32
+
33
+ class SimpleComparisonBase(MethodBase):
34
+ @staticmethod
35
+ @abstractmethod
36
+ def _test(x0: np.ndarray, x1: np.ndarray, paired: bool, **kwargs) -> float:
37
+ """Perform a statistical test between values in x0 and x1.
38
+
39
+ If `paired` is True, x0 and x1 must be of the same length and ordered such that
40
+ paired elements have the same position.
41
+
42
+ Args:
43
+ x0: Array with baseline values.
44
+ x1: Array with values to compare.
45
+ paired: Indicates whether to perform a paired test
46
+ **kwargs: kwargs passed to the test function
47
+ """
48
+ ...
49
+
50
+ def _compare_single_group(
51
+ self, baseline_idx: np.ndarray, comparison_idx: np.ndarray, *, paired: bool, **kwargs
52
+ ) -> DataFrame:
53
+ """Perform a single comparison between two groups.
54
+
55
+ Args:
56
+ baseline_idx: Numeric indices indicating which observations are in the baseline group.
57
+ comparison_idx: Numeric indices indicating which observations are in the comparison/treatment group
58
+ paired: Whether to perform a paired test. Note that in the case of a paired test,
59
+ the indices must be ordered such that paired observations appear at the same position.
60
+ **kwargs: kwargs passed to the test function
61
+ """
62
+ if paired:
63
+ assert len(baseline_idx) == len(comparison_idx), "For a paired test, indices must be of the same length"
64
+
65
+ x0 = self.data[baseline_idx, :]
66
+ x1 = self.data[comparison_idx, :]
67
+
68
+ # In the following loop, we are doing a lot of column slicing -- which is significantly
69
+ # more efficient in csc format.
70
+ if issparse(self.data):
71
+ x0 = x0.tocsc()
72
+ x1 = x1.tocsc()
73
+
74
+ res = []
75
+ for var in tqdm(self.adata.var_names):
76
+ tmp_x0 = x0[:, self.adata.var_names == var]
77
+ tmp_x0 = np.asarray(tmp_x0.todense()).flatten() if issparse(tmp_x0) else tmp_x0.flatten()
78
+ tmp_x1 = x1[:, self.adata.var_names == var]
79
+ tmp_x1 = np.asarray(tmp_x1.todense()).flatten() if issparse(tmp_x1) else tmp_x1.flatten()
80
+ pval = self._test(tmp_x0, tmp_x1, paired, **kwargs)
81
+ mean_x0 = np.mean(tmp_x0)
82
+ mean_x1 = np.mean(tmp_x1)
83
+ res.append({"variable": var, "p_value": pval, "log_fc": np.log2(mean_x1) - np.log2(mean_x0)})
84
+ return pd.DataFrame(res).sort_values("p_value")
85
+
86
+ @classmethod
87
+ def compare_groups(
88
+ cls,
89
+ adata: AnnData,
90
+ column: str,
91
+ baseline: str,
92
+ groups_to_compare: str | Sequence[str],
93
+ *,
94
+ paired_by: str | None = None,
95
+ mask: str | None = None,
96
+ layer: str | None = None,
97
+ fit_kwargs: Mapping = MappingProxyType({}),
98
+ test_kwargs: Mapping = MappingProxyType({}),
99
+ ) -> DataFrame:
100
+ if len(fit_kwargs):
101
+ warnings.warn("fit_kwargs not used for simple tests.", UserWarning, stacklevel=2)
102
+ paired = paired_by is not None
103
+ model = cls(adata, mask=mask, layer=layer)
104
+ if groups_to_compare is None:
105
+ # compare against all other
106
+ groups_to_compare = sorted(set(model.adata.obs[column]) - {baseline})
107
+ if isinstance(groups_to_compare, str):
108
+ groups_to_compare = [groups_to_compare]
109
+
110
+ def _get_idx(column, value):
111
+ mask = model.adata.obs[column] == value
112
+ if paired:
113
+ dummies = pd.get_dummies(model.adata.obs[paired_by], sparse=True).sparse.to_coo().tocsr()
114
+ if not np.all(np.sum(dummies, axis=0) == 2):
115
+ raise ValueError("Pairing is only possible with exactly two values per group")
116
+ # Use matrix multiplication to only retreive those dummy entries that are associated with the current `value`.
117
+ # Convert to COO matrix to get rows/cols
118
+ # row indices refers to the indices of rows that have `column == value` (equivalent to np.where(mask)[0])
119
+ # col indices refers to the numeric index of each "pair" in obs_names
120
+ ind_mat = diags(mask.values, dtype=bool) @ dummies
121
+ if not np.all(np.sum(ind_mat, axis=0) == 1):
122
+ raise ValueError("Pairing is only possible with exactly two values per group")
123
+ ind_mat = ind_mat.tocoo()
124
+ return ind_mat.row[np.argsort(ind_mat.col)]
125
+ else:
126
+ return np.where(mask)[0]
127
+
128
+ res_dfs = []
129
+ baseline_idx = _get_idx(column, baseline)
130
+ for group_to_compare in groups_to_compare:
131
+ comparison_idx = _get_idx(column, group_to_compare)
132
+ res_dfs.append(
133
+ model._compare_single_group(baseline_idx, comparison_idx, paired=paired, **test_kwargs).assign(
134
+ comparison=f"{group_to_compare}_vs_{baseline if baseline is not None else 'rest'}"
135
+ )
136
+ )
137
+ return fdr_correction(pd.concat(res_dfs))
138
+
139
+
140
+ class WilcoxonTest(SimpleComparisonBase):
141
+ """Perform a unpaired or paired Wilcoxon test.
142
+
143
+ (the former is also known as "Mann-Whitney U test", the latter as "wilcoxon signed rank test")
144
+ """
145
+
146
+ @staticmethod
147
+ def _test(x0: np.ndarray, x1: np.ndarray, paired: bool, **kwargs) -> float:
148
+ if paired:
149
+ return scipy.stats.wilcoxon(x0, x1, **kwargs).pvalue
150
+ else:
151
+ return scipy.stats.mannwhitneyu(x0, x1, **kwargs).pvalue
152
+
153
+
154
+ class TTest(SimpleComparisonBase):
155
+ """Perform a unpaired or paired T-test"""
156
+
157
+ @staticmethod
158
+ def _test(x0: np.ndarray, x1: np.ndarray, paired: bool, **kwargs) -> float:
159
+ if paired:
160
+ return scipy.stats.ttest_rel(x0, x1, **kwargs).pvalue
161
+ else:
162
+ return scipy.stats.ttest_ind(x0, x1, **kwargs).pvalue