pertpy 0.6.0__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. pertpy/__init__.py +4 -2
  2. pertpy/data/__init__.py +66 -1
  3. pertpy/data/_dataloader.py +28 -26
  4. pertpy/data/_datasets.py +261 -92
  5. pertpy/metadata/__init__.py +6 -0
  6. pertpy/metadata/_cell_line.py +795 -0
  7. pertpy/metadata/_compound.py +128 -0
  8. pertpy/metadata/_drug.py +238 -0
  9. pertpy/metadata/_look_up.py +569 -0
  10. pertpy/metadata/_metadata.py +70 -0
  11. pertpy/metadata/_moa.py +125 -0
  12. pertpy/plot/__init__.py +0 -13
  13. pertpy/preprocessing/__init__.py +2 -0
  14. pertpy/preprocessing/_guide_rna.py +89 -6
  15. pertpy/tools/__init__.py +48 -15
  16. pertpy/tools/_augur.py +329 -32
  17. pertpy/tools/_cinemaot.py +145 -6
  18. pertpy/tools/_coda/_base_coda.py +1237 -116
  19. pertpy/tools/_coda/_sccoda.py +66 -36
  20. pertpy/tools/_coda/_tasccoda.py +46 -39
  21. pertpy/tools/_dialogue.py +180 -77
  22. pertpy/tools/_differential_gene_expression/__init__.py +20 -0
  23. pertpy/tools/_differential_gene_expression/_base.py +657 -0
  24. pertpy/tools/_differential_gene_expression/_checks.py +41 -0
  25. pertpy/tools/_differential_gene_expression/_dge_comparison.py +86 -0
  26. pertpy/tools/_differential_gene_expression/_edger.py +125 -0
  27. pertpy/tools/_differential_gene_expression/_formulaic.py +189 -0
  28. pertpy/tools/_differential_gene_expression/_pydeseq2.py +95 -0
  29. pertpy/tools/_differential_gene_expression/_simple_tests.py +162 -0
  30. pertpy/tools/_differential_gene_expression/_statsmodels.py +72 -0
  31. pertpy/tools/_distances/_distance_tests.py +29 -24
  32. pertpy/tools/_distances/_distances.py +584 -98
  33. pertpy/tools/_enrichment.py +460 -0
  34. pertpy/tools/_kernel_pca.py +1 -1
  35. pertpy/tools/_milo.py +406 -49
  36. pertpy/tools/_mixscape.py +677 -55
  37. pertpy/tools/_perturbation_space/_clustering.py +10 -3
  38. pertpy/tools/_perturbation_space/_comparison.py +112 -0
  39. pertpy/tools/_perturbation_space/_discriminator_classifiers.py +524 -0
  40. pertpy/tools/_perturbation_space/_perturbation_space.py +146 -52
  41. pertpy/tools/_perturbation_space/_simple.py +52 -11
  42. pertpy/tools/_scgen/__init__.py +1 -1
  43. pertpy/tools/_scgen/_base_components.py +2 -3
  44. pertpy/tools/_scgen/_scgen.py +706 -0
  45. pertpy/tools/_scgen/_utils.py +3 -5
  46. pertpy/tools/decoupler_LICENSE +674 -0
  47. {pertpy-0.6.0.dist-info → pertpy-0.8.0.dist-info}/METADATA +48 -20
  48. pertpy-0.8.0.dist-info/RECORD +57 -0
  49. {pertpy-0.6.0.dist-info → pertpy-0.8.0.dist-info}/WHEEL +1 -1
  50. pertpy/plot/_augur.py +0 -234
  51. pertpy/plot/_cinemaot.py +0 -81
  52. pertpy/plot/_coda.py +0 -1001
  53. pertpy/plot/_dialogue.py +0 -91
  54. pertpy/plot/_guide_rna.py +0 -82
  55. pertpy/plot/_milopy.py +0 -284
  56. pertpy/plot/_mixscape.py +0 -594
  57. pertpy/plot/_scgen.py +0 -337
  58. pertpy/tools/_differential_gene_expression.py +0 -99
  59. pertpy/tools/_metadata/__init__.py +0 -0
  60. pertpy/tools/_metadata/_cell_line.py +0 -613
  61. pertpy/tools/_metadata/_look_up.py +0 -342
  62. pertpy/tools/_perturbation_space/_discriminator_classifier.py +0 -381
  63. pertpy/tools/_scgen/_jax_scgen.py +0 -370
  64. pertpy-0.6.0.dist-info/RECORD +0 -50
  65. /pertpy/tools/_scgen/{_jax_scgenvae.py → _scgenvae.py} +0 -0
  66. {pertpy-0.6.0.dist-info → pertpy-0.8.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,86 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ from anndata import AnnData
4
+
5
+
6
+ class DGEEVAL:
7
+ def compare(
8
+ self,
9
+ adata: AnnData | None = None,
10
+ de_key1: str = None,
11
+ de_key2: str = None,
12
+ de_df1: pd.DataFrame | None = None,
13
+ de_df2: pd.DataFrame | None = None,
14
+ shared_top: int = 100,
15
+ ) -> dict[str, float]:
16
+ """Compare two differential expression analyses.
17
+
18
+ Compare two sets of DE results and evaluate the similarity by the overlap of top DEG and
19
+ the correlation of their scores and adjusted p-values.
20
+
21
+ Args:
22
+ adata: AnnData object containing DE results in `uns`. Required if `de_key1` and `de_key2` are used.
23
+ de_key1: Key for DE results in `adata.uns`, e.g., output of `tl.rank_genes_groups`.
24
+ de_key2: Another key for DE results in `adata.uns`, e.g., output of `tl.rank_genes_groups`.
25
+ de_df1: DataFrame containing DE results, e.g. output from pertpy differential gene expression interface.
26
+ de_df2: DataFrame containing DE results, e.g. output from pertpy differential gene expression interface.
27
+ shared_top: The number of top DEG to compute the proportion of their intersection.
28
+
29
+ """
30
+ if (de_key1 or de_key2) and (de_df1 is not None or de_df2 is not None):
31
+ raise ValueError(
32
+ "Please provide either both `de_key1` and `de_key2` with `adata`, or `de_df1` and `de_df2`, but not both."
33
+ )
34
+
35
+ if de_df1 is None and de_df2 is None: # use keys
36
+ if not de_key1 or not de_key2:
37
+ raise ValueError("Both `de_key1` and `de_key2` must be provided together if using `adata`.")
38
+
39
+ else: # use dfs
40
+ if de_df1 is None or de_df2 is None:
41
+ raise ValueError("Both `de_df1` and `de_df2` must be provided together if using DataFrames.")
42
+
43
+ if de_key1:
44
+ if not adata:
45
+ raise ValueError("`adata` should be provided with `de_key1` and `de_key2`. ")
46
+ assert all(
47
+ k in adata.uns for k in [de_key1, de_key2]
48
+ ), "Provided `de_key1` and `de_key2` must exist in `adata.uns`."
49
+ vars = adata.var_names
50
+
51
+ if de_df1 is not None:
52
+ for df in (de_df1, de_df2):
53
+ if not {"variable", "log_fc", "adj_p_value"}.issubset(df.columns):
54
+ raise ValueError("Each DataFrame must contain columns: 'variable', 'log_fc', and 'adj_p_value'.")
55
+
56
+ assert set(de_df1["variable"]) == set(de_df2["variable"]), "Variables in both dataframes must match."
57
+ vars = de_df1["variable"].sort_values()
58
+
59
+ shared_top = min(shared_top, len(vars))
60
+ vars_ranks = np.arange(1, len(vars) + 1)
61
+ results = pd.DataFrame(index=vars)
62
+ top_names = []
63
+
64
+ if de_key1 and de_key2:
65
+ for i, k in enumerate([de_key1, de_key2]):
66
+ label = adata.uns[k]["names"].dtype.names[0]
67
+ srt_idx = np.argsort(adata.uns[k]["names"][label])
68
+ results[f"scores_{i}"] = adata.uns[k]["scores"][label][srt_idx]
69
+ results[f"pvals_adj_{i}"] = adata.uns[k]["pvals_adj"][label][srt_idx]
70
+ results[f"ranks_{i}"] = vars_ranks[srt_idx]
71
+ top_names.append(adata.uns[k]["names"][label][:shared_top])
72
+ else:
73
+ for i, df in enumerate([de_df1, de_df2]):
74
+ srt_idx = np.argsort(df["variable"])
75
+ results[f"scores_{i}"] = df["log_fc"].values[srt_idx]
76
+ results[f"pvals_adj_{i}"] = df["adj_p_value"].values[srt_idx]
77
+ results[f"ranks_{i}"] = vars_ranks[srt_idx]
78
+ top_names.append(df["variable"][:shared_top])
79
+
80
+ metrics = {}
81
+ metrics["shared_top_genes"] = len(set(top_names[0]).intersection(top_names[1])) / shared_top
82
+ metrics["scores_corr"] = results["scores_0"].corr(results["scores_1"], method="pearson")
83
+ metrics["pvals_adj_corr"] = results["pvals_adj_0"].corr(results["pvals_adj_1"], method="pearson")
84
+ metrics["scores_ranks_corr"] = results["ranks_0"].corr(results["ranks_1"], method="spearman")
85
+
86
+ return metrics
@@ -0,0 +1,125 @@
1
+ from collections.abc import Sequence
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+ from scanpy import logging
6
+ from scipy.sparse import issparse
7
+
8
+ from ._base import LinearModelBase
9
+ from ._checks import check_is_integer_matrix
10
+
11
+
12
+ class EdgeR(LinearModelBase):
13
+ """Differential expression test using EdgeR"""
14
+
15
+ def _check_counts(self):
16
+ check_is_integer_matrix(self.data)
17
+
18
+ def fit(self, **kwargs): # adata, design, mask, layer
19
+ """Fit model using edgeR.
20
+
21
+ Note: this creates its own AnnData object for downstream.
22
+
23
+ Args:
24
+ **kwargs: Keyword arguments specific to glmQLFit()
25
+ """
26
+ # For running in notebook
27
+ # pandas2ri.activate()
28
+ # rpy2.robjects.numpy2ri.activate()
29
+ try:
30
+ import rpy2.robjects.numpy2ri
31
+ import rpy2.robjects.pandas2ri
32
+ from rpy2 import robjects as ro
33
+ from rpy2.robjects import numpy2ri, pandas2ri
34
+ from rpy2.robjects.conversion import localconverter
35
+ from rpy2.robjects.packages import importr
36
+
37
+ pandas2ri.activate()
38
+ rpy2.robjects.numpy2ri.activate()
39
+
40
+ except ImportError:
41
+ raise ImportError("edger requires rpy2 to be installed.") from None
42
+
43
+ try:
44
+ edger = importr("edgeR")
45
+ except ImportError as e:
46
+ raise ImportError(
47
+ "edgeR requires a valid R installation with the following packages:\n"
48
+ "edgeR, BiocParallel, RhpcBLASctl"
49
+ ) from e
50
+
51
+ # Convert dataframe
52
+ with localconverter(ro.default_converter + numpy2ri.converter):
53
+ expr = self.adata.X if self.layer is None else self.adata.layers[self.layer]
54
+ if issparse(expr):
55
+ expr = expr.T.toarray()
56
+ else:
57
+ expr = expr.T
58
+
59
+ expr_r = ro.conversion.py2rpy(pd.DataFrame(expr, index=self.adata.var_names, columns=self.adata.obs_names))
60
+
61
+ dge = edger.DGEList(counts=expr_r, samples=self.adata.obs)
62
+
63
+ logging.info("Calculating NormFactors")
64
+ dge = edger.calcNormFactors(dge)
65
+
66
+ logging.info("Estimating Dispersions")
67
+ dge = edger.estimateDisp(dge, design=self.design)
68
+
69
+ logging.info("Fitting linear model")
70
+ fit = edger.glmQLFit(dge, design=self.design, **kwargs)
71
+
72
+ ro.globalenv["fit"] = fit
73
+ self.fit = fit
74
+
75
+ def _test_single_contrast(self, contrast: Sequence[float], **kwargs) -> pd.DataFrame:
76
+ """Conduct test for each contrast and return a data frame
77
+
78
+ Args:
79
+ contrast: numpy array of integars indicating contrast i.e. [-1, 0, 1, 0, 0]
80
+ """
81
+ ## -- Check installations
82
+ # For running in notebook
83
+ # pandas2ri.activate()
84
+ # rpy2.robjects.numpy2ri.activate()
85
+
86
+ # ToDo:
87
+ # parse **kwargs to R function
88
+ # Fix mask for .fit()
89
+
90
+ try:
91
+ import rpy2.robjects.numpy2ri
92
+ import rpy2.robjects.pandas2ri
93
+ from rpy2 import robjects as ro
94
+ from rpy2.robjects import numpy2ri, pandas2ri
95
+ from rpy2.robjects.conversion import localconverter
96
+ from rpy2.robjects.packages import importr
97
+
98
+ except ImportError:
99
+ raise ImportError("edger requires rpy2 to be installed.") from None
100
+
101
+ try:
102
+ importr("edgeR")
103
+ except ImportError:
104
+ raise ImportError(
105
+ "edgeR requires a valid R installation with the following packages: " "edgeR, BiocParallel, RhpcBLASctl"
106
+ ) from None
107
+
108
+ # Convert vector to R, which drops a category like `self.design_matrix` to use the intercept for the left out.
109
+ contrast_vec_r = ro.conversion.py2rpy(np.asarray(contrast))
110
+ ro.globalenv["contrast_vec"] = contrast_vec_r
111
+
112
+ # Test contrast with R
113
+ ro.r(
114
+ """
115
+ test = edgeR::glmQLFTest(fit, contrast=contrast_vec)
116
+ de_res = edgeR::topTags(test, n=Inf, adjust.method="BH", sort.by="PValue")$table
117
+ """
118
+ )
119
+
120
+ # Convert results to pandas
121
+ de_res = ro.conversion.rpy2py(ro.globalenv["de_res"])
122
+ de_res.index.name = "variable"
123
+ de_res = de_res.reset_index()
124
+
125
+ return de_res.rename(columns={"PValue": "p_value", "logFC": "log_fc", "FDR": "adj_p_value"})
@@ -0,0 +1,189 @@
1
+ """Helpers to interact with Formulaic Formulas
2
+
3
+ Some helpful definitions for working with formulaic formulas (e.g. `~ 0 + C(donor):treatment + np.log1p(continuous)`):
4
+ * A *term* refers to an expression in the formula, separated by `+`, e.g. `C(donor):treatment`, or `np.log1p(continuous)`.
5
+ * A *variable* refers to a column of the data frame passed to formulaic, e.g. `donor`.
6
+ * A *factor* is the specification of how a certain variable is represented in the design matrix, e.g. treatment coding with base level "A" and reduced rank.
7
+ """
8
+
9
+ from collections import defaultdict
10
+ from collections.abc import Mapping, Sequence
11
+ from dataclasses import dataclass
12
+ from typing import Any
13
+
14
+ from formulaic import FactorValues, ModelSpec
15
+ from formulaic.materializers import PandasMaterializer
16
+ from formulaic.materializers.types import EvaluatedFactor
17
+ from formulaic.parser.types import Factor
18
+ from interface_meta import override
19
+
20
+
21
+ @dataclass
22
+ class FactorMetadata:
23
+ """Store (relevant) metadata for a factor of a formula."""
24
+
25
+ name: str
26
+ """The unambiguous factor name as specified in the formula. E.g. `donor`, or `C(donor, contr.treatment(base="A"))`"""
27
+
28
+ reduced_rank: bool
29
+ """Whether a column will be dropped because it is redundant"""
30
+
31
+ custom_encoder: bool
32
+ """Whether or not a custom encoder (e.g. `C(...)`) was used."""
33
+
34
+ categories: Sequence[str]
35
+ """The unique categories in this factor (after applying `drop_rows`)"""
36
+
37
+ kind: Factor.Kind
38
+ """Type of the factor"""
39
+
40
+ drop_field: str = None
41
+ """The category that is dropped.
42
+
43
+ Note that
44
+ * this may also be populated if `reduced_rank = False`
45
+ * this is only populated when no encoder was used (e.g. `~ donor` but NOT `~ C(donor)`.
46
+ """
47
+
48
+ column_names: Sequence[str] = None
49
+ """The column names for this factor included in the design matrix.
50
+
51
+ This may be the same as `categories` if the default encoder is used, or
52
+ categories without the base level if a custom encoder (e.g. `C(...)`) is used.
53
+ """
54
+
55
+ colname_format: str = None
56
+ """A formattable string that can be used to generate the column name in the design matrix, e.g. `{name}[T.{field}]`"""
57
+
58
+ @property
59
+ def base(self) -> str | None:
60
+ """
61
+ The base category for this categorical factor.
62
+
63
+ This is derived from `drop_field` (for default encoding) or by comparing the column names in
64
+ the design matrix with all categories (for custom encoding, e.g. `C(...)`).
65
+ """
66
+ if not self.reduced_rank:
67
+ return None
68
+ else:
69
+ if self.custom_encoder:
70
+ tmp_base = set(self.categories) - set(self.column_names)
71
+ assert len(tmp_base) == 1
72
+ return tmp_base.pop()
73
+ else:
74
+ assert self.drop_field is not None
75
+ return self.drop_field
76
+
77
+
78
+ def get_factor_storage_and_materializer() -> tuple[dict[str, list[FactorMetadata]], dict[str, set[str]], type]:
79
+ """Keeps track of categorical factors used in a model specification by generating a custom materializer.
80
+
81
+ This materializer reports back metadata upon materialization of the model matrix.
82
+
83
+ Returns:
84
+ - A dictionary storing metadata for each factor processed by the custom materializer, named `factor_storage`.
85
+ - A dictionary mapping variables to factor names, which works similarly to model_spec.variable_terms
86
+ but maps to factors rather than terms, named `variable_to_factors`.
87
+ - A materializer class tied to the specific instance of `factor_storage`.
88
+ """
89
+ # There can be multiple FactorMetadata entries per sample, for instance when formulaic generates an interaction
90
+ # term, it generates the factor with both full rank and reduced rank.
91
+ factor_storage: dict[str, list[FactorMetadata]] = defaultdict(list)
92
+ variable_to_factors: dict[str, set[str]] = defaultdict(set)
93
+
94
+ class CustomPandasMaterializer(PandasMaterializer):
95
+ """An extension of the PandasMaterializer that records all categorical variables and their (base) categories."""
96
+
97
+ REGISTER_NAME = "custom_pandas"
98
+ REGISTER_INPUTS = ("pandas.core.frame.DataFrame",)
99
+ REGISTER_OUTPUTS = ("pandas", "numpy", "sparse")
100
+
101
+ def __init__(
102
+ self,
103
+ data: Any,
104
+ context: Mapping[str, Any] | None = None,
105
+ record_factor_metadata: bool = False,
106
+ **params: Any,
107
+ ):
108
+ """Initialize the Materializer.
109
+
110
+ Args:
111
+ data: Passed to PandasMaterializer.
112
+ context: Passed to PandasMaterializer
113
+ record_factor_metadata: Flag that tells whether this particular instance of the custom materializer class
114
+ is supposed to record factor metadata. Only the instance that is used for building the design
115
+ matrix should record the metadata. All other instances (e.g. used to generate contrast vectors)
116
+ should not record metadata to not overwrite the specifications from the design matrix.
117
+ **params: Passed to PandasMaterializer
118
+ """
119
+ self.factor_metadata_storage = factor_storage if record_factor_metadata else None
120
+ self.variable_to_factors = variable_to_factors if record_factor_metadata else None
121
+ # temporary pointer to metadata of factor that is currently evaluated
122
+ self._current_factor: FactorMetadata = None
123
+ super().__init__(data, context, **params)
124
+
125
+ @override
126
+ def _encode_evaled_factor(
127
+ self, factor: EvaluatedFactor, spec: ModelSpec, drop_rows: Sequence[int], reduced_rank: bool = False
128
+ ) -> dict[str, Any]:
129
+ """Function is called just before the factor is evaluated.
130
+
131
+ We can record some metadata, before we call the original function.
132
+ """
133
+ assert (
134
+ self._current_factor is None
135
+ ), "_current_factor should always be None when we start recording metadata"
136
+ if self.factor_metadata_storage is not None:
137
+ # Don't store if the factor is cached (then we should already have recorded it)
138
+ if factor.expr in self.encoded_cache or (factor.expr, reduced_rank) in self.encoded_cache:
139
+ assert factor.expr in self.factor_metadata_storage, "Factor should be there since it's cached"
140
+ else:
141
+ for var in factor.variables:
142
+ self.variable_to_factors[var].add(factor.expr)
143
+ self._current_factor = FactorMetadata(
144
+ name=factor.expr,
145
+ reduced_rank=reduced_rank,
146
+ categories=tuple(sorted(factor.values.drop(index=factor.values.index[drop_rows]).unique())),
147
+ custom_encoder=factor.metadata.encoder is not None,
148
+ kind=factor.metadata.kind,
149
+ )
150
+ return super()._encode_evaled_factor(factor, spec, drop_rows, reduced_rank)
151
+
152
+ @override
153
+ def _flatten_encoded_evaled_factor(self, name: str, values: FactorValues[dict]) -> dict[str, Any]:
154
+ """
155
+ Function is called at the end, before the design matrix gets materialized.
156
+
157
+ Here we have access to additional metadata, such as `drop_field`.
158
+ """
159
+ if self._current_factor is not None:
160
+ assert self._current_factor.name == name
161
+ self._current_factor.drop_field = values.__formulaic_metadata__.drop_field
162
+ self._current_factor.column_names = values.__formulaic_metadata__.column_names
163
+ self._current_factor.colname_format = values.__formulaic_metadata__.format
164
+ self.factor_metadata_storage[name].append(self._current_factor)
165
+ self._current_factor = None
166
+
167
+ return super()._flatten_encoded_evaled_factor(name, values)
168
+
169
+ return factor_storage, variable_to_factors, CustomPandasMaterializer
170
+
171
+
172
+ class AmbiguousAttributeError(ValueError):
173
+ pass
174
+
175
+
176
+ def resolve_ambiguous(objs: Sequence[Any], attr: str) -> Any:
177
+ """Given a list of objects, return an attribute if it is the same between all object. Otherwise, raise an error."""
178
+ if not objs:
179
+ raise ValueError("Collection is empty")
180
+
181
+ first_obj_attr = getattr(objs[0], attr)
182
+
183
+ # Check if the attribute is the same for all objects
184
+ for obj in objs[1:]:
185
+ if getattr(obj, attr) != first_obj_attr:
186
+ raise AmbiguousAttributeError(f"Ambiguous attribute '{attr}': values differ between objects")
187
+
188
+ # If attribute is the same for all objects, return it
189
+ return first_obj_attr
@@ -0,0 +1,95 @@
1
+ import os
2
+ import re
3
+ import warnings
4
+
5
+ import pandas as pd
6
+ from anndata import AnnData
7
+ from numpy import ndarray
8
+ from pydeseq2.dds import DeseqDataSet
9
+ from pydeseq2.default_inference import DefaultInference
10
+ from pydeseq2.ds import DeseqStats
11
+ from scipy.sparse import issparse
12
+
13
+ from ._base import LinearModelBase
14
+ from ._checks import check_is_integer_matrix
15
+
16
+
17
+ class PyDESeq2(LinearModelBase):
18
+ """Differential expression test using a PyDESeq2"""
19
+
20
+ def __init__(
21
+ self, adata: AnnData, design: str | ndarray, *, mask: str | None = None, layer: str | None = None, **kwargs
22
+ ):
23
+ super().__init__(adata, design, mask=mask, layer=layer, **kwargs)
24
+ # work around pydeseq2 issue with sparse matrices
25
+ # see also https://github.com/owkin/PyDESeq2/issues/25
26
+ if issparse(self.data):
27
+ if self.layer is None:
28
+ self.adata.X = self.adata.X.toarray()
29
+ else:
30
+ self.adata.layers[self.layer] = self.adata.layers[self.layer].toarray()
31
+
32
+ def _check_counts(self):
33
+ check_is_integer_matrix(self.data)
34
+
35
+ def fit(self, **kwargs) -> pd.DataFrame:
36
+ """Fit dds model using pydeseq2.
37
+
38
+ Note: this creates its own AnnData object for downstream processing.
39
+
40
+ Args:
41
+ **kwargs: Keyword arguments specific to DeseqDataSet(), except for `n_cpus` which will use all available CPUs minus one if the argument is not passed.
42
+ """
43
+ inference = DefaultInference(n_cpus=kwargs.pop("n_cpus", os.cpu_count() - 1))
44
+ covars = self.design.columns.tolist()
45
+ if "Intercept" not in covars:
46
+ warnings.warn(
47
+ "Warning: Pydeseq is hard-coded to use Intercept, please include intercept into the model", stacklevel=2
48
+ )
49
+ processed_covars = list({re.sub(r"\[T\.(.*)\]", "", col) for col in covars if col != "Intercept"})
50
+ dds = DeseqDataSet(
51
+ adata=self.adata, design_factors=processed_covars, refit_cooks=True, inference=inference, **kwargs
52
+ )
53
+ # workaround code to insert design array
54
+ des_mtx_cols = dds.obsm["design_matrix"].columns
55
+ dds.obsm["design_matrix"] = self.design
56
+ if dds.obsm["design_matrix"].shape[1] == len(des_mtx_cols):
57
+ dds.obsm["design_matrix"].columns = des_mtx_cols.copy()
58
+
59
+ dds.deseq2()
60
+ self.dds = dds
61
+
62
+ # TODO: PyDeseq2 doesn't support arbitrary designs and contrasts yet
63
+ # see https://github.com/owkin/PyDESeq2/issues/213
64
+
65
+ # Therefore these functions are overridden in a way to make it work with PyDESeq2,
66
+ # ingoring the inconsistency of function signatures. Once arbitrary design
67
+ # matrices and contrasts are supported by PyDEseq2, we can fully support the
68
+ # Linear model interface.
69
+ def _test_single_contrast(self, contrast: list[str], alpha=0.05, **kwargs) -> pd.DataFrame: # type: ignore
70
+ """Conduct a specific test and returns a Pandas DataFrame.
71
+
72
+ Args:
73
+ contrast: list of three strings of the form `["variable", "tested level", "reference level"]`.
74
+ alpha: p value threshold used for controlling fdr with independent hypothesis weighting
75
+ **kwargs: extra arguments to pass to DeseqStats()
76
+ """
77
+ stat_res = DeseqStats(self.dds, contrast=contrast, alpha=alpha, **kwargs)
78
+ # Calling `.summary()` is required to fill the `results_df` data frame
79
+ stat_res.summary()
80
+ res_df = (
81
+ pd.DataFrame(stat_res.results_df)
82
+ .rename(columns={"pvalue": "p_value", "padj": "adj_p_value", "log2FoldChange": "log_fc"})
83
+ .sort_values("p_value")
84
+ )
85
+ res_df.index.name = "variable"
86
+ res_df = res_df.reset_index()
87
+ return res_df
88
+
89
+ def cond(self, **kwargs) -> ndarray:
90
+ raise NotImplementedError(
91
+ "PyDESeq2 currently doesn't support arbitrary contrasts, see https://github.com/owkin/PyDESeq2/issues/213"
92
+ )
93
+
94
+ def contrast(self, column: str, baseline: str, group_to_compare: str) -> tuple[str, str, str]: # type: ignore
95
+ return (column, group_to_compare, baseline)
@@ -0,0 +1,162 @@
1
+ """Simple tests such as t-test, wilcoxon"""
2
+
3
+ import warnings
4
+ from abc import abstractmethod
5
+ from collections.abc import Mapping, Sequence
6
+ from types import MappingProxyType
7
+
8
+ import numpy as np
9
+ import pandas as pd
10
+ import scipy.stats
11
+ import statsmodels
12
+ from anndata import AnnData
13
+ from pandas.core.api import DataFrame as DataFrame
14
+ from scipy.sparse import diags, issparse
15
+ from tqdm.auto import tqdm
16
+
17
+ from ._base import MethodBase
18
+
19
+
20
+ def fdr_correction(
21
+ df: pd.DataFrame, pvalue_col: str = "p_value", *, key_added: str = "adj_p_value", inplace: bool = False
22
+ ):
23
+ """Adjust p-values in a DataFrame with test results using FDR correction."""
24
+ if not inplace:
25
+ df = df.copy()
26
+
27
+ df[key_added] = statsmodels.stats.multitest.fdrcorrection(df[pvalue_col].values)[1]
28
+
29
+ if not inplace:
30
+ return df
31
+
32
+
33
+ class SimpleComparisonBase(MethodBase):
34
+ @staticmethod
35
+ @abstractmethod
36
+ def _test(x0: np.ndarray, x1: np.ndarray, paired: bool, **kwargs) -> float:
37
+ """Perform a statistical test between values in x0 and x1.
38
+
39
+ If `paired` is True, x0 and x1 must be of the same length and ordered such that
40
+ paired elements have the same position.
41
+
42
+ Args:
43
+ x0: Array with baseline values.
44
+ x1: Array with values to compare.
45
+ paired: Indicates whether to perform a paired test
46
+ **kwargs: kwargs passed to the test function
47
+ """
48
+ ...
49
+
50
+ def _compare_single_group(
51
+ self, baseline_idx: np.ndarray, comparison_idx: np.ndarray, *, paired: bool, **kwargs
52
+ ) -> DataFrame:
53
+ """Perform a single comparison between two groups.
54
+
55
+ Args:
56
+ baseline_idx: Numeric indices indicating which observations are in the baseline group.
57
+ comparison_idx: Numeric indices indicating which observations are in the comparison/treatment group
58
+ paired: Whether to perform a paired test. Note that in the case of a paired test,
59
+ the indices must be ordered such that paired observations appear at the same position.
60
+ **kwargs: kwargs passed to the test function
61
+ """
62
+ if paired:
63
+ assert len(baseline_idx) == len(comparison_idx), "For a paired test, indices must be of the same length"
64
+
65
+ x0 = self.data[baseline_idx, :]
66
+ x1 = self.data[comparison_idx, :]
67
+
68
+ # In the following loop, we are doing a lot of column slicing -- which is significantly
69
+ # more efficient in csc format.
70
+ if issparse(self.data):
71
+ x0 = x0.tocsc()
72
+ x1 = x1.tocsc()
73
+
74
+ res = []
75
+ for var in tqdm(self.adata.var_names):
76
+ tmp_x0 = x0[:, self.adata.var_names == var]
77
+ tmp_x0 = np.asarray(tmp_x0.todense()).flatten() if issparse(tmp_x0) else tmp_x0.flatten()
78
+ tmp_x1 = x1[:, self.adata.var_names == var]
79
+ tmp_x1 = np.asarray(tmp_x1.todense()).flatten() if issparse(tmp_x1) else tmp_x1.flatten()
80
+ pval = self._test(tmp_x0, tmp_x1, paired, **kwargs)
81
+ mean_x0 = np.mean(tmp_x0)
82
+ mean_x1 = np.mean(tmp_x1)
83
+ res.append({"variable": var, "p_value": pval, "log_fc": np.log2(mean_x1) - np.log2(mean_x0)})
84
+ return pd.DataFrame(res).sort_values("p_value")
85
+
86
+ @classmethod
87
+ def compare_groups(
88
+ cls,
89
+ adata: AnnData,
90
+ column: str,
91
+ baseline: str,
92
+ groups_to_compare: str | Sequence[str],
93
+ *,
94
+ paired_by: str | None = None,
95
+ mask: str | None = None,
96
+ layer: str | None = None,
97
+ fit_kwargs: Mapping = MappingProxyType({}),
98
+ test_kwargs: Mapping = MappingProxyType({}),
99
+ ) -> DataFrame:
100
+ if len(fit_kwargs):
101
+ warnings.warn("fit_kwargs not used for simple tests.", UserWarning, stacklevel=2)
102
+ paired = paired_by is not None
103
+ model = cls(adata, mask=mask, layer=layer)
104
+ if groups_to_compare is None:
105
+ # compare against all other
106
+ groups_to_compare = sorted(set(model.adata.obs[column]) - {baseline})
107
+ if isinstance(groups_to_compare, str):
108
+ groups_to_compare = [groups_to_compare]
109
+
110
+ def _get_idx(column, value):
111
+ mask = model.adata.obs[column] == value
112
+ if paired:
113
+ dummies = pd.get_dummies(model.adata.obs[paired_by], sparse=True).sparse.to_coo().tocsr()
114
+ if not np.all(np.sum(dummies, axis=0) == 2):
115
+ raise ValueError("Pairing is only possible with exactly two values per group")
116
+ # Use matrix multiplication to only retreive those dummy entries that are associated with the current `value`.
117
+ # Convert to COO matrix to get rows/cols
118
+ # row indices refers to the indices of rows that have `column == value` (equivalent to np.where(mask)[0])
119
+ # col indices refers to the numeric index of each "pair" in obs_names
120
+ ind_mat = diags(mask.values, dtype=bool) @ dummies
121
+ if not np.all(np.sum(ind_mat, axis=0) == 1):
122
+ raise ValueError("Pairing is only possible with exactly two values per group")
123
+ ind_mat = ind_mat.tocoo()
124
+ return ind_mat.row[np.argsort(ind_mat.col)]
125
+ else:
126
+ return np.where(mask)[0]
127
+
128
+ res_dfs = []
129
+ baseline_idx = _get_idx(column, baseline)
130
+ for group_to_compare in groups_to_compare:
131
+ comparison_idx = _get_idx(column, group_to_compare)
132
+ res_dfs.append(
133
+ model._compare_single_group(baseline_idx, comparison_idx, paired=paired, **test_kwargs).assign(
134
+ comparison=f"{group_to_compare}_vs_{baseline if baseline is not None else 'rest'}"
135
+ )
136
+ )
137
+ return fdr_correction(pd.concat(res_dfs))
138
+
139
+
140
+ class WilcoxonTest(SimpleComparisonBase):
141
+ """Perform a unpaired or paired Wilcoxon test.
142
+
143
+ (the former is also known as "Mann-Whitney U test", the latter as "wilcoxon signed rank test")
144
+ """
145
+
146
+ @staticmethod
147
+ def _test(x0: np.ndarray, x1: np.ndarray, paired: bool, **kwargs) -> float:
148
+ if paired:
149
+ return scipy.stats.wilcoxon(x0, x1, **kwargs).pvalue
150
+ else:
151
+ return scipy.stats.mannwhitneyu(x0, x1, **kwargs).pvalue
152
+
153
+
154
+ class TTest(SimpleComparisonBase):
155
+ """Perform a unpaired or paired T-test"""
156
+
157
+ @staticmethod
158
+ def _test(x0: np.ndarray, x1: np.ndarray, paired: bool, **kwargs) -> float:
159
+ if paired:
160
+ return scipy.stats.ttest_rel(x0, x1, **kwargs).pvalue
161
+ else:
162
+ return scipy.stats.ttest_ind(x0, x1, **kwargs).pvalue