DeConveil 0.1.3__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. {deconveil-0.1.3 → deconveil-0.2.0}/DeConveil.egg-info/PKG-INFO +4 -1
  2. {deconveil-0.1.3 → deconveil-0.2.0}/DeConveil.egg-info/SOURCES.txt +2 -0
  3. {deconveil-0.1.3 → deconveil-0.2.0}/DeConveil.egg-info/requires.txt +4 -0
  4. {deconveil-0.1.3 → deconveil-0.2.0}/PKG-INFO +4 -1
  5. {deconveil-0.1.3 → deconveil-0.2.0}/README.md +60 -7
  6. {deconveil-0.1.3 → deconveil-0.2.0}/deconveil/__init__.py +1 -0
  7. deconveil-0.2.0/deconveil/__version__.py +1 -0
  8. {deconveil-0.1.3 → deconveil-0.2.0}/deconveil/dds.py +170 -63
  9. {deconveil-0.1.3 → deconveil-0.2.0}/deconveil/default_inference.py +3 -3
  10. {deconveil-0.1.3 → deconveil-0.2.0}/deconveil/ds.py +82 -170
  11. {deconveil-0.1.3 → deconveil-0.2.0}/deconveil/grid_search.py +1 -0
  12. {deconveil-0.1.3 → deconveil-0.2.0}/deconveil/inference.py +4 -4
  13. deconveil-0.2.0/deconveil/nb_regression_fit.py +313 -0
  14. deconveil-0.2.0/deconveil/simulate_gene_dosage.py +589 -0
  15. {deconveil-0.1.3 → deconveil-0.2.0}/deconveil/utils_fit.py +173 -129
  16. {deconveil-0.1.3 → deconveil-0.2.0}/setup.py +4 -1
  17. deconveil-0.1.3/deconveil/__version__.py +0 -1
  18. {deconveil-0.1.3 → deconveil-0.2.0}/DeConveil.egg-info/dependency_links.txt +0 -0
  19. {deconveil-0.1.3 → deconveil-0.2.0}/DeConveil.egg-info/top_level.txt +0 -0
  20. {deconveil-0.1.3 → deconveil-0.2.0}/LICENSE +0 -0
  21. {deconveil-0.1.3 → deconveil-0.2.0}/deconveil/utils_clustering.py +0 -0
  22. {deconveil-0.1.3 → deconveil-0.2.0}/deconveil/utils_plot.py +0 -0
  23. {deconveil-0.1.3 → deconveil-0.2.0}/deconveil/utils_processing.py +0 -0
  24. {deconveil-0.1.3 → deconveil-0.2.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: DeConveil
3
- Version: 0.1.3
3
+ Version: 0.2.0
4
4
  Summary: An extension of PyDESeq2/DESeq2 designed to account for genome aneuploidy
5
5
  Home-page: https://github.com/caravagnalab/DeConveil
6
6
  Author: Katsiaryna Davydzenka
@@ -18,6 +18,9 @@ Requires-Dist: formulaic-contrasts>=0.2.0
18
18
  Requires-Dist: matplotlib>=3.6.2
19
19
  Requires-Dist: seaborn>=0.12.2
20
20
  Requires-Dist: pydeseq2>=0.4.12
21
+ Requires-Dist: rpy2>=3.5.0
22
+ Provides-Extra: stan
23
+ Requires-Dist: cmdstanpy>=1.2.0; extra == "stan"
21
24
  Provides-Extra: dev
22
25
  Requires-Dist: pytest>=6.2.4; extra == "dev"
23
26
  Requires-Dist: pre-commit>=2.13.0; extra == "dev"
@@ -13,6 +13,8 @@ deconveil/default_inference.py
13
13
  deconveil/ds.py
14
14
  deconveil/grid_search.py
15
15
  deconveil/inference.py
16
+ deconveil/nb_regression_fit.py
17
+ deconveil/simulate_gene_dosage.py
16
18
  deconveil/utils_clustering.py
17
19
  deconveil/utils_fit.py
18
20
  deconveil/utils_plot.py
@@ -8,6 +8,7 @@ formulaic-contrasts>=0.2.0
8
8
  matplotlib>=3.6.2
9
9
  seaborn>=0.12.2
10
10
  pydeseq2>=0.4.12
11
+ rpy2>=3.5.0
11
12
 
12
13
  [dev]
13
14
  pytest>=6.2.4
@@ -16,3 +17,6 @@ numpydoc
16
17
  coverage
17
18
  mypy
18
19
  pandas-stubs
20
+
21
+ [stan]
22
+ cmdstanpy>=1.2.0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: DeConveil
3
- Version: 0.1.3
3
+ Version: 0.2.0
4
4
  Summary: An extension of PyDESeq2/DESeq2 designed to account for genome aneuploidy
5
5
  Home-page: https://github.com/caravagnalab/DeConveil
6
6
  Author: Katsiaryna Davydzenka
@@ -18,6 +18,9 @@ Requires-Dist: formulaic-contrasts>=0.2.0
18
18
  Requires-Dist: matplotlib>=3.6.2
19
19
  Requires-Dist: seaborn>=0.12.2
20
20
  Requires-Dist: pydeseq2>=0.4.12
21
+ Requires-Dist: rpy2>=3.5.0
22
+ Provides-Extra: stan
23
+ Requires-Dist: cmdstanpy>=1.2.0; extra == "stan"
21
24
  Provides-Extra: dev
22
25
  Requires-Dist: pytest>=6.2.4; extra == "dev"
23
26
  Requires-Dist: pre-commit>=2.13.0; extra == "dev"
@@ -1,30 +1,83 @@
1
1
  # DeConveil
2
2
 
3
- <img src="docs/deconveil_logo.png" align="right" width="300">
3
+ <img src="docs/logo.png" align="right" width="300">
4
4
 
5
5
  #
6
6
  [![pypi version](https://img.shields.io/pypi/v/DeConveil)](https://pypi.org/project/DeConveil)
7
7
 
8
- The goal of *DeConveil* is the extension of Differential Gene Expression testing by accounting for genome aneuploidy.
8
+ ## Introduction
9
+
10
+ The goal of *DeConveil* is the extension of Differential Gene Expression (DGE) testing by accounting for genome aneuploidy.
9
11
  This computational framework extends traditional DGE analysis by integrating DNA Copy Number Variation (CNV) data.
10
12
  This approach adjusts for dosage effects and categorizes genes as *dosage-sensitive (DSG)*, *dosage-insensitive (DIG)*, and *dosage-compensated (DCG)*, separating the expression changes caused by CNVs from other alterations in transcriptional regulation.
11
13
  To perform this gene separation we need to carry out DGE testing using both *PyDESeq2 (CN-naive)* and *DeConveil (CN-aware)* methods.
12
14
 
15
+ In addition to the core *DeConveil* framework, the package also provides a complementary *Negative Binomial (NB) regression model*, which can be used independently as an alternative inference and analysis strategy.
16
+
13
17
  You can download the results of our analysis from [deconveilCaseStudies](https://github.com/kdavydzenka/deconveilCaseStudies)
14
18
 
19
+ ## Inference methods
20
+
21
+ *DeConveil* provides two complementary approaches for modeling gene expression in the presence of genome aneuploidy.
22
+
23
+ ### 1) Core DeConveil framework (default)
24
+
25
+ The main *DeConveil* framework extends *DESeq2/PyDESeq2* by incorporating copy-number information.
26
+ This approach is designed for standard DGE analysis while accounting for dosage-dependent effects and is the default and recommended workflow.
27
+
28
+ ### 2) Complementary Negative Binomial regression (Stan-based)
15
29
 
16
- ### Installation
30
+ *DeConveil* also implements a complementary *NB regression model*, implemented in Stan and accessed via `cmdstanpy`.
31
+ This model is applied only to tumor samples and is designed to test dosage sensitivity and dosage compensation by directly modeling the relationship between gene expression and CNV.
32
+
33
+ The Stan-based NB regression can be used independently of the core *DeConveil* pipeline and is intended for users who want:
34
+ - a focused analysis of dosage-dependent expression in tumor samples;
35
+ - Bayesian inference
36
+ - explicit uncertainty quantification
37
+
38
+ The Stan-based NB regression is optional and does not affect the core *DeConveil* workflow.
39
+
40
+ ## Installation
17
41
 
18
42
  **Pre-required installations before running DeConveil**
19
43
 
20
- Python libraries are required to be installed: *pydeseq2*
44
+ ### Python dependencies
45
+
46
+ Python libraries required for the core *DeConveil* framework include `pydeseq2`
21
47
 
22
48
  `pip install pydeseq2`
23
49
 
50
+ `DeConveil` can be installed from PyPI using `pip`:
51
+
24
52
  `pip install DeConveil`
25
53
 
26
- or `git clone https://github.com/caravagnalab/DeConveil.git`
54
+ `DeConveil` can also be installed from Bioconda with `conda`:
55
+
56
+ `conda install -c bioconda deconveil`
57
+
58
+ ### R dependencies (required)
59
+
60
+ *DeConveil* relies on the R package `stageR` (via `rpy2`) for stage-wise multiple testing and FDR control.
61
+ A working `R` installation and the `stageR` package are required.
62
+ The package can be installed from Bioconductor:
63
+
64
+ `BiocManager::install("stageR")`
65
+
66
+ ### Optional Stan support
67
+
68
+ The complementary NB regression requires the Python package `cmdstanpy` and a working installation of `CmdStan`.
69
+ To enable Stan support, install DeConveil with the stan extra:
70
+
71
+ `pip install DeConveil[stan]`
72
+
73
+ Then install CmdStan:
74
+
75
+ `python -m cmdstanpy.install_cmdstan`
76
+
77
+ If Stan support is not installed, the core DeConveil framework remains fully functional.
78
+
27
79
 
80
+ ## Data
28
81
 
29
82
  **Input data**
30
83
 
@@ -49,7 +102,7 @@ These data frames are further processed to separate gene groups using `define_ge
49
102
  A tutorial of the analysis workflow is available in `test_deconveil.ipynb`
50
103
 
51
104
 
52
- #### Citation
105
+ ### Citation
53
106
 
54
107
  [![](http://img.shields.io/badge/doi-10.1101/2025.03.29.646108-red.svg)](https://doi.org/10.1101/2025.03.29.646108)
55
108
 
@@ -58,7 +111,7 @@ If you use `DeConveil`, cite:
58
111
  K. Davydzenka, G. Caravagna, G. Sanguinetti. Extending differential gene expression testing to handle genome aneuploidy in cancer. [bioRxiv preprint](https://doi.org/10.1101/2025.03.29.646108), 2025.
59
112
 
60
113
 
61
- #### Copyright and contacts
114
+ ### Copyright and contacts
62
115
 
63
116
  Katsiaryna Davydzenka, Cancer Data Science (CDS) Laboratory.
64
117
 
@@ -3,5 +3,6 @@ from .inference import Inference
3
3
  from .default_inference import DefInference
4
4
  from .ds import deconveil_stats
5
5
  from .grid_search import grid_fit_shrink_beta
6
+ from .nb_regression_fit import *
6
7
 
7
8
 
@@ -0,0 +1 @@
1
+ __version__ = "0.2.0"
@@ -5,6 +5,7 @@ from typing import List, Literal, Optional, Union, cast
5
5
 
6
6
  import numpy as np
7
7
  import pandas as pd
8
+ from formulaic_contrasts import FormulaicContrasts # type: ignore[import-untyped]
8
9
  from scipy.optimize import minimize
9
10
  from scipy.special import polygamma # type: ignore
10
11
  from scipy.stats import f # type: ignore
@@ -17,7 +18,7 @@ from deconveil.utils_fit import fit_rough_dispersions
17
18
  from deconveil.utils_fit import fit_moments_dispersions2
18
19
  from deconveil.utils_fit import grid_fit_beta
19
20
  from deconveil.utils_fit import irls_glm
20
- from deconveil.utils_fit import build_design_matrix
21
+ from deconveil.utils_processing import replace_underscores
21
22
 
22
23
  from pydeseq2.preprocessing import deseq2_norm_fit
23
24
  from pydeseq2.preprocessing import deseq2_norm_transform
@@ -25,7 +26,6 @@ from pydeseq2.utils import dispersion_trend
25
26
  from pydeseq2.utils import mean_absolute_deviation
26
27
  from pydeseq2.utils import n_or_more_replicates
27
28
  from pydeseq2.utils import nb_nll
28
- from pydeseq2.utils import replace_underscores
29
29
  from pydeseq2.utils import robust_method_of_moments_disp
30
30
  from pydeseq2.utils import test_valid_counts
31
31
  from pydeseq2.utils import trimmed_mean
@@ -43,23 +43,20 @@ class deconveil_fit:
43
43
  cnv : pandas.DataFrame
44
44
  Discrete numbres. One column per gene, rows are indexed by sample barcodes.
45
45
 
46
-
47
46
  metadata : pandas.DataFrame
48
47
  DataFrame containing sample metadata.
49
48
  Must be indexed by sample barcodes.
50
49
 
51
- design_factors : str or list
52
- Name of the columns of metadata to be used as design variables.
53
- (default: ``'condition'``).
54
-
55
- continuous_factors : list or None
56
- An optional list of continuous (as opposed to categorical) factors. Any factor
57
- not in ``continuous_factors`` will be considered categorical (default: ``None``).
50
+ design : str or pandas.DataFrame
51
+ Model design. Can be either a pandas DataFrame representing a design matrix, or
52
+ a formulaic formula in the format ``'x + z'`` or ``'~x+z'``.
53
+ If a design matrix is provided, deconveil_stats built from this deconveil_fit will
54
+ only support contrasts in the form of numeric vectors.
55
+ (Default: ``'~condition')``.
58
56
 
59
- ref_level : list or None
60
- An optional list of two strings of the form ``["factor", "test_level"]``
61
- specifying the factor of interest and the reference (control) level against which
62
- we're testing, e.g. ``["condition", "A"]``. (default: ``None``).
57
+ design_factors : str or list, optional
58
+ Depecated. An optional list of factors to include in the design matrix.
59
+ (default: ``None``)
63
60
 
64
61
  fit_type: str
65
62
  Either ``"parametric"`` or ``"mean"`` for the type of fitting of dispersions to
@@ -67,6 +64,20 @@ class deconveil_fit:
67
64
  robust gamma-family GLM. ``"mean"``: use the mean of gene-wise dispersion
68
65
  estimates. Will set the fit type for the DEA and the vst transformation. If
69
66
  needed, it can be set separately for each method.(default: ``"parametric"``).
67
+
68
+ size_factors_fit_type : str
69
+ The normalization method to use: ``"ratio"``, ``"poscounts"`` or ``"iterative"``.
70
+ ``"ratio"``: fit size factors using the median-of-ratios method. ``"poscounts"``:
71
+ fit size factors using the method implemented in DESeq2 for the case where there
72
+ may be few or no genes which have no zero values.
73
+ ``"iterative"``: fit size factors iteratively. (default: ``"ratio"``).
74
+
75
+ control_genes : ndarray, list, or pandas.Index, optional
76
+ Genes to use as control genes for size factor fitting. If provided, size factors
77
+ will be fit using only these genes. This is useful when certain genes are known
78
+ to be invariant across conditions (e.g., housekeeping genes). Any valid AnnData
79
+ indexer (bool array, integer positions, or gene name strings) can be used.
80
+ (default: ``None``).
70
81
 
71
82
  min_mu : float
72
83
  Threshold for mean estimates. (default: ``0.5``).
@@ -119,27 +130,36 @@ class deconveil_fit:
119
130
  filtered_genes: numpy.ndarray
120
131
  Genes whose log means are different from -∞, computed in
121
132
  preprocessing.deseq2_norm_fit().
133
+
134
+ factor_storage : dict
135
+ A dictionary storing metadata for each factor processed by the custom
136
+ materializer (only if ``design`` is input as a formula).
137
+
138
+ variable_to_factors : dict
139
+ A dictionary mapping variable names to factor names (only if ``design`` is input
140
+ as a formula).
122
141
 
123
142
  """
124
143
 
125
144
  def __init__(
126
145
  self,
127
146
  *,
128
- counts: Optional[pd.DataFrame] = None,
129
- cnv: Optional[pd.DataFrame] = None,
130
- metadata: Optional[pd.DataFrame] = None,
131
- design_factors: Union[str, List[str]] = "condition",
132
- continuous_factors: Optional[List[str]] = None,
133
- ref_level: Optional[List[str]] = None,
147
+ counts: pd.DataFrame | None = None,
148
+ cnv: pd.DataFrame | None = None,
149
+ metadata: pd.DataFrame | None = None,
150
+ design: str | pd.DataFrame = "~condition",
151
+ design_factors: str | list[str] | None = None,
134
152
  fit_type: Literal["parametric", "mean"] = "parametric",
153
+ size_factors_fit_type: Literal["ratio", "poscounts", "iterative"] = "ratio",
154
+ control_genes: np.ndarray | list[str] | list[int] | pd.Index | None = None,
135
155
  min_mu: float = 0.5,
136
156
  min_disp: float = 1e-8,
137
157
  max_disp: float = 10.0,
138
158
  refit_cooks: bool = True,
139
159
  min_replicates: int = 7,
140
160
  beta_tol: float = 1e-8,
141
- n_cpus: Optional[int] = None,
142
- inference: Optional[Inference] = None,
161
+ n_cpus: int | None = None,
162
+ inference: Inference | None = None,
143
163
  quiet: bool = False,
144
164
  ) -> None:
145
165
 
@@ -159,27 +179,43 @@ class deconveil_fit:
159
179
 
160
180
  self.metadata = metadata
161
181
  self.fit_type = fit_type
182
+ self.design = design
183
+ self.obsm={}
184
+
162
185
 
163
- # Convert design_factors to list if a single string was provided.
164
- self.design_factors = (
165
- [design_factors] if isinstance(design_factors, str) else design_factors
166
- )
186
+ if design_factors is not None:
187
+ warnings.warn(
188
+ "design_factors are deprecated and will soon be removed"
189
+ "Please consider providing a formulaic formula using the design argument instead",
190
+ DeprecationWarning,
191
+ stacklevel=2,
192
+ )
193
+ design_factors = (
194
+ design_factors if isinstance(design_factors, list) else [design_factors]
195
+ )
196
+ self.design = "~" + " + ".join(design_factors)
197
+
198
+ if not (
199
+ isinstance(self.design, (str | pd.DataFrame)) or isinstance(self.design, str)
200
+ ):
201
+ raise ValueError(
202
+ "design must be a string representing a formulaic formula, or a pandas DataFrame."
203
+ )
167
204
 
168
- self.continuous_factors = continuous_factors
205
+ if isinstance(self.design, str):
206
+ # Keep track of the categorical factors used in the model specification,
207
+ # including variable and factor names, by generating a custom materializer.
208
+ self.formulaic_contrasts = FormulaicContrasts(self.metadata, self.design)
209
+ self.obsm["design_matrix"] = self.formulaic_contrasts.design_matrix
210
+ else:
211
+ self.obsm["design_matrix"] = self.design
169
212
 
213
+ if self.obsm["design_matrix"].isna().any().any():
214
+ raise ValueError("NaNs are not allowed in the design.")
215
+
216
+ # Check that the design matrix has full rank
217
+ self._check_full_rank_design()
170
218
 
171
- # Build the design matrix
172
- self.design_matrix = build_design_matrix(
173
- metadata=self.metadata,
174
- design_factors=self.design_factors,
175
- continuous_factors=self.continuous_factors,
176
- ref_level=ref_level,
177
- expanded=False,
178
- intercept=True,
179
- )
180
-
181
- self.obsm={}
182
- self.obsm["design_matrix"] = self.design_matrix
183
219
  self.min_mu = min_mu
184
220
  self.min_disp = min_disp
185
221
  self.n_obs=self.data["counts"].shape[0]
@@ -187,15 +223,17 @@ class deconveil_fit:
187
223
  self.var_names=self.data["counts"].columns
188
224
  self.max_disp = np.maximum(max_disp, self.n_obs)
189
225
  self.refit_cooks = refit_cooks
190
- self.ref_level = ref_level
191
226
  self.min_replicates = min_replicates
192
227
  self.beta_tol = beta_tol
193
228
  self.quiet = quiet
194
- self.logmeans = None
229
+ self.size_factors_fit_type = size_factors_fit_type
230
+ self.control_genes = control_genes
231
+ self.logmeans: np.ndarray | None = None
195
232
  self.filtered_genes = None
196
233
  self.uns={}
197
234
  self.varm={}
198
235
  self.layers={}
236
+ self.filtered_genes: np.ndarray | None = None
199
237
 
200
238
 
201
239
  if inference:
@@ -211,16 +249,42 @@ class deconveil_fit:
211
249
  )
212
250
  # Initialize the inference object.
213
251
  self.inference = inference or DefInference(n_cpus=n_cpus)
252
+
253
+ @property
254
+ def variables(self):
255
+ """Get the names of the variables used in the model definition."""
256
+ try:
257
+ return self.formulaic_contrasts.variables
258
+ except AttributeError:
259
+ raise ValueError(
260
+ """Retrieving variables is only possible if the model was initialized
261
+ using a formula."""
262
+ ) from None
214
263
 
215
264
 
216
265
  def vst(
217
266
  self,
218
267
  use_design: bool = False,
219
- fit_type: Optional[Literal["parametric", "mean"]] = None,
268
+ fit_type: Literal["parametric", "mean"] | None = None,
220
269
  ) -> None:
221
270
 
222
271
  """Fit a variance stabilizing transformation, and apply it to normalized counts.
223
272
  Results are stored in ``vst_counts"``.
273
+
274
+ Parameters
275
+ ----------
276
+ use_design : bool
277
+ Whether to use the full design matrix to fit dispersions and the trend curve.
278
+ If False, only an intercept is used. (default: ``False``).
279
+
280
+ fit_type: str
281
+ * ``None``: fit_type provided at initialization to fit
282
+ the dispersions trend curve.
283
+ * ``"parametric"``: fit a dispersion-mean relation via a robust
284
+ gamma-family GLM.
285
+ * ``"mean"``: use the mean of gene-wise dispersion estimates.
286
+
287
+ (default: ``None``).
224
288
  """
225
289
 
226
290
  if fit_type is not None:
@@ -254,7 +318,9 @@ class deconveil_fit:
254
318
  # Start by fitting median-of-ratio size factors if not already present,
255
319
  # or if they were computed iteratively
256
320
  if "size_factors" not in self.obsm or self.logmeans is None:
257
- self.fit_size_factors() # by default, fit_type != "iterative"
321
+ self.fit_size_factors(
322
+ fit_type=self.size_factors_fit_type
323
+ )
258
324
 
259
325
  if not hasattr(self, "vst_fit_type"):
260
326
  self.vst_fit_type = self.fit_type
@@ -286,7 +352,7 @@ class deconveil_fit:
286
352
  del self.obsm["design_matrix_buffer"]
287
353
 
288
354
 
289
- def vst_transform(self, counts: Optional[np.ndarray] = None) -> np.ndarray:
355
+ def vst_transform(self, counts: np.ndarray | None = None) -> np.ndarray:
290
356
 
291
357
  """Apply the variance stabilizing transformation.
292
358
  Uses the results from the ``vst_fit`` method.
@@ -351,7 +417,7 @@ class deconveil_fit:
351
417
  )
352
418
 
353
419
 
354
- def deseq2(self, fit_type: Optional[Literal["parametric", "mean"]] = None) -> None:
420
+ def deseq2(self, fit_type: Literal["parametric", "mean"] | None = None) -> None:
355
421
 
356
422
  """Perform dispersion and log fold-change (LFC) estimation.
357
423
 
@@ -372,8 +438,11 @@ class deconveil_fit:
372
438
  if fit_type is not None:
373
439
  self.fit_type = fit_type
374
440
  print(f"Using {self.fit_type} fit type.")
441
+
375
442
  # Compute DESeq2 normalization factors using the Median-of-ratios method
376
- self.fit_size_factors()
443
+ self.fit_size_factors(
444
+ fit_type=self.size_factors_fit_type, control_genes=self.control_genes
445
+ )
377
446
  # Fit an independent negative binomial model per gene
378
447
  self.fit_genewise_dispersions()
379
448
  # Fit a parameterized trend curve for dispersions, of the form
@@ -393,12 +462,30 @@ class deconveil_fit:
393
462
  # for genes that had outliers replaced
394
463
  self.refit()
395
464
 
465
+ def cond(self, **kwargs):
466
+ """
467
+ Get a contrast vector representing a specific condition.
468
+
469
+ Parameters
470
+ ----------
471
+ **kwargs
472
+ Column/value pairs.
473
+
474
+ Returns
475
+ -------
476
+ ndarray
477
+ A contrast vector that aligns to the columns of the design matrix.
478
+ """
479
+ return self.formulaic_contrasts.cond(**kwargs)
480
+
481
+ def contrast(self, *args, **kwargs):
482
+ """Get a contrast for a simple pairwise comparison."""
483
+ return self.formulaic_contrasts.contrast(*args, **kwargs)
484
+
396
485
  def fit_size_factors(
397
486
  self,
398
- fit_type: Literal["ratio", "poscounts", "iterative"] = "ratio",
399
- control_genes: Optional[
400
- Union[np.ndarray, List[str], List[int], pd.Index]
401
- ] = None,
487
+ fit_type: Literal["ratio", "poscounts", "iterative"] | None = None,
488
+ control_genes: np.ndarray | list[str] | list[int] | pd.Index | None = None,
402
489
  ) -> None:
403
490
  """Fit sample-wise deseq2 normalization (size) factors.
404
491
  Parameters
@@ -411,16 +498,27 @@ class deconveil_fit:
411
498
  are used. (default: ``None``).
412
499
  """
413
500
 
501
+ if fit_type is None:
502
+ fit_type = self.size_factors_fit_type
414
503
  if not self.quiet:
415
504
  print("Fitting size factors...", file=sys.stderr)
416
505
 
417
506
  start = time.time()
418
507
 
508
+ if control_genes is None:
509
+ # Check whether control genes were specified at initialization
510
+ if hasattr(self, "control_genes"):
511
+ control_genes = self.control_genes
512
+ if not self.quiet:
513
+ print(
514
+ f"Using {control_genes} as control genes, passed at"
515
+ " deconveil_fit initialization"
516
+ )
517
+
419
518
  # If control genes are provided, set a mask where those genes are True
420
519
  if control_genes is not None:
421
520
  _control_mask = np.zeros(self.data["counts"].shape[1], dtype=bool)
422
521
 
423
- # Use AnnData internal indexing to get gene index array
424
522
  # Allows bool/int/var_name to be provided
425
523
  _control_mask[self._normalize_indices((slice(None), control_genes))[1]] = (
426
524
  True
@@ -500,6 +598,9 @@ class deconveil_fit:
500
598
  # Check that size factors are available. If not, compute them.
501
599
  if "size_factors" not in self.obsm:
502
600
  self.fit_size_factors()
601
+
602
+ counts = self.data["counts"]
603
+
503
604
 
504
605
  # Exclude genes with all zeroes
505
606
  self.varm["non_zero"] = ~(self.data["counts"] == 0).all(axis=0)
@@ -514,17 +615,22 @@ class deconveil_fit:
514
615
 
515
616
  # Convert to numpy for speed
516
617
  design_matrix = self.obsm["design_matrix"].values
618
+ size_factors = np.asarray(self.obsm["size_factors"]).reshape(-1)
517
619
  counts=self.data["counts"].to_numpy()
518
620
  cnv=self.data["cnv"].to_numpy()
519
-
520
- # with a GLM (using rough dispersion estimates).
621
+
622
+ # mu_hat is initialized differently depending on the number of different factor
623
+ # groups. If there are as many different factor combinations as design factors
624
+ # (intercept included), it is fitted with a linear model, otherwise it is fitted
625
+ # with a GLM (using rough dispersion estimates).
626
+
521
627
  if (
522
628
  len(self.obsm["design_matrix"].value_counts())
523
629
  == self.obsm["design_matrix"].shape[-1]
524
630
  ):
525
631
  mu_hat_ = self.inference.lin_reg_mu(
526
632
  counts=counts[:, self.non_zero_idx],
527
- size_factors=self.obsm["size_factors"],
633
+ size_factors=size_factors,
528
634
  design_matrix=design_matrix,
529
635
  min_mu=self.min_mu,
530
636
  )
@@ -532,18 +638,20 @@ class deconveil_fit:
532
638
  _, mu_hat_, _, _ = self.inference.irls_glm(
533
639
  counts=counts[:, self.non_zero_idx],
534
640
  cnv=cnv[:, self.non_zero_idx],
535
- size_factors=self.obsm["size_factors"],
641
+ size_factors=size_factors,
536
642
  design_matrix=design_matrix,
537
643
  disp=self.varm["_MoM_dispersions"][self.non_zero_idx],
538
644
  min_mu=self.min_mu,
539
645
  beta_tol=self.beta_tol,
540
646
  )
647
+
541
648
  mu_param_name = "_vst_mu_hat" if vst else "_mu_hat"
542
649
  disp_param_name = "genewise_dispersions"
543
650
 
544
651
  self.layers[mu_param_name] = np.full((self.n_obs, self.n_vars), np.nan)
545
652
  self.layers[mu_param_name][:, self.varm["non_zero"]] = mu_hat_
546
653
 
654
+ # Estimate per-gene dispersion via MLE (α_g)
547
655
  if not self.quiet:
548
656
  print("Fitting dispersions...", file=sys.stderr)
549
657
  start = time.time()
@@ -560,6 +668,7 @@ class deconveil_fit:
560
668
  if not self.quiet:
561
669
  print(f"... done in {end - start:.2f} seconds.\n", file=sys.stderr)
562
670
 
671
+ # Store results
563
672
  self.varm[disp_param_name] = np.full(self.n_vars, np.nan)
564
673
  self.varm[disp_param_name][self.varm["non_zero"]] = np.clip(
565
674
  dispersions_, self.min_disp, self.max_disp
@@ -609,7 +718,7 @@ class deconveil_fit:
609
718
  """Return the dispersion trend function at x."""
610
719
  if self.uns["disp_function_type"] == "parametric":
611
720
  return dispersion_trend(x, self.uns["trend_coeffs"])
612
- elif self.disp_function_type == "mean":
721
+ elif self.uns["disp_function_type"] == "mean":
613
722
  return np.full_like(x, self.uns["mean_disp"])
614
723
 
615
724
 
@@ -731,8 +840,7 @@ class deconveil_fit:
731
840
  design_matrix = self.obsm["design_matrix"].values
732
841
  counts=self.data["counts"].to_numpy()
733
842
  cnv=self.data["cnv"].to_numpy()
734
- cnv = cnv / 2
735
- cnv = cnv + 0.1
843
+ cnv = (cnv / 2) + 0.1
736
844
 
737
845
  if not self.quiet:
738
846
  print("Fitting LFCs...", file=sys.stderr)
@@ -740,7 +848,7 @@ class deconveil_fit:
740
848
  mle_lfcs_, mu_, hat_diagonals_, converged_ = self.inference.irls_glm(
741
849
  counts=counts[:, self.non_zero_idx],
742
850
  cnv=cnv[:, self.non_zero_idx],
743
- size_factors=self.obsm["size_factors"],
851
+ size_factors=np.asarray(self.obsm["size_factors"]).reshape(-1),
744
852
  design_matrix=design_matrix,
745
853
  disp=self.varm["dispersions"][self.non_zero_idx],
746
854
  min_mu=self.min_mu,
@@ -863,9 +971,10 @@ class deconveil_fit:
863
971
  """
864
972
  # Check that size_factors are available. If not, compute them.
865
973
  if "normed_counts" not in self.layers:
866
- self.fit_size_factors()
974
+ self.fit_size_factors(fit_type=self.size_factors_fit_type)
867
975
 
868
976
  normed_counts = self.layers["normed_counts"]
977
+
869
978
  rde = self.inference.fit_rough_dispersions(
870
979
  normed_counts,
871
980
  self.obsm["design_matrix"].values,
@@ -1106,9 +1215,7 @@ class deconveil_fit:
1106
1215
  ),
1107
1216
  cnv=self.data["cnv"],
1108
1217
  metadata=self.metadata,
1109
- design_factors=self.design_factors,
1110
- continuous_factors=self.continuous_factors,
1111
- ref_level=self.ref_level,
1218
+ design=self.design,
1112
1219
  min_mu=self.min_mu,
1113
1220
  min_disp=self.min_disp,
1114
1221
  max_disp=self.max_disp,
@@ -210,9 +210,9 @@ class DefInference(inference.Inference):
210
210
  ridge_factor: np.ndarray,
211
211
  contrast: np.ndarray,
212
212
  lfc_null: np.ndarray,
213
- alt_hypothesis: Optional[
214
- Literal["greaterAbs", "lessAbs", "greater", "less"]
215
- ] = None,
213
+ alt_hypothesis: (
214
+ Literal["greaterAbs", "lessAbs", "greater", "less"] | None
215
+ ) = None,
216
216
  ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
217
217
  num_genes = mu.shape[1]
218
218
  with parallel_backend(self._backend, inner_max_num_threads=1):