DeConveil 0.1.3__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {deconveil-0.1.3 → deconveil-0.2.0}/DeConveil.egg-info/PKG-INFO +4 -1
- {deconveil-0.1.3 → deconveil-0.2.0}/DeConveil.egg-info/SOURCES.txt +2 -0
- {deconveil-0.1.3 → deconveil-0.2.0}/DeConveil.egg-info/requires.txt +4 -0
- {deconveil-0.1.3 → deconveil-0.2.0}/PKG-INFO +4 -1
- {deconveil-0.1.3 → deconveil-0.2.0}/README.md +60 -7
- {deconveil-0.1.3 → deconveil-0.2.0}/deconveil/__init__.py +1 -0
- deconveil-0.2.0/deconveil/__version__.py +1 -0
- {deconveil-0.1.3 → deconveil-0.2.0}/deconveil/dds.py +170 -63
- {deconveil-0.1.3 → deconveil-0.2.0}/deconveil/default_inference.py +3 -3
- {deconveil-0.1.3 → deconveil-0.2.0}/deconveil/ds.py +82 -170
- {deconveil-0.1.3 → deconveil-0.2.0}/deconveil/grid_search.py +1 -0
- {deconveil-0.1.3 → deconveil-0.2.0}/deconveil/inference.py +4 -4
- deconveil-0.2.0/deconveil/nb_regression_fit.py +313 -0
- deconveil-0.2.0/deconveil/simulate_gene_dosage.py +589 -0
- {deconveil-0.1.3 → deconveil-0.2.0}/deconveil/utils_fit.py +173 -129
- {deconveil-0.1.3 → deconveil-0.2.0}/setup.py +4 -1
- deconveil-0.1.3/deconveil/__version__.py +0 -1
- {deconveil-0.1.3 → deconveil-0.2.0}/DeConveil.egg-info/dependency_links.txt +0 -0
- {deconveil-0.1.3 → deconveil-0.2.0}/DeConveil.egg-info/top_level.txt +0 -0
- {deconveil-0.1.3 → deconveil-0.2.0}/LICENSE +0 -0
- {deconveil-0.1.3 → deconveil-0.2.0}/deconveil/utils_clustering.py +0 -0
- {deconveil-0.1.3 → deconveil-0.2.0}/deconveil/utils_plot.py +0 -0
- {deconveil-0.1.3 → deconveil-0.2.0}/deconveil/utils_processing.py +0 -0
- {deconveil-0.1.3 → deconveil-0.2.0}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: DeConveil
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: An extension of PyDESeq2/DESeq2 designed to account for genome aneuploidy
|
|
5
5
|
Home-page: https://github.com/caravagnalab/DeConveil
|
|
6
6
|
Author: Katsiaryna Davydzenka
|
|
@@ -18,6 +18,9 @@ Requires-Dist: formulaic-contrasts>=0.2.0
|
|
|
18
18
|
Requires-Dist: matplotlib>=3.6.2
|
|
19
19
|
Requires-Dist: seaborn>=0.12.2
|
|
20
20
|
Requires-Dist: pydeseq2>=0.4.12
|
|
21
|
+
Requires-Dist: rpy2>=3.5.0
|
|
22
|
+
Provides-Extra: stan
|
|
23
|
+
Requires-Dist: cmdstanpy>=1.2.0; extra == "stan"
|
|
21
24
|
Provides-Extra: dev
|
|
22
25
|
Requires-Dist: pytest>=6.2.4; extra == "dev"
|
|
23
26
|
Requires-Dist: pre-commit>=2.13.0; extra == "dev"
|
|
@@ -13,6 +13,8 @@ deconveil/default_inference.py
|
|
|
13
13
|
deconveil/ds.py
|
|
14
14
|
deconveil/grid_search.py
|
|
15
15
|
deconveil/inference.py
|
|
16
|
+
deconveil/nb_regression_fit.py
|
|
17
|
+
deconveil/simulate_gene_dosage.py
|
|
16
18
|
deconveil/utils_clustering.py
|
|
17
19
|
deconveil/utils_fit.py
|
|
18
20
|
deconveil/utils_plot.py
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: DeConveil
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: An extension of PyDESeq2/DESeq2 designed to account for genome aneuploidy
|
|
5
5
|
Home-page: https://github.com/caravagnalab/DeConveil
|
|
6
6
|
Author: Katsiaryna Davydzenka
|
|
@@ -18,6 +18,9 @@ Requires-Dist: formulaic-contrasts>=0.2.0
|
|
|
18
18
|
Requires-Dist: matplotlib>=3.6.2
|
|
19
19
|
Requires-Dist: seaborn>=0.12.2
|
|
20
20
|
Requires-Dist: pydeseq2>=0.4.12
|
|
21
|
+
Requires-Dist: rpy2>=3.5.0
|
|
22
|
+
Provides-Extra: stan
|
|
23
|
+
Requires-Dist: cmdstanpy>=1.2.0; extra == "stan"
|
|
21
24
|
Provides-Extra: dev
|
|
22
25
|
Requires-Dist: pytest>=6.2.4; extra == "dev"
|
|
23
26
|
Requires-Dist: pre-commit>=2.13.0; extra == "dev"
|
|
@@ -1,30 +1,83 @@
|
|
|
1
1
|
# DeConveil
|
|
2
2
|
|
|
3
|
-
<img src="docs/
|
|
3
|
+
<img src="docs/logo.png" align="right" width="300">
|
|
4
4
|
|
|
5
5
|
#
|
|
6
6
|
[](https://pypi.org/project/DeConveil)
|
|
7
7
|
|
|
8
|
-
|
|
8
|
+
## Introduction
|
|
9
|
+
|
|
10
|
+
The goal of *DeConveil* is the extension of Differential Gene Expression (DGE) testing by accounting for genome aneuploidy.
|
|
9
11
|
This computational framework extends traditional DGE analysis by integrating DNA Copy Number Variation (CNV) data.
|
|
10
12
|
This approach adjusts for dosage effects and categorizes genes as *dosage-sensitive (DSG)*, *dosage-insensitive (DIG)*, and *dosage-compensated (DCG)*, separating the expression changes caused by CNVs from other alterations in transcriptional regulation.
|
|
11
13
|
To perform this gene separation we need to carry out DGE testing using both *PyDESeq2 (CN-naive)* and *DeConveil (CN-aware)* methods.
|
|
12
14
|
|
|
15
|
+
In addition to the core *DeConveil* framework, the package also provides a complementary *Negative Binomial (NB) regression model*, which can be used independently as an alternative inference and analysis strategy.
|
|
16
|
+
|
|
13
17
|
You can download the results of our analysis from [deconveilCaseStudies](https://github.com/kdavydzenka/deconveilCaseStudies)
|
|
14
18
|
|
|
19
|
+
## Inference methods
|
|
20
|
+
|
|
21
|
+
*DeConveil* provides two complementary approaches for modeling gene expression in the presence of genome aneuploidy.
|
|
22
|
+
|
|
23
|
+
### 1) Core DeConveil framework (default)
|
|
24
|
+
|
|
25
|
+
The main *DeConveil* framework extends *DESeq2/PyDESeq2* by incorporating copy-number information.
|
|
26
|
+
This approach is designed for standard DGE analysis while accounting for dosage-dependent effects and is the default and recommended workflow.
|
|
27
|
+
|
|
28
|
+
### 2) Complementary Negative Binomial regression (Stan-based)
|
|
15
29
|
|
|
16
|
-
|
|
30
|
+
*DeConveil* also implements a complementary *NB regression model*, implemented in Stan and accessed via `cmdstanpy`.
|
|
31
|
+
This model is applied only to tumor samples and is designed to test dosage sensitivity and dosage compensation by directly modeling the relationship between gene expression and CNV.
|
|
32
|
+
|
|
33
|
+
The Stan-based NB regression can be used independently of the core *DeConveil* pipeline and is intended for users who want:
|
|
34
|
+
- a focused analysis of dosage-dependent expression in tumor samples;
|
|
35
|
+
- Bayesian inference
|
|
36
|
+
- explicit uncertainty quantification
|
|
37
|
+
|
|
38
|
+
The Stan-based NB regression is optional and does not affect the core *DeConveil* workflow.
|
|
39
|
+
|
|
40
|
+
## Installation
|
|
17
41
|
|
|
18
42
|
**Pre-required installations before running DeConveil**
|
|
19
43
|
|
|
20
|
-
Python
|
|
44
|
+
### Python dependencies
|
|
45
|
+
|
|
46
|
+
Python libraries required for the core *DeConveil* framework include `pydeseq2`
|
|
21
47
|
|
|
22
48
|
`pip install pydeseq2`
|
|
23
49
|
|
|
50
|
+
`DeConveil` can be installed from PyPI using `pip`:
|
|
51
|
+
|
|
24
52
|
`pip install DeConveil`
|
|
25
53
|
|
|
26
|
-
|
|
54
|
+
`DeConveil` can also be installed from Bioconda with `conda`:
|
|
55
|
+
|
|
56
|
+
`conda install -c bioconda deconveil`
|
|
57
|
+
|
|
58
|
+
### R dependencies (required)
|
|
59
|
+
|
|
60
|
+
*DeConveil* relies on the R package `stageR` (via `rpy2`) for stage-wise multiple testing and FDR control.
|
|
61
|
+
A working `R` installation and the `stageR` package are required.
|
|
62
|
+
The package can be installed from Bioconductor:
|
|
63
|
+
|
|
64
|
+
`BiocManager::install("stageR")`
|
|
65
|
+
|
|
66
|
+
### Optional Stan support
|
|
67
|
+
|
|
68
|
+
The complementary NB regression requires the Python package `cmdstanpy` and a working installation of `CmdStan`.
|
|
69
|
+
To enable Stan support, install DeConveil with the stan extra:
|
|
70
|
+
|
|
71
|
+
`pip install DeConveil[stan]`
|
|
72
|
+
|
|
73
|
+
Then install CmdStan:
|
|
74
|
+
|
|
75
|
+
`python -m cmdstanpy.install_cmdstan`
|
|
76
|
+
|
|
77
|
+
If Stan support is not installed, the core DeConveil framework remains fully functional.
|
|
78
|
+
|
|
27
79
|
|
|
80
|
+
## Data
|
|
28
81
|
|
|
29
82
|
**Input data**
|
|
30
83
|
|
|
@@ -49,7 +102,7 @@ These data frames are further processed to separate gene groups using `define_ge
|
|
|
49
102
|
A tutorial of the analysis workflow is available in `test_deconveil.ipynb`
|
|
50
103
|
|
|
51
104
|
|
|
52
|
-
|
|
105
|
+
### Citation
|
|
53
106
|
|
|
54
107
|
[](https://doi.org/10.1101/2025.03.29.646108)
|
|
55
108
|
|
|
@@ -58,7 +111,7 @@ If you use `DeConveil`, cite:
|
|
|
58
111
|
K. Davydzenka, G. Caravagna, G. Sanguinetti. Extending differential gene expression testing to handle genome aneuploidy in cancer. [bioRxiv preprint](https://doi.org/10.1101/2025.03.29.646108), 2025.
|
|
59
112
|
|
|
60
113
|
|
|
61
|
-
|
|
114
|
+
### Copyright and contacts
|
|
62
115
|
|
|
63
116
|
Katsiaryna Davydzenka, Cancer Data Science (CDS) Laboratory.
|
|
64
117
|
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.2.0"
|
|
@@ -5,6 +5,7 @@ from typing import List, Literal, Optional, Union, cast
|
|
|
5
5
|
|
|
6
6
|
import numpy as np
|
|
7
7
|
import pandas as pd
|
|
8
|
+
from formulaic_contrasts import FormulaicContrasts # type: ignore[import-untyped]
|
|
8
9
|
from scipy.optimize import minimize
|
|
9
10
|
from scipy.special import polygamma # type: ignore
|
|
10
11
|
from scipy.stats import f # type: ignore
|
|
@@ -17,7 +18,7 @@ from deconveil.utils_fit import fit_rough_dispersions
|
|
|
17
18
|
from deconveil.utils_fit import fit_moments_dispersions2
|
|
18
19
|
from deconveil.utils_fit import grid_fit_beta
|
|
19
20
|
from deconveil.utils_fit import irls_glm
|
|
20
|
-
from deconveil.
|
|
21
|
+
from deconveil.utils_processing import replace_underscores
|
|
21
22
|
|
|
22
23
|
from pydeseq2.preprocessing import deseq2_norm_fit
|
|
23
24
|
from pydeseq2.preprocessing import deseq2_norm_transform
|
|
@@ -25,7 +26,6 @@ from pydeseq2.utils import dispersion_trend
|
|
|
25
26
|
from pydeseq2.utils import mean_absolute_deviation
|
|
26
27
|
from pydeseq2.utils import n_or_more_replicates
|
|
27
28
|
from pydeseq2.utils import nb_nll
|
|
28
|
-
from pydeseq2.utils import replace_underscores
|
|
29
29
|
from pydeseq2.utils import robust_method_of_moments_disp
|
|
30
30
|
from pydeseq2.utils import test_valid_counts
|
|
31
31
|
from pydeseq2.utils import trimmed_mean
|
|
@@ -43,23 +43,20 @@ class deconveil_fit:
|
|
|
43
43
|
cnv : pandas.DataFrame
|
|
44
44
|
Discrete numbres. One column per gene, rows are indexed by sample barcodes.
|
|
45
45
|
|
|
46
|
-
|
|
47
46
|
metadata : pandas.DataFrame
|
|
48
47
|
DataFrame containing sample metadata.
|
|
49
48
|
Must be indexed by sample barcodes.
|
|
50
49
|
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
not in ``continuous_factors`` will be considered categorical (default: ``None``).
|
|
50
|
+
design : str or pandas.DataFrame
|
|
51
|
+
Model design. Can be either a pandas DataFrame representing a design matrix, or
|
|
52
|
+
a formulaic formula in the format ``'x + z'`` or ``'~x+z'``.
|
|
53
|
+
If a design matrix is provided, deconveil_stats built from this deconveil_fit will
|
|
54
|
+
only support contrasts in the form of numeric vectors.
|
|
55
|
+
(Default: ``'~condition')``.
|
|
58
56
|
|
|
59
|
-
|
|
60
|
-
An optional list of
|
|
61
|
-
|
|
62
|
-
we're testing, e.g. ``["condition", "A"]``. (default: ``None``).
|
|
57
|
+
design_factors : str or list, optional
|
|
58
|
+
Depecated. An optional list of factors to include in the design matrix.
|
|
59
|
+
(default: ``None``)
|
|
63
60
|
|
|
64
61
|
fit_type: str
|
|
65
62
|
Either ``"parametric"`` or ``"mean"`` for the type of fitting of dispersions to
|
|
@@ -67,6 +64,20 @@ class deconveil_fit:
|
|
|
67
64
|
robust gamma-family GLM. ``"mean"``: use the mean of gene-wise dispersion
|
|
68
65
|
estimates. Will set the fit type for the DEA and the vst transformation. If
|
|
69
66
|
needed, it can be set separately for each method.(default: ``"parametric"``).
|
|
67
|
+
|
|
68
|
+
size_factors_fit_type : str
|
|
69
|
+
The normalization method to use: ``"ratio"``, ``"poscounts"`` or ``"iterative"``.
|
|
70
|
+
``"ratio"``: fit size factors using the median-of-ratios method. ``"poscounts"``:
|
|
71
|
+
fit size factors using the method implemented in DESeq2 for the case where there
|
|
72
|
+
may be few or no genes which have no zero values.
|
|
73
|
+
``"iterative"``: fit size factors iteratively. (default: ``"ratio"``).
|
|
74
|
+
|
|
75
|
+
control_genes : ndarray, list, or pandas.Index, optional
|
|
76
|
+
Genes to use as control genes for size factor fitting. If provided, size factors
|
|
77
|
+
will be fit using only these genes. This is useful when certain genes are known
|
|
78
|
+
to be invariant across conditions (e.g., housekeeping genes). Any valid AnnData
|
|
79
|
+
indexer (bool array, integer positions, or gene name strings) can be used.
|
|
80
|
+
(default: ``None``).
|
|
70
81
|
|
|
71
82
|
min_mu : float
|
|
72
83
|
Threshold for mean estimates. (default: ``0.5``).
|
|
@@ -119,27 +130,36 @@ class deconveil_fit:
|
|
|
119
130
|
filtered_genes: numpy.ndarray
|
|
120
131
|
Genes whose log means are different from -∞, computed in
|
|
121
132
|
preprocessing.deseq2_norm_fit().
|
|
133
|
+
|
|
134
|
+
factor_storage : dict
|
|
135
|
+
A dictionary storing metadata for each factor processed by the custom
|
|
136
|
+
materializer (only if ``design`` is input as a formula).
|
|
137
|
+
|
|
138
|
+
variable_to_factors : dict
|
|
139
|
+
A dictionary mapping variable names to factor names (only if ``design`` is input
|
|
140
|
+
as a formula).
|
|
122
141
|
|
|
123
142
|
"""
|
|
124
143
|
|
|
125
144
|
def __init__(
|
|
126
145
|
self,
|
|
127
146
|
*,
|
|
128
|
-
counts:
|
|
129
|
-
cnv:
|
|
130
|
-
metadata:
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
ref_level: Optional[List[str]] = None,
|
|
147
|
+
counts: pd.DataFrame | None = None,
|
|
148
|
+
cnv: pd.DataFrame | None = None,
|
|
149
|
+
metadata: pd.DataFrame | None = None,
|
|
150
|
+
design: str | pd.DataFrame = "~condition",
|
|
151
|
+
design_factors: str | list[str] | None = None,
|
|
134
152
|
fit_type: Literal["parametric", "mean"] = "parametric",
|
|
153
|
+
size_factors_fit_type: Literal["ratio", "poscounts", "iterative"] = "ratio",
|
|
154
|
+
control_genes: np.ndarray | list[str] | list[int] | pd.Index | None = None,
|
|
135
155
|
min_mu: float = 0.5,
|
|
136
156
|
min_disp: float = 1e-8,
|
|
137
157
|
max_disp: float = 10.0,
|
|
138
158
|
refit_cooks: bool = True,
|
|
139
159
|
min_replicates: int = 7,
|
|
140
160
|
beta_tol: float = 1e-8,
|
|
141
|
-
n_cpus:
|
|
142
|
-
inference:
|
|
161
|
+
n_cpus: int | None = None,
|
|
162
|
+
inference: Inference | None = None,
|
|
143
163
|
quiet: bool = False,
|
|
144
164
|
) -> None:
|
|
145
165
|
|
|
@@ -159,27 +179,43 @@ class deconveil_fit:
|
|
|
159
179
|
|
|
160
180
|
self.metadata = metadata
|
|
161
181
|
self.fit_type = fit_type
|
|
182
|
+
self.design = design
|
|
183
|
+
self.obsm={}
|
|
184
|
+
|
|
162
185
|
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
186
|
+
if design_factors is not None:
|
|
187
|
+
warnings.warn(
|
|
188
|
+
"design_factors are deprecated and will soon be removed"
|
|
189
|
+
"Please consider providing a formulaic formula using the design argument instead",
|
|
190
|
+
DeprecationWarning,
|
|
191
|
+
stacklevel=2,
|
|
192
|
+
)
|
|
193
|
+
design_factors = (
|
|
194
|
+
design_factors if isinstance(design_factors, list) else [design_factors]
|
|
195
|
+
)
|
|
196
|
+
self.design = "~" + " + ".join(design_factors)
|
|
197
|
+
|
|
198
|
+
if not (
|
|
199
|
+
isinstance(self.design, (str | pd.DataFrame)) or isinstance(self.design, str)
|
|
200
|
+
):
|
|
201
|
+
raise ValueError(
|
|
202
|
+
"design must be a string representing a formulaic formula, or a pandas DataFrame."
|
|
203
|
+
)
|
|
167
204
|
|
|
168
|
-
self.
|
|
205
|
+
if isinstance(self.design, str):
|
|
206
|
+
# Keep track of the categorical factors used in the model specification,
|
|
207
|
+
# including variable and factor names, by generating a custom materializer.
|
|
208
|
+
self.formulaic_contrasts = FormulaicContrasts(self.metadata, self.design)
|
|
209
|
+
self.obsm["design_matrix"] = self.formulaic_contrasts.design_matrix
|
|
210
|
+
else:
|
|
211
|
+
self.obsm["design_matrix"] = self.design
|
|
169
212
|
|
|
213
|
+
if self.obsm["design_matrix"].isna().any().any():
|
|
214
|
+
raise ValueError("NaNs are not allowed in the design.")
|
|
215
|
+
|
|
216
|
+
# Check that the design matrix has full rank
|
|
217
|
+
self._check_full_rank_design()
|
|
170
218
|
|
|
171
|
-
# Build the design matrix
|
|
172
|
-
self.design_matrix = build_design_matrix(
|
|
173
|
-
metadata=self.metadata,
|
|
174
|
-
design_factors=self.design_factors,
|
|
175
|
-
continuous_factors=self.continuous_factors,
|
|
176
|
-
ref_level=ref_level,
|
|
177
|
-
expanded=False,
|
|
178
|
-
intercept=True,
|
|
179
|
-
)
|
|
180
|
-
|
|
181
|
-
self.obsm={}
|
|
182
|
-
self.obsm["design_matrix"] = self.design_matrix
|
|
183
219
|
self.min_mu = min_mu
|
|
184
220
|
self.min_disp = min_disp
|
|
185
221
|
self.n_obs=self.data["counts"].shape[0]
|
|
@@ -187,15 +223,17 @@ class deconveil_fit:
|
|
|
187
223
|
self.var_names=self.data["counts"].columns
|
|
188
224
|
self.max_disp = np.maximum(max_disp, self.n_obs)
|
|
189
225
|
self.refit_cooks = refit_cooks
|
|
190
|
-
self.ref_level = ref_level
|
|
191
226
|
self.min_replicates = min_replicates
|
|
192
227
|
self.beta_tol = beta_tol
|
|
193
228
|
self.quiet = quiet
|
|
194
|
-
self.
|
|
229
|
+
self.size_factors_fit_type = size_factors_fit_type
|
|
230
|
+
self.control_genes = control_genes
|
|
231
|
+
self.logmeans: np.ndarray | None = None
|
|
195
232
|
self.filtered_genes = None
|
|
196
233
|
self.uns={}
|
|
197
234
|
self.varm={}
|
|
198
235
|
self.layers={}
|
|
236
|
+
self.filtered_genes: np.ndarray | None = None
|
|
199
237
|
|
|
200
238
|
|
|
201
239
|
if inference:
|
|
@@ -211,16 +249,42 @@ class deconveil_fit:
|
|
|
211
249
|
)
|
|
212
250
|
# Initialize the inference object.
|
|
213
251
|
self.inference = inference or DefInference(n_cpus=n_cpus)
|
|
252
|
+
|
|
253
|
+
@property
|
|
254
|
+
def variables(self):
|
|
255
|
+
"""Get the names of the variables used in the model definition."""
|
|
256
|
+
try:
|
|
257
|
+
return self.formulaic_contrasts.variables
|
|
258
|
+
except AttributeError:
|
|
259
|
+
raise ValueError(
|
|
260
|
+
"""Retrieving variables is only possible if the model was initialized
|
|
261
|
+
using a formula."""
|
|
262
|
+
) from None
|
|
214
263
|
|
|
215
264
|
|
|
216
265
|
def vst(
|
|
217
266
|
self,
|
|
218
267
|
use_design: bool = False,
|
|
219
|
-
fit_type:
|
|
268
|
+
fit_type: Literal["parametric", "mean"] | None = None,
|
|
220
269
|
) -> None:
|
|
221
270
|
|
|
222
271
|
"""Fit a variance stabilizing transformation, and apply it to normalized counts.
|
|
223
272
|
Results are stored in ``vst_counts"``.
|
|
273
|
+
|
|
274
|
+
Parameters
|
|
275
|
+
----------
|
|
276
|
+
use_design : bool
|
|
277
|
+
Whether to use the full design matrix to fit dispersions and the trend curve.
|
|
278
|
+
If False, only an intercept is used. (default: ``False``).
|
|
279
|
+
|
|
280
|
+
fit_type: str
|
|
281
|
+
* ``None``: fit_type provided at initialization to fit
|
|
282
|
+
the dispersions trend curve.
|
|
283
|
+
* ``"parametric"``: fit a dispersion-mean relation via a robust
|
|
284
|
+
gamma-family GLM.
|
|
285
|
+
* ``"mean"``: use the mean of gene-wise dispersion estimates.
|
|
286
|
+
|
|
287
|
+
(default: ``None``).
|
|
224
288
|
"""
|
|
225
289
|
|
|
226
290
|
if fit_type is not None:
|
|
@@ -254,7 +318,9 @@ class deconveil_fit:
|
|
|
254
318
|
# Start by fitting median-of-ratio size factors if not already present,
|
|
255
319
|
# or if they were computed iteratively
|
|
256
320
|
if "size_factors" not in self.obsm or self.logmeans is None:
|
|
257
|
-
self.fit_size_factors(
|
|
321
|
+
self.fit_size_factors(
|
|
322
|
+
fit_type=self.size_factors_fit_type
|
|
323
|
+
)
|
|
258
324
|
|
|
259
325
|
if not hasattr(self, "vst_fit_type"):
|
|
260
326
|
self.vst_fit_type = self.fit_type
|
|
@@ -286,7 +352,7 @@ class deconveil_fit:
|
|
|
286
352
|
del self.obsm["design_matrix_buffer"]
|
|
287
353
|
|
|
288
354
|
|
|
289
|
-
def vst_transform(self, counts:
|
|
355
|
+
def vst_transform(self, counts: np.ndarray | None = None) -> np.ndarray:
|
|
290
356
|
|
|
291
357
|
"""Apply the variance stabilizing transformation.
|
|
292
358
|
Uses the results from the ``vst_fit`` method.
|
|
@@ -351,7 +417,7 @@ class deconveil_fit:
|
|
|
351
417
|
)
|
|
352
418
|
|
|
353
419
|
|
|
354
|
-
def deseq2(self, fit_type:
|
|
420
|
+
def deseq2(self, fit_type: Literal["parametric", "mean"] | None = None) -> None:
|
|
355
421
|
|
|
356
422
|
"""Perform dispersion and log fold-change (LFC) estimation.
|
|
357
423
|
|
|
@@ -372,8 +438,11 @@ class deconveil_fit:
|
|
|
372
438
|
if fit_type is not None:
|
|
373
439
|
self.fit_type = fit_type
|
|
374
440
|
print(f"Using {self.fit_type} fit type.")
|
|
441
|
+
|
|
375
442
|
# Compute DESeq2 normalization factors using the Median-of-ratios method
|
|
376
|
-
self.fit_size_factors(
|
|
443
|
+
self.fit_size_factors(
|
|
444
|
+
fit_type=self.size_factors_fit_type, control_genes=self.control_genes
|
|
445
|
+
)
|
|
377
446
|
# Fit an independent negative binomial model per gene
|
|
378
447
|
self.fit_genewise_dispersions()
|
|
379
448
|
# Fit a parameterized trend curve for dispersions, of the form
|
|
@@ -393,12 +462,30 @@ class deconveil_fit:
|
|
|
393
462
|
# for genes that had outliers replaced
|
|
394
463
|
self.refit()
|
|
395
464
|
|
|
465
|
+
def cond(self, **kwargs):
|
|
466
|
+
"""
|
|
467
|
+
Get a contrast vector representing a specific condition.
|
|
468
|
+
|
|
469
|
+
Parameters
|
|
470
|
+
----------
|
|
471
|
+
**kwargs
|
|
472
|
+
Column/value pairs.
|
|
473
|
+
|
|
474
|
+
Returns
|
|
475
|
+
-------
|
|
476
|
+
ndarray
|
|
477
|
+
A contrast vector that aligns to the columns of the design matrix.
|
|
478
|
+
"""
|
|
479
|
+
return self.formulaic_contrasts.cond(**kwargs)
|
|
480
|
+
|
|
481
|
+
def contrast(self, *args, **kwargs):
|
|
482
|
+
"""Get a contrast for a simple pairwise comparison."""
|
|
483
|
+
return self.formulaic_contrasts.contrast(*args, **kwargs)
|
|
484
|
+
|
|
396
485
|
def fit_size_factors(
|
|
397
486
|
self,
|
|
398
|
-
fit_type: Literal["ratio", "poscounts", "iterative"] =
|
|
399
|
-
control_genes:
|
|
400
|
-
Union[np.ndarray, List[str], List[int], pd.Index]
|
|
401
|
-
] = None,
|
|
487
|
+
fit_type: Literal["ratio", "poscounts", "iterative"] | None = None,
|
|
488
|
+
control_genes: np.ndarray | list[str] | list[int] | pd.Index | None = None,
|
|
402
489
|
) -> None:
|
|
403
490
|
"""Fit sample-wise deseq2 normalization (size) factors.
|
|
404
491
|
Parameters
|
|
@@ -411,16 +498,27 @@ class deconveil_fit:
|
|
|
411
498
|
are used. (default: ``None``).
|
|
412
499
|
"""
|
|
413
500
|
|
|
501
|
+
if fit_type is None:
|
|
502
|
+
fit_type = self.size_factors_fit_type
|
|
414
503
|
if not self.quiet:
|
|
415
504
|
print("Fitting size factors...", file=sys.stderr)
|
|
416
505
|
|
|
417
506
|
start = time.time()
|
|
418
507
|
|
|
508
|
+
if control_genes is None:
|
|
509
|
+
# Check whether control genes were specified at initialization
|
|
510
|
+
if hasattr(self, "control_genes"):
|
|
511
|
+
control_genes = self.control_genes
|
|
512
|
+
if not self.quiet:
|
|
513
|
+
print(
|
|
514
|
+
f"Using {control_genes} as control genes, passed at"
|
|
515
|
+
" deconveil_fit initialization"
|
|
516
|
+
)
|
|
517
|
+
|
|
419
518
|
# If control genes are provided, set a mask where those genes are True
|
|
420
519
|
if control_genes is not None:
|
|
421
520
|
_control_mask = np.zeros(self.data["counts"].shape[1], dtype=bool)
|
|
422
521
|
|
|
423
|
-
# Use AnnData internal indexing to get gene index array
|
|
424
522
|
# Allows bool/int/var_name to be provided
|
|
425
523
|
_control_mask[self._normalize_indices((slice(None), control_genes))[1]] = (
|
|
426
524
|
True
|
|
@@ -500,6 +598,9 @@ class deconveil_fit:
|
|
|
500
598
|
# Check that size factors are available. If not, compute them.
|
|
501
599
|
if "size_factors" not in self.obsm:
|
|
502
600
|
self.fit_size_factors()
|
|
601
|
+
|
|
602
|
+
counts = self.data["counts"]
|
|
603
|
+
|
|
503
604
|
|
|
504
605
|
# Exclude genes with all zeroes
|
|
505
606
|
self.varm["non_zero"] = ~(self.data["counts"] == 0).all(axis=0)
|
|
@@ -514,17 +615,22 @@ class deconveil_fit:
|
|
|
514
615
|
|
|
515
616
|
# Convert to numpy for speed
|
|
516
617
|
design_matrix = self.obsm["design_matrix"].values
|
|
618
|
+
size_factors = np.asarray(self.obsm["size_factors"]).reshape(-1)
|
|
517
619
|
counts=self.data["counts"].to_numpy()
|
|
518
620
|
cnv=self.data["cnv"].to_numpy()
|
|
519
|
-
|
|
520
|
-
|
|
621
|
+
|
|
622
|
+
# mu_hat is initialized differently depending on the number of different factor
|
|
623
|
+
# groups. If there are as many different factor combinations as design factors
|
|
624
|
+
# (intercept included), it is fitted with a linear model, otherwise it is fitted
|
|
625
|
+
# with a GLM (using rough dispersion estimates).
|
|
626
|
+
|
|
521
627
|
if (
|
|
522
628
|
len(self.obsm["design_matrix"].value_counts())
|
|
523
629
|
== self.obsm["design_matrix"].shape[-1]
|
|
524
630
|
):
|
|
525
631
|
mu_hat_ = self.inference.lin_reg_mu(
|
|
526
632
|
counts=counts[:, self.non_zero_idx],
|
|
527
|
-
size_factors=
|
|
633
|
+
size_factors=size_factors,
|
|
528
634
|
design_matrix=design_matrix,
|
|
529
635
|
min_mu=self.min_mu,
|
|
530
636
|
)
|
|
@@ -532,18 +638,20 @@ class deconveil_fit:
|
|
|
532
638
|
_, mu_hat_, _, _ = self.inference.irls_glm(
|
|
533
639
|
counts=counts[:, self.non_zero_idx],
|
|
534
640
|
cnv=cnv[:, self.non_zero_idx],
|
|
535
|
-
size_factors=
|
|
641
|
+
size_factors=size_factors,
|
|
536
642
|
design_matrix=design_matrix,
|
|
537
643
|
disp=self.varm["_MoM_dispersions"][self.non_zero_idx],
|
|
538
644
|
min_mu=self.min_mu,
|
|
539
645
|
beta_tol=self.beta_tol,
|
|
540
646
|
)
|
|
647
|
+
|
|
541
648
|
mu_param_name = "_vst_mu_hat" if vst else "_mu_hat"
|
|
542
649
|
disp_param_name = "genewise_dispersions"
|
|
543
650
|
|
|
544
651
|
self.layers[mu_param_name] = np.full((self.n_obs, self.n_vars), np.nan)
|
|
545
652
|
self.layers[mu_param_name][:, self.varm["non_zero"]] = mu_hat_
|
|
546
653
|
|
|
654
|
+
# Estimate per-gene dispersion via MLE (α_g)
|
|
547
655
|
if not self.quiet:
|
|
548
656
|
print("Fitting dispersions...", file=sys.stderr)
|
|
549
657
|
start = time.time()
|
|
@@ -560,6 +668,7 @@ class deconveil_fit:
|
|
|
560
668
|
if not self.quiet:
|
|
561
669
|
print(f"... done in {end - start:.2f} seconds.\n", file=sys.stderr)
|
|
562
670
|
|
|
671
|
+
# Store results
|
|
563
672
|
self.varm[disp_param_name] = np.full(self.n_vars, np.nan)
|
|
564
673
|
self.varm[disp_param_name][self.varm["non_zero"]] = np.clip(
|
|
565
674
|
dispersions_, self.min_disp, self.max_disp
|
|
@@ -609,7 +718,7 @@ class deconveil_fit:
|
|
|
609
718
|
"""Return the dispersion trend function at x."""
|
|
610
719
|
if self.uns["disp_function_type"] == "parametric":
|
|
611
720
|
return dispersion_trend(x, self.uns["trend_coeffs"])
|
|
612
|
-
elif self.disp_function_type == "mean":
|
|
721
|
+
elif self.uns["disp_function_type"] == "mean":
|
|
613
722
|
return np.full_like(x, self.uns["mean_disp"])
|
|
614
723
|
|
|
615
724
|
|
|
@@ -731,8 +840,7 @@ class deconveil_fit:
|
|
|
731
840
|
design_matrix = self.obsm["design_matrix"].values
|
|
732
841
|
counts=self.data["counts"].to_numpy()
|
|
733
842
|
cnv=self.data["cnv"].to_numpy()
|
|
734
|
-
cnv = cnv / 2
|
|
735
|
-
cnv = cnv + 0.1
|
|
843
|
+
cnv = (cnv / 2) + 0.1
|
|
736
844
|
|
|
737
845
|
if not self.quiet:
|
|
738
846
|
print("Fitting LFCs...", file=sys.stderr)
|
|
@@ -740,7 +848,7 @@ class deconveil_fit:
|
|
|
740
848
|
mle_lfcs_, mu_, hat_diagonals_, converged_ = self.inference.irls_glm(
|
|
741
849
|
counts=counts[:, self.non_zero_idx],
|
|
742
850
|
cnv=cnv[:, self.non_zero_idx],
|
|
743
|
-
size_factors=self.obsm["size_factors"],
|
|
851
|
+
size_factors=np.asarray(self.obsm["size_factors"]).reshape(-1),
|
|
744
852
|
design_matrix=design_matrix,
|
|
745
853
|
disp=self.varm["dispersions"][self.non_zero_idx],
|
|
746
854
|
min_mu=self.min_mu,
|
|
@@ -863,9 +971,10 @@ class deconveil_fit:
|
|
|
863
971
|
"""
|
|
864
972
|
# Check that size_factors are available. If not, compute them.
|
|
865
973
|
if "normed_counts" not in self.layers:
|
|
866
|
-
self.fit_size_factors()
|
|
974
|
+
self.fit_size_factors(fit_type=self.size_factors_fit_type)
|
|
867
975
|
|
|
868
976
|
normed_counts = self.layers["normed_counts"]
|
|
977
|
+
|
|
869
978
|
rde = self.inference.fit_rough_dispersions(
|
|
870
979
|
normed_counts,
|
|
871
980
|
self.obsm["design_matrix"].values,
|
|
@@ -1106,9 +1215,7 @@ class deconveil_fit:
|
|
|
1106
1215
|
),
|
|
1107
1216
|
cnv=self.data["cnv"],
|
|
1108
1217
|
metadata=self.metadata,
|
|
1109
|
-
|
|
1110
|
-
continuous_factors=self.continuous_factors,
|
|
1111
|
-
ref_level=self.ref_level,
|
|
1218
|
+
design=self.design,
|
|
1112
1219
|
min_mu=self.min_mu,
|
|
1113
1220
|
min_disp=self.min_disp,
|
|
1114
1221
|
max_disp=self.max_disp,
|
|
@@ -210,9 +210,9 @@ class DefInference(inference.Inference):
|
|
|
210
210
|
ridge_factor: np.ndarray,
|
|
211
211
|
contrast: np.ndarray,
|
|
212
212
|
lfc_null: np.ndarray,
|
|
213
|
-
alt_hypothesis:
|
|
214
|
-
Literal["greaterAbs", "lessAbs", "greater", "less"]
|
|
215
|
-
|
|
213
|
+
alt_hypothesis: (
|
|
214
|
+
Literal["greaterAbs", "lessAbs", "greater", "less"] | None
|
|
215
|
+
) = None,
|
|
216
216
|
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
|
|
217
217
|
num_genes = mu.shape[1]
|
|
218
218
|
with parallel_backend(self._backend, inner_max_num_threads=1):
|