DeConveil 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- DeConveil/__init__.py +7 -0
- DeConveil/dds.py +1279 -0
- DeConveil/default_inference.py +284 -0
- DeConveil/ds.py +758 -0
- DeConveil/grid_search.py +195 -0
- DeConveil/inference.py +373 -0
- DeConveil/utils_CNaware.py +809 -0
- DeConveil-0.1.0.dist-info/LICENSE +21 -0
- DeConveil-0.1.0.dist-info/METADATA +35 -0
- DeConveil-0.1.0.dist-info/RECORD +12 -0
- DeConveil-0.1.0.dist-info/WHEEL +5 -0
- DeConveil-0.1.0.dist-info/top_level.txt +1 -0
DeConveil/ds.py
ADDED
|
@@ -0,0 +1,758 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import time
|
|
3
|
+
from typing import List
|
|
4
|
+
from typing import Literal
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
import pandas as pd
|
|
9
|
+
from scipy.optimize import root_scalar # type: ignore
|
|
10
|
+
from scipy.stats import f # type: ignore
|
|
11
|
+
from scipy.stats import false_discovery_control # type: ignore
|
|
12
|
+
|
|
13
|
+
from deconveil.dds import deconveil_fit
|
|
14
|
+
from deconveil.default_inference import DefInference
|
|
15
|
+
from deconveil.inference import Inference
|
|
16
|
+
from deconveil.grid_search import grid_fit_shrink_beta
|
|
17
|
+
from pydeseq2.utils import lowess
|
|
18
|
+
from pydeseq2.utils import wald_test
|
|
19
|
+
from pydeseq2.utils import make_MA_plot
|
|
20
|
+
from pydeseq2.utils import n_or_more_replicates
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class deconveil_stats:
|
|
24
|
+
"""PyDESeq2 statistical tests for differential expression.
|
|
25
|
+
|
|
26
|
+
Implements p-value estimation for differential gene expression according
|
|
27
|
+
to the DESeq2 pipeline :cite:p:`DeseqStats-love2014moderated`.
|
|
28
|
+
|
|
29
|
+
Also supports apeGLM log-fold change shrinkage :cite:p:`DeseqStats-zhu2019heavy`.
|
|
30
|
+
|
|
31
|
+
Parameters
|
|
32
|
+
----------
|
|
33
|
+
dds : pydeseq2CN_data
|
|
34
|
+
pydeseq2CN_data for which dispersion and LFCs were already estimated.
|
|
35
|
+
|
|
36
|
+
contrast : list or None
|
|
37
|
+
A list of three strings, in the following format:
|
|
38
|
+
``['variable_of_interest', 'tested_level', 'ref_level']``.
|
|
39
|
+
Names must correspond to the metadata data passed to the pydeseq2CN_data.
|
|
40
|
+
E.g., ``['condition', 'B', 'A']`` will measure the LFC of 'condition B' compared
|
|
41
|
+
to 'condition A'.
|
|
42
|
+
For continuous variables, the last two strings should be left empty, e.g.
|
|
43
|
+
``['measurement', '', '']``.
|
|
44
|
+
If ``None``, the last variable from the design matrix is chosen
|
|
45
|
+
as the variable of interest, and the reference level is picked alphabetically.
|
|
46
|
+
(default: ``None``).
|
|
47
|
+
|
|
48
|
+
alpha : float
|
|
49
|
+
P-value and adjusted p-value significance threshold (usually 0.05).
|
|
50
|
+
(default: ``0.05``).
|
|
51
|
+
|
|
52
|
+
independent_filter : bool
|
|
53
|
+
Whether to perform independent filtering to correct p-value trends.
|
|
54
|
+
(default: ``True``).
|
|
55
|
+
|
|
56
|
+
prior_LFC_var : ndarray
|
|
57
|
+
Prior variance for LFCs, used for ridge regularization. (default: ``None``).
|
|
58
|
+
|
|
59
|
+
lfc_null : float
|
|
60
|
+
The (log2) log fold change under the null hypothesis. (default: ``0``).
|
|
61
|
+
|
|
62
|
+
alt_hypothesis : str or None
|
|
63
|
+
The alternative hypothesis for computing wald p-values. By default, the normal
|
|
64
|
+
Wald test assesses deviation of the estimated log fold change from the null
|
|
65
|
+
hypothesis, as given by ``lfc_null``.
|
|
66
|
+
One of ``["greaterAbs", "lessAbs", "greater", "less"]`` or ``None``.
|
|
67
|
+
The alternative hypothesis corresponds to what the user wants to find rather
|
|
68
|
+
than the null hypothesis. (default: ``None``).
|
|
69
|
+
|
|
70
|
+
inference : Inference
|
|
71
|
+
Implementation of inference routines object instance.
|
|
72
|
+
(default:
|
|
73
|
+
:class:`DeftInference <def_inference.DefInference>`).
|
|
74
|
+
|
|
75
|
+
quiet : bool
|
|
76
|
+
Suppress deseq2 status updates during fit.
|
|
77
|
+
|
|
78
|
+
Attributes
|
|
79
|
+
----------
|
|
80
|
+
base_mean : pandas.Series
|
|
81
|
+
Genewise means of normalized counts.
|
|
82
|
+
|
|
83
|
+
lfc_null : float
|
|
84
|
+
The (log2) log fold change under the null hypothesis.
|
|
85
|
+
|
|
86
|
+
alt_hypothesis : str or None
|
|
87
|
+
The alternative hypothesis for computing wald p-values.
|
|
88
|
+
|
|
89
|
+
contrast_vector : ndarray
|
|
90
|
+
Vector encoding the contrast (variable being tested).
|
|
91
|
+
|
|
92
|
+
contrast_idx : int
|
|
93
|
+
Index of the LFC column corresponding to the variable being tested.
|
|
94
|
+
|
|
95
|
+
design_matrix : pandas.DataFrame
|
|
96
|
+
A DataFrame with experiment design information (to split cohorts).
|
|
97
|
+
Indexed by sample barcodes. Depending on the contrast that is provided to the
|
|
98
|
+
DeseqStats object, it may differ from the DeseqDataSet design matrix, as the
|
|
99
|
+
reference level may need to be adapted.
|
|
100
|
+
|
|
101
|
+
LFC : pandas.DataFrame
|
|
102
|
+
Estimated log-fold change between conditions and intercept, in natural log scale.
|
|
103
|
+
|
|
104
|
+
SE : pandas.Series
|
|
105
|
+
Standard LFC error.
|
|
106
|
+
|
|
107
|
+
statistics : pandas.Series
|
|
108
|
+
Wald statistics.
|
|
109
|
+
|
|
110
|
+
p_values : pandas.Series
|
|
111
|
+
P-values estimated from Wald statistics.
|
|
112
|
+
|
|
113
|
+
padj : pandas.Series
|
|
114
|
+
P-values adjusted for multiple testing.
|
|
115
|
+
|
|
116
|
+
results_df : pandas.DataFrame
|
|
117
|
+
Summary of the statistical analysis.
|
|
118
|
+
|
|
119
|
+
shrunk_LFCs : bool
|
|
120
|
+
Whether LFCs are shrunk.
|
|
121
|
+
|
|
122
|
+
n_processes : int
|
|
123
|
+
Number of threads to use for multiprocessing.
|
|
124
|
+
|
|
125
|
+
quiet : bool
|
|
126
|
+
Suppress deseq2 status updates during fit.
|
|
127
|
+
|
|
128
|
+
References
|
|
129
|
+
----------
|
|
130
|
+
.. bibliography::
|
|
131
|
+
:keyprefix: DeseqStats-
|
|
132
|
+
"""
|
|
133
|
+
|
|
134
|
+
def __init__(
|
|
135
|
+
self,
|
|
136
|
+
dds: deconveil_fit,
|
|
137
|
+
contrast: Optional[List[str]] = None,
|
|
138
|
+
alpha: float = 0.05,
|
|
139
|
+
cooks_filter: bool = True,
|
|
140
|
+
independent_filter: bool = True,
|
|
141
|
+
prior_LFC_var: Optional[np.ndarray] = None,
|
|
142
|
+
lfc_null: float = 0.0,
|
|
143
|
+
alt_hypothesis: Optional[
|
|
144
|
+
Literal["greaterAbs", "lessAbs", "greater", "less"]
|
|
145
|
+
] = None,
|
|
146
|
+
inference: Optional[Inference] = None,
|
|
147
|
+
quiet: bool = False,
|
|
148
|
+
) -> None:
|
|
149
|
+
assert (
|
|
150
|
+
"LFC" in dds.varm
|
|
151
|
+
), "Please provide a fitted pydeseq2CN_data by first running the `deseq2` method."
|
|
152
|
+
|
|
153
|
+
self.dds = dds
|
|
154
|
+
|
|
155
|
+
self.alpha = alpha
|
|
156
|
+
self.cooks_filter = cooks_filter
|
|
157
|
+
self.independent_filter = independent_filter
|
|
158
|
+
self.base_mean = self.dds.varm["_normed_means"].copy()
|
|
159
|
+
self.prior_LFC_var = prior_LFC_var
|
|
160
|
+
|
|
161
|
+
if lfc_null < 0 and alt_hypothesis in {"greaterAbs", "lessAbs"}:
|
|
162
|
+
raise ValueError(
|
|
163
|
+
f"The alternative hypothesis being {alt_hypothesis}, please provide a",
|
|
164
|
+
f"positive lfc_null value (got {lfc_null}).",
|
|
165
|
+
)
|
|
166
|
+
self.lfc_null = lfc_null
|
|
167
|
+
self.alt_hypothesis = alt_hypothesis
|
|
168
|
+
|
|
169
|
+
# Check the validity of the contrast (if provided) or build it.
|
|
170
|
+
self._build_contrast(contrast)
|
|
171
|
+
|
|
172
|
+
# Initialize the design matrix and LFCs. If the chosen reference level are the
|
|
173
|
+
# same as in dds, keep them unchanged. Otherwise, change reference level.
|
|
174
|
+
self.design_matrix = self.dds.obsm["design_matrix"].copy()
|
|
175
|
+
self.LFC = self.dds.varm["LFC"].copy()
|
|
176
|
+
|
|
177
|
+
# Build a contrast vector corresponding to the variable and levels of interest
|
|
178
|
+
self._build_contrast_vector()
|
|
179
|
+
|
|
180
|
+
# Set a flag to indicate that LFCs are unshrunk
|
|
181
|
+
self.shrunk_LFCs = False
|
|
182
|
+
self.quiet = quiet
|
|
183
|
+
|
|
184
|
+
# Initialize the inference object.
|
|
185
|
+
self.inference = inference or DefInference()
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def summary(
|
|
189
|
+
self,
|
|
190
|
+
**kwargs,
|
|
191
|
+
) -> None:
|
|
192
|
+
"""Run the statistical analysis.
|
|
193
|
+
|
|
194
|
+
The results are stored in the ``results_df`` attribute.
|
|
195
|
+
|
|
196
|
+
Parameters
|
|
197
|
+
----------
|
|
198
|
+
**kwargs
|
|
199
|
+
Keyword arguments: providing new values for ``lfc_null`` or
|
|
200
|
+
``alt_hypothesis`` will override the corresponding ``DeseqStat`` attributes.
|
|
201
|
+
"""
|
|
202
|
+
new_lfc_null = kwargs.get("lfc_null", "default")
|
|
203
|
+
new_alt_hypothesis = kwargs.get("alt_hypothesis", "default")
|
|
204
|
+
|
|
205
|
+
rerun_summary = False
|
|
206
|
+
if new_lfc_null == "default":
|
|
207
|
+
lfc_null = self.lfc_null
|
|
208
|
+
else:
|
|
209
|
+
lfc_null = new_lfc_null
|
|
210
|
+
if new_alt_hypothesis == "default":
|
|
211
|
+
alt_hypothesis = self.alt_hypothesis
|
|
212
|
+
else:
|
|
213
|
+
alt_hypothesis = new_alt_hypothesis
|
|
214
|
+
if lfc_null < 0 and alt_hypothesis in {"greaterAbs", "lessAbs"}:
|
|
215
|
+
raise ValueError(
|
|
216
|
+
f"The alternative hypothesis being {alt_hypothesis}, please provide a",
|
|
217
|
+
f"positive lfc_null value (got {lfc_null}).",
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
if (
|
|
221
|
+
not hasattr(self, "p_values")
|
|
222
|
+
or self.lfc_null != lfc_null
|
|
223
|
+
or self.alt_hypothesis != alt_hypothesis
|
|
224
|
+
):
|
|
225
|
+
# Estimate p-values with Wald test
|
|
226
|
+
self.lfc_null = lfc_null
|
|
227
|
+
self.alt_hypothesis = alt_hypothesis
|
|
228
|
+
rerun_summary = True
|
|
229
|
+
self.run_wald_test()
|
|
230
|
+
|
|
231
|
+
if self.cooks_filter:
|
|
232
|
+
# Filter p-values based on Cooks outliers
|
|
233
|
+
self._cooks_filtering()
|
|
234
|
+
|
|
235
|
+
if not hasattr(self, "padj") or rerun_summary:
|
|
236
|
+
if self.independent_filter:
|
|
237
|
+
# Compute adjusted p-values and correct p-value trend
|
|
238
|
+
self._independent_filtering()
|
|
239
|
+
else:
|
|
240
|
+
# Compute adjusted p-values using the Benjamini-Hochberg method, without
|
|
241
|
+
# correcting the p-value trend.
|
|
242
|
+
self._p_value_adjustment()
|
|
243
|
+
|
|
244
|
+
# Store the results in a DataFrame, in log2 scale for LFCs.
|
|
245
|
+
self.results_df = pd.DataFrame(index=self.dds.var_names)
|
|
246
|
+
self.results_df["baseMean"] = self.base_mean
|
|
247
|
+
self.results_df["log2FoldChange"] = self.LFC @ self.contrast_vector / np.log(2)
|
|
248
|
+
self.results_df["lfcSE"] = self.SE / np.log(2)
|
|
249
|
+
self.results_df["stat"] = self.statistics
|
|
250
|
+
self.results_df["pvalue"] = self.p_values
|
|
251
|
+
self.results_df["padj"] = self.padj
|
|
252
|
+
|
|
253
|
+
if not self.quiet:
|
|
254
|
+
if self.contrast[1] == self.contrast[2] == "":
|
|
255
|
+
# The factor is continuous
|
|
256
|
+
print(f"Log2 fold change & Wald test p-value: " f"{self.contrast[0]}")
|
|
257
|
+
else:
|
|
258
|
+
# The factor is categorical
|
|
259
|
+
print(
|
|
260
|
+
f"Log2 fold change & Wald test p-value: "
|
|
261
|
+
f"{self.contrast[0]} {self.contrast[1]} vs {self.contrast[2]}"
|
|
262
|
+
)
|
|
263
|
+
print(self.results_df)
|
|
264
|
+
|
|
265
|
+
def run_wald_test(self) -> None:
|
|
266
|
+
"""Perform a Wald test.
|
|
267
|
+
|
|
268
|
+
Get gene-wise p-values for gene over/under-expression.
|
|
269
|
+
"""
|
|
270
|
+
num_vars = self.design_matrix.shape[1]
|
|
271
|
+
|
|
272
|
+
# Raise a warning if LFCs are shrunk.
|
|
273
|
+
if self.shrunk_LFCs:
|
|
274
|
+
if not self.quiet:
|
|
275
|
+
print(
|
|
276
|
+
"Note: running Wald test on shrunk LFCs. "
|
|
277
|
+
"Some sequencing datasets show better performance with the testing "
|
|
278
|
+
"separated from the use of the LFC prior.",
|
|
279
|
+
file=sys.stderr,
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
mu = (
|
|
283
|
+
np.exp(self.design_matrix @ self.LFC.T)
|
|
284
|
+
.multiply(self.dds.obsm["size_factors"], 0)
|
|
285
|
+
.values
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
# Set regularization factors.
|
|
289
|
+
if self.prior_LFC_var is not None:
|
|
290
|
+
ridge_factor = np.diag(1 / self.prior_LFC_var**2)
|
|
291
|
+
else:
|
|
292
|
+
ridge_factor = np.diag(np.repeat(1e-6, num_vars))
|
|
293
|
+
|
|
294
|
+
design_matrix = self.design_matrix.values
|
|
295
|
+
LFCs = self.LFC.values
|
|
296
|
+
|
|
297
|
+
if not self.quiet:
|
|
298
|
+
print("Running Wald tests...", file=sys.stderr)
|
|
299
|
+
start = time.time()
|
|
300
|
+
pvals, stats, se = self.inference.wald_test(
|
|
301
|
+
design_matrix=design_matrix,
|
|
302
|
+
disp=self.dds.varm["dispersions"],
|
|
303
|
+
lfc=LFCs,
|
|
304
|
+
mu=mu,
|
|
305
|
+
ridge_factor=ridge_factor,
|
|
306
|
+
contrast=self.contrast_vector,
|
|
307
|
+
lfc_null=np.log(2) * self.lfc_null, # Convert log2 to natural log
|
|
308
|
+
alt_hypothesis=self.alt_hypothesis,
|
|
309
|
+
)
|
|
310
|
+
end = time.time()
|
|
311
|
+
if not self.quiet:
|
|
312
|
+
print(f"... done in {end-start:.2f} seconds.\n", file=sys.stderr)
|
|
313
|
+
|
|
314
|
+
self.p_values: pd.Series = pd.Series(pvals, index=self.dds.var_names)
|
|
315
|
+
self.statistics: pd.Series = pd.Series(stats, index=self.dds.var_names)
|
|
316
|
+
self.SE: pd.Series = pd.Series(se, index=self.dds.var_names)
|
|
317
|
+
|
|
318
|
+
# Account for possible all_zeroes due to outlier refitting in DESeqDataSet
|
|
319
|
+
if self.dds.refit_cooks and self.dds.varm["replaced"].sum() > 0:
|
|
320
|
+
self.SE.loc[self.dds.new_all_zeroes_genes] = 0.0
|
|
321
|
+
self.statistics.loc[self.dds.new_all_zeroes_genes] = 0.0
|
|
322
|
+
self.p_values.loc[self.dds.new_all_zeroes_genes] = 1.0
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
def lfc_shrink(self, coeff: Optional[str] = None, adapt: bool = True) -> None:
|
|
326
|
+
"""LFC shrinkage with an apeGLM prior :cite:p:`DeseqStats-zhu2019heavy`.
|
|
327
|
+
|
|
328
|
+
Shrinks LFCs using a heavy-tailed Cauchy prior, leaving p-values unchanged.
|
|
329
|
+
|
|
330
|
+
Parameters
|
|
331
|
+
----------
|
|
332
|
+
coeff : str or None
|
|
333
|
+
The LFC coefficient to shrink. If set to ``None``, the method will try to
|
|
334
|
+
shrink the coefficient corresponding to the ``contrast`` attribute.
|
|
335
|
+
If the desired coefficient is not available, it may be set from the
|
|
336
|
+
:class:`pydeseq2.dds.DeseqDataSet` argument ``ref_level``.
|
|
337
|
+
(default: ``None``).
|
|
338
|
+
adapt: bool
|
|
339
|
+
Whether to use the MLE estimates of LFC to adapt the prior. If False, the
|
|
340
|
+
prior scale is set to 1. (``default=True``)
|
|
341
|
+
"""
|
|
342
|
+
if self.contrast[1] == self.contrast[2] == "":
|
|
343
|
+
# The factor being tested is continuous
|
|
344
|
+
contrast_level = self.contrast[0]
|
|
345
|
+
else:
|
|
346
|
+
# The factor being tested is categorical
|
|
347
|
+
contrast_level = (
|
|
348
|
+
f"{self.contrast[0]}_{self.contrast[1]}_vs_{self.contrast[2]}"
|
|
349
|
+
)
|
|
350
|
+
|
|
351
|
+
if coeff is not None:
|
|
352
|
+
if coeff not in self.LFC.columns:
|
|
353
|
+
split_coeff = coeff.split("_")
|
|
354
|
+
if len(split_coeff) == 4:
|
|
355
|
+
raise KeyError(
|
|
356
|
+
f"The coeff argument '{coeff}' should be one the LFC columns. "
|
|
357
|
+
f"The available LFC coeffs are {self.LFC.columns[1:]}. "
|
|
358
|
+
f"If the desired coefficient is not available, please set "
|
|
359
|
+
f"`ref_level = [{split_coeff[0]}, {split_coeff[3]}]` "
|
|
360
|
+
f"in DeseqDataSet and rerun."
|
|
361
|
+
)
|
|
362
|
+
else:
|
|
363
|
+
raise KeyError(
|
|
364
|
+
f"The coeff argument '{coeff}' should be one the LFC columns. "
|
|
365
|
+
f"The available LFC coeffs are {self.LFC.columns[1:]}. "
|
|
366
|
+
f"If the desired coefficient is not available, please set the "
|
|
367
|
+
f"appropriate`ref_level` in DeseqDataSet and rerun."
|
|
368
|
+
)
|
|
369
|
+
elif contrast_level not in self.LFC.columns:
|
|
370
|
+
raise KeyError(
|
|
371
|
+
f"lfc_shrink's coeff argument was set to None, but the coefficient "
|
|
372
|
+
f"corresponding to the contrast {self.contrast} is not available."
|
|
373
|
+
f"The available LFC coeffs are {self.LFC.columns[1:]}. "
|
|
374
|
+
f"If the desired coefficient is not available, please set "
|
|
375
|
+
f"`ref_level = [{self.contrast[0]}, {self.contrast[2]}]` "
|
|
376
|
+
f"in DeseqDataSet and rerun."
|
|
377
|
+
)
|
|
378
|
+
else:
|
|
379
|
+
coeff = contrast_level
|
|
380
|
+
|
|
381
|
+
coeff_idx = self.LFC.columns.get_loc(coeff)
|
|
382
|
+
|
|
383
|
+
size = 1.0 / self.dds.varm["dispersions"]
|
|
384
|
+
offset = np.log(self.dds.obsm["size_factors"])
|
|
385
|
+
|
|
386
|
+
counts=self.dds.data["counts"]
|
|
387
|
+
cnv=self.dds.data["cnv"].to_numpy()
|
|
388
|
+
cnv = cnv + 0.1
|
|
389
|
+
cnv = np.log(cnv)
|
|
390
|
+
|
|
391
|
+
# Set priors
|
|
392
|
+
prior_no_shrink_scale = 15
|
|
393
|
+
prior_scale = 1
|
|
394
|
+
if adapt:
|
|
395
|
+
prior_var = self._fit_prior_var(coeff_idx=coeff_idx)
|
|
396
|
+
prior_scale = np.minimum(np.sqrt(prior_var), 1)
|
|
397
|
+
|
|
398
|
+
design_matrix = self.design_matrix.values
|
|
399
|
+
|
|
400
|
+
if not self.quiet:
|
|
401
|
+
print("Fitting MAP LFCs...", file=sys.stderr)
|
|
402
|
+
start = time.time()
|
|
403
|
+
lfcs, inv_hessians, l_bfgs_b_converged_ = self.inference.lfc_shrink_nbinom_glm(
|
|
404
|
+
design_matrix=design_matrix,
|
|
405
|
+
counts=counts[:, self.dds.non_zero_idx],
|
|
406
|
+
cnv=cnv[:, self.dds.non_zero_idx],
|
|
407
|
+
size=size[self.dds.non_zero_idx],
|
|
408
|
+
offset=offset,
|
|
409
|
+
prior_no_shrink_scale=prior_no_shrink_scale,
|
|
410
|
+
prior_scale=prior_scale,
|
|
411
|
+
optimizer="L-BFGS-B",
|
|
412
|
+
shrink_index=coeff_idx,
|
|
413
|
+
)
|
|
414
|
+
end = time.time()
|
|
415
|
+
if not self.quiet:
|
|
416
|
+
print(f"... done in {end-start:.2f} seconds.\n", file=sys.stderr)
|
|
417
|
+
|
|
418
|
+
self.LFC.iloc[:, coeff_idx].update(
|
|
419
|
+
pd.Series(
|
|
420
|
+
np.array(lfcs)[:, coeff_idx],
|
|
421
|
+
index=self.dds.non_zero_genes,
|
|
422
|
+
)
|
|
423
|
+
)
|
|
424
|
+
|
|
425
|
+
self.SE.update(
|
|
426
|
+
pd.Series(
|
|
427
|
+
np.array(
|
|
428
|
+
[
|
|
429
|
+
np.sqrt(np.abs(inv_hess[coeff_idx, coeff_idx]))
|
|
430
|
+
for inv_hess in inv_hessians
|
|
431
|
+
]
|
|
432
|
+
),
|
|
433
|
+
index=self.dds.non_zero_genes,
|
|
434
|
+
)
|
|
435
|
+
)
|
|
436
|
+
|
|
437
|
+
self._LFC_shrink_converged = pd.Series(np.nan, index=self.dds.var_names)
|
|
438
|
+
self._LFC_shrink_converged.update(
|
|
439
|
+
pd.Series(l_bfgs_b_converged_, index=self.dds.non_zero_genes)
|
|
440
|
+
)
|
|
441
|
+
|
|
442
|
+
# Set a flag to indicate that LFCs were shrunk
|
|
443
|
+
self.shrunk_LFCs = True
|
|
444
|
+
|
|
445
|
+
# Replace in results dataframe, if it exists
|
|
446
|
+
if hasattr(self, "results_df"):
|
|
447
|
+
self.results_df["log2FoldChange"] = self.LFC.iloc[:, coeff_idx] / np.log(2)
|
|
448
|
+
self.results_df["lfcSE"] = self.SE / np.log(2)
|
|
449
|
+
# Get the corresponding factor, tested and reference levels of the shrunk
|
|
450
|
+
# coefficient
|
|
451
|
+
split_coeff = coeff.split("_")
|
|
452
|
+
# Categorical coeffs are of the form "factor_A_vs_B", and continuous coeffs
|
|
453
|
+
# of the form "factor".
|
|
454
|
+
if len(split_coeff) == 1:
|
|
455
|
+
# The factor is continuous
|
|
456
|
+
print(f"Shrunk log2 fold change & Wald test p-value: " f"{coeff}")
|
|
457
|
+
else:
|
|
458
|
+
# The factor is categorical
|
|
459
|
+
# Categorical coeffs are of the form "factor_A_vs_B", hence "factor"
|
|
460
|
+
# is split_coeff[0], "A" is split_coeff[1] and "B" split_coeff[3]
|
|
461
|
+
print(
|
|
462
|
+
f"Shrunk log2 fold change & Wald test p-value: "
|
|
463
|
+
f"{split_coeff[0]} {split_coeff[1]} vs {split_coeff[3]}"
|
|
464
|
+
)
|
|
465
|
+
|
|
466
|
+
print(self.results_df)
|
|
467
|
+
|
|
468
|
+
|
|
469
|
+
def _independent_filtering(self) -> None:
|
|
470
|
+
"""Compute adjusted p-values using independent filtering.
|
|
471
|
+
|
|
472
|
+
Corrects p-value trend (see :cite:p:`DeseqStats-love2014moderated`)
|
|
473
|
+
"""
|
|
474
|
+
# Check that p-values are available. If not, compute them.
|
|
475
|
+
if not hasattr(self, "p_values"):
|
|
476
|
+
self.run_wald_test()
|
|
477
|
+
|
|
478
|
+
lower_quantile = np.mean(self.base_mean == 0)
|
|
479
|
+
|
|
480
|
+
if lower_quantile < 0.95:
|
|
481
|
+
upper_quantile = 0.95
|
|
482
|
+
else:
|
|
483
|
+
upper_quantile = 1
|
|
484
|
+
|
|
485
|
+
theta = np.linspace(lower_quantile, upper_quantile, 50)
|
|
486
|
+
cutoffs = np.quantile(self.base_mean, theta)
|
|
487
|
+
|
|
488
|
+
result = pd.DataFrame(
|
|
489
|
+
np.nan, index=self.dds.var_names, columns=np.arange(len(theta))
|
|
490
|
+
)
|
|
491
|
+
|
|
492
|
+
for i, cutoff in enumerate(cutoffs):
|
|
493
|
+
use = (self.base_mean >= cutoff) & (~self.p_values.isna())
|
|
494
|
+
U2 = self.p_values[use]
|
|
495
|
+
if not U2.empty:
|
|
496
|
+
result.loc[use, i] = false_discovery_control(U2, method="bh")
|
|
497
|
+
num_rej = (result < self.alpha).sum(0).values
|
|
498
|
+
lowess_res = lowess(theta, num_rej, frac=1 / 5)
|
|
499
|
+
|
|
500
|
+
if num_rej.max() <= 10:
|
|
501
|
+
j = 0
|
|
502
|
+
else:
|
|
503
|
+
residual = num_rej[num_rej > 0] - lowess_res[num_rej > 0]
|
|
504
|
+
thresh = lowess_res.max() - np.sqrt(np.mean(residual**2))
|
|
505
|
+
if np.any(num_rej > thresh):
|
|
506
|
+
j = np.where(num_rej > thresh)[0][0]
|
|
507
|
+
else:
|
|
508
|
+
j = 0
|
|
509
|
+
|
|
510
|
+
self.padj = result.loc[:, j]
|
|
511
|
+
|
|
512
|
+
def _p_value_adjustment(self) -> None:
|
|
513
|
+
"""Compute adjusted p-values using the Benjamini-Hochberg method.
|
|
514
|
+
|
|
515
|
+
Does not correct the p-value trend.
|
|
516
|
+
This method and the `_independent_filtering` are mutually exclusive.
|
|
517
|
+
"""
|
|
518
|
+
if not hasattr(self, "p_values"):
|
|
519
|
+
# Estimate p-values with Wald test
|
|
520
|
+
self.run_wald_test()
|
|
521
|
+
|
|
522
|
+
self.padj = pd.Series(np.nan, index=self.dds.var_names)
|
|
523
|
+
self.padj.loc[~self.p_values.isna()] = false_discovery_control(
|
|
524
|
+
self.p_values.dropna(), method="bh"
|
|
525
|
+
)
|
|
526
|
+
|
|
527
|
+
def _cooks_filtering(self) -> None:
|
|
528
|
+
"""Filter p-values based on Cooks outliers."""
|
|
529
|
+
# Check that p-values are available. If not, compute them.
|
|
530
|
+
if not hasattr(self, "p_values"):
|
|
531
|
+
self.run_wald_test()
|
|
532
|
+
|
|
533
|
+
num_samples = self.dds.n_obs
|
|
534
|
+
num_vars = self.design_matrix.shape[-1]
|
|
535
|
+
cooks_cutoff = f.ppf(0.99, num_vars, num_samples - num_vars)
|
|
536
|
+
|
|
537
|
+
# As in DESeq2, only take samples with 3 or more replicates when looking for
|
|
538
|
+
# max cooks.
|
|
539
|
+
#use_for_max = n_or_more_replicates(self.design_matrix, 3)
|
|
540
|
+
use_for_max = n_or_more_replicates(self.dds.obsm["design_matrix"], 3).values
|
|
541
|
+
|
|
542
|
+
# If for a gene there are 3 samples or more that have more counts than the
|
|
543
|
+
# maximum cooks sample, don't count this gene as an outlier.
|
|
544
|
+
|
|
545
|
+
# Take into account whether we already replaced outliers
|
|
546
|
+
|
|
547
|
+
if self.dds.refit_cooks and self.dds.varm["refitted"].sum() > 0:
|
|
548
|
+
cooks_layer = self.dds.layers["replace_cooks"]
|
|
549
|
+
filtered_cooks_layer = cooks_layer.loc[use_for_max, :]
|
|
550
|
+
|
|
551
|
+
else:
|
|
552
|
+
cooks_layer = self.dds.layers["cooks"]
|
|
553
|
+
filtered_cooks_layer = cooks_layer[use_for_max, :]
|
|
554
|
+
|
|
555
|
+
cooks_outlier = (filtered_cooks_layer > cooks_cutoff).any(axis=0).copy()
|
|
556
|
+
|
|
557
|
+
# Find the position of the maximum cooks distance for each outlier gene
|
|
558
|
+
|
|
559
|
+
if isinstance(self.dds.data["counts"], pd.DataFrame):
|
|
560
|
+
pos = self.dds.data["counts"].iloc[:, cooks_outlier].values.argmax(axis=0)
|
|
561
|
+
else:
|
|
562
|
+
pos = self.dds.data["counts"][:, cooks_outlier].argmax(axis=0) # Use NumPy indexing
|
|
563
|
+
|
|
564
|
+
# Filter out genes where 3 or more samples exceed the maximum cooks distance
|
|
565
|
+
if isinstance(self.dds.data["counts"], pd.DataFrame):
|
|
566
|
+
cooks_outlier[cooks_outlier] = (
|
|
567
|
+
(self.dds.data["counts"].iloc[:, cooks_outlier].values
|
|
568
|
+
> self.dds.data["counts"].iloc[pos, cooks_outlier].values)
|
|
569
|
+
.sum(axis=0) < 3
|
|
570
|
+
)
|
|
571
|
+
else:
|
|
572
|
+
cooks_outlier[cooks_outlier] = (
|
|
573
|
+
(self.dds.data["counts"][:, cooks_outlier]
|
|
574
|
+
> self.dds.data["counts"][pos, cooks_outlier])
|
|
575
|
+
.sum(axis=0) < 3
|
|
576
|
+
)
|
|
577
|
+
|
|
578
|
+
# Assign NaN to p-values for outlier genes
|
|
579
|
+
cooks_outlier_index = self.dds.var_names[cooks_outlier] # Assuming var_names is the gene index
|
|
580
|
+
self.p_values.loc[cooks_outlier_index] = np.nan
|
|
581
|
+
|
|
582
|
+
def _fit_prior_var(
|
|
583
|
+
self, coeff_idx: str, min_var: float = 1e-6, max_var: float = 400.0
|
|
584
|
+
) -> float:
|
|
585
|
+
"""Estimate the prior variance of the apeGLM model.
|
|
586
|
+
|
|
587
|
+
Returns shrinkage factors.
|
|
588
|
+
|
|
589
|
+
Parameters
|
|
590
|
+
----------
|
|
591
|
+
coeff_idx : str
|
|
592
|
+
Index of the coefficient to shrink.
|
|
593
|
+
|
|
594
|
+
min_var : float
|
|
595
|
+
Lower bound for prior variance. (default: ``1e-6``).
|
|
596
|
+
|
|
597
|
+
max_var : float
|
|
598
|
+
Upper bound for prior variance. (default: ``400``).
|
|
599
|
+
|
|
600
|
+
Returns
|
|
601
|
+
-------
|
|
602
|
+
float
|
|
603
|
+
Estimated prior variance.
|
|
604
|
+
"""
|
|
605
|
+
keep = ~self.LFC.iloc[:, coeff_idx].isna()
|
|
606
|
+
S = self.LFC[keep].iloc[:, coeff_idx] ** 2
|
|
607
|
+
D = self.SE[keep] ** 2
|
|
608
|
+
|
|
609
|
+
def objective(a: float) -> float:
|
|
610
|
+
# Equation to solve
|
|
611
|
+
coeff = 1 / (2 * (a + D) ** 2)
|
|
612
|
+
return ((S - D) * coeff).sum() / coeff.sum() - a
|
|
613
|
+
|
|
614
|
+
# The prior variance is the zero of the above function.
|
|
615
|
+
if objective(min_var) < 0:
|
|
616
|
+
return min_var
|
|
617
|
+
else:
|
|
618
|
+
return root_scalar(objective, bracket=[min_var, max_var]).root
|
|
619
|
+
|
|
620
|
+
def _build_contrast(self, contrast: Optional[List[str]] = None) -> None:
|
|
621
|
+
"""Check the validity of the contrast (if provided).
|
|
622
|
+
|
|
623
|
+
If not, build a default
|
|
624
|
+
contrast, corresponding to the last column of the design matrix.
|
|
625
|
+
A contrast should be a list of three strings, in the following format:
|
|
626
|
+
``['variable_of_interest', 'tested_level', 'reference_level']``.
|
|
627
|
+
Names must correspond to the metadata data passed to the DeseqDataSet.
|
|
628
|
+
E.g., ``['condition', 'B', 'A']`` will measure the LFC of 'condition B'
|
|
629
|
+
compared to 'condition A'.
|
|
630
|
+
For continuous variables, the last two strings will be left empty, e.g.
|
|
631
|
+
``['measurement', '', ''].
|
|
632
|
+
If None, the last variable from the design matrix
|
|
633
|
+
is chosen as the variable of interest, and the reference level is picked
|
|
634
|
+
alphabetically.
|
|
635
|
+
|
|
636
|
+
Parameters
|
|
637
|
+
----------
|
|
638
|
+
contrast : list or None
|
|
639
|
+
A list of three strings, in the following format:
|
|
640
|
+
``['variable_of_interest', 'tested_level', 'reference_level']``.
|
|
641
|
+
(default: ``None``).
|
|
642
|
+
"""
|
|
643
|
+
if contrast is not None: # Test contrast if provided
|
|
644
|
+
if len(contrast) != 3:
|
|
645
|
+
raise ValueError("The contrast should contain three strings.")
|
|
646
|
+
if contrast[0] not in self.dds.design_factors:
|
|
647
|
+
raise KeyError(
|
|
648
|
+
f"The contrast variable ('{contrast[0]}') should be one "
|
|
649
|
+
f"of the design factors."
|
|
650
|
+
)
|
|
651
|
+
if not (contrast[1] == contrast[2] == ""):
|
|
652
|
+
# The contrast factor is categorical, so we should check that the tested
|
|
653
|
+
# and reference levels are valid.
|
|
654
|
+
if contrast[1] not in self.dds.obs[contrast[0]].values:
|
|
655
|
+
raise KeyError(
|
|
656
|
+
f"The tested level ('{contrast[1]}') should correspond to "
|
|
657
|
+
f"one of the levels of '{contrast[0]}'"
|
|
658
|
+
)
|
|
659
|
+
if contrast[2] not in self.dds.obs[contrast[0]].values:
|
|
660
|
+
raise KeyError(
|
|
661
|
+
f"The reference level ('{contrast[2]}') should correspond to "
|
|
662
|
+
f"one of the levels of '{contrast[0]}'"
|
|
663
|
+
)
|
|
664
|
+
self.contrast = contrast
|
|
665
|
+
else: # Build contrast if None
|
|
666
|
+
factor = self.dds.design_factors[-1]
|
|
667
|
+
# Check whether this factor is categorical or continuous.
|
|
668
|
+
if (
|
|
669
|
+
self.dds.continuous_factors is not None
|
|
670
|
+
and factor in self.dds.continuous_factors
|
|
671
|
+
):
|
|
672
|
+
# The factor is continuous
|
|
673
|
+
self.contrast = [factor, "", ""]
|
|
674
|
+
else:
|
|
675
|
+
# The factor is categorical
|
|
676
|
+
factor_col = next(
|
|
677
|
+
col
|
|
678
|
+
for col in self.dds.obsm["design_matrix"].columns
|
|
679
|
+
if col.startswith(factor)
|
|
680
|
+
)
|
|
681
|
+
split_col = factor_col.split("_")
|
|
682
|
+
self.contrast = [split_col[0], split_col[1], split_col[-1]]
|
|
683
|
+
|
|
684
|
+
def _build_contrast_vector(self) -> None:
|
|
685
|
+
"""
|
|
686
|
+
Build a vector corresponding to the desired contrast.
|
|
687
|
+
|
|
688
|
+
Allows to test any pair of levels without refitting LFCs.
|
|
689
|
+
"""
|
|
690
|
+
factor = self.contrast[0]
|
|
691
|
+
alternative = self.contrast[1]
|
|
692
|
+
ref = self.contrast[2]
|
|
693
|
+
if ref == alternative == "":
|
|
694
|
+
# "factor" is a continuous variable
|
|
695
|
+
contrast_level = factor
|
|
696
|
+
else:
|
|
697
|
+
contrast_level = f"{factor}_{alternative}_vs_{ref}"
|
|
698
|
+
|
|
699
|
+
self.contrast_vector = np.zeros(self.LFC.shape[-1])
|
|
700
|
+
if contrast_level in self.design_matrix.columns:
|
|
701
|
+
self.contrast_idx = self.LFC.columns.get_loc(contrast_level)
|
|
702
|
+
self.contrast_vector[self.contrast_idx] = 1
|
|
703
|
+
elif f"{factor}_{ref}_vs_{alternative}" in self.design_matrix.columns:
|
|
704
|
+
# Reference and alternative are inverted
|
|
705
|
+
self.contrast_idx = self.LFC.columns.get_loc(
|
|
706
|
+
f"{factor}_{ref}_vs_{alternative}"
|
|
707
|
+
)
|
|
708
|
+
self.contrast_vector[self.contrast_idx] = -1
|
|
709
|
+
else:
|
|
710
|
+
# Need to change reference
|
|
711
|
+
# Get any column corresponding to the desired factor and extract old ref
|
|
712
|
+
old_ref = next(
|
|
713
|
+
col for col in self.LFC.columns if col.startswith(factor)
|
|
714
|
+
).split("_vs_")[-1]
|
|
715
|
+
new_alternative_idx = self.LFC.columns.get_loc(
|
|
716
|
+
f"{factor}_{alternative}_vs_{old_ref}"
|
|
717
|
+
)
|
|
718
|
+
new_ref_idx = self.LFC.columns.get_loc(f"{factor}_{ref}_vs_{old_ref}")
|
|
719
|
+
self.contrast_vector[new_alternative_idx] = 1
|
|
720
|
+
self.contrast_vector[new_ref_idx] = -1
|
|
721
|
+
|
|
722
|
+
|
|
723
|
+
def plot_MA(self, log: bool = True, save_path: Optional[str] = None, **kwargs):
|
|
724
|
+
"""
|
|
725
|
+
Create an log ratio (M)-average (A) plot using matplotlib.
|
|
726
|
+
|
|
727
|
+
Useful for looking at log fold-change versus mean expression
|
|
728
|
+
between two groups/samples/etc.
|
|
729
|
+
Uses matplotlib to emulate the ``make_MA()`` function in DESeq2 in R.
|
|
730
|
+
|
|
731
|
+
Parameters
|
|
732
|
+
----------
|
|
733
|
+
log : bool
|
|
734
|
+
Whether or not to log scale x and y axes (``default=True``).
|
|
735
|
+
|
|
736
|
+
save_path : str or None
|
|
737
|
+
The path where to save the plot. If left None, the plot won't be saved
|
|
738
|
+
(``default=None``).
|
|
739
|
+
|
|
740
|
+
**kwargs
|
|
741
|
+
Matplotlib keyword arguments for the scatter plot.
|
|
742
|
+
"""
|
|
743
|
+
# Raise an error if results_df are missing
|
|
744
|
+
if not hasattr(self, "results_df"):
|
|
745
|
+
raise AttributeError(
|
|
746
|
+
"Trying to make an MA plot but p-values were not computed yet. "
|
|
747
|
+
"Please run the summary() method first."
|
|
748
|
+
)
|
|
749
|
+
|
|
750
|
+
make_MA_plot(
|
|
751
|
+
self.results_df,
|
|
752
|
+
padj_thresh=self.alpha,
|
|
753
|
+
log=log,
|
|
754
|
+
save_path=save_path,
|
|
755
|
+
lfc_null=self.lfc_null,
|
|
756
|
+
alt_hypothesis=self.alt_hypothesis,
|
|
757
|
+
**kwargs,
|
|
758
|
+
)
|