DeConveil 0.1.4__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deconveil/__init__.py +1 -0
- deconveil/__version__.py +1 -1
- deconveil/dds.py +169 -62
- deconveil/default_inference.py +3 -3
- deconveil/ds.py +82 -170
- deconveil/grid_search.py +1 -0
- deconveil/inference.py +4 -4
- deconveil/nb_regression_fit.py +313 -0
- deconveil/simulate_gene_dosage.py +589 -0
- deconveil/utils_fit.py +173 -129
- {deconveil-0.1.4.dist-info → deconveil-0.2.0.dist-info}/METADATA +4 -1
- deconveil-0.2.0.dist-info/RECORD +18 -0
- {deconveil-0.1.4.dist-info → deconveil-0.2.0.dist-info}/WHEEL +1 -1
- deconveil-0.1.4.dist-info/RECORD +0 -16
- {deconveil-0.1.4.dist-info → deconveil-0.2.0.dist-info}/licenses/LICENSE +0 -0
- {deconveil-0.1.4.dist-info → deconveil-0.2.0.dist-info}/top_level.txt +0 -0
deconveil/ds.py
CHANGED
|
@@ -132,16 +132,16 @@ class deconveil_stats:
|
|
|
132
132
|
def __init__(
|
|
133
133
|
self,
|
|
134
134
|
dds: deconveil_fit,
|
|
135
|
-
contrast:
|
|
135
|
+
contrast: list[str] | np.ndarray,
|
|
136
136
|
alpha: float = 0.05,
|
|
137
137
|
cooks_filter: bool = True,
|
|
138
138
|
independent_filter: bool = True,
|
|
139
|
-
prior_LFC_var:
|
|
139
|
+
prior_LFC_var: np.ndarray | None = None,
|
|
140
140
|
lfc_null: float = 0.0,
|
|
141
|
-
alt_hypothesis:
|
|
142
|
-
Literal["greaterAbs", "lessAbs", "greater", "less"]
|
|
143
|
-
|
|
144
|
-
inference:
|
|
141
|
+
alt_hypothesis: (
|
|
142
|
+
Literal["greaterAbs", "lessAbs", "greater", "less"] | None
|
|
143
|
+
) = None,
|
|
144
|
+
inference: Inference | None = None,
|
|
145
145
|
quiet: bool = False,
|
|
146
146
|
) -> None:
|
|
147
147
|
assert (
|
|
@@ -164,24 +164,67 @@ class deconveil_stats:
|
|
|
164
164
|
self.lfc_null = lfc_null
|
|
165
165
|
self.alt_hypothesis = alt_hypothesis
|
|
166
166
|
|
|
167
|
-
# Check the validity of the contrast (if provided) or build it.
|
|
168
|
-
self._build_contrast(contrast)
|
|
169
|
-
|
|
170
167
|
# Initialize the design matrix and LFCs. If the chosen reference level are the
|
|
171
168
|
# same as in dds, keep them unchanged. Otherwise, change reference level.
|
|
172
169
|
self.design_matrix = self.dds.obsm["design_matrix"].copy()
|
|
173
170
|
self.LFC = self.dds.varm["LFC"].copy()
|
|
174
171
|
|
|
175
|
-
#
|
|
176
|
-
self.
|
|
177
|
-
|
|
172
|
+
# Check the validity of the contrast (if provided) or build it.
|
|
173
|
+
self.contrast: list[str] | np.ndarray
|
|
174
|
+
if contrast is None:
|
|
175
|
+
raise ValueError(
|
|
176
|
+
"""Default contrasts are no longer supported.
|
|
177
|
+
The "contrast" argument must be provided."""
|
|
178
|
+
)
|
|
179
|
+
elif isinstance(contrast, np.ndarray):
|
|
180
|
+
if contrast.shape[0] != self.dds.obsm["design_matrix"].shape[1]:
|
|
181
|
+
raise ValueError(
|
|
182
|
+
"The contrast vector must have the same length as the design matrix."
|
|
183
|
+
)
|
|
184
|
+
self.contrast = contrast
|
|
185
|
+
self.contrast_vector = contrast
|
|
186
|
+
else:
|
|
187
|
+
self.contrast = contrast
|
|
188
|
+
self._build_contrast_vector()
|
|
189
|
+
|
|
178
190
|
# Set a flag to indicate that LFCs are unshrunk
|
|
179
191
|
self.shrunk_LFCs = False
|
|
180
192
|
self.quiet = quiet
|
|
181
193
|
|
|
194
|
+
if inference:
|
|
195
|
+
if n_cpus:
|
|
196
|
+
if hasattr(inference, "n_cpus"):
|
|
197
|
+
inference.n_cpus = n_cpus
|
|
198
|
+
else:
|
|
199
|
+
warnings.warn(
|
|
200
|
+
"The provided inference object does not have an n_cpus "
|
|
201
|
+
"attribute, cannot override `n_cpus`.",
|
|
202
|
+
UserWarning,
|
|
203
|
+
stacklevel=2,
|
|
204
|
+
)
|
|
205
|
+
|
|
182
206
|
# Initialize the inference object.
|
|
183
207
|
self.inference = inference or DefInference()
|
|
184
208
|
|
|
209
|
+
# If the `refit_cooks` attribute of the dds object is True, check that outliers
|
|
210
|
+
# were actually refitted.
|
|
211
|
+
#if self.dds.refit_cooks and "replaced" not in self.dds.var:
|
|
212
|
+
#raise AttributeError(
|
|
213
|
+
#"dds has 'refit_cooks' set to True but Cooks outliers have not been "
|
|
214
|
+
#"refitted. Please run 'dds.refit()' first or set 'dds.refit_cooks' "
|
|
215
|
+
#"to False."
|
|
216
|
+
#)
|
|
217
|
+
if self.dds.refit_cooks and "replaced" not in getattr(self.dds, "varm", {}):
|
|
218
|
+
raise AttributeError(
|
|
219
|
+
"dds has 'refit_cooks' set to True but Cooks outliers have not been "
|
|
220
|
+
"refitted. Please run 'dds.refit()' first or set 'dds.refit_cooks' "
|
|
221
|
+
"to False."
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
@property
|
|
225
|
+
def variables(self):
|
|
226
|
+
"""Get the names of the variables used in the model definition."""
|
|
227
|
+
return self.dds.variables
|
|
185
228
|
|
|
186
229
|
def summary(
|
|
187
230
|
self,
|
|
@@ -249,9 +292,12 @@ class deconveil_stats:
|
|
|
249
292
|
self.results_df["padj"] = self.padj
|
|
250
293
|
|
|
251
294
|
if not self.quiet:
|
|
252
|
-
if self.contrast
|
|
253
|
-
# The
|
|
254
|
-
print(
|
|
295
|
+
if isinstance(self.contrast, np.ndarray):
|
|
296
|
+
# The contrast vector was directly provided
|
|
297
|
+
print(
|
|
298
|
+
"Log2 fold change & Wald test p-value, contrast vector: "
|
|
299
|
+
f"{self.contrast}"
|
|
300
|
+
)
|
|
255
301
|
else:
|
|
256
302
|
# The factor is categorical
|
|
257
303
|
print(
|
|
@@ -259,6 +305,7 @@ class deconveil_stats:
|
|
|
259
305
|
f"{self.contrast[0]} {self.contrast[1]} vs {self.contrast[2]}"
|
|
260
306
|
)
|
|
261
307
|
print(self.results_df)
|
|
308
|
+
|
|
262
309
|
|
|
263
310
|
def run_wald_test(self) -> None:
|
|
264
311
|
"""Perform a Wald test.
|
|
@@ -288,7 +335,7 @@ class deconveil_stats:
|
|
|
288
335
|
ridge_factor = np.diag(1 / self.prior_LFC_var**2)
|
|
289
336
|
else:
|
|
290
337
|
ridge_factor = np.diag(np.repeat(1e-6, num_vars))
|
|
291
|
-
|
|
338
|
+
|
|
292
339
|
design_matrix = self.design_matrix.values
|
|
293
340
|
LFCs = self.LFC.values
|
|
294
341
|
|
|
@@ -320,70 +367,38 @@ class deconveil_stats:
|
|
|
320
367
|
self.p_values.loc[self.dds.new_all_zeroes_genes] = 1.0
|
|
321
368
|
|
|
322
369
|
|
|
323
|
-
def lfc_shrink(self, coeff:
|
|
370
|
+
def lfc_shrink(self, coeff: str, adapt: bool = True) -> None:
|
|
324
371
|
"""LFC shrinkage with an apeGLM prior :cite:p:`DeseqStats-zhu2019heavy`.
|
|
325
372
|
|
|
326
373
|
Shrinks LFCs using a heavy-tailed Cauchy prior, leaving p-values unchanged.
|
|
327
374
|
|
|
328
375
|
Parameters
|
|
329
376
|
----------
|
|
330
|
-
coeff : str
|
|
331
|
-
The LFC coefficient to shrink.
|
|
332
|
-
shrink the coefficient corresponding to the ``contrast`` attribute.
|
|
333
|
-
If the desired coefficient is not available, it may be set from the
|
|
334
|
-
:class:`pydeseq2.dds.DeseqDataSet` argument ``ref_level``.
|
|
377
|
+
coeff : str
|
|
378
|
+
The LFC coefficient to shrink. Must be one of the columns of the LFC matrix.
|
|
335
379
|
(default: ``None``).
|
|
380
|
+
|
|
336
381
|
adapt: bool
|
|
337
382
|
Whether to use the MLE estimates of LFC to adapt the prior. If False, the
|
|
338
383
|
prior scale is set to 1. (``default=True``)
|
|
339
384
|
"""
|
|
340
|
-
if self.contrast[1] == self.contrast[2] == "":
|
|
341
|
-
# The factor being tested is continuous
|
|
342
|
-
contrast_level = self.contrast[0]
|
|
343
|
-
else:
|
|
344
|
-
# The factor being tested is categorical
|
|
345
|
-
contrast_level = (
|
|
346
|
-
f"{self.contrast[0]}_{self.contrast[1]}_vs_{self.contrast[2]}"
|
|
347
|
-
)
|
|
348
385
|
|
|
349
|
-
if coeff
|
|
350
|
-
if coeff not in self.LFC.columns:
|
|
351
|
-
split_coeff = coeff.split("_")
|
|
352
|
-
if len(split_coeff) == 4:
|
|
353
|
-
raise KeyError(
|
|
354
|
-
f"The coeff argument '{coeff}' should be one the LFC columns. "
|
|
355
|
-
f"The available LFC coeffs are {self.LFC.columns[1:]}. "
|
|
356
|
-
f"If the desired coefficient is not available, please set "
|
|
357
|
-
f"`ref_level = [{split_coeff[0]}, {split_coeff[3]}]` "
|
|
358
|
-
f"in DeseqDataSet and rerun."
|
|
359
|
-
)
|
|
360
|
-
else:
|
|
361
|
-
raise KeyError(
|
|
362
|
-
f"The coeff argument '{coeff}' should be one the LFC columns. "
|
|
363
|
-
f"The available LFC coeffs are {self.LFC.columns[1:]}. "
|
|
364
|
-
f"If the desired coefficient is not available, please set the "
|
|
365
|
-
f"appropriate`ref_level` in DeseqDataSet and rerun."
|
|
366
|
-
)
|
|
367
|
-
elif contrast_level not in self.LFC.columns:
|
|
386
|
+
if coeff not in self.LFC.columns:
|
|
368
387
|
raise KeyError(
|
|
369
|
-
f"
|
|
370
|
-
f"
|
|
371
|
-
|
|
372
|
-
f"If the desired coefficient is not available, please set "
|
|
373
|
-
f"`ref_level = [{self.contrast[0]}, {self.contrast[2]}]` "
|
|
374
|
-
f"in DeseqDataSet and rerun."
|
|
375
|
-
)
|
|
376
|
-
else:
|
|
377
|
-
coeff = contrast_level
|
|
388
|
+
f"coeff '{coeff}' must be one of the LFC columns.\n"
|
|
389
|
+
f"Available: {list(self.LFC.columns)}"
|
|
390
|
+
)
|
|
378
391
|
|
|
379
392
|
coeff_idx = self.LFC.columns.get_loc(coeff)
|
|
380
393
|
|
|
394
|
+
design_matrix = self.design_matrix.values
|
|
381
395
|
size = 1.0 / self.dds.varm["dispersions"]
|
|
382
|
-
offset = np.log(self.dds.obsm["size_factors"])
|
|
396
|
+
offset = np.log(self.dds.obsm["size_factors"])
|
|
383
397
|
|
|
384
398
|
counts=self.dds.data["counts"]
|
|
385
399
|
cnv=self.dds.data["cnv"].to_numpy()
|
|
386
|
-
cnv = cnv + 0.1
|
|
400
|
+
cnv = (cnv / 2) + 0.1
|
|
401
|
+
#cnv = cnv + 0.1
|
|
387
402
|
cnv = np.log(cnv)
|
|
388
403
|
|
|
389
404
|
# Set priors
|
|
@@ -444,24 +459,10 @@ class deconveil_stats:
|
|
|
444
459
|
if hasattr(self, "results_df"):
|
|
445
460
|
self.results_df["log2FoldChange"] = self.LFC.iloc[:, coeff_idx] / np.log(2)
|
|
446
461
|
self.results_df["lfcSE"] = self.SE / np.log(2)
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
# Categorical coeffs are of the form "factor_A_vs_B", and continuous coeffs
|
|
451
|
-
# of the form "factor".
|
|
452
|
-
if len(split_coeff) == 1:
|
|
453
|
-
# The factor is continuous
|
|
454
|
-
print(f"Shrunk log2 fold change & Wald test p-value: " f"{coeff}")
|
|
455
|
-
else:
|
|
456
|
-
# The factor is categorical
|
|
457
|
-
# Categorical coeffs are of the form "factor_A_vs_B", hence "factor"
|
|
458
|
-
# is split_coeff[0], "A" is split_coeff[1] and "B" split_coeff[3]
|
|
459
|
-
print(
|
|
460
|
-
f"Shrunk log2 fold change & Wald test p-value: "
|
|
461
|
-
f"{split_coeff[0]} {split_coeff[1]} vs {split_coeff[3]}"
|
|
462
|
-
)
|
|
462
|
+
if not self.quiet:
|
|
463
|
+
print(f"Shrunk log2 fold change & Wald test p-value: {coeff}")
|
|
464
|
+
print(self.results_df)
|
|
463
465
|
|
|
464
|
-
print(self.results_df)
|
|
465
466
|
|
|
466
467
|
|
|
467
468
|
def _independent_filtering(self) -> None:
|
|
@@ -534,7 +535,6 @@ class deconveil_stats:
|
|
|
534
535
|
|
|
535
536
|
# As in DESeq2, only take samples with 3 or more replicates when looking for
|
|
536
537
|
# max cooks.
|
|
537
|
-
#use_for_max = n_or_more_replicates(self.design_matrix, 3)
|
|
538
538
|
use_for_max = n_or_more_replicates(self.dds.obsm["design_matrix"], 3).values
|
|
539
539
|
|
|
540
540
|
# If for a gene there are 3 samples or more that have more counts than the
|
|
@@ -614,70 +614,7 @@ class deconveil_stats:
|
|
|
614
614
|
return min_var
|
|
615
615
|
else:
|
|
616
616
|
return root_scalar(objective, bracket=[min_var, max_var]).root
|
|
617
|
-
|
|
618
|
-
def _build_contrast(self, contrast: Optional[List[str]] = None) -> None:
|
|
619
|
-
"""Check the validity of the contrast (if provided).
|
|
620
|
-
|
|
621
|
-
If not, build a default
|
|
622
|
-
contrast, corresponding to the last column of the design matrix.
|
|
623
|
-
A contrast should be a list of three strings, in the following format:
|
|
624
|
-
``['variable_of_interest', 'tested_level', 'reference_level']``.
|
|
625
|
-
Names must correspond to the metadata data passed to the DeseqDataSet.
|
|
626
|
-
E.g., ``['condition', 'B', 'A']`` will measure the LFC of 'condition B'
|
|
627
|
-
compared to 'condition A'.
|
|
628
|
-
For continuous variables, the last two strings will be left empty, e.g.
|
|
629
|
-
``['measurement', '', ''].
|
|
630
|
-
If None, the last variable from the design matrix
|
|
631
|
-
is chosen as the variable of interest, and the reference level is picked
|
|
632
|
-
alphabetically.
|
|
633
|
-
|
|
634
|
-
Parameters
|
|
635
|
-
----------
|
|
636
|
-
contrast : list or None
|
|
637
|
-
A list of three strings, in the following format:
|
|
638
|
-
``['variable_of_interest', 'tested_level', 'reference_level']``.
|
|
639
|
-
(default: ``None``).
|
|
640
|
-
"""
|
|
641
|
-
if contrast is not None: # Test contrast if provided
|
|
642
|
-
if len(contrast) != 3:
|
|
643
|
-
raise ValueError("The contrast should contain three strings.")
|
|
644
|
-
if contrast[0] not in self.dds.design_factors:
|
|
645
|
-
raise KeyError(
|
|
646
|
-
f"The contrast variable ('{contrast[0]}') should be one "
|
|
647
|
-
f"of the design factors."
|
|
648
|
-
)
|
|
649
|
-
if not (contrast[1] == contrast[2] == ""):
|
|
650
|
-
# The contrast factor is categorical, so we should check that the tested
|
|
651
|
-
# and reference levels are valid.
|
|
652
|
-
if contrast[1] not in self.dds.obs[contrast[0]].values:
|
|
653
|
-
raise KeyError(
|
|
654
|
-
f"The tested level ('{contrast[1]}') should correspond to "
|
|
655
|
-
f"one of the levels of '{contrast[0]}'"
|
|
656
|
-
)
|
|
657
|
-
if contrast[2] not in self.dds.obs[contrast[0]].values:
|
|
658
|
-
raise KeyError(
|
|
659
|
-
f"The reference level ('{contrast[2]}') should correspond to "
|
|
660
|
-
f"one of the levels of '{contrast[0]}'"
|
|
661
|
-
)
|
|
662
|
-
self.contrast = contrast
|
|
663
|
-
else: # Build contrast if None
|
|
664
|
-
factor = self.dds.design_factors[-1]
|
|
665
|
-
# Check whether this factor is categorical or continuous.
|
|
666
|
-
if (
|
|
667
|
-
self.dds.continuous_factors is not None
|
|
668
|
-
and factor in self.dds.continuous_factors
|
|
669
|
-
):
|
|
670
|
-
# The factor is continuous
|
|
671
|
-
self.contrast = [factor, "", ""]
|
|
672
|
-
else:
|
|
673
|
-
# The factor is categorical
|
|
674
|
-
factor_col = next(
|
|
675
|
-
col
|
|
676
|
-
for col in self.dds.obsm["design_matrix"].columns
|
|
677
|
-
if col.startswith(factor)
|
|
678
|
-
)
|
|
679
|
-
split_col = factor_col.split("_")
|
|
680
|
-
self.contrast = [split_col[0], split_col[1], split_col[-1]]
|
|
617
|
+
|
|
681
618
|
|
|
682
619
|
def _build_contrast_vector(self) -> None:
|
|
683
620
|
"""
|
|
@@ -688,34 +625,9 @@ class deconveil_stats:
|
|
|
688
625
|
factor = self.contrast[0]
|
|
689
626
|
alternative = self.contrast[1]
|
|
690
627
|
ref = self.contrast[2]
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
else:
|
|
695
|
-
contrast_level = f"{factor}_{alternative}_vs_{ref}"
|
|
696
|
-
|
|
697
|
-
self.contrast_vector = np.zeros(self.LFC.shape[-1])
|
|
698
|
-
if contrast_level in self.design_matrix.columns:
|
|
699
|
-
self.contrast_idx = self.LFC.columns.get_loc(contrast_level)
|
|
700
|
-
self.contrast_vector[self.contrast_idx] = 1
|
|
701
|
-
elif f"{factor}_{ref}_vs_{alternative}" in self.design_matrix.columns:
|
|
702
|
-
# Reference and alternative are inverted
|
|
703
|
-
self.contrast_idx = self.LFC.columns.get_loc(
|
|
704
|
-
f"{factor}_{ref}_vs_{alternative}"
|
|
705
|
-
)
|
|
706
|
-
self.contrast_vector[self.contrast_idx] = -1
|
|
707
|
-
else:
|
|
708
|
-
# Need to change reference
|
|
709
|
-
# Get any column corresponding to the desired factor and extract old ref
|
|
710
|
-
old_ref = next(
|
|
711
|
-
col for col in self.LFC.columns if col.startswith(factor)
|
|
712
|
-
).split("_vs_")[-1]
|
|
713
|
-
new_alternative_idx = self.LFC.columns.get_loc(
|
|
714
|
-
f"{factor}_{alternative}_vs_{old_ref}"
|
|
715
|
-
)
|
|
716
|
-
new_ref_idx = self.LFC.columns.get_loc(f"{factor}_{ref}_vs_{old_ref}")
|
|
717
|
-
self.contrast_vector[new_alternative_idx] = 1
|
|
718
|
-
self.contrast_vector[new_ref_idx] = -1
|
|
628
|
+
self.contrast_vector = self.dds.contrast(
|
|
629
|
+
column=factor, baseline=ref, group_to_compare=alternative
|
|
630
|
+
)
|
|
719
631
|
|
|
720
632
|
|
|
721
633
|
def plot_MA(self, log: bool = True, save_path: Optional[str] = None, **kwargs):
|
deconveil/grid_search.py
CHANGED
|
@@ -67,6 +67,7 @@ def grid_fit_beta(
|
|
|
67
67
|
raise ValueError("Beta is not properly initialized or has an unexpected shape.")
|
|
68
68
|
|
|
69
69
|
|
|
70
|
+
#mu = np.maximum(size_factors[:, None] * np.exp(design_matrix @ beta.T), min_mu)
|
|
70
71
|
mu = np.maximum(cnv * size_factors[:, None] * np.exp(design_matrix @ beta.T), min_mu)
|
|
71
72
|
return vec_nb_nll(counts, mu, disp) + 0.5 * (1e-6 * beta**2).sum(1)
|
|
72
73
|
|
deconveil/inference.py
CHANGED
|
@@ -261,9 +261,9 @@ class Inference(ABC):
|
|
|
261
261
|
ridge_factor: np.ndarray,
|
|
262
262
|
contrast: np.ndarray,
|
|
263
263
|
lfc_null: np.ndarray,
|
|
264
|
-
alt_hypothesis:
|
|
265
|
-
Literal["greaterAbs", "lessAbs", "greater", "less"]
|
|
266
|
-
|
|
264
|
+
alt_hypothesis: (
|
|
265
|
+
Literal["greaterAbs", "lessAbs", "greater", "less"] | None
|
|
266
|
+
) = None,
|
|
267
267
|
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
|
|
268
268
|
"""Run Wald test for differential expression.
|
|
269
269
|
|
|
@@ -320,7 +320,7 @@ class Inference(ABC):
|
|
|
320
320
|
prior_scale: float,
|
|
321
321
|
optimizer: str,
|
|
322
322
|
shrink_index: int,
|
|
323
|
-
) ->
|
|
323
|
+
) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
|
|
324
324
|
"""Fit a negative binomial MAP LFC using an apeGLM prior.
|
|
325
325
|
|
|
326
326
|
Only the LFC is shrinked, and not the intercept.
|