DeConveil 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
DeConveil/ds.py ADDED
@@ -0,0 +1,758 @@
1
+ import sys
2
+ import time
3
+ from typing import List
4
+ from typing import Literal
5
+ from typing import Optional
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+ from scipy.optimize import root_scalar # type: ignore
10
+ from scipy.stats import f # type: ignore
11
+ from scipy.stats import false_discovery_control # type: ignore
12
+
13
+ from deconveil.dds import deconveil_fit
14
+ from deconveil.default_inference import DefInference
15
+ from deconveil.inference import Inference
16
+ from deconveil.grid_search import grid_fit_shrink_beta
17
+ from pydeseq2.utils import lowess
18
+ from pydeseq2.utils import wald_test
19
+ from pydeseq2.utils import make_MA_plot
20
+ from pydeseq2.utils import n_or_more_replicates
21
+
22
+
23
+ class deconveil_stats:
24
+ """PyDESeq2 statistical tests for differential expression.
25
+
26
+ Implements p-value estimation for differential gene expression according
27
+ to the DESeq2 pipeline :cite:p:`DeseqStats-love2014moderated`.
28
+
29
+ Also supports apeGLM log-fold change shrinkage :cite:p:`DeseqStats-zhu2019heavy`.
30
+
31
+ Parameters
32
+ ----------
33
+ dds : pydeseq2CN_data
34
+ pydeseq2CN_data for which dispersion and LFCs were already estimated.
35
+
36
+ contrast : list or None
37
+ A list of three strings, in the following format:
38
+ ``['variable_of_interest', 'tested_level', 'ref_level']``.
39
+ Names must correspond to the metadata data passed to the pydeseq2CN_data.
40
+ E.g., ``['condition', 'B', 'A']`` will measure the LFC of 'condition B' compared
41
+ to 'condition A'.
42
+ For continuous variables, the last two strings should be left empty, e.g.
43
+ ``['measurement', '', '']``.
44
+ If ``None``, the last variable from the design matrix is chosen
45
+ as the variable of interest, and the reference level is picked alphabetically.
46
+ (default: ``None``).
47
+
48
+ alpha : float
49
+ P-value and adjusted p-value significance threshold (usually 0.05).
50
+ (default: ``0.05``).
51
+
52
+ independent_filter : bool
53
+ Whether to perform independent filtering to correct p-value trends.
54
+ (default: ``True``).
55
+
56
+ prior_LFC_var : ndarray
57
+ Prior variance for LFCs, used for ridge regularization. (default: ``None``).
58
+
59
+ lfc_null : float
60
+ The (log2) log fold change under the null hypothesis. (default: ``0``).
61
+
62
+ alt_hypothesis : str or None
63
+ The alternative hypothesis for computing wald p-values. By default, the normal
64
+ Wald test assesses deviation of the estimated log fold change from the null
65
+ hypothesis, as given by ``lfc_null``.
66
+ One of ``["greaterAbs", "lessAbs", "greater", "less"]`` or ``None``.
67
+ The alternative hypothesis corresponds to what the user wants to find rather
68
+ than the null hypothesis. (default: ``None``).
69
+
70
+ inference : Inference
71
+ Implementation of inference routines object instance.
72
+ (default:
73
+ :class:`DeftInference <def_inference.DefInference>`).
74
+
75
+ quiet : bool
76
+ Suppress deseq2 status updates during fit.
77
+
78
+ Attributes
79
+ ----------
80
+ base_mean : pandas.Series
81
+ Genewise means of normalized counts.
82
+
83
+ lfc_null : float
84
+ The (log2) log fold change under the null hypothesis.
85
+
86
+ alt_hypothesis : str or None
87
+ The alternative hypothesis for computing wald p-values.
88
+
89
+ contrast_vector : ndarray
90
+ Vector encoding the contrast (variable being tested).
91
+
92
+ contrast_idx : int
93
+ Index of the LFC column corresponding to the variable being tested.
94
+
95
+ design_matrix : pandas.DataFrame
96
+ A DataFrame with experiment design information (to split cohorts).
97
+ Indexed by sample barcodes. Depending on the contrast that is provided to the
98
+ DeseqStats object, it may differ from the DeseqDataSet design matrix, as the
99
+ reference level may need to be adapted.
100
+
101
+ LFC : pandas.DataFrame
102
+ Estimated log-fold change between conditions and intercept, in natural log scale.
103
+
104
+ SE : pandas.Series
105
+ Standard LFC error.
106
+
107
+ statistics : pandas.Series
108
+ Wald statistics.
109
+
110
+ p_values : pandas.Series
111
+ P-values estimated from Wald statistics.
112
+
113
+ padj : pandas.Series
114
+ P-values adjusted for multiple testing.
115
+
116
+ results_df : pandas.DataFrame
117
+ Summary of the statistical analysis.
118
+
119
+ shrunk_LFCs : bool
120
+ Whether LFCs are shrunk.
121
+
122
+ n_processes : int
123
+ Number of threads to use for multiprocessing.
124
+
125
+ quiet : bool
126
+ Suppress deseq2 status updates during fit.
127
+
128
+ References
129
+ ----------
130
+ .. bibliography::
131
+ :keyprefix: DeseqStats-
132
+ """
133
+
134
+ def __init__(
135
+ self,
136
+ dds: deconveil_fit,
137
+ contrast: Optional[List[str]] = None,
138
+ alpha: float = 0.05,
139
+ cooks_filter: bool = True,
140
+ independent_filter: bool = True,
141
+ prior_LFC_var: Optional[np.ndarray] = None,
142
+ lfc_null: float = 0.0,
143
+ alt_hypothesis: Optional[
144
+ Literal["greaterAbs", "lessAbs", "greater", "less"]
145
+ ] = None,
146
+ inference: Optional[Inference] = None,
147
+ quiet: bool = False,
148
+ ) -> None:
149
+ assert (
150
+ "LFC" in dds.varm
151
+ ), "Please provide a fitted pydeseq2CN_data by first running the `deseq2` method."
152
+
153
+ self.dds = dds
154
+
155
+ self.alpha = alpha
156
+ self.cooks_filter = cooks_filter
157
+ self.independent_filter = independent_filter
158
+ self.base_mean = self.dds.varm["_normed_means"].copy()
159
+ self.prior_LFC_var = prior_LFC_var
160
+
161
+ if lfc_null < 0 and alt_hypothesis in {"greaterAbs", "lessAbs"}:
162
+ raise ValueError(
163
+ f"The alternative hypothesis being {alt_hypothesis}, please provide a",
164
+ f"positive lfc_null value (got {lfc_null}).",
165
+ )
166
+ self.lfc_null = lfc_null
167
+ self.alt_hypothesis = alt_hypothesis
168
+
169
+ # Check the validity of the contrast (if provided) or build it.
170
+ self._build_contrast(contrast)
171
+
172
+ # Initialize the design matrix and LFCs. If the chosen reference level are the
173
+ # same as in dds, keep them unchanged. Otherwise, change reference level.
174
+ self.design_matrix = self.dds.obsm["design_matrix"].copy()
175
+ self.LFC = self.dds.varm["LFC"].copy()
176
+
177
+ # Build a contrast vector corresponding to the variable and levels of interest
178
+ self._build_contrast_vector()
179
+
180
+ # Set a flag to indicate that LFCs are unshrunk
181
+ self.shrunk_LFCs = False
182
+ self.quiet = quiet
183
+
184
+ # Initialize the inference object.
185
+ self.inference = inference or DefInference()
186
+
187
+
188
+ def summary(
189
+ self,
190
+ **kwargs,
191
+ ) -> None:
192
+ """Run the statistical analysis.
193
+
194
+ The results are stored in the ``results_df`` attribute.
195
+
196
+ Parameters
197
+ ----------
198
+ **kwargs
199
+ Keyword arguments: providing new values for ``lfc_null`` or
200
+ ``alt_hypothesis`` will override the corresponding ``DeseqStat`` attributes.
201
+ """
202
+ new_lfc_null = kwargs.get("lfc_null", "default")
203
+ new_alt_hypothesis = kwargs.get("alt_hypothesis", "default")
204
+
205
+ rerun_summary = False
206
+ if new_lfc_null == "default":
207
+ lfc_null = self.lfc_null
208
+ else:
209
+ lfc_null = new_lfc_null
210
+ if new_alt_hypothesis == "default":
211
+ alt_hypothesis = self.alt_hypothesis
212
+ else:
213
+ alt_hypothesis = new_alt_hypothesis
214
+ if lfc_null < 0 and alt_hypothesis in {"greaterAbs", "lessAbs"}:
215
+ raise ValueError(
216
+ f"The alternative hypothesis being {alt_hypothesis}, please provide a",
217
+ f"positive lfc_null value (got {lfc_null}).",
218
+ )
219
+
220
+ if (
221
+ not hasattr(self, "p_values")
222
+ or self.lfc_null != lfc_null
223
+ or self.alt_hypothesis != alt_hypothesis
224
+ ):
225
+ # Estimate p-values with Wald test
226
+ self.lfc_null = lfc_null
227
+ self.alt_hypothesis = alt_hypothesis
228
+ rerun_summary = True
229
+ self.run_wald_test()
230
+
231
+ if self.cooks_filter:
232
+ # Filter p-values based on Cooks outliers
233
+ self._cooks_filtering()
234
+
235
+ if not hasattr(self, "padj") or rerun_summary:
236
+ if self.independent_filter:
237
+ # Compute adjusted p-values and correct p-value trend
238
+ self._independent_filtering()
239
+ else:
240
+ # Compute adjusted p-values using the Benjamini-Hochberg method, without
241
+ # correcting the p-value trend.
242
+ self._p_value_adjustment()
243
+
244
+ # Store the results in a DataFrame, in log2 scale for LFCs.
245
+ self.results_df = pd.DataFrame(index=self.dds.var_names)
246
+ self.results_df["baseMean"] = self.base_mean
247
+ self.results_df["log2FoldChange"] = self.LFC @ self.contrast_vector / np.log(2)
248
+ self.results_df["lfcSE"] = self.SE / np.log(2)
249
+ self.results_df["stat"] = self.statistics
250
+ self.results_df["pvalue"] = self.p_values
251
+ self.results_df["padj"] = self.padj
252
+
253
+ if not self.quiet:
254
+ if self.contrast[1] == self.contrast[2] == "":
255
+ # The factor is continuous
256
+ print(f"Log2 fold change & Wald test p-value: " f"{self.contrast[0]}")
257
+ else:
258
+ # The factor is categorical
259
+ print(
260
+ f"Log2 fold change & Wald test p-value: "
261
+ f"{self.contrast[0]} {self.contrast[1]} vs {self.contrast[2]}"
262
+ )
263
+ print(self.results_df)
264
+
265
+ def run_wald_test(self) -> None:
266
+ """Perform a Wald test.
267
+
268
+ Get gene-wise p-values for gene over/under-expression.
269
+ """
270
+ num_vars = self.design_matrix.shape[1]
271
+
272
+ # Raise a warning if LFCs are shrunk.
273
+ if self.shrunk_LFCs:
274
+ if not self.quiet:
275
+ print(
276
+ "Note: running Wald test on shrunk LFCs. "
277
+ "Some sequencing datasets show better performance with the testing "
278
+ "separated from the use of the LFC prior.",
279
+ file=sys.stderr,
280
+ )
281
+
282
+ mu = (
283
+ np.exp(self.design_matrix @ self.LFC.T)
284
+ .multiply(self.dds.obsm["size_factors"], 0)
285
+ .values
286
+ )
287
+
288
+ # Set regularization factors.
289
+ if self.prior_LFC_var is not None:
290
+ ridge_factor = np.diag(1 / self.prior_LFC_var**2)
291
+ else:
292
+ ridge_factor = np.diag(np.repeat(1e-6, num_vars))
293
+
294
+ design_matrix = self.design_matrix.values
295
+ LFCs = self.LFC.values
296
+
297
+ if not self.quiet:
298
+ print("Running Wald tests...", file=sys.stderr)
299
+ start = time.time()
300
+ pvals, stats, se = self.inference.wald_test(
301
+ design_matrix=design_matrix,
302
+ disp=self.dds.varm["dispersions"],
303
+ lfc=LFCs,
304
+ mu=mu,
305
+ ridge_factor=ridge_factor,
306
+ contrast=self.contrast_vector,
307
+ lfc_null=np.log(2) * self.lfc_null, # Convert log2 to natural log
308
+ alt_hypothesis=self.alt_hypothesis,
309
+ )
310
+ end = time.time()
311
+ if not self.quiet:
312
+ print(f"... done in {end-start:.2f} seconds.\n", file=sys.stderr)
313
+
314
+ self.p_values: pd.Series = pd.Series(pvals, index=self.dds.var_names)
315
+ self.statistics: pd.Series = pd.Series(stats, index=self.dds.var_names)
316
+ self.SE: pd.Series = pd.Series(se, index=self.dds.var_names)
317
+
318
+ # Account for possible all_zeroes due to outlier refitting in DESeqDataSet
319
+ if self.dds.refit_cooks and self.dds.varm["replaced"].sum() > 0:
320
+ self.SE.loc[self.dds.new_all_zeroes_genes] = 0.0
321
+ self.statistics.loc[self.dds.new_all_zeroes_genes] = 0.0
322
+ self.p_values.loc[self.dds.new_all_zeroes_genes] = 1.0
323
+
324
+
325
+ def lfc_shrink(self, coeff: Optional[str] = None, adapt: bool = True) -> None:
326
+ """LFC shrinkage with an apeGLM prior :cite:p:`DeseqStats-zhu2019heavy`.
327
+
328
+ Shrinks LFCs using a heavy-tailed Cauchy prior, leaving p-values unchanged.
329
+
330
+ Parameters
331
+ ----------
332
+ coeff : str or None
333
+ The LFC coefficient to shrink. If set to ``None``, the method will try to
334
+ shrink the coefficient corresponding to the ``contrast`` attribute.
335
+ If the desired coefficient is not available, it may be set from the
336
+ :class:`pydeseq2.dds.DeseqDataSet` argument ``ref_level``.
337
+ (default: ``None``).
338
+ adapt: bool
339
+ Whether to use the MLE estimates of LFC to adapt the prior. If False, the
340
+ prior scale is set to 1. (``default=True``)
341
+ """
342
+ if self.contrast[1] == self.contrast[2] == "":
343
+ # The factor being tested is continuous
344
+ contrast_level = self.contrast[0]
345
+ else:
346
+ # The factor being tested is categorical
347
+ contrast_level = (
348
+ f"{self.contrast[0]}_{self.contrast[1]}_vs_{self.contrast[2]}"
349
+ )
350
+
351
+ if coeff is not None:
352
+ if coeff not in self.LFC.columns:
353
+ split_coeff = coeff.split("_")
354
+ if len(split_coeff) == 4:
355
+ raise KeyError(
356
+ f"The coeff argument '{coeff}' should be one the LFC columns. "
357
+ f"The available LFC coeffs are {self.LFC.columns[1:]}. "
358
+ f"If the desired coefficient is not available, please set "
359
+ f"`ref_level = [{split_coeff[0]}, {split_coeff[3]}]` "
360
+ f"in DeseqDataSet and rerun."
361
+ )
362
+ else:
363
+ raise KeyError(
364
+ f"The coeff argument '{coeff}' should be one the LFC columns. "
365
+ f"The available LFC coeffs are {self.LFC.columns[1:]}. "
366
+ f"If the desired coefficient is not available, please set the "
367
+ f"appropriate`ref_level` in DeseqDataSet and rerun."
368
+ )
369
+ elif contrast_level not in self.LFC.columns:
370
+ raise KeyError(
371
+ f"lfc_shrink's coeff argument was set to None, but the coefficient "
372
+ f"corresponding to the contrast {self.contrast} is not available."
373
+ f"The available LFC coeffs are {self.LFC.columns[1:]}. "
374
+ f"If the desired coefficient is not available, please set "
375
+ f"`ref_level = [{self.contrast[0]}, {self.contrast[2]}]` "
376
+ f"in DeseqDataSet and rerun."
377
+ )
378
+ else:
379
+ coeff = contrast_level
380
+
381
+ coeff_idx = self.LFC.columns.get_loc(coeff)
382
+
383
+ size = 1.0 / self.dds.varm["dispersions"]
384
+ offset = np.log(self.dds.obsm["size_factors"])
385
+
386
+ counts=self.dds.data["counts"]
387
+ cnv=self.dds.data["cnv"].to_numpy()
388
+ cnv = cnv + 0.1
389
+ cnv = np.log(cnv)
390
+
391
+ # Set priors
392
+ prior_no_shrink_scale = 15
393
+ prior_scale = 1
394
+ if adapt:
395
+ prior_var = self._fit_prior_var(coeff_idx=coeff_idx)
396
+ prior_scale = np.minimum(np.sqrt(prior_var), 1)
397
+
398
+ design_matrix = self.design_matrix.values
399
+
400
+ if not self.quiet:
401
+ print("Fitting MAP LFCs...", file=sys.stderr)
402
+ start = time.time()
403
+ lfcs, inv_hessians, l_bfgs_b_converged_ = self.inference.lfc_shrink_nbinom_glm(
404
+ design_matrix=design_matrix,
405
+ counts=counts[:, self.dds.non_zero_idx],
406
+ cnv=cnv[:, self.dds.non_zero_idx],
407
+ size=size[self.dds.non_zero_idx],
408
+ offset=offset,
409
+ prior_no_shrink_scale=prior_no_shrink_scale,
410
+ prior_scale=prior_scale,
411
+ optimizer="L-BFGS-B",
412
+ shrink_index=coeff_idx,
413
+ )
414
+ end = time.time()
415
+ if not self.quiet:
416
+ print(f"... done in {end-start:.2f} seconds.\n", file=sys.stderr)
417
+
418
+ self.LFC.iloc[:, coeff_idx].update(
419
+ pd.Series(
420
+ np.array(lfcs)[:, coeff_idx],
421
+ index=self.dds.non_zero_genes,
422
+ )
423
+ )
424
+
425
+ self.SE.update(
426
+ pd.Series(
427
+ np.array(
428
+ [
429
+ np.sqrt(np.abs(inv_hess[coeff_idx, coeff_idx]))
430
+ for inv_hess in inv_hessians
431
+ ]
432
+ ),
433
+ index=self.dds.non_zero_genes,
434
+ )
435
+ )
436
+
437
+ self._LFC_shrink_converged = pd.Series(np.nan, index=self.dds.var_names)
438
+ self._LFC_shrink_converged.update(
439
+ pd.Series(l_bfgs_b_converged_, index=self.dds.non_zero_genes)
440
+ )
441
+
442
+ # Set a flag to indicate that LFCs were shrunk
443
+ self.shrunk_LFCs = True
444
+
445
+ # Replace in results dataframe, if it exists
446
+ if hasattr(self, "results_df"):
447
+ self.results_df["log2FoldChange"] = self.LFC.iloc[:, coeff_idx] / np.log(2)
448
+ self.results_df["lfcSE"] = self.SE / np.log(2)
449
+ # Get the corresponding factor, tested and reference levels of the shrunk
450
+ # coefficient
451
+ split_coeff = coeff.split("_")
452
+ # Categorical coeffs are of the form "factor_A_vs_B", and continuous coeffs
453
+ # of the form "factor".
454
+ if len(split_coeff) == 1:
455
+ # The factor is continuous
456
+ print(f"Shrunk log2 fold change & Wald test p-value: " f"{coeff}")
457
+ else:
458
+ # The factor is categorical
459
+ # Categorical coeffs are of the form "factor_A_vs_B", hence "factor"
460
+ # is split_coeff[0], "A" is split_coeff[1] and "B" split_coeff[3]
461
+ print(
462
+ f"Shrunk log2 fold change & Wald test p-value: "
463
+ f"{split_coeff[0]} {split_coeff[1]} vs {split_coeff[3]}"
464
+ )
465
+
466
+ print(self.results_df)
467
+
468
+
469
+ def _independent_filtering(self) -> None:
470
+ """Compute adjusted p-values using independent filtering.
471
+
472
+ Corrects p-value trend (see :cite:p:`DeseqStats-love2014moderated`)
473
+ """
474
+ # Check that p-values are available. If not, compute them.
475
+ if not hasattr(self, "p_values"):
476
+ self.run_wald_test()
477
+
478
+ lower_quantile = np.mean(self.base_mean == 0)
479
+
480
+ if lower_quantile < 0.95:
481
+ upper_quantile = 0.95
482
+ else:
483
+ upper_quantile = 1
484
+
485
+ theta = np.linspace(lower_quantile, upper_quantile, 50)
486
+ cutoffs = np.quantile(self.base_mean, theta)
487
+
488
+ result = pd.DataFrame(
489
+ np.nan, index=self.dds.var_names, columns=np.arange(len(theta))
490
+ )
491
+
492
+ for i, cutoff in enumerate(cutoffs):
493
+ use = (self.base_mean >= cutoff) & (~self.p_values.isna())
494
+ U2 = self.p_values[use]
495
+ if not U2.empty:
496
+ result.loc[use, i] = false_discovery_control(U2, method="bh")
497
+ num_rej = (result < self.alpha).sum(0).values
498
+ lowess_res = lowess(theta, num_rej, frac=1 / 5)
499
+
500
+ if num_rej.max() <= 10:
501
+ j = 0
502
+ else:
503
+ residual = num_rej[num_rej > 0] - lowess_res[num_rej > 0]
504
+ thresh = lowess_res.max() - np.sqrt(np.mean(residual**2))
505
+ if np.any(num_rej > thresh):
506
+ j = np.where(num_rej > thresh)[0][0]
507
+ else:
508
+ j = 0
509
+
510
+ self.padj = result.loc[:, j]
511
+
512
+ def _p_value_adjustment(self) -> None:
513
+ """Compute adjusted p-values using the Benjamini-Hochberg method.
514
+
515
+ Does not correct the p-value trend.
516
+ This method and the `_independent_filtering` are mutually exclusive.
517
+ """
518
+ if not hasattr(self, "p_values"):
519
+ # Estimate p-values with Wald test
520
+ self.run_wald_test()
521
+
522
+ self.padj = pd.Series(np.nan, index=self.dds.var_names)
523
+ self.padj.loc[~self.p_values.isna()] = false_discovery_control(
524
+ self.p_values.dropna(), method="bh"
525
+ )
526
+
527
+ def _cooks_filtering(self) -> None:
528
+ """Filter p-values based on Cooks outliers."""
529
+ # Check that p-values are available. If not, compute them.
530
+ if not hasattr(self, "p_values"):
531
+ self.run_wald_test()
532
+
533
+ num_samples = self.dds.n_obs
534
+ num_vars = self.design_matrix.shape[-1]
535
+ cooks_cutoff = f.ppf(0.99, num_vars, num_samples - num_vars)
536
+
537
+ # As in DESeq2, only take samples with 3 or more replicates when looking for
538
+ # max cooks.
539
+ #use_for_max = n_or_more_replicates(self.design_matrix, 3)
540
+ use_for_max = n_or_more_replicates(self.dds.obsm["design_matrix"], 3).values
541
+
542
+ # If for a gene there are 3 samples or more that have more counts than the
543
+ # maximum cooks sample, don't count this gene as an outlier.
544
+
545
+ # Take into account whether we already replaced outliers
546
+
547
+ if self.dds.refit_cooks and self.dds.varm["refitted"].sum() > 0:
548
+ cooks_layer = self.dds.layers["replace_cooks"]
549
+ filtered_cooks_layer = cooks_layer.loc[use_for_max, :]
550
+
551
+ else:
552
+ cooks_layer = self.dds.layers["cooks"]
553
+ filtered_cooks_layer = cooks_layer[use_for_max, :]
554
+
555
+ cooks_outlier = (filtered_cooks_layer > cooks_cutoff).any(axis=0).copy()
556
+
557
+ # Find the position of the maximum cooks distance for each outlier gene
558
+
559
+ if isinstance(self.dds.data["counts"], pd.DataFrame):
560
+ pos = self.dds.data["counts"].iloc[:, cooks_outlier].values.argmax(axis=0)
561
+ else:
562
+ pos = self.dds.data["counts"][:, cooks_outlier].argmax(axis=0) # Use NumPy indexing
563
+
564
+ # Filter out genes where 3 or more samples exceed the maximum cooks distance
565
+ if isinstance(self.dds.data["counts"], pd.DataFrame):
566
+ cooks_outlier[cooks_outlier] = (
567
+ (self.dds.data["counts"].iloc[:, cooks_outlier].values
568
+ > self.dds.data["counts"].iloc[pos, cooks_outlier].values)
569
+ .sum(axis=0) < 3
570
+ )
571
+ else:
572
+ cooks_outlier[cooks_outlier] = (
573
+ (self.dds.data["counts"][:, cooks_outlier]
574
+ > self.dds.data["counts"][pos, cooks_outlier])
575
+ .sum(axis=0) < 3
576
+ )
577
+
578
+ # Assign NaN to p-values for outlier genes
579
+ cooks_outlier_index = self.dds.var_names[cooks_outlier] # Assuming var_names is the gene index
580
+ self.p_values.loc[cooks_outlier_index] = np.nan
581
+
582
+ def _fit_prior_var(
583
+ self, coeff_idx: str, min_var: float = 1e-6, max_var: float = 400.0
584
+ ) -> float:
585
+ """Estimate the prior variance of the apeGLM model.
586
+
587
+ Returns shrinkage factors.
588
+
589
+ Parameters
590
+ ----------
591
+ coeff_idx : str
592
+ Index of the coefficient to shrink.
593
+
594
+ min_var : float
595
+ Lower bound for prior variance. (default: ``1e-6``).
596
+
597
+ max_var : float
598
+ Upper bound for prior variance. (default: ``400``).
599
+
600
+ Returns
601
+ -------
602
+ float
603
+ Estimated prior variance.
604
+ """
605
+ keep = ~self.LFC.iloc[:, coeff_idx].isna()
606
+ S = self.LFC[keep].iloc[:, coeff_idx] ** 2
607
+ D = self.SE[keep] ** 2
608
+
609
+ def objective(a: float) -> float:
610
+ # Equation to solve
611
+ coeff = 1 / (2 * (a + D) ** 2)
612
+ return ((S - D) * coeff).sum() / coeff.sum() - a
613
+
614
+ # The prior variance is the zero of the above function.
615
+ if objective(min_var) < 0:
616
+ return min_var
617
+ else:
618
+ return root_scalar(objective, bracket=[min_var, max_var]).root
619
+
620
+ def _build_contrast(self, contrast: Optional[List[str]] = None) -> None:
621
+ """Check the validity of the contrast (if provided).
622
+
623
+ If not, build a default
624
+ contrast, corresponding to the last column of the design matrix.
625
+ A contrast should be a list of three strings, in the following format:
626
+ ``['variable_of_interest', 'tested_level', 'reference_level']``.
627
+ Names must correspond to the metadata data passed to the DeseqDataSet.
628
+ E.g., ``['condition', 'B', 'A']`` will measure the LFC of 'condition B'
629
+ compared to 'condition A'.
630
+ For continuous variables, the last two strings will be left empty, e.g.
631
+ ``['measurement', '', ''].
632
+ If None, the last variable from the design matrix
633
+ is chosen as the variable of interest, and the reference level is picked
634
+ alphabetically.
635
+
636
+ Parameters
637
+ ----------
638
+ contrast : list or None
639
+ A list of three strings, in the following format:
640
+ ``['variable_of_interest', 'tested_level', 'reference_level']``.
641
+ (default: ``None``).
642
+ """
643
+ if contrast is not None: # Test contrast if provided
644
+ if len(contrast) != 3:
645
+ raise ValueError("The contrast should contain three strings.")
646
+ if contrast[0] not in self.dds.design_factors:
647
+ raise KeyError(
648
+ f"The contrast variable ('{contrast[0]}') should be one "
649
+ f"of the design factors."
650
+ )
651
+ if not (contrast[1] == contrast[2] == ""):
652
+ # The contrast factor is categorical, so we should check that the tested
653
+ # and reference levels are valid.
654
+ if contrast[1] not in self.dds.obs[contrast[0]].values:
655
+ raise KeyError(
656
+ f"The tested level ('{contrast[1]}') should correspond to "
657
+ f"one of the levels of '{contrast[0]}'"
658
+ )
659
+ if contrast[2] not in self.dds.obs[contrast[0]].values:
660
+ raise KeyError(
661
+ f"The reference level ('{contrast[2]}') should correspond to "
662
+ f"one of the levels of '{contrast[0]}'"
663
+ )
664
+ self.contrast = contrast
665
+ else: # Build contrast if None
666
+ factor = self.dds.design_factors[-1]
667
+ # Check whether this factor is categorical or continuous.
668
+ if (
669
+ self.dds.continuous_factors is not None
670
+ and factor in self.dds.continuous_factors
671
+ ):
672
+ # The factor is continuous
673
+ self.contrast = [factor, "", ""]
674
+ else:
675
+ # The factor is categorical
676
+ factor_col = next(
677
+ col
678
+ for col in self.dds.obsm["design_matrix"].columns
679
+ if col.startswith(factor)
680
+ )
681
+ split_col = factor_col.split("_")
682
+ self.contrast = [split_col[0], split_col[1], split_col[-1]]
683
+
684
+ def _build_contrast_vector(self) -> None:
685
+ """
686
+ Build a vector corresponding to the desired contrast.
687
+
688
+ Allows to test any pair of levels without refitting LFCs.
689
+ """
690
+ factor = self.contrast[0]
691
+ alternative = self.contrast[1]
692
+ ref = self.contrast[2]
693
+ if ref == alternative == "":
694
+ # "factor" is a continuous variable
695
+ contrast_level = factor
696
+ else:
697
+ contrast_level = f"{factor}_{alternative}_vs_{ref}"
698
+
699
+ self.contrast_vector = np.zeros(self.LFC.shape[-1])
700
+ if contrast_level in self.design_matrix.columns:
701
+ self.contrast_idx = self.LFC.columns.get_loc(contrast_level)
702
+ self.contrast_vector[self.contrast_idx] = 1
703
+ elif f"{factor}_{ref}_vs_{alternative}" in self.design_matrix.columns:
704
+ # Reference and alternative are inverted
705
+ self.contrast_idx = self.LFC.columns.get_loc(
706
+ f"{factor}_{ref}_vs_{alternative}"
707
+ )
708
+ self.contrast_vector[self.contrast_idx] = -1
709
+ else:
710
+ # Need to change reference
711
+ # Get any column corresponding to the desired factor and extract old ref
712
+ old_ref = next(
713
+ col for col in self.LFC.columns if col.startswith(factor)
714
+ ).split("_vs_")[-1]
715
+ new_alternative_idx = self.LFC.columns.get_loc(
716
+ f"{factor}_{alternative}_vs_{old_ref}"
717
+ )
718
+ new_ref_idx = self.LFC.columns.get_loc(f"{factor}_{ref}_vs_{old_ref}")
719
+ self.contrast_vector[new_alternative_idx] = 1
720
+ self.contrast_vector[new_ref_idx] = -1
721
+
722
+
723
+ def plot_MA(self, log: bool = True, save_path: Optional[str] = None, **kwargs):
724
+ """
725
+ Create an log ratio (M)-average (A) plot using matplotlib.
726
+
727
+ Useful for looking at log fold-change versus mean expression
728
+ between two groups/samples/etc.
729
+ Uses matplotlib to emulate the ``make_MA()`` function in DESeq2 in R.
730
+
731
+ Parameters
732
+ ----------
733
+ log : bool
734
+ Whether or not to log scale x and y axes (``default=True``).
735
+
736
+ save_path : str or None
737
+ The path where to save the plot. If left None, the plot won't be saved
738
+ (``default=None``).
739
+
740
+ **kwargs
741
+ Matplotlib keyword arguments for the scatter plot.
742
+ """
743
+ # Raise an error if results_df are missing
744
+ if not hasattr(self, "results_df"):
745
+ raise AttributeError(
746
+ "Trying to make an MA plot but p-values were not computed yet. "
747
+ "Please run the summary() method first."
748
+ )
749
+
750
+ make_MA_plot(
751
+ self.results_df,
752
+ padj_thresh=self.alpha,
753
+ log=log,
754
+ save_path=save_path,
755
+ lfc_null=self.lfc_null,
756
+ alt_hypothesis=self.alt_hypothesis,
757
+ **kwargs,
758
+ )