DeConveil 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,195 @@
1
+ from typing import Optional
2
+
3
+ import numpy as np
4
+ from scipy.special import gammaln # type: ignore
5
+
6
+ from deconveil import utils_CNaware
7
+
8
+
9
+ def grid_fit_beta(
10
+ counts: np.ndarray,
11
+ size_factors: np.ndarray,
12
+ design_matrix: np.ndarray,
13
+ disp: float,
14
+ cnv: np.ndarray,
15
+ min_mu: float = 0.5,
16
+ grid_length: int = 60,
17
+ min_beta: float = -30,
18
+ max_beta: float = 30,
19
+ ) -> np.ndarray:
20
+ """Find best LFC parameter.
21
+
22
+ Perform 2D grid search to maximize negative binomial
23
+ GLM log-likelihood w.r.t. LFCs.
24
+
25
+ Parameters
26
+ ----------
27
+ counts : ndarray
28
+ Raw counts for a given gene.
29
+
30
+ size_factors : ndarray
31
+ DESeq2 normalization factors.
32
+
33
+ design_matrix : ndarray
34
+ Design matrix.
35
+
36
+ disp : float
37
+ Gene-wise dispersion prior.
38
+
39
+ min_mu : float
40
+ Lower threshold for dispersion parameters.
41
+
42
+ grid_length : int
43
+ Number of grid points. (default: ``100``).
44
+
45
+ min_beta : float
46
+ Lower-bound on LFC. (default: ``30``).
47
+
48
+ max_beta : float
49
+ Upper-bound on LFC. (default: ``30``).
50
+
51
+ Returns
52
+ -------
53
+ ndarray
54
+ Fitted LFC parameter.
55
+ """
56
+
57
+ x_grid = np.linspace(min_beta, max_beta, grid_length)
58
+ y_grid = np.linspace(min_beta, max_beta, grid_length)
59
+ ll_grid = np.zeros((grid_length, grid_length))
60
+
61
+ def loss(beta: np.ndarray) -> np.ndarray:
62
+ # closure to minimize
63
+ print(f"Shape of beta: {beta.shape}")
64
+ print(f"Shape of design_matrix: {design_matrix.shape}")
65
+
66
+ if beta is None or len(beta.shape) < 2:
67
+ raise ValueError("Beta is not properly initialized or has an unexpected shape.")
68
+
69
+
70
+ mu = np.maximum(cnv * size_factors[:, None] * np.exp(design_matrix @ beta.T), min_mu)
71
+ return vec_nb_nll(counts, mu, disp) + 0.5 * (1e-6 * beta**2).sum(1)
72
+
73
+ for i, x in enumerate(x_grid):
74
+ ll_grid[i, :] = loss(np.array([[x, y] for y in y_grid]))
75
+
76
+ min_idxs = np.unravel_index(np.argmin(ll_grid, axis=None), ll_grid.shape)
77
+ delta = x_grid[1] - x_grid[0]
78
+
79
+ fine_x_grid = np.linspace(
80
+ x_grid[min_idxs[0]] - delta, x_grid[min_idxs[0]] + delta, grid_length
81
+ )
82
+
83
+ fine_y_grid = np.linspace(
84
+ y_grid[min_idxs[1]] - delta,
85
+ y_grid[min_idxs[1]] + delta,
86
+ grid_length,
87
+ )
88
+
89
+ for i, x in enumerate(fine_x_grid):
90
+ ll_grid[i, :] = loss(np.array([[x, y] for y in fine_y_grid]))
91
+
92
+ min_idxs = np.unravel_index(np.argmin(ll_grid, axis=None), ll_grid.shape)
93
+ beta = np.array([fine_x_grid[min_idxs[0]], fine_y_grid[min_idxs[1]]])
94
+ return beta
95
+
96
+
97
+ def grid_fit_shrink_beta(
98
+ counts: np.ndarray,
99
+ cnv: np.ndarray,
100
+ offset: np.ndarray,
101
+ design_matrix: np.ndarray,
102
+ size: np.ndarray,
103
+ prior_no_shrink_scale: float,
104
+ prior_scale: float,
105
+ scale_cnst: float,
106
+ grid_length: int = 60,
107
+ min_beta: float = -30,
108
+ max_beta: float = 30,
109
+ ) -> np.ndarray:
110
+ """Find best LFC parameter.
111
+
112
+ Performs 2D grid search to maximize MAP negative binomial
113
+ GLM log-likelihood w.r.t. LFCs, with apeGLM prior.
114
+
115
+ Parameters
116
+ ----------
117
+ counts : ndarray
118
+ Raw counts for a given gene.
119
+
120
+ offset : ndarray
121
+ Natural logarithm of size factor.
122
+
123
+ design_matrix : ndarray
124
+ Design matrix.
125
+
126
+ size : ndarray
127
+ Size parameter of NB family (inverse of dispersion).
128
+
129
+ prior_no_shrink_scale : float
130
+ Prior variance for the intercept.
131
+
132
+ prior_scale : float
133
+ Prior variance for the LFC coefficient.
134
+
135
+ scale_cnst : float
136
+ Scaling factor for the optimization.
137
+
138
+ grid_length : int
139
+ Number of grid points. (default: ``100``).
140
+
141
+ min_beta : int
142
+ Lower-bound on LFC. (default: ``30``).
143
+
144
+ max_beta : int
145
+ Upper-bound on LFC. (default: ``30``).
146
+
147
+ Returns
148
+ -------
149
+ ndarray
150
+ Fitted MAP LFC parameter.
151
+ """
152
+ x_grid = np.linspace(min_beta, max_beta, grid_length)
153
+ y_grid = np.linspace(min_beta, max_beta, grid_length)
154
+ ll_grid = np.zeros((grid_length, grid_length))
155
+
156
+ def loss(beta: np.ndarray) -> float:
157
+ # closure to minimize
158
+ return (
159
+ utils_CNaware.nbinomFn(
160
+ beta,
161
+ design_matrix,
162
+ counts,
163
+ cnv,
164
+ size,
165
+ offset,
166
+ prior_no_shrink_scale,
167
+ prior_scale,
168
+ )
169
+ / scale_cnst
170
+ )
171
+
172
+ for i, x in enumerate(x_grid):
173
+ for j, y in enumerate(y_grid):
174
+ ll_grid[i, j] = loss(np.array([x, y]))
175
+
176
+ min_idxs = np.unravel_index(np.argmin(ll_grid, axis=None), ll_grid.shape)
177
+ delta = x_grid[1] - x_grid[0]
178
+
179
+ fine_x_grid = np.linspace(
180
+ x_grid[min_idxs[0]] - delta, x_grid[min_idxs[0]] + delta, grid_length
181
+ )
182
+
183
+ fine_y_grid = np.linspace(
184
+ y_grid[min_idxs[1]] - delta,
185
+ y_grid[min_idxs[1]] + delta,
186
+ grid_length,
187
+ )
188
+
189
+ for i, x in enumerate(fine_x_grid):
190
+ for j, y in enumerate(fine_y_grid):
191
+ ll_grid[i, j] = loss(np.array([x, y]))
192
+
193
+ min_idxs = np.unravel_index(np.argmin(ll_grid, axis=None), ll_grid.shape)
194
+ beta = np.array([fine_x_grid[min_idxs[0]], fine_y_grid[min_idxs[1]]])
195
+ return beta
DeConveil/inference.py ADDED
@@ -0,0 +1,373 @@
1
+ from abc import ABC
2
+ from abc import abstractmethod
3
+ from typing import Literal
4
+ from typing import Optional
5
+ from typing import Tuple
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+
10
+
11
+ class Inference(ABC):
12
+ """Abstract class with DESeq2-related inference methods."""
13
+
14
+ @abstractmethod
15
+ def lin_reg_mu(
16
+ self,
17
+ counts: np.ndarray,
18
+ size_factors: np.ndarray,
19
+ design_matrix: np.ndarray,
20
+ min_mu: float,
21
+ ) -> np.ndarray:
22
+ """Estimate mean of negative binomial model using a linear regression.
23
+
24
+ Used to initialize genewise dispersion models.
25
+
26
+ Parameters
27
+ ----------
28
+ counts : ndarray
29
+ Raw counts.
30
+
31
+ size_factors : ndarray
32
+ Sample-wise scaling factors (obtained from median-of-ratios).
33
+
34
+ design_matrix : ndarray
35
+ Design matrix.
36
+
37
+ min_mu : float
38
+ Lower threshold for fitted means, for numerical stability.
39
+ (default: ``0.5``).
40
+
41
+ Returns
42
+ -------
43
+ ndarray
44
+ Estimated mean.
45
+ """
46
+
47
+ @abstractmethod
48
+ def irls_glm(
49
+ self,
50
+ counts: np.ndarray,
51
+ size_factors: np.ndarray,
52
+ design_matrix: np.ndarray,
53
+ disp: np.ndarray,
54
+ cnv: np.ndarray,
55
+ min_mu: float,
56
+ beta_tol: float,
57
+ min_beta: float = -30,
58
+ max_beta: float = 30,
59
+ optimizer: Literal["BFGS", "L-BFGS-B"] = "L-BFGS-B",
60
+ maxiter: int = 250,
61
+ ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
62
+ r"""Fit a NB GLM wit log-link to predict counts from the design matrix.
63
+
64
+ See equations (1-2) in the DESeq2 paper.
65
+
66
+ Parameters
67
+ ----------
68
+ counts : ndarray
69
+ Raw counts.
70
+
71
+ size_factors : ndarray
72
+ Sample-wise scaling factors (obtained from median-of-ratios).
73
+
74
+ design_matrix : ndarray
75
+ Design matrix.
76
+
77
+ disp : ndarray
78
+ Gene-wise dispersion prior.
79
+
80
+ min_mu : ndarray
81
+ Lower bound on estimated means, to ensure numerical stability.
82
+ (default: ``0.5``).
83
+
84
+ beta_tol : float
85
+ Stopping criterion for IRWLS:
86
+ :math:`\vert dev - dev_{old}\vert / \vert dev + 0.1 \vert < \beta_{tol}`.
87
+ (default: ``1e-8``).
88
+
89
+ min_beta : float
90
+ Lower-bound on LFC. (default: ``-30``).
91
+
92
+ max_beta : float
93
+ Upper-bound on LFC. (default: ``-30``).
94
+
95
+ optimizer : str
96
+ Optimizing method to use in case IRLS starts diverging.
97
+ Accepted values: 'BFGS' or 'L-BFGS-B'.
98
+ NB: only 'L-BFGS-B' ensures that LFCS will
99
+ lay in the [min_beta, max_beta] range. (default: ``'L-BFGS-B'``).
100
+
101
+ maxiter : int
102
+ Maximum number of IRLS iterations to perform before switching to L-BFGS-B.
103
+ (default: ``250``).
104
+
105
+ Returns
106
+ -------
107
+ beta: ndarray
108
+ Fitted (basemean, lfc) coefficients of negative binomial GLM.
109
+
110
+ mu: ndarray
111
+ Means estimated from size factors and beta:
112
+ :math:`\mu = s_{ij} \exp(\beta^t X)`.
113
+
114
+ H: ndarray
115
+ Diagonal of the :math:`W^{1/2} X (X^t W X)^-1 X^t W^{1/2}`
116
+ covariance matrix.
117
+
118
+ converged: ndarray
119
+ Whether IRLS or the optimizer converged. If not and if dimension allows it,
120
+ perform grid search.
121
+ """
122
+
123
+ @abstractmethod
124
+ def alpha_mle(
125
+ self,
126
+ counts: np.ndarray,
127
+ design_matrix: np.ndarray,
128
+ mu: np.ndarray,
129
+ alpha_hat: np.ndarray,
130
+ min_disp: float,
131
+ max_disp: float,
132
+ prior_disp_var: Optional[float] = None,
133
+ cr_reg: bool = True,
134
+ prior_reg: bool = False,
135
+ optimizer: Literal["BFGS", "L-BFGS-B"] = "L-BFGS-B",
136
+ ) -> Tuple[np.ndarray, np.ndarray]:
137
+ """Estimate the dispersion parameter of a negative binomial GLM.
138
+
139
+ Parameters
140
+ ----------
141
+ counts : ndarray
142
+ Raw counts.
143
+
144
+ design_matrix : ndarray
145
+ Design matrix.
146
+
147
+ mu : ndarray
148
+ Mean estimation for the NB model.
149
+
150
+ alpha_hat : ndarray
151
+ Initial dispersion estimate.
152
+
153
+ min_disp : float
154
+ Lower threshold for dispersion parameters.
155
+
156
+ max_disp : float
157
+ Upper threshold for dispersion parameters.
158
+
159
+ prior_disp_var : float
160
+ Prior dispersion variance.
161
+
162
+ cr_reg : bool
163
+ Whether to use Cox-Reid regularization. (default: ``True``).
164
+
165
+ prior_reg : bool
166
+ Whether to use prior log-residual regularization. (default: ``False``).
167
+
168
+ optimizer : str
169
+ Optimizing method to use. Accepted values: 'BFGS' or 'L-BFGS-B'.
170
+ (default: ``'L-BFGS-B'``).
171
+
172
+ Returns
173
+ -------
174
+ ndarray
175
+ Dispersion estimate.
176
+
177
+ ndarray
178
+ Whether L-BFGS-B converged. If not, dispersion is estimated
179
+ using grid search.
180
+ """
181
+
182
+ @abstractmethod
183
+ def fit_rough_dispersions(
184
+ self, normed_counts: np.ndarray, design_matrix: np.ndarray
185
+ ) -> np.ndarray:
186
+ """'Rough dispersion' estimates from linear model, as per the R code.
187
+
188
+ Used as initial estimates in :meth:`DeseqDataSet.fit_genewise_dispersions()
189
+ <pydeseq2.dds.DeseqDataSet.fit_genewise_dispersions>`.
190
+
191
+ Parameters
192
+ ----------
193
+ normed_counts : ndarray
194
+ Array of deseq2-normalized read counts. Rows: samples, columns: genes.
195
+
196
+ design_matrix : pandas.DataFrame
197
+ A DataFrame with experiment design information (to split cohorts).
198
+ Indexed by sample barcodes. Unexpanded, *with* intercept.
199
+
200
+ Returns
201
+ -------
202
+ ndarray
203
+ Estimated dispersion parameter for each gene.
204
+ """
205
+
206
+
207
+ @abstractmethod
208
+ def fit_moments_dispersions2(
209
+ self, normed_counts: np.ndarray, size_factors: np.ndarray
210
+ ) -> np.ndarray:
211
+ """Dispersion estimates based on moments, as per the R code.
212
+
213
+ Used as initial estimates in :meth:`DeseqDataSet.fit_genewise_dispersions()
214
+ <pydeseq2.dds.DeseqDataSet.fit_genewise_dispersions>`.
215
+
216
+ Parameters
217
+ ----------
218
+ normed_counts : ndarray
219
+ Array of deseq2-normalized read counts. Rows: samples, columns: genes.
220
+
221
+ size_factors : ndarray
222
+ DESeq2 normalization factors.
223
+
224
+ Returns
225
+ -------
226
+ ndarray
227
+ Estimated dispersion parameter for each gene.
228
+ """
229
+
230
+ @abstractmethod
231
+ def dispersion_trend_gamma_glm(
232
+ self, covariates: pd.Series, targets: pd.Series
233
+ ) -> Tuple[np.ndarray, np.ndarray, bool]:
234
+ """Fit a gamma glm on gene dispersions.
235
+
236
+ The intercept should be concatenated in this method
237
+ and the first returned coefficient should be the intercept.
238
+
239
+ Parameters
240
+ ----------
241
+ covariates : pd.Series
242
+ Covariates for the regression (num_genes,).
243
+ targets : pd.Series
244
+ Targets for the regression (num_genes,).
245
+
246
+ Returns
247
+ -------
248
+ coeffs : ndarray
249
+ Coefficients of the regression.
250
+ predictions : ndarray
251
+ Predictions of the regression.
252
+ converged : bool
253
+ Whether the optimization converged.
254
+ """
255
+
256
+ @abstractmethod
257
+ def wald_test(
258
+ self,
259
+ design_matrix: np.ndarray,
260
+ disp: np.ndarray,
261
+ lfc: np.ndarray,
262
+ mu: np.ndarray,
263
+ ridge_factor: np.ndarray,
264
+ contrast: np.ndarray,
265
+ lfc_null: np.ndarray,
266
+ alt_hypothesis: Optional[
267
+ Literal["greaterAbs", "lessAbs", "greater", "less"]
268
+ ] = None,
269
+ ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
270
+ """Run Wald test for differential expression.
271
+
272
+ Computes Wald statistics, standard error and p-values from
273
+ dispersion and LFC estimates.
274
+
275
+ Parameters
276
+ ----------
277
+ design_matrix : ndarray
278
+ Design matrix.
279
+
280
+ disp : float
281
+ Dispersion estimate.
282
+
283
+ lfc : ndarray
284
+ Log-fold change estimate (in natural log scale).
285
+
286
+ mu : float
287
+ Mean estimation for the NB model.
288
+
289
+ ridge_factor : ndarray
290
+ Regularization factors.
291
+
292
+ contrast : ndarray
293
+ Vector encoding the contrast that is being tested.
294
+
295
+ lfc_null : float
296
+ The (log2) log fold change under the null hypothesis.
297
+
298
+ alt_hypothesis : str or None
299
+ The alternative hypothesis for computing wald p-values.
300
+
301
+ Returns
302
+ -------
303
+ wald_p_value : ndarray
304
+ Estimated p-value.
305
+
306
+ wald_statistic : ndarray
307
+ Wald statistic.
308
+
309
+ wald_se : ndarray
310
+ Standard error of the Wald statistic.
311
+ """
312
+
313
+ @abstractmethod
314
+ def lfc_shrink_nbinom_glm(
315
+ self,
316
+ design_matrix: np.ndarray,
317
+ counts: np.ndarray,
318
+ cnv: np.ndarray,
319
+ size: np.ndarray,
320
+ offset: np.ndarray,
321
+ prior_no_shrink_scale: float,
322
+ prior_scale: float,
323
+ optimizer: str,
324
+ shrink_index: int,
325
+ ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
326
+ """Fit a negative binomial MAP LFC using an apeGLM prior.
327
+
328
+ Only the LFC is shrinked, and not the intercept.
329
+
330
+ Parameters
331
+ ----------
332
+ design_matrix : ndarray
333
+ Design matrix.
334
+
335
+ counts : ndarray
336
+ Raw counts.
337
+
338
+ size : ndarray
339
+ Size parameter of NB family (inverse of dispersion).
340
+
341
+ offset : ndarray
342
+ Natural logarithm of size factor.
343
+
344
+ prior_no_shrink_scale : float
345
+ Prior variance for the intercept.
346
+
347
+ prior_scale : float
348
+ Prior variance for the LFC parameter.
349
+
350
+ optimizer : str
351
+ Optimizing method to use in case IRLS starts diverging.
352
+ Accepted values: 'L-BFGS-B', 'BFGS' or 'Newton-CG'.
353
+
354
+ shrink_index : int
355
+ Index of the LFC coordinate to shrink. (default: ``1``).
356
+
357
+ Returns
358
+ -------
359
+ beta: ndarray
360
+ 2-element array, containing the intercept (first) and the LFC (second).
361
+
362
+ inv_hessian: ndarray
363
+ Inverse of the Hessian of the objective at the estimated MAP LFC.
364
+
365
+ converged: ndarray
366
+ Whether L-BFGS-B converged for each optimization problem.
367
+ """
368
+
369
+
370
+
371
+
372
+
373
+