edgepython 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- edgepython/__init__.py +114 -0
- edgepython/classes.py +517 -0
- edgepython/compressed_matrix.py +388 -0
- edgepython/dgelist.py +314 -0
- edgepython/dispersion.py +920 -0
- edgepython/dispersion_lowlevel.py +1066 -0
- edgepython/exact_test.py +525 -0
- edgepython/expression.py +323 -0
- edgepython/filtering.py +96 -0
- edgepython/gene_sets.py +1215 -0
- edgepython/glm_fit.py +653 -0
- edgepython/glm_levenberg.py +359 -0
- edgepython/glm_test.py +375 -0
- edgepython/io.py +1887 -0
- edgepython/limma_port.py +987 -0
- edgepython/normalization.py +546 -0
- edgepython/ql_weights.py +765 -0
- edgepython/results.py +236 -0
- edgepython/sc_fit.py +1511 -0
- edgepython/smoothing.py +474 -0
- edgepython/splicing.py +537 -0
- edgepython/utils.py +1050 -0
- edgepython/visualization.py +409 -0
- edgepython/weighted_lowess.py +323 -0
- edgepython-0.2.0.dist-info/METADATA +201 -0
- edgepython-0.2.0.dist-info/RECORD +29 -0
- edgepython-0.2.0.dist-info/WHEEL +5 -0
- edgepython-0.2.0.dist-info/licenses/LICENSE +674 -0
- edgepython-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1066 @@
|
|
|
1
|
+
# This code was written by Claude (Anthropic). The project was directed by Lior Pachter.
|
|
2
|
+
"""
|
|
3
|
+
Low-level dispersion estimation functions for edgePython.
|
|
4
|
+
|
|
5
|
+
Port of edgeR's adjustedProfileLik, maximizeInterpolant,
|
|
6
|
+
condLogLikDerDelta, condLogLikDerSize, dispCoxReid,
|
|
7
|
+
dispCoxReidInterpolateTagwise, dispCoxReidSplineTrend, dispBinTrend, etc.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import numpy as np
|
|
11
|
+
import warnings
|
|
12
|
+
from scipy.special import gammaln, digamma, polygamma
|
|
13
|
+
from scipy.optimize import minimize_scalar, minimize
|
|
14
|
+
from scipy.interpolate import CubicSpline
|
|
15
|
+
from numba import njit
|
|
16
|
+
|
|
17
|
+
from .utils import (expand_as_matrix, systematic_subset, moving_average_by_col,
|
|
18
|
+
cut_with_min_n)
|
|
19
|
+
from .expression import ave_log_cpm
|
|
20
|
+
from .limma_port import is_fullrank
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def adjusted_profile_lik_grid(grid_dispersions, y, design, offset, weights=None):
|
|
24
|
+
"""Evaluate APL at multiple dispersion grid points efficiently.
|
|
25
|
+
|
|
26
|
+
Optimized version that avoids per-call overhead of glm_fit by directly
|
|
27
|
+
calling mglm_one_group and precomputing shared quantities.
|
|
28
|
+
|
|
29
|
+
Parameters
|
|
30
|
+
----------
|
|
31
|
+
grid_dispersions : ndarray of shape (ngrid,)
|
|
32
|
+
Grid of dispersion values.
|
|
33
|
+
y : ndarray (ngenes, nlibs)
|
|
34
|
+
Count matrix.
|
|
35
|
+
design : ndarray (nlibs, ncoefs)
|
|
36
|
+
Design matrix.
|
|
37
|
+
offset : ndarray (ngenes, nlibs)
|
|
38
|
+
Offset matrix.
|
|
39
|
+
weights : ndarray (ngenes, nlibs), optional
|
|
40
|
+
Observation weights.
|
|
41
|
+
|
|
42
|
+
Returns
|
|
43
|
+
-------
|
|
44
|
+
ndarray of shape (ngenes, ngrid) — APL values.
|
|
45
|
+
"""
|
|
46
|
+
from .glm_fit import mglm_one_group, _expand_to_matrix
|
|
47
|
+
from .utils import design_as_factor
|
|
48
|
+
|
|
49
|
+
y = np.asarray(y, dtype=np.float64)
|
|
50
|
+
if y.ndim == 1:
|
|
51
|
+
y = y.reshape(1, -1)
|
|
52
|
+
ngenes, nlibs = y.shape
|
|
53
|
+
design = np.asarray(design, dtype=np.float64)
|
|
54
|
+
if design.ndim == 1:
|
|
55
|
+
design = design.reshape(-1, 1)
|
|
56
|
+
ncoefs = design.shape[1]
|
|
57
|
+
|
|
58
|
+
offset = np.asarray(offset, dtype=np.float64)
|
|
59
|
+
if offset.ndim == 1:
|
|
60
|
+
offset = np.tile(offset, (ngenes, 1))
|
|
61
|
+
|
|
62
|
+
if weights is not None:
|
|
63
|
+
w = np.asarray(weights, dtype=np.float64)
|
|
64
|
+
if w.ndim == 1:
|
|
65
|
+
w = np.tile(w, (ngenes, 1))
|
|
66
|
+
else:
|
|
67
|
+
w = np.ones_like(y)
|
|
68
|
+
|
|
69
|
+
grid_dispersions = np.asarray(grid_dispersions, dtype=np.float64)
|
|
70
|
+
ngrid = len(grid_dispersions)
|
|
71
|
+
|
|
72
|
+
# Precompute group structure (same for all grid points)
|
|
73
|
+
group = design_as_factor(design)
|
|
74
|
+
unique_groups = np.unique(group)
|
|
75
|
+
ngroups = len(unique_groups)
|
|
76
|
+
is_oneway = ngroups == ncoefs
|
|
77
|
+
|
|
78
|
+
# Precompute group column indices
|
|
79
|
+
group_cols = [np.where(group == grp)[0] for grp in unique_groups]
|
|
80
|
+
|
|
81
|
+
# Check if design is indicator (no back-solve needed)
|
|
82
|
+
first_of_group = np.array([cols[0] for cols in group_cols])
|
|
83
|
+
design_unique = design[first_of_group]
|
|
84
|
+
is_indicator = (np.sum(design_unique == 1) == ngroups and
|
|
85
|
+
np.sum(design_unique == 0) == (ngroups - 1) * ngroups)
|
|
86
|
+
|
|
87
|
+
# Precompute gammaln(y+1) — same for all dispersions
|
|
88
|
+
lgamma_y1 = gammaln(y + 1)
|
|
89
|
+
|
|
90
|
+
# Output
|
|
91
|
+
apl = np.empty((ngenes, ngrid), dtype=np.float64)
|
|
92
|
+
|
|
93
|
+
for gi in range(ngrid):
|
|
94
|
+
d = grid_dispersions[gi]
|
|
95
|
+
disp_scalar = np.float64(d)
|
|
96
|
+
|
|
97
|
+
if is_oneway:
|
|
98
|
+
# Fit each group with mglm_one_group directly
|
|
99
|
+
mu = np.empty_like(y)
|
|
100
|
+
for g_idx, cols in enumerate(group_cols):
|
|
101
|
+
y_g = y[:, cols]
|
|
102
|
+
off_g = offset[:, cols]
|
|
103
|
+
w_g = w[:, cols]
|
|
104
|
+
disp_g = np.full_like(y_g, disp_scalar)
|
|
105
|
+
b = mglm_one_group(y_g, dispersion=disp_g, offset=off_g,
|
|
106
|
+
weights=w_g)
|
|
107
|
+
for jj in cols:
|
|
108
|
+
mu[:, jj] = np.exp(np.clip(b + offset[:, jj], -500, 500))
|
|
109
|
+
else:
|
|
110
|
+
# General case: fall back to full glm_fit
|
|
111
|
+
from .glm_fit import glm_fit
|
|
112
|
+
fit = glm_fit(y, design=design, dispersion=d, offset=offset,
|
|
113
|
+
weights=weights, prior_count=0)
|
|
114
|
+
mu = fit['fitted.values']
|
|
115
|
+
|
|
116
|
+
# NB log-likelihood (vectorized)
|
|
117
|
+
mu_safe = np.maximum(mu, 1e-300)
|
|
118
|
+
r = 1.0 / max(d, 1e-300)
|
|
119
|
+
|
|
120
|
+
ll = np.sum(w * (gammaln(y + r) - gammaln(r) - lgamma_y1
|
|
121
|
+
+ r * np.log(r) + y * np.log(mu_safe)
|
|
122
|
+
- (r + y) * np.log(r + mu_safe)), axis=1)
|
|
123
|
+
|
|
124
|
+
# Cox-Reid adjustment: -0.5 * log|X'WX|
|
|
125
|
+
working_w = w * mu_safe / (1.0 + d * mu_safe)
|
|
126
|
+
working_w = np.maximum(working_w, 1e-300)
|
|
127
|
+
|
|
128
|
+
XtWX = np.einsum('gj,jk,jl->gkl', working_w, design, design)
|
|
129
|
+
|
|
130
|
+
if ncoefs == 1:
|
|
131
|
+
logdet = np.log(np.maximum(XtWX[:, 0, 0], 1e-300))
|
|
132
|
+
elif ncoefs == 2:
|
|
133
|
+
det = XtWX[:, 0, 0] * XtWX[:, 1, 1] - XtWX[:, 0, 1] ** 2
|
|
134
|
+
logdet = np.log(np.maximum(det, 1e-300))
|
|
135
|
+
else:
|
|
136
|
+
sign, logdet = np.linalg.slogdet(XtWX)
|
|
137
|
+
logdet = np.where(sign > 0, logdet, 0.0)
|
|
138
|
+
|
|
139
|
+
apl[:, gi] = ll - 0.5 * logdet
|
|
140
|
+
|
|
141
|
+
return apl
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def adjusted_profile_lik(dispersion, y, design, offset, weights=None,
|
|
145
|
+
start=None, get_coef=False):
|
|
146
|
+
"""Tagwise Cox-Reid adjusted profile log-likelihoods for the dispersion.
|
|
147
|
+
|
|
148
|
+
Port of edgeR's adjustedProfileLik (C code reimplemented).
|
|
149
|
+
|
|
150
|
+
Parameters
|
|
151
|
+
----------
|
|
152
|
+
dispersion : float or ndarray
|
|
153
|
+
Dispersion value(s).
|
|
154
|
+
y : ndarray
|
|
155
|
+
Count matrix (genes x samples).
|
|
156
|
+
design : ndarray
|
|
157
|
+
Design matrix.
|
|
158
|
+
offset : ndarray
|
|
159
|
+
Offset matrix.
|
|
160
|
+
weights : ndarray, optional
|
|
161
|
+
Observation weights.
|
|
162
|
+
start : ndarray, optional
|
|
163
|
+
Starting coefficients for GLM fit.
|
|
164
|
+
get_coef : bool
|
|
165
|
+
If True, return coefficients along with APL.
|
|
166
|
+
|
|
167
|
+
Returns
|
|
168
|
+
-------
|
|
169
|
+
ndarray of adjusted profile log-likelihoods (one per gene),
|
|
170
|
+
or dict with 'apl' and 'beta' if get_coef=True.
|
|
171
|
+
"""
|
|
172
|
+
y = np.asarray(y, dtype=np.float64)
|
|
173
|
+
if y.ndim == 1:
|
|
174
|
+
y = y.reshape(1, -1)
|
|
175
|
+
ngenes, nlibs = y.shape
|
|
176
|
+
design = np.asarray(design, dtype=np.float64)
|
|
177
|
+
if design.ndim == 1:
|
|
178
|
+
design = design.reshape(-1, 1)
|
|
179
|
+
ncoefs = design.shape[1]
|
|
180
|
+
|
|
181
|
+
offset = np.asarray(offset, dtype=np.float64)
|
|
182
|
+
if offset.ndim == 1:
|
|
183
|
+
offset = np.tile(offset, (ngenes, 1))
|
|
184
|
+
|
|
185
|
+
dispersion = np.atleast_1d(np.asarray(dispersion, dtype=np.float64))
|
|
186
|
+
if len(dispersion) == 1:
|
|
187
|
+
disp = np.full(ngenes, dispersion[0])
|
|
188
|
+
else:
|
|
189
|
+
disp = dispersion
|
|
190
|
+
|
|
191
|
+
if weights is not None:
|
|
192
|
+
w = np.asarray(weights, dtype=np.float64)
|
|
193
|
+
if w.ndim == 1:
|
|
194
|
+
w = np.tile(w, (ngenes, 1))
|
|
195
|
+
else:
|
|
196
|
+
w = np.ones_like(y)
|
|
197
|
+
|
|
198
|
+
# Fit GLM to get mu
|
|
199
|
+
from .glm_fit import glm_fit
|
|
200
|
+
fit = glm_fit(y, design=design, dispersion=disp, offset=offset,
|
|
201
|
+
weights=weights, prior_count=0, start=start)
|
|
202
|
+
mu = fit['fitted.values']
|
|
203
|
+
beta = fit.get('unshrunk.coefficients', fit['coefficients'])
|
|
204
|
+
|
|
205
|
+
# Compute adjusted profile log-likelihood for all genes (vectorized)
|
|
206
|
+
mu_safe = np.maximum(mu, 1e-300) # (ngenes, nlibs)
|
|
207
|
+
r = 1.0 / np.maximum(disp, 1e-300) # (ngenes,)
|
|
208
|
+
is_nb = disp > 0
|
|
209
|
+
|
|
210
|
+
# NB log-likelihood (vectorized)
|
|
211
|
+
r_col = r[:, None] # (ngenes, 1)
|
|
212
|
+
ll = np.zeros(ngenes)
|
|
213
|
+
if np.any(is_nb):
|
|
214
|
+
nb = is_nb
|
|
215
|
+
ll[nb] = np.sum(w[nb] * (gammaln(y[nb] + r_col[nb]) - gammaln(r_col[nb])
|
|
216
|
+
- gammaln(y[nb] + 1)
|
|
217
|
+
+ r_col[nb] * np.log(r_col[nb]) + y[nb] * np.log(mu_safe[nb])
|
|
218
|
+
- (r_col[nb] + y[nb]) * np.log(r_col[nb] + mu_safe[nb])), axis=1)
|
|
219
|
+
if np.any(~is_nb):
|
|
220
|
+
pois = ~is_nb
|
|
221
|
+
ll[pois] = np.sum(w[pois] * (y[pois] * np.log(mu_safe[pois])
|
|
222
|
+
- mu_safe[pois] - gammaln(y[pois] + 1)), axis=1)
|
|
223
|
+
|
|
224
|
+
# Cox-Reid adjustment: -0.5 * log|X'WX| (vectorized)
|
|
225
|
+
# Working weights: mu / (1 + d*mu) for NB, mu for Poisson
|
|
226
|
+
disp_col = disp[:, None] # (ngenes, 1)
|
|
227
|
+
working_w = np.where(is_nb[:, None],
|
|
228
|
+
w * mu_safe / (1.0 + disp_col * mu_safe),
|
|
229
|
+
w * mu_safe)
|
|
230
|
+
working_w = np.maximum(working_w, 1e-300) # (ngenes, nlibs)
|
|
231
|
+
|
|
232
|
+
# Compute X'WX for all genes at once using einsum
|
|
233
|
+
# XtWX[g, k, l] = sum_j working_w[g,j] * design[j,k] * design[j,l]
|
|
234
|
+
XtWX = np.einsum('gj,jk,jl->gkl', working_w, design, design) # (ngenes, ncoefs, ncoefs)
|
|
235
|
+
|
|
236
|
+
# Log determinant for all genes
|
|
237
|
+
if ncoefs == 1:
|
|
238
|
+
logdet = np.log(np.maximum(XtWX[:, 0, 0], 1e-300))
|
|
239
|
+
elif ncoefs == 2:
|
|
240
|
+
det = XtWX[:, 0, 0] * XtWX[:, 1, 1] - XtWX[:, 0, 1] ** 2
|
|
241
|
+
logdet = np.log(np.maximum(det, 1e-300))
|
|
242
|
+
else:
|
|
243
|
+
sign, logdet = np.linalg.slogdet(XtWX)
|
|
244
|
+
logdet = np.where(sign > 0, logdet, 0.0)
|
|
245
|
+
|
|
246
|
+
cr_adj = -0.5 * logdet
|
|
247
|
+
apl = ll + cr_adj
|
|
248
|
+
|
|
249
|
+
if get_coef:
|
|
250
|
+
return {'apl': apl, 'beta': beta}
|
|
251
|
+
return apl
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
@njit(cache=True)
|
|
255
|
+
def _fmm_spline(n, x, y, b, c, d):
|
|
256
|
+
"""Forsythe-Malcolm-Moler cubic spline (matches R's splines.c / edgeR's fmm_spline).
|
|
257
|
+
|
|
258
|
+
Computes coefficients b, c, d such that in segment i:
|
|
259
|
+
S(t) = y[i] + b[i]*t + c[i]*t^2 + d[i]*t^3
|
|
260
|
+
where t = x_eval - x[i].
|
|
261
|
+
"""
|
|
262
|
+
if n < 2:
|
|
263
|
+
return
|
|
264
|
+
if n < 3:
|
|
265
|
+
t = (y[1] - y[0]) / (x[1] - x[0])
|
|
266
|
+
b[0] = t
|
|
267
|
+
b[1] = t
|
|
268
|
+
c[0] = c[1] = d[0] = d[1] = 0.0
|
|
269
|
+
return
|
|
270
|
+
|
|
271
|
+
nm1 = n - 1
|
|
272
|
+
|
|
273
|
+
# Set up tridiagonal system
|
|
274
|
+
# Using d for offdiagonal, b for diagonal, c for RHS
|
|
275
|
+
d[0] = x[1] - x[0]
|
|
276
|
+
c[1] = (y[1] - y[0]) / d[0]
|
|
277
|
+
for i in range(1, nm1):
|
|
278
|
+
d[i] = x[i + 1] - x[i]
|
|
279
|
+
b[i] = 2.0 * (d[i - 1] + d[i])
|
|
280
|
+
c[i + 1] = (y[i + 1] - y[i]) / d[i]
|
|
281
|
+
c[i] = c[i + 1] - c[i]
|
|
282
|
+
|
|
283
|
+
# End conditions (FMM: match third derivatives)
|
|
284
|
+
b[0] = -d[0]
|
|
285
|
+
b[nm1] = -d[nm1 - 1]
|
|
286
|
+
c[0] = 0.0
|
|
287
|
+
c[nm1] = 0.0
|
|
288
|
+
if n > 3:
|
|
289
|
+
c[0] = c[2] / (x[3] - x[1]) - c[1] / (x[2] - x[0])
|
|
290
|
+
c[nm1] = c[nm1 - 1] / (x[nm1] - x[nm1 - 2]) - c[nm1 - 2] / (x[nm1 - 1] - x[nm1 - 3])
|
|
291
|
+
c[0] = c[0] * d[0] * d[0] / (x[3] - x[0])
|
|
292
|
+
c[nm1] = -c[nm1] * d[nm1 - 1] * d[nm1 - 1] / (x[nm1] - x[nm1 - 3])
|
|
293
|
+
|
|
294
|
+
# Gaussian elimination
|
|
295
|
+
for i in range(1, n):
|
|
296
|
+
t = d[i - 1] / b[i - 1]
|
|
297
|
+
b[i] = b[i] - t * d[i - 1]
|
|
298
|
+
c[i] = c[i] - t * c[i - 1]
|
|
299
|
+
|
|
300
|
+
# Backward substitution
|
|
301
|
+
c[nm1] = c[nm1] / b[nm1]
|
|
302
|
+
for i in range(nm1 - 1, -1, -1):
|
|
303
|
+
c[i] = (c[i] - d[i] * c[i + 1]) / b[i]
|
|
304
|
+
|
|
305
|
+
# Compute polynomial coefficients
|
|
306
|
+
b[nm1] = (y[nm1] - y[nm1 - 1]) / d[nm1 - 1] + d[nm1 - 1] * (c[nm1 - 1] + 2.0 * c[nm1])
|
|
307
|
+
for i in range(nm1):
|
|
308
|
+
b[i] = (y[i + 1] - y[i]) / d[i] - d[i] * (c[i + 1] + 2.0 * c[i])
|
|
309
|
+
d[i] = (c[i + 1] - c[i]) / d[i]
|
|
310
|
+
c[i] = 3.0 * c[i]
|
|
311
|
+
c[nm1] = 3.0 * c[nm1]
|
|
312
|
+
d[nm1] = d[nm1 - 1]
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
@njit(cache=True)
|
|
316
|
+
def _maximize_interpolant_kernel(x, y_mat, ngenes, npts, result):
|
|
317
|
+
"""Numba kernel: FMM spline + analytical max (matches edgeR's C find_max).
|
|
318
|
+
|
|
319
|
+
For each gene, fits an FMM cubic spline, finds the grid point with the
|
|
320
|
+
highest value, then analytically solves for the maximum on the two
|
|
321
|
+
neighbouring segments by finding roots of the derivative (a quadratic).
|
|
322
|
+
This is O(npts) per gene with no discretisation artifacts.
|
|
323
|
+
"""
|
|
324
|
+
b = np.empty(npts)
|
|
325
|
+
c = np.empty(npts)
|
|
326
|
+
d = np.empty(npts)
|
|
327
|
+
y_row = np.empty(npts)
|
|
328
|
+
|
|
329
|
+
for g in range(ngenes):
|
|
330
|
+
# Copy row (fmm_spline modifies y in-place via c)
|
|
331
|
+
for i in range(npts):
|
|
332
|
+
y_row[i] = y_mat[g, i]
|
|
333
|
+
|
|
334
|
+
# Find coarse grid maximum
|
|
335
|
+
maxed = y_row[0]
|
|
336
|
+
maxed_at = 0
|
|
337
|
+
for i in range(1, npts):
|
|
338
|
+
if y_row[i] > maxed:
|
|
339
|
+
maxed = y_row[i]
|
|
340
|
+
maxed_at = i
|
|
341
|
+
x_max = x[maxed_at]
|
|
342
|
+
|
|
343
|
+
# Fit FMM spline: S(t) = y[i] + b[i]*t + c[i]*t^2 + d[i]*t^3
|
|
344
|
+
_fmm_spline(npts, x, y_row, b, c, d)
|
|
345
|
+
|
|
346
|
+
# Check left segment (maxed_at - 1)
|
|
347
|
+
if maxed_at > 0:
|
|
348
|
+
seg = maxed_at - 1
|
|
349
|
+
lb = b[seg]
|
|
350
|
+
lc = c[seg]
|
|
351
|
+
ld = d[seg]
|
|
352
|
+
|
|
353
|
+
# Derivative: b + 2c*t + 3d*t^2 = 0
|
|
354
|
+
# Discriminant: (2c)^2 - 4*(3d)*b = 4*(c^2 - 3*d*b)
|
|
355
|
+
delta = lc * lc - 3.0 * ld * lb
|
|
356
|
+
if delta >= 0.0:
|
|
357
|
+
# Solution for maximum (not minimum)
|
|
358
|
+
numerator = -lc - np.sqrt(delta)
|
|
359
|
+
chosen_sol = numerator / (3.0 * ld) if ld != 0.0 else 0.0
|
|
360
|
+
|
|
361
|
+
seg_width = x[maxed_at] - x[seg]
|
|
362
|
+
if chosen_sol > 0.0 and chosen_sol < seg_width:
|
|
363
|
+
temp = ((ld * chosen_sol + lc) * chosen_sol + lb) * chosen_sol + y_row[seg]
|
|
364
|
+
if temp > maxed:
|
|
365
|
+
maxed = temp
|
|
366
|
+
x_max = chosen_sol + x[seg]
|
|
367
|
+
|
|
368
|
+
# Check right segment (maxed_at)
|
|
369
|
+
if maxed_at < npts - 1:
|
|
370
|
+
seg = maxed_at
|
|
371
|
+
rb = b[seg]
|
|
372
|
+
rc = c[seg]
|
|
373
|
+
rd = d[seg]
|
|
374
|
+
|
|
375
|
+
delta = rc * rc - 3.0 * rd * rb
|
|
376
|
+
if delta >= 0.0:
|
|
377
|
+
numerator = -rc - np.sqrt(delta)
|
|
378
|
+
chosen_sol = numerator / (3.0 * rd) if rd != 0.0 else 0.0
|
|
379
|
+
|
|
380
|
+
seg_width = x[seg + 1] - x[seg]
|
|
381
|
+
if chosen_sol > 0.0 and chosen_sol < seg_width:
|
|
382
|
+
temp = ((rd * chosen_sol + rc) * chosen_sol + rb) * chosen_sol + y_row[seg]
|
|
383
|
+
if temp > maxed:
|
|
384
|
+
maxed = temp
|
|
385
|
+
x_max = chosen_sol + x[seg]
|
|
386
|
+
|
|
387
|
+
result[g] = x_max
|
|
388
|
+
|
|
389
|
+
|
|
390
|
+
def maximize_interpolant(x, y):
|
|
391
|
+
"""Find the maximum of an interpolated function for each row.
|
|
392
|
+
|
|
393
|
+
Port of edgeR's maximizeInterpolant. Uses FMM cubic spline fitting
|
|
394
|
+
followed by analytical maximum finding on neighbouring segments,
|
|
395
|
+
matching R's C implementation exactly.
|
|
396
|
+
|
|
397
|
+
Parameters
|
|
398
|
+
----------
|
|
399
|
+
x : ndarray
|
|
400
|
+
Grid points (sorted, unique).
|
|
401
|
+
y : ndarray
|
|
402
|
+
Log-likelihood matrix (genes x grid points).
|
|
403
|
+
|
|
404
|
+
Returns
|
|
405
|
+
-------
|
|
406
|
+
ndarray of maximizing x values (one per gene).
|
|
407
|
+
"""
|
|
408
|
+
x = np.asarray(x, dtype=np.float64).copy()
|
|
409
|
+
y = np.asarray(y, dtype=np.float64)
|
|
410
|
+
if y.ndim == 1:
|
|
411
|
+
y = y.reshape(1, -1)
|
|
412
|
+
|
|
413
|
+
ngenes = y.shape[0]
|
|
414
|
+
npts = len(x)
|
|
415
|
+
|
|
416
|
+
result = np.empty(ngenes, dtype=np.float64)
|
|
417
|
+
_maximize_interpolant_kernel(x, y, ngenes, npts, result)
|
|
418
|
+
return result
|
|
419
|
+
|
|
420
|
+
|
|
421
|
+
def cond_log_lik_der_size(y, r, der=0):
|
|
422
|
+
"""Derivatives of conditional log-likelihood w.r.t. r=1/dispersion.
|
|
423
|
+
|
|
424
|
+
Port of edgeR's condLogLikDerSize.
|
|
425
|
+
"""
|
|
426
|
+
y = np.asarray(y, dtype=np.float64)
|
|
427
|
+
if y.ndim == 1:
|
|
428
|
+
y = y.reshape(1, -1)
|
|
429
|
+
n = y.shape[1]
|
|
430
|
+
m = np.mean(y, axis=1)
|
|
431
|
+
|
|
432
|
+
if der == 0:
|
|
433
|
+
# Log-likelihood
|
|
434
|
+
return (np.sum(gammaln(y + r[:, None]), axis=1) +
|
|
435
|
+
gammaln(n * r) - gammaln(n * (m + r)) - n * gammaln(r))
|
|
436
|
+
elif der == 1:
|
|
437
|
+
# First derivative
|
|
438
|
+
return (np.sum(digamma(y + r[:, None]), axis=1) +
|
|
439
|
+
n * digamma(n * r) - n * digamma(n * (m + r)) - n * digamma(r))
|
|
440
|
+
elif der == 2:
|
|
441
|
+
# Second derivative
|
|
442
|
+
return (np.sum(polygamma(1, y + r[:, None]), axis=1) +
|
|
443
|
+
n**2 * polygamma(1, n * r) - n**2 * polygamma(1, n * (m + r)) -
|
|
444
|
+
n * polygamma(1, r))
|
|
445
|
+
else:
|
|
446
|
+
raise ValueError(f"der must be 0, 1, or 2, got {der}")
|
|
447
|
+
|
|
448
|
+
|
|
449
|
+
def cond_log_lik_der_delta(y, delta, der=0):
|
|
450
|
+
"""Derivatives of conditional log-likelihood w.r.t. delta=dispersion/(1+dispersion).
|
|
451
|
+
|
|
452
|
+
Port of edgeR's condLogLikDerDelta.
|
|
453
|
+
"""
|
|
454
|
+
y = np.asarray(y, dtype=np.float64)
|
|
455
|
+
if y.ndim == 1:
|
|
456
|
+
y = y.reshape(1, -1)
|
|
457
|
+
|
|
458
|
+
delta = np.atleast_1d(np.asarray(delta, dtype=np.float64))
|
|
459
|
+
r = (1.0 / delta) - 1.0
|
|
460
|
+
|
|
461
|
+
if der == 0:
|
|
462
|
+
return cond_log_lik_der_size(y, r, der=0)
|
|
463
|
+
elif der == 1:
|
|
464
|
+
return cond_log_lik_der_size(y, r, der=1) * (-delta**(-2))
|
|
465
|
+
elif der == 2:
|
|
466
|
+
return (cond_log_lik_der_size(y, r, der=1) * 2 * delta**(-3) +
|
|
467
|
+
cond_log_lik_der_size(y, r, der=2) * delta**(-4))
|
|
468
|
+
else:
|
|
469
|
+
raise ValueError(f"der must be 0, 1, or 2, got {der}")
|
|
470
|
+
|
|
471
|
+
|
|
472
|
+
def common_cond_log_lik_der_delta(y_split, delta, der=0):
|
|
473
|
+
"""Sum of conditional log-likelihoods across groups.
|
|
474
|
+
|
|
475
|
+
Port of edgeR's commonCondLogLikDerDelta.
|
|
476
|
+
"""
|
|
477
|
+
total = 0.0
|
|
478
|
+
for y_group in y_split:
|
|
479
|
+
total += np.sum(cond_log_lik_der_delta(y_group, delta, der=der))
|
|
480
|
+
return total
|
|
481
|
+
|
|
482
|
+
|
|
483
|
+
def disp_cox_reid(y, design=None, offset=None, weights=None, ave_log_cpm_vals=None,
|
|
484
|
+
interval=(0, 4), tol=1e-5, min_row_sum=5, subset=10000):
|
|
485
|
+
"""Cox-Reid APL estimator of common dispersion.
|
|
486
|
+
|
|
487
|
+
Port of edgeR's dispCoxReid.
|
|
488
|
+
|
|
489
|
+
Parameters
|
|
490
|
+
----------
|
|
491
|
+
y : ndarray
|
|
492
|
+
Count matrix.
|
|
493
|
+
design : ndarray, optional
|
|
494
|
+
Design matrix.
|
|
495
|
+
offset : ndarray, optional
|
|
496
|
+
Offset.
|
|
497
|
+
weights : ndarray, optional
|
|
498
|
+
Weights.
|
|
499
|
+
ave_log_cpm_vals : ndarray, optional
|
|
500
|
+
Pre-computed AveLogCPM values.
|
|
501
|
+
interval : tuple
|
|
502
|
+
Search interval for dispersion.
|
|
503
|
+
tol : float
|
|
504
|
+
Optimization tolerance.
|
|
505
|
+
min_row_sum : int
|
|
506
|
+
Minimum row sum.
|
|
507
|
+
subset : int
|
|
508
|
+
Number of genes to subset.
|
|
509
|
+
|
|
510
|
+
Returns
|
|
511
|
+
-------
|
|
512
|
+
float : estimated common dispersion.
|
|
513
|
+
"""
|
|
514
|
+
y = np.asarray(y, dtype=np.float64)
|
|
515
|
+
if y.ndim == 1:
|
|
516
|
+
y = y.reshape(1, -1)
|
|
517
|
+
|
|
518
|
+
if design is None:
|
|
519
|
+
design = np.ones((y.shape[1], 1))
|
|
520
|
+
else:
|
|
521
|
+
design = np.asarray(design, dtype=np.float64)
|
|
522
|
+
if design.ndim == 1:
|
|
523
|
+
design = design.reshape(-1, 1)
|
|
524
|
+
|
|
525
|
+
if offset is None:
|
|
526
|
+
offset = np.log(y.sum(axis=0))
|
|
527
|
+
offset = expand_as_matrix(np.asarray(offset, dtype=np.float64), y.shape)
|
|
528
|
+
|
|
529
|
+
if interval[0] < 0:
|
|
530
|
+
raise ValueError("please give a non-negative interval for the dispersion")
|
|
531
|
+
|
|
532
|
+
# Apply min row count
|
|
533
|
+
row_sums = y.sum(axis=1)
|
|
534
|
+
keep = row_sums >= min_row_sum
|
|
535
|
+
if not np.all(keep):
|
|
536
|
+
y = y[keep]
|
|
537
|
+
offset = offset[keep]
|
|
538
|
+
if weights is not None:
|
|
539
|
+
weights = np.asarray(weights)
|
|
540
|
+
if weights.ndim == 2:
|
|
541
|
+
weights = weights[keep]
|
|
542
|
+
if ave_log_cpm_vals is not None:
|
|
543
|
+
ave_log_cpm_vals = ave_log_cpm_vals[keep]
|
|
544
|
+
|
|
545
|
+
if y.shape[0] < 1:
|
|
546
|
+
raise ValueError("no data rows with required number of counts")
|
|
547
|
+
|
|
548
|
+
# Subsetting
|
|
549
|
+
if subset is not None and subset <= y.shape[0] / 2:
|
|
550
|
+
if ave_log_cpm_vals is None:
|
|
551
|
+
ave_log_cpm_vals = ave_log_cpm(y, offset=offset, weights=weights)
|
|
552
|
+
i = systematic_subset(subset, ave_log_cpm_vals)
|
|
553
|
+
y = y[i]
|
|
554
|
+
offset = offset[i]
|
|
555
|
+
if weights is not None and weights.ndim == 2:
|
|
556
|
+
weights = weights[i]
|
|
557
|
+
|
|
558
|
+
# Function to optimize
|
|
559
|
+
def fun(par):
|
|
560
|
+
disp = par ** 4
|
|
561
|
+
return -np.sum(adjusted_profile_lik(disp, y, design, offset, weights=weights))
|
|
562
|
+
|
|
563
|
+
# Optimize
|
|
564
|
+
lo = interval[0] ** 0.25
|
|
565
|
+
hi = interval[1] ** 0.25
|
|
566
|
+
if lo == 0:
|
|
567
|
+
lo = 1e-10
|
|
568
|
+
result = minimize_scalar(fun, bounds=(lo, hi), method='bounded',
|
|
569
|
+
options={'xatol': tol})
|
|
570
|
+
return result.x ** 4
|
|
571
|
+
|
|
572
|
+
|
|
573
|
+
def disp_cox_reid_interpolate_tagwise(y, design, offset=None, dispersion=None,
|
|
574
|
+
trend=True, ave_log_cpm_vals=None,
|
|
575
|
+
min_row_sum=5, prior_df=10, span=0.3,
|
|
576
|
+
grid_npts=11, grid_range=(-6, 6),
|
|
577
|
+
weights=None):
|
|
578
|
+
"""Estimate tagwise NB dispersions using Cox-Reid APL with interpolation.
|
|
579
|
+
|
|
580
|
+
Port of edgeR's dispCoxReidInterpolateTagwise.
|
|
581
|
+
|
|
582
|
+
Parameters
|
|
583
|
+
----------
|
|
584
|
+
y : ndarray
|
|
585
|
+
Count matrix.
|
|
586
|
+
design : ndarray
|
|
587
|
+
Design matrix.
|
|
588
|
+
offset : ndarray, optional
|
|
589
|
+
Offset.
|
|
590
|
+
dispersion : float or ndarray
|
|
591
|
+
Starting dispersion(s).
|
|
592
|
+
trend : bool
|
|
593
|
+
Use trend.
|
|
594
|
+
ave_log_cpm_vals : ndarray, optional
|
|
595
|
+
Average log CPM.
|
|
596
|
+
min_row_sum : int
|
|
597
|
+
Minimum row sum.
|
|
598
|
+
prior_df : float
|
|
599
|
+
Prior degrees of freedom.
|
|
600
|
+
span : float
|
|
601
|
+
Span for moving average.
|
|
602
|
+
grid_npts : int
|
|
603
|
+
Number of grid points.
|
|
604
|
+
grid_range : tuple
|
|
605
|
+
Range for grid.
|
|
606
|
+
weights : ndarray, optional
|
|
607
|
+
Weights.
|
|
608
|
+
|
|
609
|
+
Returns
|
|
610
|
+
-------
|
|
611
|
+
ndarray of tagwise dispersions.
|
|
612
|
+
"""
|
|
613
|
+
y = np.asarray(y, dtype=np.float64)
|
|
614
|
+
if y.ndim == 1:
|
|
615
|
+
y = y.reshape(1, -1)
|
|
616
|
+
ntags, nlibs = y.shape
|
|
617
|
+
|
|
618
|
+
design = np.asarray(design, dtype=np.float64)
|
|
619
|
+
if design.ndim == 1:
|
|
620
|
+
design = design.reshape(-1, 1)
|
|
621
|
+
ncoefs = design.shape[1]
|
|
622
|
+
|
|
623
|
+
if offset is None:
|
|
624
|
+
offset = np.log(y.sum(axis=0))
|
|
625
|
+
offset = expand_as_matrix(np.asarray(offset, dtype=np.float64), y.shape)
|
|
626
|
+
|
|
627
|
+
if ave_log_cpm_vals is None:
|
|
628
|
+
ave_log_cpm_vals = ave_log_cpm(y, offset=offset, weights=weights)
|
|
629
|
+
|
|
630
|
+
dispersion = np.atleast_1d(np.asarray(dispersion, dtype=np.float64))
|
|
631
|
+
if len(dispersion) == 1:
|
|
632
|
+
dispersion = np.full(ntags, dispersion[0])
|
|
633
|
+
elif len(dispersion) != ntags:
|
|
634
|
+
raise ValueError("length of dispersion doesn't match nrow(y)")
|
|
635
|
+
|
|
636
|
+
# Apply min_row_sum
|
|
637
|
+
row_sums = y.sum(axis=1)
|
|
638
|
+
keep = row_sums >= min_row_sum
|
|
639
|
+
if not np.all(keep):
|
|
640
|
+
if np.any(keep):
|
|
641
|
+
dispersion[keep] = disp_cox_reid_interpolate_tagwise(
|
|
642
|
+
y[keep], design, offset=offset[keep],
|
|
643
|
+
dispersion=dispersion[keep],
|
|
644
|
+
ave_log_cpm_vals=ave_log_cpm_vals[keep],
|
|
645
|
+
grid_npts=grid_npts, min_row_sum=0,
|
|
646
|
+
prior_df=prior_df, span=span, trend=trend,
|
|
647
|
+
weights=weights[keep] if weights is not None and np.ndim(weights) == 2 else weights)
|
|
648
|
+
return dispersion
|
|
649
|
+
|
|
650
|
+
# Posterior profile likelihood
|
|
651
|
+
prior_n = prior_df / (nlibs - ncoefs)
|
|
652
|
+
spline_pts = np.linspace(grid_range[0], grid_range[1], grid_npts)
|
|
653
|
+
apl = np.zeros((ntags, grid_npts))
|
|
654
|
+
|
|
655
|
+
for i in range(grid_npts):
|
|
656
|
+
spline_disp = dispersion * 2 ** spline_pts[i]
|
|
657
|
+
apl[:, i] = adjusted_profile_lik(spline_disp, y, design, offset, weights=weights)
|
|
658
|
+
|
|
659
|
+
if trend:
|
|
660
|
+
o = np.argsort(ave_log_cpm_vals)
|
|
661
|
+
oo = np.argsort(o)
|
|
662
|
+
width = int(np.floor(span * ntags))
|
|
663
|
+
width = max(width, 1)
|
|
664
|
+
apl_smooth = moving_average_by_col(apl[o], width=width)[oo]
|
|
665
|
+
else:
|
|
666
|
+
apl_smooth = np.tile(np.mean(apl, axis=0), (ntags, 1))
|
|
667
|
+
|
|
668
|
+
apl_smooth = (apl + prior_n * apl_smooth) / (1 + prior_n)
|
|
669
|
+
|
|
670
|
+
# Tagwise maximization
|
|
671
|
+
d = maximize_interpolant(spline_pts, apl_smooth)
|
|
672
|
+
return dispersion * 2 ** d
|
|
673
|
+
|
|
674
|
+
|
|
675
|
+
def _ns_basis_with_knots(x, internal_knots, boundary_knots):
|
|
676
|
+
"""Create natural cubic spline basis matching R's cbind(1, ns(x, knots=knots)).
|
|
677
|
+
|
|
678
|
+
Uses the truncated power basis from ESL (Hastie et al.) eq 5.4-5.5.
|
|
679
|
+
|
|
680
|
+
Parameters
|
|
681
|
+
----------
|
|
682
|
+
x : array
|
|
683
|
+
Data values.
|
|
684
|
+
internal_knots : array
|
|
685
|
+
Internal knot positions.
|
|
686
|
+
boundary_knots : array of length 2
|
|
687
|
+
[lower, upper] boundary knots.
|
|
688
|
+
|
|
689
|
+
Returns
|
|
690
|
+
-------
|
|
691
|
+
ndarray of shape (n, len(internal_knots) + 2)
|
|
692
|
+
Basis matrix including intercept column.
|
|
693
|
+
"""
|
|
694
|
+
x = np.asarray(x, dtype=np.float64)
|
|
695
|
+
n = len(x)
|
|
696
|
+
internal_knots = np.asarray(internal_knots, dtype=np.float64)
|
|
697
|
+
|
|
698
|
+
all_knots = np.sort(np.concatenate([[boundary_knots[0]],
|
|
699
|
+
internal_knots,
|
|
700
|
+
[boundary_knots[1]]]))
|
|
701
|
+
K = len(all_knots)
|
|
702
|
+
ncols = K # = len(internal_knots) + 2
|
|
703
|
+
|
|
704
|
+
basis = np.zeros((n, ncols))
|
|
705
|
+
basis[:, 0] = 1.0
|
|
706
|
+
basis[:, 1] = x
|
|
707
|
+
|
|
708
|
+
if K > 2:
|
|
709
|
+
xi_K = all_knots[-1]
|
|
710
|
+
xi_Km1 = all_knots[-2]
|
|
711
|
+
|
|
712
|
+
def d_func(xi_j):
|
|
713
|
+
return (np.maximum(x - xi_j, 0) ** 3 -
|
|
714
|
+
np.maximum(x - xi_K, 0) ** 3) / (xi_K - xi_j)
|
|
715
|
+
|
|
716
|
+
d_Km1 = d_func(xi_Km1)
|
|
717
|
+
for j in range(K - 2):
|
|
718
|
+
basis[:, 2 + j] = d_func(all_knots[j]) - d_Km1
|
|
719
|
+
|
|
720
|
+
return basis
|
|
721
|
+
|
|
722
|
+
|
|
723
|
+
def disp_cox_reid_spline_trend(y, design, offset=None, df=5, subset=10000,
|
|
724
|
+
ave_log_cpm_vals=None, method_optim='Nelder-Mead'):
|
|
725
|
+
"""Estimate spline trend dispersion.
|
|
726
|
+
|
|
727
|
+
Faithful port of edgeR's dispCoxReidSplineTrend.
|
|
728
|
+
Fits: dispersion = exp(X @ par - abundance) where X is a natural spline
|
|
729
|
+
basis, optimized via Nelder-Mead on adjusted profile likelihood.
|
|
730
|
+
|
|
731
|
+
Returns
|
|
732
|
+
-------
|
|
733
|
+
dict with 'dispersion' and 'AveLogCPM'.
|
|
734
|
+
"""
|
|
735
|
+
y = np.asarray(y, dtype=np.float64)
|
|
736
|
+
if y.ndim == 1:
|
|
737
|
+
y = y.reshape(1, -1)
|
|
738
|
+
ntags, nlibs = y.shape
|
|
739
|
+
|
|
740
|
+
if offset is None:
|
|
741
|
+
offset = np.zeros(nlibs)
|
|
742
|
+
offset = expand_as_matrix(np.asarray(offset, dtype=np.float64), y.shape)
|
|
743
|
+
|
|
744
|
+
if ave_log_cpm_vals is None:
|
|
745
|
+
ave_log_cpm_vals = ave_log_cpm(y, offset=offset)
|
|
746
|
+
|
|
747
|
+
all_zero = y.sum(axis=1) == 0
|
|
748
|
+
abundance_nonzero = ave_log_cpm_vals[~all_zero]
|
|
749
|
+
y_nonzero = y[~all_zero]
|
|
750
|
+
offset_nonzero = offset[~all_zero]
|
|
751
|
+
|
|
752
|
+
i = systematic_subset(subset, abundance_nonzero)
|
|
753
|
+
|
|
754
|
+
if len(abundance_nonzero) < 2:
|
|
755
|
+
common_disp = disp_cox_reid(y_nonzero, design, offset=offset_nonzero)
|
|
756
|
+
disp = np.full(ntags, common_disp)
|
|
757
|
+
return {'dispersion': disp, 'AveLogCPM': ave_log_cpm_vals}
|
|
758
|
+
|
|
759
|
+
# Knot placement matching R: weighted mix of quantile and equally-spaced
|
|
760
|
+
p1 = np.arange(1, df) / df
|
|
761
|
+
knots1 = np.quantile(abundance_nonzero, p1)
|
|
762
|
+
r = np.array([np.min(abundance_nonzero), np.max(abundance_nonzero)])
|
|
763
|
+
knots2 = r[0] + p1 * (r[1] - r[0])
|
|
764
|
+
knots = 0.3 * knots1 + 0.7 * knots2
|
|
765
|
+
|
|
766
|
+
# Build natural spline basis: cbind(1, ns(abundance, knots=knots))
|
|
767
|
+
X = _ns_basis_with_knots(abundance_nonzero, knots, boundary_knots=r)
|
|
768
|
+
|
|
769
|
+
# Objective: negative sum of adjusted profile likelihoods
|
|
770
|
+
def fun(par, y_sub, design, offset_sub, abundance_sub, X_sub):
|
|
771
|
+
eta = X_sub @ par
|
|
772
|
+
dispersion = np.exp(eta - abundance_sub)
|
|
773
|
+
try:
|
|
774
|
+
apl = adjusted_profile_lik(dispersion, y_sub, design, offset_sub)
|
|
775
|
+
return -np.sum(apl)
|
|
776
|
+
except Exception:
|
|
777
|
+
return 1e10
|
|
778
|
+
|
|
779
|
+
# Initial parameters matching R
|
|
780
|
+
par0 = np.zeros(df + 1)
|
|
781
|
+
par0[0] = np.median(abundance_nonzero[i]) + np.log(0.1)
|
|
782
|
+
|
|
783
|
+
result = minimize(fun, par0, args=(y_nonzero[i], design,
|
|
784
|
+
offset_nonzero[i], abundance_nonzero[i],
|
|
785
|
+
X[i]),
|
|
786
|
+
method=method_optim)
|
|
787
|
+
|
|
788
|
+
# Evaluate fitted dispersions for all genes
|
|
789
|
+
disp_nonzero = np.exp(X @ result.x - abundance_nonzero)
|
|
790
|
+
|
|
791
|
+
disp = np.full(ntags, np.nan)
|
|
792
|
+
disp[all_zero] = disp_nonzero[np.argmin(abundance_nonzero)] if len(disp_nonzero) > 0 else 0.1
|
|
793
|
+
disp[~all_zero] = disp_nonzero
|
|
794
|
+
|
|
795
|
+
return {'dispersion': disp, 'AveLogCPM': ave_log_cpm_vals}
|
|
796
|
+
|
|
797
|
+
|
|
798
|
+
def disp_cox_reid_power_trend(y, design, offset=None, ave_log_cpm_vals=None,
|
|
799
|
+
subset=10000, method_optim='Nelder-Mead'):
|
|
800
|
+
"""Estimate power trend dispersion.
|
|
801
|
+
|
|
802
|
+
Faithful port of edgeR's dispCoxReidPowerTrend.
|
|
803
|
+
Fits the parametric model: dispersion = exp(a + b*AveLogCPM) + exp(c)
|
|
804
|
+
by maximizing the Cox-Reid adjusted profile likelihood via Nelder-Mead.
|
|
805
|
+
|
|
806
|
+
Returns
|
|
807
|
+
-------
|
|
808
|
+
dict with 'dispersion' and 'AveLogCPM'.
|
|
809
|
+
"""
|
|
810
|
+
y = np.asarray(y, dtype=np.float64)
|
|
811
|
+
if y.ndim == 1:
|
|
812
|
+
y = y.reshape(1, -1)
|
|
813
|
+
ntags = y.shape[0]
|
|
814
|
+
|
|
815
|
+
if offset is None:
|
|
816
|
+
offset = np.log(y.sum(axis=0))
|
|
817
|
+
offset = expand_as_matrix(np.asarray(offset, dtype=np.float64), y.shape)
|
|
818
|
+
|
|
819
|
+
if ave_log_cpm_vals is None:
|
|
820
|
+
ave_log_cpm_vals = ave_log_cpm(y, offset=offset)
|
|
821
|
+
|
|
822
|
+
abundance_full = ave_log_cpm_vals
|
|
823
|
+
|
|
824
|
+
# Exclude all-zero rows
|
|
825
|
+
all_zero = y.sum(axis=1) == 0
|
|
826
|
+
abundance_nonzero = abundance_full[~all_zero]
|
|
827
|
+
y_nonzero = y[~all_zero]
|
|
828
|
+
offset_nonzero = offset[~all_zero]
|
|
829
|
+
|
|
830
|
+
# Systematic subset for efficiency
|
|
831
|
+
i = systematic_subset(subset, abundance_nonzero)
|
|
832
|
+
|
|
833
|
+
# Objective: negative sum of adjusted profile likelihoods
|
|
834
|
+
def fun(par, y_sub, design, offset_sub, abundance_sub):
|
|
835
|
+
dispersion = np.exp(par[0] + par[1] * abundance_sub) + np.exp(par[2])
|
|
836
|
+
try:
|
|
837
|
+
apl = adjusted_profile_lik(dispersion, y_sub, design, offset_sub)
|
|
838
|
+
return -np.sum(apl)
|
|
839
|
+
except Exception:
|
|
840
|
+
return 1e10
|
|
841
|
+
|
|
842
|
+
par0 = np.array([np.log(0.1), 0.0, -5.0])
|
|
843
|
+
result = minimize(fun, par0, args=(y_nonzero[i], design,
|
|
844
|
+
offset_nonzero[i], abundance_nonzero[i]),
|
|
845
|
+
method=method_optim)
|
|
846
|
+
|
|
847
|
+
# Compute dispersion for all genes using fitted parameters
|
|
848
|
+
dispersion = np.exp(result.x[0] + result.x[1] * abundance_full) + np.exp(result.x[2])
|
|
849
|
+
|
|
850
|
+
return {'dispersion': dispersion, 'AveLogCPM': abundance_full}
|
|
851
|
+
|
|
852
|
+
|
|
853
|
+
def disp_bin_trend(y, design=None, offset=None, df=5, span=0.3,
|
|
854
|
+
min_n=400, method_bin='CoxReid', method_trend='spline',
|
|
855
|
+
ave_log_cpm_vals=None, weights=None):
|
|
856
|
+
"""Estimate dispersion trend by binning.
|
|
857
|
+
|
|
858
|
+
Port of edgeR's dispBinTrend.
|
|
859
|
+
|
|
860
|
+
Returns
|
|
861
|
+
-------
|
|
862
|
+
dict with 'dispersion', 'AveLogCPM', 'bin.AveLogCPM', 'bin.dispersion'.
|
|
863
|
+
"""
|
|
864
|
+
y = np.asarray(y, dtype=np.float64)
|
|
865
|
+
if y.ndim == 1:
|
|
866
|
+
y = y.reshape(1, -1)
|
|
867
|
+
ntags, nlibs = y.shape
|
|
868
|
+
|
|
869
|
+
pos = y.sum(axis=1) > 0
|
|
870
|
+
if not np.any(pos):
|
|
871
|
+
return {'AveLogCPM': ave_log_cpm_vals,
|
|
872
|
+
'dispersion': np.zeros(ntags)}
|
|
873
|
+
npostags = np.sum(pos)
|
|
874
|
+
|
|
875
|
+
if design is None:
|
|
876
|
+
design = np.ones((nlibs, 1))
|
|
877
|
+
else:
|
|
878
|
+
design = np.asarray(design, dtype=np.float64)
|
|
879
|
+
if design.ndim == 1:
|
|
880
|
+
design = design.reshape(-1, 1)
|
|
881
|
+
|
|
882
|
+
if offset is None:
|
|
883
|
+
offset = np.log(y.sum(axis=0))
|
|
884
|
+
offset = expand_as_matrix(np.asarray(offset, dtype=np.float64), y.shape)
|
|
885
|
+
|
|
886
|
+
if ave_log_cpm_vals is None:
|
|
887
|
+
ave_log_cpm_vals = ave_log_cpm(y, offset=offset, weights=weights)
|
|
888
|
+
|
|
889
|
+
# Define bins
|
|
890
|
+
if npostags < 100:
|
|
891
|
+
nbins = 1
|
|
892
|
+
else:
|
|
893
|
+
nbins = int(np.floor(npostags ** 0.4))
|
|
894
|
+
nbins = min(nbins, 1000)
|
|
895
|
+
min_n = min(min_n, npostags // nbins)
|
|
896
|
+
if min_n < 50:
|
|
897
|
+
nbins = npostags // 50
|
|
898
|
+
min_n = 50
|
|
899
|
+
|
|
900
|
+
nbins = max(nbins, 1)
|
|
901
|
+
|
|
902
|
+
if nbins == 1:
|
|
903
|
+
d = disp_cox_reid(y[pos], design, offset=offset[pos],
|
|
904
|
+
weights=weights[pos] if weights is not None and np.ndim(weights) == 2 else weights,
|
|
905
|
+
min_row_sum=0, ave_log_cpm_vals=ave_log_cpm_vals[pos])
|
|
906
|
+
return {'AveLogCPM': ave_log_cpm_vals,
|
|
907
|
+
'dispersion': np.full(ntags, d),
|
|
908
|
+
'bin.AveLogCPM': np.array([np.mean(ave_log_cpm_vals[pos])]),
|
|
909
|
+
'bin.dispersion': np.array([d])}
|
|
910
|
+
|
|
911
|
+
groups = np.zeros(ntags, dtype=int)
|
|
912
|
+
bins_info = cut_with_min_n(ave_log_cpm_vals[pos], intervals=nbins, min_n=min_n)
|
|
913
|
+
groups[pos] = bins_info['group']
|
|
914
|
+
|
|
915
|
+
bin_d = np.zeros(nbins)
|
|
916
|
+
bin_a = np.zeros(nbins)
|
|
917
|
+
for i in range(1, nbins + 1):
|
|
918
|
+
bin_mask = groups == i
|
|
919
|
+
if np.sum(bin_mask) == 0:
|
|
920
|
+
continue
|
|
921
|
+
bin_ave = ave_log_cpm_vals[bin_mask]
|
|
922
|
+
w_bin = None
|
|
923
|
+
if weights is not None and np.ndim(weights) == 2:
|
|
924
|
+
w_bin = weights[bin_mask]
|
|
925
|
+
try:
|
|
926
|
+
bin_d[i - 1] = disp_cox_reid(y[bin_mask], design, offset=offset[bin_mask],
|
|
927
|
+
weights=w_bin, min_row_sum=0,
|
|
928
|
+
ave_log_cpm_vals=bin_ave)
|
|
929
|
+
except Exception:
|
|
930
|
+
bin_d[i - 1] = 0.1
|
|
931
|
+
bin_a[i - 1] = np.mean(bin_ave)
|
|
932
|
+
|
|
933
|
+
# If few bins, use linear interpolation
|
|
934
|
+
if nbins < 7:
|
|
935
|
+
from scipy.interpolate import interp1d
|
|
936
|
+
f = interp1d(bin_a, np.sqrt(np.maximum(bin_d, 0)),
|
|
937
|
+
fill_value='extrapolate', kind='linear')
|
|
938
|
+
dispersion = f(ave_log_cpm_vals) ** 2
|
|
939
|
+
return {'AveLogCPM': ave_log_cpm_vals, 'dispersion': dispersion,
|
|
940
|
+
'bin.AveLogCPM': bin_a, 'bin.dispersion': bin_d}
|
|
941
|
+
|
|
942
|
+
# Natural spline + OLS matching R's dispBinTrend:
|
|
943
|
+
# ns(bin.A, df=df, knots=0.3*quantile+0.7*equispaced, intercept=TRUE)
|
|
944
|
+
# then lm.fit(basisbins, sqrt(bin.d))
|
|
945
|
+
p1 = np.arange(1, df) / df
|
|
946
|
+
knots1 = np.quantile(bin_a, p1)
|
|
947
|
+
r = np.array([np.min(bin_a), np.max(bin_a)])
|
|
948
|
+
knots2 = r[0] + p1 * (r[1] - r[0])
|
|
949
|
+
knots = 0.3 * knots1 + 0.7 * knots2
|
|
950
|
+
|
|
951
|
+
try:
|
|
952
|
+
basisbins = _ns_basis_with_knots(bin_a, knots, boundary_knots=r)
|
|
953
|
+
beta = np.linalg.lstsq(basisbins, np.sqrt(np.maximum(bin_d, 0)),
|
|
954
|
+
rcond=None)[0]
|
|
955
|
+
basisall = _ns_basis_with_knots(ave_log_cpm_vals, knots,
|
|
956
|
+
boundary_knots=r)
|
|
957
|
+
dispersion = np.maximum((basisall @ beta) ** 2, 0)
|
|
958
|
+
except Exception:
|
|
959
|
+
dispersion = np.full(ntags, np.mean(bin_d))
|
|
960
|
+
|
|
961
|
+
return {'AveLogCPM': ave_log_cpm_vals, 'dispersion': dispersion,
|
|
962
|
+
'bin.AveLogCPM': bin_a, 'bin.dispersion': bin_d}
|
|
963
|
+
|
|
964
|
+
|
|
965
|
+
def disp_pearson(y, design=None, offset=None, subset=10000,
|
|
966
|
+
ave_log_cpm_vals=None):
|
|
967
|
+
"""Pearson estimator of common dispersion.
|
|
968
|
+
|
|
969
|
+
Port of edgeR's dispPearson.
|
|
970
|
+
"""
|
|
971
|
+
y = np.asarray(y, dtype=np.float64)
|
|
972
|
+
if y.ndim == 1:
|
|
973
|
+
y = y.reshape(1, -1)
|
|
974
|
+
|
|
975
|
+
if design is None:
|
|
976
|
+
design = np.ones((y.shape[1], 1))
|
|
977
|
+
design = np.asarray(design, dtype=np.float64)
|
|
978
|
+
|
|
979
|
+
if offset is None:
|
|
980
|
+
offset = np.log(y.sum(axis=0))
|
|
981
|
+
offset = expand_as_matrix(np.asarray(offset, dtype=np.float64), y.shape)
|
|
982
|
+
|
|
983
|
+
ntags, nlibs = y.shape
|
|
984
|
+
ncoefs = design.shape[1]
|
|
985
|
+
df_res = nlibs - ncoefs
|
|
986
|
+
|
|
987
|
+
if df_res <= 0:
|
|
988
|
+
warnings.warn("No residual df: setting dispersion to NA")
|
|
989
|
+
return np.nan
|
|
990
|
+
|
|
991
|
+
# Subsetting
|
|
992
|
+
if subset is not None and subset < ntags:
|
|
993
|
+
if ave_log_cpm_vals is None:
|
|
994
|
+
ave_log_cpm_vals = ave_log_cpm(y, offset=offset)
|
|
995
|
+
i = systematic_subset(subset, ave_log_cpm_vals)
|
|
996
|
+
y = y[i]
|
|
997
|
+
offset = offset[i]
|
|
998
|
+
ntags = y.shape[0]
|
|
999
|
+
|
|
1000
|
+
def pearson_disp(d):
|
|
1001
|
+
from .glm_fit import glm_fit
|
|
1002
|
+
fit = glm_fit(y, design=design, dispersion=d, offset=offset, prior_count=0)
|
|
1003
|
+
mu = fit['fitted.values']
|
|
1004
|
+
# Pearson chi-squared
|
|
1005
|
+
pearson = np.sum((y - mu) ** 2 / (mu + d * mu ** 2))
|
|
1006
|
+
return (pearson / (ntags * df_res) - 1)
|
|
1007
|
+
|
|
1008
|
+
# Bisection search
|
|
1009
|
+
try:
|
|
1010
|
+
from scipy.optimize import brentq
|
|
1011
|
+
result = brentq(pearson_disp, 0.001, 10.0, xtol=1e-5)
|
|
1012
|
+
except Exception:
|
|
1013
|
+
result = 0.1
|
|
1014
|
+
|
|
1015
|
+
return max(result, 0)
|
|
1016
|
+
|
|
1017
|
+
|
|
1018
|
+
def disp_deviance(y, design=None, offset=None, subset=10000,
|
|
1019
|
+
ave_log_cpm_vals=None):
|
|
1020
|
+
"""Deviance estimator of common dispersion.
|
|
1021
|
+
|
|
1022
|
+
Port of edgeR's dispDeviance.
|
|
1023
|
+
"""
|
|
1024
|
+
y = np.asarray(y, dtype=np.float64)
|
|
1025
|
+
if y.ndim == 1:
|
|
1026
|
+
y = y.reshape(1, -1)
|
|
1027
|
+
|
|
1028
|
+
if design is None:
|
|
1029
|
+
design = np.ones((y.shape[1], 1))
|
|
1030
|
+
design = np.asarray(design, dtype=np.float64)
|
|
1031
|
+
|
|
1032
|
+
if offset is None:
|
|
1033
|
+
offset = np.log(y.sum(axis=0))
|
|
1034
|
+
offset = expand_as_matrix(np.asarray(offset, dtype=np.float64), y.shape)
|
|
1035
|
+
|
|
1036
|
+
ntags, nlibs = y.shape
|
|
1037
|
+
ncoefs = design.shape[1]
|
|
1038
|
+
df_res = nlibs - ncoefs
|
|
1039
|
+
|
|
1040
|
+
if df_res <= 0:
|
|
1041
|
+
warnings.warn("No residual df: setting dispersion to NA")
|
|
1042
|
+
return np.nan
|
|
1043
|
+
|
|
1044
|
+
# Subsetting
|
|
1045
|
+
if subset is not None and subset < ntags:
|
|
1046
|
+
if ave_log_cpm_vals is None:
|
|
1047
|
+
ave_log_cpm_vals = ave_log_cpm(y, offset=offset)
|
|
1048
|
+
i = systematic_subset(subset, ave_log_cpm_vals)
|
|
1049
|
+
y = y[i]
|
|
1050
|
+
offset = offset[i]
|
|
1051
|
+
ntags = y.shape[0]
|
|
1052
|
+
|
|
1053
|
+
def dev_disp(d):
|
|
1054
|
+
from .glm_fit import glm_fit
|
|
1055
|
+
from .glm_levenberg import nbinom_deviance
|
|
1056
|
+
fit = glm_fit(y, design=design, dispersion=d, offset=offset, prior_count=0)
|
|
1057
|
+
dev = nbinom_deviance(y, fit['fitted.values'], d)
|
|
1058
|
+
return np.sum(dev) / (ntags * df_res) - 1
|
|
1059
|
+
|
|
1060
|
+
try:
|
|
1061
|
+
from scipy.optimize import brentq
|
|
1062
|
+
result = brentq(dev_disp, 0.001, 10.0, xtol=1e-5)
|
|
1063
|
+
except Exception:
|
|
1064
|
+
result = 0.1
|
|
1065
|
+
|
|
1066
|
+
return max(result, 0)
|