edgepython 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1066 @@
1
+ # This code was written by Claude (Anthropic). The project was directed by Lior Pachter.
2
+ """
3
+ Low-level dispersion estimation functions for edgePython.
4
+
5
+ Port of edgeR's adjustedProfileLik, maximizeInterpolant,
6
+ condLogLikDerDelta, condLogLikDerSize, dispCoxReid,
7
+ dispCoxReidInterpolateTagwise, dispCoxReidSplineTrend, dispBinTrend, etc.
8
+ """
9
+
10
+ import numpy as np
11
+ import warnings
12
+ from scipy.special import gammaln, digamma, polygamma
13
+ from scipy.optimize import minimize_scalar, minimize
14
+ from scipy.interpolate import CubicSpline
15
+ from numba import njit
16
+
17
+ from .utils import (expand_as_matrix, systematic_subset, moving_average_by_col,
18
+ cut_with_min_n)
19
+ from .expression import ave_log_cpm
20
+ from .limma_port import is_fullrank
21
+
22
+
23
+ def adjusted_profile_lik_grid(grid_dispersions, y, design, offset, weights=None):
24
+ """Evaluate APL at multiple dispersion grid points efficiently.
25
+
26
+ Optimized version that avoids per-call overhead of glm_fit by directly
27
+ calling mglm_one_group and precomputing shared quantities.
28
+
29
+ Parameters
30
+ ----------
31
+ grid_dispersions : ndarray of shape (ngrid,)
32
+ Grid of dispersion values.
33
+ y : ndarray (ngenes, nlibs)
34
+ Count matrix.
35
+ design : ndarray (nlibs, ncoefs)
36
+ Design matrix.
37
+ offset : ndarray (ngenes, nlibs)
38
+ Offset matrix.
39
+ weights : ndarray (ngenes, nlibs), optional
40
+ Observation weights.
41
+
42
+ Returns
43
+ -------
44
+ ndarray of shape (ngenes, ngrid) — APL values.
45
+ """
46
+ from .glm_fit import mglm_one_group, _expand_to_matrix
47
+ from .utils import design_as_factor
48
+
49
+ y = np.asarray(y, dtype=np.float64)
50
+ if y.ndim == 1:
51
+ y = y.reshape(1, -1)
52
+ ngenes, nlibs = y.shape
53
+ design = np.asarray(design, dtype=np.float64)
54
+ if design.ndim == 1:
55
+ design = design.reshape(-1, 1)
56
+ ncoefs = design.shape[1]
57
+
58
+ offset = np.asarray(offset, dtype=np.float64)
59
+ if offset.ndim == 1:
60
+ offset = np.tile(offset, (ngenes, 1))
61
+
62
+ if weights is not None:
63
+ w = np.asarray(weights, dtype=np.float64)
64
+ if w.ndim == 1:
65
+ w = np.tile(w, (ngenes, 1))
66
+ else:
67
+ w = np.ones_like(y)
68
+
69
+ grid_dispersions = np.asarray(grid_dispersions, dtype=np.float64)
70
+ ngrid = len(grid_dispersions)
71
+
72
+ # Precompute group structure (same for all grid points)
73
+ group = design_as_factor(design)
74
+ unique_groups = np.unique(group)
75
+ ngroups = len(unique_groups)
76
+ is_oneway = ngroups == ncoefs
77
+
78
+ # Precompute group column indices
79
+ group_cols = [np.where(group == grp)[0] for grp in unique_groups]
80
+
81
+ # Check if design is indicator (no back-solve needed)
82
+ first_of_group = np.array([cols[0] for cols in group_cols])
83
+ design_unique = design[first_of_group]
84
+ is_indicator = (np.sum(design_unique == 1) == ngroups and
85
+ np.sum(design_unique == 0) == (ngroups - 1) * ngroups)
86
+
87
+ # Precompute gammaln(y+1) — same for all dispersions
88
+ lgamma_y1 = gammaln(y + 1)
89
+
90
+ # Output
91
+ apl = np.empty((ngenes, ngrid), dtype=np.float64)
92
+
93
+ for gi in range(ngrid):
94
+ d = grid_dispersions[gi]
95
+ disp_scalar = np.float64(d)
96
+
97
+ if is_oneway:
98
+ # Fit each group with mglm_one_group directly
99
+ mu = np.empty_like(y)
100
+ for g_idx, cols in enumerate(group_cols):
101
+ y_g = y[:, cols]
102
+ off_g = offset[:, cols]
103
+ w_g = w[:, cols]
104
+ disp_g = np.full_like(y_g, disp_scalar)
105
+ b = mglm_one_group(y_g, dispersion=disp_g, offset=off_g,
106
+ weights=w_g)
107
+ for jj in cols:
108
+ mu[:, jj] = np.exp(np.clip(b + offset[:, jj], -500, 500))
109
+ else:
110
+ # General case: fall back to full glm_fit
111
+ from .glm_fit import glm_fit
112
+ fit = glm_fit(y, design=design, dispersion=d, offset=offset,
113
+ weights=weights, prior_count=0)
114
+ mu = fit['fitted.values']
115
+
116
+ # NB log-likelihood (vectorized)
117
+ mu_safe = np.maximum(mu, 1e-300)
118
+ r = 1.0 / max(d, 1e-300)
119
+
120
+ ll = np.sum(w * (gammaln(y + r) - gammaln(r) - lgamma_y1
121
+ + r * np.log(r) + y * np.log(mu_safe)
122
+ - (r + y) * np.log(r + mu_safe)), axis=1)
123
+
124
+ # Cox-Reid adjustment: -0.5 * log|X'WX|
125
+ working_w = w * mu_safe / (1.0 + d * mu_safe)
126
+ working_w = np.maximum(working_w, 1e-300)
127
+
128
+ XtWX = np.einsum('gj,jk,jl->gkl', working_w, design, design)
129
+
130
+ if ncoefs == 1:
131
+ logdet = np.log(np.maximum(XtWX[:, 0, 0], 1e-300))
132
+ elif ncoefs == 2:
133
+ det = XtWX[:, 0, 0] * XtWX[:, 1, 1] - XtWX[:, 0, 1] ** 2
134
+ logdet = np.log(np.maximum(det, 1e-300))
135
+ else:
136
+ sign, logdet = np.linalg.slogdet(XtWX)
137
+ logdet = np.where(sign > 0, logdet, 0.0)
138
+
139
+ apl[:, gi] = ll - 0.5 * logdet
140
+
141
+ return apl
142
+
143
+
144
+ def adjusted_profile_lik(dispersion, y, design, offset, weights=None,
145
+ start=None, get_coef=False):
146
+ """Tagwise Cox-Reid adjusted profile log-likelihoods for the dispersion.
147
+
148
+ Port of edgeR's adjustedProfileLik (C code reimplemented).
149
+
150
+ Parameters
151
+ ----------
152
+ dispersion : float or ndarray
153
+ Dispersion value(s).
154
+ y : ndarray
155
+ Count matrix (genes x samples).
156
+ design : ndarray
157
+ Design matrix.
158
+ offset : ndarray
159
+ Offset matrix.
160
+ weights : ndarray, optional
161
+ Observation weights.
162
+ start : ndarray, optional
163
+ Starting coefficients for GLM fit.
164
+ get_coef : bool
165
+ If True, return coefficients along with APL.
166
+
167
+ Returns
168
+ -------
169
+ ndarray of adjusted profile log-likelihoods (one per gene),
170
+ or dict with 'apl' and 'beta' if get_coef=True.
171
+ """
172
+ y = np.asarray(y, dtype=np.float64)
173
+ if y.ndim == 1:
174
+ y = y.reshape(1, -1)
175
+ ngenes, nlibs = y.shape
176
+ design = np.asarray(design, dtype=np.float64)
177
+ if design.ndim == 1:
178
+ design = design.reshape(-1, 1)
179
+ ncoefs = design.shape[1]
180
+
181
+ offset = np.asarray(offset, dtype=np.float64)
182
+ if offset.ndim == 1:
183
+ offset = np.tile(offset, (ngenes, 1))
184
+
185
+ dispersion = np.atleast_1d(np.asarray(dispersion, dtype=np.float64))
186
+ if len(dispersion) == 1:
187
+ disp = np.full(ngenes, dispersion[0])
188
+ else:
189
+ disp = dispersion
190
+
191
+ if weights is not None:
192
+ w = np.asarray(weights, dtype=np.float64)
193
+ if w.ndim == 1:
194
+ w = np.tile(w, (ngenes, 1))
195
+ else:
196
+ w = np.ones_like(y)
197
+
198
+ # Fit GLM to get mu
199
+ from .glm_fit import glm_fit
200
+ fit = glm_fit(y, design=design, dispersion=disp, offset=offset,
201
+ weights=weights, prior_count=0, start=start)
202
+ mu = fit['fitted.values']
203
+ beta = fit.get('unshrunk.coefficients', fit['coefficients'])
204
+
205
+ # Compute adjusted profile log-likelihood for all genes (vectorized)
206
+ mu_safe = np.maximum(mu, 1e-300) # (ngenes, nlibs)
207
+ r = 1.0 / np.maximum(disp, 1e-300) # (ngenes,)
208
+ is_nb = disp > 0
209
+
210
+ # NB log-likelihood (vectorized)
211
+ r_col = r[:, None] # (ngenes, 1)
212
+ ll = np.zeros(ngenes)
213
+ if np.any(is_nb):
214
+ nb = is_nb
215
+ ll[nb] = np.sum(w[nb] * (gammaln(y[nb] + r_col[nb]) - gammaln(r_col[nb])
216
+ - gammaln(y[nb] + 1)
217
+ + r_col[nb] * np.log(r_col[nb]) + y[nb] * np.log(mu_safe[nb])
218
+ - (r_col[nb] + y[nb]) * np.log(r_col[nb] + mu_safe[nb])), axis=1)
219
+ if np.any(~is_nb):
220
+ pois = ~is_nb
221
+ ll[pois] = np.sum(w[pois] * (y[pois] * np.log(mu_safe[pois])
222
+ - mu_safe[pois] - gammaln(y[pois] + 1)), axis=1)
223
+
224
+ # Cox-Reid adjustment: -0.5 * log|X'WX| (vectorized)
225
+ # Working weights: mu / (1 + d*mu) for NB, mu for Poisson
226
+ disp_col = disp[:, None] # (ngenes, 1)
227
+ working_w = np.where(is_nb[:, None],
228
+ w * mu_safe / (1.0 + disp_col * mu_safe),
229
+ w * mu_safe)
230
+ working_w = np.maximum(working_w, 1e-300) # (ngenes, nlibs)
231
+
232
+ # Compute X'WX for all genes at once using einsum
233
+ # XtWX[g, k, l] = sum_j working_w[g,j] * design[j,k] * design[j,l]
234
+ XtWX = np.einsum('gj,jk,jl->gkl', working_w, design, design) # (ngenes, ncoefs, ncoefs)
235
+
236
+ # Log determinant for all genes
237
+ if ncoefs == 1:
238
+ logdet = np.log(np.maximum(XtWX[:, 0, 0], 1e-300))
239
+ elif ncoefs == 2:
240
+ det = XtWX[:, 0, 0] * XtWX[:, 1, 1] - XtWX[:, 0, 1] ** 2
241
+ logdet = np.log(np.maximum(det, 1e-300))
242
+ else:
243
+ sign, logdet = np.linalg.slogdet(XtWX)
244
+ logdet = np.where(sign > 0, logdet, 0.0)
245
+
246
+ cr_adj = -0.5 * logdet
247
+ apl = ll + cr_adj
248
+
249
+ if get_coef:
250
+ return {'apl': apl, 'beta': beta}
251
+ return apl
252
+
253
+
254
+ @njit(cache=True)
255
+ def _fmm_spline(n, x, y, b, c, d):
256
+ """Forsythe-Malcolm-Moler cubic spline (matches R's splines.c / edgeR's fmm_spline).
257
+
258
+ Computes coefficients b, c, d such that in segment i:
259
+ S(t) = y[i] + b[i]*t + c[i]*t^2 + d[i]*t^3
260
+ where t = x_eval - x[i].
261
+ """
262
+ if n < 2:
263
+ return
264
+ if n < 3:
265
+ t = (y[1] - y[0]) / (x[1] - x[0])
266
+ b[0] = t
267
+ b[1] = t
268
+ c[0] = c[1] = d[0] = d[1] = 0.0
269
+ return
270
+
271
+ nm1 = n - 1
272
+
273
+ # Set up tridiagonal system
274
+ # Using d for offdiagonal, b for diagonal, c for RHS
275
+ d[0] = x[1] - x[0]
276
+ c[1] = (y[1] - y[0]) / d[0]
277
+ for i in range(1, nm1):
278
+ d[i] = x[i + 1] - x[i]
279
+ b[i] = 2.0 * (d[i - 1] + d[i])
280
+ c[i + 1] = (y[i + 1] - y[i]) / d[i]
281
+ c[i] = c[i + 1] - c[i]
282
+
283
+ # End conditions (FMM: match third derivatives)
284
+ b[0] = -d[0]
285
+ b[nm1] = -d[nm1 - 1]
286
+ c[0] = 0.0
287
+ c[nm1] = 0.0
288
+ if n > 3:
289
+ c[0] = c[2] / (x[3] - x[1]) - c[1] / (x[2] - x[0])
290
+ c[nm1] = c[nm1 - 1] / (x[nm1] - x[nm1 - 2]) - c[nm1 - 2] / (x[nm1 - 1] - x[nm1 - 3])
291
+ c[0] = c[0] * d[0] * d[0] / (x[3] - x[0])
292
+ c[nm1] = -c[nm1] * d[nm1 - 1] * d[nm1 - 1] / (x[nm1] - x[nm1 - 3])
293
+
294
+ # Gaussian elimination
295
+ for i in range(1, n):
296
+ t = d[i - 1] / b[i - 1]
297
+ b[i] = b[i] - t * d[i - 1]
298
+ c[i] = c[i] - t * c[i - 1]
299
+
300
+ # Backward substitution
301
+ c[nm1] = c[nm1] / b[nm1]
302
+ for i in range(nm1 - 1, -1, -1):
303
+ c[i] = (c[i] - d[i] * c[i + 1]) / b[i]
304
+
305
+ # Compute polynomial coefficients
306
+ b[nm1] = (y[nm1] - y[nm1 - 1]) / d[nm1 - 1] + d[nm1 - 1] * (c[nm1 - 1] + 2.0 * c[nm1])
307
+ for i in range(nm1):
308
+ b[i] = (y[i + 1] - y[i]) / d[i] - d[i] * (c[i + 1] + 2.0 * c[i])
309
+ d[i] = (c[i + 1] - c[i]) / d[i]
310
+ c[i] = 3.0 * c[i]
311
+ c[nm1] = 3.0 * c[nm1]
312
+ d[nm1] = d[nm1 - 1]
313
+
314
+
315
+ @njit(cache=True)
316
+ def _maximize_interpolant_kernel(x, y_mat, ngenes, npts, result):
317
+ """Numba kernel: FMM spline + analytical max (matches edgeR's C find_max).
318
+
319
+ For each gene, fits an FMM cubic spline, finds the grid point with the
320
+ highest value, then analytically solves for the maximum on the two
321
+ neighbouring segments by finding roots of the derivative (a quadratic).
322
+ This is O(npts) per gene with no discretisation artifacts.
323
+ """
324
+ b = np.empty(npts)
325
+ c = np.empty(npts)
326
+ d = np.empty(npts)
327
+ y_row = np.empty(npts)
328
+
329
+ for g in range(ngenes):
330
+ # Copy row (fmm_spline modifies y in-place via c)
331
+ for i in range(npts):
332
+ y_row[i] = y_mat[g, i]
333
+
334
+ # Find coarse grid maximum
335
+ maxed = y_row[0]
336
+ maxed_at = 0
337
+ for i in range(1, npts):
338
+ if y_row[i] > maxed:
339
+ maxed = y_row[i]
340
+ maxed_at = i
341
+ x_max = x[maxed_at]
342
+
343
+ # Fit FMM spline: S(t) = y[i] + b[i]*t + c[i]*t^2 + d[i]*t^3
344
+ _fmm_spline(npts, x, y_row, b, c, d)
345
+
346
+ # Check left segment (maxed_at - 1)
347
+ if maxed_at > 0:
348
+ seg = maxed_at - 1
349
+ lb = b[seg]
350
+ lc = c[seg]
351
+ ld = d[seg]
352
+
353
+ # Derivative: b + 2c*t + 3d*t^2 = 0
354
+ # Discriminant: (2c)^2 - 4*(3d)*b = 4*(c^2 - 3*d*b)
355
+ delta = lc * lc - 3.0 * ld * lb
356
+ if delta >= 0.0:
357
+ # Solution for maximum (not minimum)
358
+ numerator = -lc - np.sqrt(delta)
359
+ chosen_sol = numerator / (3.0 * ld) if ld != 0.0 else 0.0
360
+
361
+ seg_width = x[maxed_at] - x[seg]
362
+ if chosen_sol > 0.0 and chosen_sol < seg_width:
363
+ temp = ((ld * chosen_sol + lc) * chosen_sol + lb) * chosen_sol + y_row[seg]
364
+ if temp > maxed:
365
+ maxed = temp
366
+ x_max = chosen_sol + x[seg]
367
+
368
+ # Check right segment (maxed_at)
369
+ if maxed_at < npts - 1:
370
+ seg = maxed_at
371
+ rb = b[seg]
372
+ rc = c[seg]
373
+ rd = d[seg]
374
+
375
+ delta = rc * rc - 3.0 * rd * rb
376
+ if delta >= 0.0:
377
+ numerator = -rc - np.sqrt(delta)
378
+ chosen_sol = numerator / (3.0 * rd) if rd != 0.0 else 0.0
379
+
380
+ seg_width = x[seg + 1] - x[seg]
381
+ if chosen_sol > 0.0 and chosen_sol < seg_width:
382
+ temp = ((rd * chosen_sol + rc) * chosen_sol + rb) * chosen_sol + y_row[seg]
383
+ if temp > maxed:
384
+ maxed = temp
385
+ x_max = chosen_sol + x[seg]
386
+
387
+ result[g] = x_max
388
+
389
+
390
+ def maximize_interpolant(x, y):
391
+ """Find the maximum of an interpolated function for each row.
392
+
393
+ Port of edgeR's maximizeInterpolant. Uses FMM cubic spline fitting
394
+ followed by analytical maximum finding on neighbouring segments,
395
+ matching R's C implementation exactly.
396
+
397
+ Parameters
398
+ ----------
399
+ x : ndarray
400
+ Grid points (sorted, unique).
401
+ y : ndarray
402
+ Log-likelihood matrix (genes x grid points).
403
+
404
+ Returns
405
+ -------
406
+ ndarray of maximizing x values (one per gene).
407
+ """
408
+ x = np.asarray(x, dtype=np.float64).copy()
409
+ y = np.asarray(y, dtype=np.float64)
410
+ if y.ndim == 1:
411
+ y = y.reshape(1, -1)
412
+
413
+ ngenes = y.shape[0]
414
+ npts = len(x)
415
+
416
+ result = np.empty(ngenes, dtype=np.float64)
417
+ _maximize_interpolant_kernel(x, y, ngenes, npts, result)
418
+ return result
419
+
420
+
421
+ def cond_log_lik_der_size(y, r, der=0):
422
+ """Derivatives of conditional log-likelihood w.r.t. r=1/dispersion.
423
+
424
+ Port of edgeR's condLogLikDerSize.
425
+ """
426
+ y = np.asarray(y, dtype=np.float64)
427
+ if y.ndim == 1:
428
+ y = y.reshape(1, -1)
429
+ n = y.shape[1]
430
+ m = np.mean(y, axis=1)
431
+
432
+ if der == 0:
433
+ # Log-likelihood
434
+ return (np.sum(gammaln(y + r[:, None]), axis=1) +
435
+ gammaln(n * r) - gammaln(n * (m + r)) - n * gammaln(r))
436
+ elif der == 1:
437
+ # First derivative
438
+ return (np.sum(digamma(y + r[:, None]), axis=1) +
439
+ n * digamma(n * r) - n * digamma(n * (m + r)) - n * digamma(r))
440
+ elif der == 2:
441
+ # Second derivative
442
+ return (np.sum(polygamma(1, y + r[:, None]), axis=1) +
443
+ n**2 * polygamma(1, n * r) - n**2 * polygamma(1, n * (m + r)) -
444
+ n * polygamma(1, r))
445
+ else:
446
+ raise ValueError(f"der must be 0, 1, or 2, got {der}")
447
+
448
+
449
+ def cond_log_lik_der_delta(y, delta, der=0):
450
+ """Derivatives of conditional log-likelihood w.r.t. delta=dispersion/(1+dispersion).
451
+
452
+ Port of edgeR's condLogLikDerDelta.
453
+ """
454
+ y = np.asarray(y, dtype=np.float64)
455
+ if y.ndim == 1:
456
+ y = y.reshape(1, -1)
457
+
458
+ delta = np.atleast_1d(np.asarray(delta, dtype=np.float64))
459
+ r = (1.0 / delta) - 1.0
460
+
461
+ if der == 0:
462
+ return cond_log_lik_der_size(y, r, der=0)
463
+ elif der == 1:
464
+ return cond_log_lik_der_size(y, r, der=1) * (-delta**(-2))
465
+ elif der == 2:
466
+ return (cond_log_lik_der_size(y, r, der=1) * 2 * delta**(-3) +
467
+ cond_log_lik_der_size(y, r, der=2) * delta**(-4))
468
+ else:
469
+ raise ValueError(f"der must be 0, 1, or 2, got {der}")
470
+
471
+
472
+ def common_cond_log_lik_der_delta(y_split, delta, der=0):
473
+ """Sum of conditional log-likelihoods across groups.
474
+
475
+ Port of edgeR's commonCondLogLikDerDelta.
476
+ """
477
+ total = 0.0
478
+ for y_group in y_split:
479
+ total += np.sum(cond_log_lik_der_delta(y_group, delta, der=der))
480
+ return total
481
+
482
+
483
+ def disp_cox_reid(y, design=None, offset=None, weights=None, ave_log_cpm_vals=None,
484
+ interval=(0, 4), tol=1e-5, min_row_sum=5, subset=10000):
485
+ """Cox-Reid APL estimator of common dispersion.
486
+
487
+ Port of edgeR's dispCoxReid.
488
+
489
+ Parameters
490
+ ----------
491
+ y : ndarray
492
+ Count matrix.
493
+ design : ndarray, optional
494
+ Design matrix.
495
+ offset : ndarray, optional
496
+ Offset.
497
+ weights : ndarray, optional
498
+ Weights.
499
+ ave_log_cpm_vals : ndarray, optional
500
+ Pre-computed AveLogCPM values.
501
+ interval : tuple
502
+ Search interval for dispersion.
503
+ tol : float
504
+ Optimization tolerance.
505
+ min_row_sum : int
506
+ Minimum row sum.
507
+ subset : int
508
+ Number of genes to subset.
509
+
510
+ Returns
511
+ -------
512
+ float : estimated common dispersion.
513
+ """
514
+ y = np.asarray(y, dtype=np.float64)
515
+ if y.ndim == 1:
516
+ y = y.reshape(1, -1)
517
+
518
+ if design is None:
519
+ design = np.ones((y.shape[1], 1))
520
+ else:
521
+ design = np.asarray(design, dtype=np.float64)
522
+ if design.ndim == 1:
523
+ design = design.reshape(-1, 1)
524
+
525
+ if offset is None:
526
+ offset = np.log(y.sum(axis=0))
527
+ offset = expand_as_matrix(np.asarray(offset, dtype=np.float64), y.shape)
528
+
529
+ if interval[0] < 0:
530
+ raise ValueError("please give a non-negative interval for the dispersion")
531
+
532
+ # Apply min row count
533
+ row_sums = y.sum(axis=1)
534
+ keep = row_sums >= min_row_sum
535
+ if not np.all(keep):
536
+ y = y[keep]
537
+ offset = offset[keep]
538
+ if weights is not None:
539
+ weights = np.asarray(weights)
540
+ if weights.ndim == 2:
541
+ weights = weights[keep]
542
+ if ave_log_cpm_vals is not None:
543
+ ave_log_cpm_vals = ave_log_cpm_vals[keep]
544
+
545
+ if y.shape[0] < 1:
546
+ raise ValueError("no data rows with required number of counts")
547
+
548
+ # Subsetting
549
+ if subset is not None and subset <= y.shape[0] / 2:
550
+ if ave_log_cpm_vals is None:
551
+ ave_log_cpm_vals = ave_log_cpm(y, offset=offset, weights=weights)
552
+ i = systematic_subset(subset, ave_log_cpm_vals)
553
+ y = y[i]
554
+ offset = offset[i]
555
+ if weights is not None and weights.ndim == 2:
556
+ weights = weights[i]
557
+
558
+ # Function to optimize
559
+ def fun(par):
560
+ disp = par ** 4
561
+ return -np.sum(adjusted_profile_lik(disp, y, design, offset, weights=weights))
562
+
563
+ # Optimize
564
+ lo = interval[0] ** 0.25
565
+ hi = interval[1] ** 0.25
566
+ if lo == 0:
567
+ lo = 1e-10
568
+ result = minimize_scalar(fun, bounds=(lo, hi), method='bounded',
569
+ options={'xatol': tol})
570
+ return result.x ** 4
571
+
572
+
573
+ def disp_cox_reid_interpolate_tagwise(y, design, offset=None, dispersion=None,
574
+ trend=True, ave_log_cpm_vals=None,
575
+ min_row_sum=5, prior_df=10, span=0.3,
576
+ grid_npts=11, grid_range=(-6, 6),
577
+ weights=None):
578
+ """Estimate tagwise NB dispersions using Cox-Reid APL with interpolation.
579
+
580
+ Port of edgeR's dispCoxReidInterpolateTagwise.
581
+
582
+ Parameters
583
+ ----------
584
+ y : ndarray
585
+ Count matrix.
586
+ design : ndarray
587
+ Design matrix.
588
+ offset : ndarray, optional
589
+ Offset.
590
+ dispersion : float or ndarray
591
+ Starting dispersion(s).
592
+ trend : bool
593
+ Use trend.
594
+ ave_log_cpm_vals : ndarray, optional
595
+ Average log CPM.
596
+ min_row_sum : int
597
+ Minimum row sum.
598
+ prior_df : float
599
+ Prior degrees of freedom.
600
+ span : float
601
+ Span for moving average.
602
+ grid_npts : int
603
+ Number of grid points.
604
+ grid_range : tuple
605
+ Range for grid.
606
+ weights : ndarray, optional
607
+ Weights.
608
+
609
+ Returns
610
+ -------
611
+ ndarray of tagwise dispersions.
612
+ """
613
+ y = np.asarray(y, dtype=np.float64)
614
+ if y.ndim == 1:
615
+ y = y.reshape(1, -1)
616
+ ntags, nlibs = y.shape
617
+
618
+ design = np.asarray(design, dtype=np.float64)
619
+ if design.ndim == 1:
620
+ design = design.reshape(-1, 1)
621
+ ncoefs = design.shape[1]
622
+
623
+ if offset is None:
624
+ offset = np.log(y.sum(axis=0))
625
+ offset = expand_as_matrix(np.asarray(offset, dtype=np.float64), y.shape)
626
+
627
+ if ave_log_cpm_vals is None:
628
+ ave_log_cpm_vals = ave_log_cpm(y, offset=offset, weights=weights)
629
+
630
+ dispersion = np.atleast_1d(np.asarray(dispersion, dtype=np.float64))
631
+ if len(dispersion) == 1:
632
+ dispersion = np.full(ntags, dispersion[0])
633
+ elif len(dispersion) != ntags:
634
+ raise ValueError("length of dispersion doesn't match nrow(y)")
635
+
636
+ # Apply min_row_sum
637
+ row_sums = y.sum(axis=1)
638
+ keep = row_sums >= min_row_sum
639
+ if not np.all(keep):
640
+ if np.any(keep):
641
+ dispersion[keep] = disp_cox_reid_interpolate_tagwise(
642
+ y[keep], design, offset=offset[keep],
643
+ dispersion=dispersion[keep],
644
+ ave_log_cpm_vals=ave_log_cpm_vals[keep],
645
+ grid_npts=grid_npts, min_row_sum=0,
646
+ prior_df=prior_df, span=span, trend=trend,
647
+ weights=weights[keep] if weights is not None and np.ndim(weights) == 2 else weights)
648
+ return dispersion
649
+
650
+ # Posterior profile likelihood
651
+ prior_n = prior_df / (nlibs - ncoefs)
652
+ spline_pts = np.linspace(grid_range[0], grid_range[1], grid_npts)
653
+ apl = np.zeros((ntags, grid_npts))
654
+
655
+ for i in range(grid_npts):
656
+ spline_disp = dispersion * 2 ** spline_pts[i]
657
+ apl[:, i] = adjusted_profile_lik(spline_disp, y, design, offset, weights=weights)
658
+
659
+ if trend:
660
+ o = np.argsort(ave_log_cpm_vals)
661
+ oo = np.argsort(o)
662
+ width = int(np.floor(span * ntags))
663
+ width = max(width, 1)
664
+ apl_smooth = moving_average_by_col(apl[o], width=width)[oo]
665
+ else:
666
+ apl_smooth = np.tile(np.mean(apl, axis=0), (ntags, 1))
667
+
668
+ apl_smooth = (apl + prior_n * apl_smooth) / (1 + prior_n)
669
+
670
+ # Tagwise maximization
671
+ d = maximize_interpolant(spline_pts, apl_smooth)
672
+ return dispersion * 2 ** d
673
+
674
+
675
+ def _ns_basis_with_knots(x, internal_knots, boundary_knots):
676
+ """Create natural cubic spline basis matching R's cbind(1, ns(x, knots=knots)).
677
+
678
+ Uses the truncated power basis from ESL (Hastie et al.) eq 5.4-5.5.
679
+
680
+ Parameters
681
+ ----------
682
+ x : array
683
+ Data values.
684
+ internal_knots : array
685
+ Internal knot positions.
686
+ boundary_knots : array of length 2
687
+ [lower, upper] boundary knots.
688
+
689
+ Returns
690
+ -------
691
+ ndarray of shape (n, len(internal_knots) + 2)
692
+ Basis matrix including intercept column.
693
+ """
694
+ x = np.asarray(x, dtype=np.float64)
695
+ n = len(x)
696
+ internal_knots = np.asarray(internal_knots, dtype=np.float64)
697
+
698
+ all_knots = np.sort(np.concatenate([[boundary_knots[0]],
699
+ internal_knots,
700
+ [boundary_knots[1]]]))
701
+ K = len(all_knots)
702
+ ncols = K # = len(internal_knots) + 2
703
+
704
+ basis = np.zeros((n, ncols))
705
+ basis[:, 0] = 1.0
706
+ basis[:, 1] = x
707
+
708
+ if K > 2:
709
+ xi_K = all_knots[-1]
710
+ xi_Km1 = all_knots[-2]
711
+
712
+ def d_func(xi_j):
713
+ return (np.maximum(x - xi_j, 0) ** 3 -
714
+ np.maximum(x - xi_K, 0) ** 3) / (xi_K - xi_j)
715
+
716
+ d_Km1 = d_func(xi_Km1)
717
+ for j in range(K - 2):
718
+ basis[:, 2 + j] = d_func(all_knots[j]) - d_Km1
719
+
720
+ return basis
721
+
722
+
723
+ def disp_cox_reid_spline_trend(y, design, offset=None, df=5, subset=10000,
724
+ ave_log_cpm_vals=None, method_optim='Nelder-Mead'):
725
+ """Estimate spline trend dispersion.
726
+
727
+ Faithful port of edgeR's dispCoxReidSplineTrend.
728
+ Fits: dispersion = exp(X @ par - abundance) where X is a natural spline
729
+ basis, optimized via Nelder-Mead on adjusted profile likelihood.
730
+
731
+ Returns
732
+ -------
733
+ dict with 'dispersion' and 'AveLogCPM'.
734
+ """
735
+ y = np.asarray(y, dtype=np.float64)
736
+ if y.ndim == 1:
737
+ y = y.reshape(1, -1)
738
+ ntags, nlibs = y.shape
739
+
740
+ if offset is None:
741
+ offset = np.zeros(nlibs)
742
+ offset = expand_as_matrix(np.asarray(offset, dtype=np.float64), y.shape)
743
+
744
+ if ave_log_cpm_vals is None:
745
+ ave_log_cpm_vals = ave_log_cpm(y, offset=offset)
746
+
747
+ all_zero = y.sum(axis=1) == 0
748
+ abundance_nonzero = ave_log_cpm_vals[~all_zero]
749
+ y_nonzero = y[~all_zero]
750
+ offset_nonzero = offset[~all_zero]
751
+
752
+ i = systematic_subset(subset, abundance_nonzero)
753
+
754
+ if len(abundance_nonzero) < 2:
755
+ common_disp = disp_cox_reid(y_nonzero, design, offset=offset_nonzero)
756
+ disp = np.full(ntags, common_disp)
757
+ return {'dispersion': disp, 'AveLogCPM': ave_log_cpm_vals}
758
+
759
+ # Knot placement matching R: weighted mix of quantile and equally-spaced
760
+ p1 = np.arange(1, df) / df
761
+ knots1 = np.quantile(abundance_nonzero, p1)
762
+ r = np.array([np.min(abundance_nonzero), np.max(abundance_nonzero)])
763
+ knots2 = r[0] + p1 * (r[1] - r[0])
764
+ knots = 0.3 * knots1 + 0.7 * knots2
765
+
766
+ # Build natural spline basis: cbind(1, ns(abundance, knots=knots))
767
+ X = _ns_basis_with_knots(abundance_nonzero, knots, boundary_knots=r)
768
+
769
+ # Objective: negative sum of adjusted profile likelihoods
770
+ def fun(par, y_sub, design, offset_sub, abundance_sub, X_sub):
771
+ eta = X_sub @ par
772
+ dispersion = np.exp(eta - abundance_sub)
773
+ try:
774
+ apl = adjusted_profile_lik(dispersion, y_sub, design, offset_sub)
775
+ return -np.sum(apl)
776
+ except Exception:
777
+ return 1e10
778
+
779
+ # Initial parameters matching R
780
+ par0 = np.zeros(df + 1)
781
+ par0[0] = np.median(abundance_nonzero[i]) + np.log(0.1)
782
+
783
+ result = minimize(fun, par0, args=(y_nonzero[i], design,
784
+ offset_nonzero[i], abundance_nonzero[i],
785
+ X[i]),
786
+ method=method_optim)
787
+
788
+ # Evaluate fitted dispersions for all genes
789
+ disp_nonzero = np.exp(X @ result.x - abundance_nonzero)
790
+
791
+ disp = np.full(ntags, np.nan)
792
+ disp[all_zero] = disp_nonzero[np.argmin(abundance_nonzero)] if len(disp_nonzero) > 0 else 0.1
793
+ disp[~all_zero] = disp_nonzero
794
+
795
+ return {'dispersion': disp, 'AveLogCPM': ave_log_cpm_vals}
796
+
797
+
798
+ def disp_cox_reid_power_trend(y, design, offset=None, ave_log_cpm_vals=None,
799
+ subset=10000, method_optim='Nelder-Mead'):
800
+ """Estimate power trend dispersion.
801
+
802
+ Faithful port of edgeR's dispCoxReidPowerTrend.
803
+ Fits the parametric model: dispersion = exp(a + b*AveLogCPM) + exp(c)
804
+ by maximizing the Cox-Reid adjusted profile likelihood via Nelder-Mead.
805
+
806
+ Returns
807
+ -------
808
+ dict with 'dispersion' and 'AveLogCPM'.
809
+ """
810
+ y = np.asarray(y, dtype=np.float64)
811
+ if y.ndim == 1:
812
+ y = y.reshape(1, -1)
813
+ ntags = y.shape[0]
814
+
815
+ if offset is None:
816
+ offset = np.log(y.sum(axis=0))
817
+ offset = expand_as_matrix(np.asarray(offset, dtype=np.float64), y.shape)
818
+
819
+ if ave_log_cpm_vals is None:
820
+ ave_log_cpm_vals = ave_log_cpm(y, offset=offset)
821
+
822
+ abundance_full = ave_log_cpm_vals
823
+
824
+ # Exclude all-zero rows
825
+ all_zero = y.sum(axis=1) == 0
826
+ abundance_nonzero = abundance_full[~all_zero]
827
+ y_nonzero = y[~all_zero]
828
+ offset_nonzero = offset[~all_zero]
829
+
830
+ # Systematic subset for efficiency
831
+ i = systematic_subset(subset, abundance_nonzero)
832
+
833
+ # Objective: negative sum of adjusted profile likelihoods
834
+ def fun(par, y_sub, design, offset_sub, abundance_sub):
835
+ dispersion = np.exp(par[0] + par[1] * abundance_sub) + np.exp(par[2])
836
+ try:
837
+ apl = adjusted_profile_lik(dispersion, y_sub, design, offset_sub)
838
+ return -np.sum(apl)
839
+ except Exception:
840
+ return 1e10
841
+
842
+ par0 = np.array([np.log(0.1), 0.0, -5.0])
843
+ result = minimize(fun, par0, args=(y_nonzero[i], design,
844
+ offset_nonzero[i], abundance_nonzero[i]),
845
+ method=method_optim)
846
+
847
+ # Compute dispersion for all genes using fitted parameters
848
+ dispersion = np.exp(result.x[0] + result.x[1] * abundance_full) + np.exp(result.x[2])
849
+
850
+ return {'dispersion': dispersion, 'AveLogCPM': abundance_full}
851
+
852
+
853
+ def disp_bin_trend(y, design=None, offset=None, df=5, span=0.3,
854
+ min_n=400, method_bin='CoxReid', method_trend='spline',
855
+ ave_log_cpm_vals=None, weights=None):
856
+ """Estimate dispersion trend by binning.
857
+
858
+ Port of edgeR's dispBinTrend.
859
+
860
+ Returns
861
+ -------
862
+ dict with 'dispersion', 'AveLogCPM', 'bin.AveLogCPM', 'bin.dispersion'.
863
+ """
864
+ y = np.asarray(y, dtype=np.float64)
865
+ if y.ndim == 1:
866
+ y = y.reshape(1, -1)
867
+ ntags, nlibs = y.shape
868
+
869
+ pos = y.sum(axis=1) > 0
870
+ if not np.any(pos):
871
+ return {'AveLogCPM': ave_log_cpm_vals,
872
+ 'dispersion': np.zeros(ntags)}
873
+ npostags = np.sum(pos)
874
+
875
+ if design is None:
876
+ design = np.ones((nlibs, 1))
877
+ else:
878
+ design = np.asarray(design, dtype=np.float64)
879
+ if design.ndim == 1:
880
+ design = design.reshape(-1, 1)
881
+
882
+ if offset is None:
883
+ offset = np.log(y.sum(axis=0))
884
+ offset = expand_as_matrix(np.asarray(offset, dtype=np.float64), y.shape)
885
+
886
+ if ave_log_cpm_vals is None:
887
+ ave_log_cpm_vals = ave_log_cpm(y, offset=offset, weights=weights)
888
+
889
+ # Define bins
890
+ if npostags < 100:
891
+ nbins = 1
892
+ else:
893
+ nbins = int(np.floor(npostags ** 0.4))
894
+ nbins = min(nbins, 1000)
895
+ min_n = min(min_n, npostags // nbins)
896
+ if min_n < 50:
897
+ nbins = npostags // 50
898
+ min_n = 50
899
+
900
+ nbins = max(nbins, 1)
901
+
902
+ if nbins == 1:
903
+ d = disp_cox_reid(y[pos], design, offset=offset[pos],
904
+ weights=weights[pos] if weights is not None and np.ndim(weights) == 2 else weights,
905
+ min_row_sum=0, ave_log_cpm_vals=ave_log_cpm_vals[pos])
906
+ return {'AveLogCPM': ave_log_cpm_vals,
907
+ 'dispersion': np.full(ntags, d),
908
+ 'bin.AveLogCPM': np.array([np.mean(ave_log_cpm_vals[pos])]),
909
+ 'bin.dispersion': np.array([d])}
910
+
911
+ groups = np.zeros(ntags, dtype=int)
912
+ bins_info = cut_with_min_n(ave_log_cpm_vals[pos], intervals=nbins, min_n=min_n)
913
+ groups[pos] = bins_info['group']
914
+
915
+ bin_d = np.zeros(nbins)
916
+ bin_a = np.zeros(nbins)
917
+ for i in range(1, nbins + 1):
918
+ bin_mask = groups == i
919
+ if np.sum(bin_mask) == 0:
920
+ continue
921
+ bin_ave = ave_log_cpm_vals[bin_mask]
922
+ w_bin = None
923
+ if weights is not None and np.ndim(weights) == 2:
924
+ w_bin = weights[bin_mask]
925
+ try:
926
+ bin_d[i - 1] = disp_cox_reid(y[bin_mask], design, offset=offset[bin_mask],
927
+ weights=w_bin, min_row_sum=0,
928
+ ave_log_cpm_vals=bin_ave)
929
+ except Exception:
930
+ bin_d[i - 1] = 0.1
931
+ bin_a[i - 1] = np.mean(bin_ave)
932
+
933
+ # If few bins, use linear interpolation
934
+ if nbins < 7:
935
+ from scipy.interpolate import interp1d
936
+ f = interp1d(bin_a, np.sqrt(np.maximum(bin_d, 0)),
937
+ fill_value='extrapolate', kind='linear')
938
+ dispersion = f(ave_log_cpm_vals) ** 2
939
+ return {'AveLogCPM': ave_log_cpm_vals, 'dispersion': dispersion,
940
+ 'bin.AveLogCPM': bin_a, 'bin.dispersion': bin_d}
941
+
942
+ # Natural spline + OLS matching R's dispBinTrend:
943
+ # ns(bin.A, df=df, knots=0.3*quantile+0.7*equispaced, intercept=TRUE)
944
+ # then lm.fit(basisbins, sqrt(bin.d))
945
+ p1 = np.arange(1, df) / df
946
+ knots1 = np.quantile(bin_a, p1)
947
+ r = np.array([np.min(bin_a), np.max(bin_a)])
948
+ knots2 = r[0] + p1 * (r[1] - r[0])
949
+ knots = 0.3 * knots1 + 0.7 * knots2
950
+
951
+ try:
952
+ basisbins = _ns_basis_with_knots(bin_a, knots, boundary_knots=r)
953
+ beta = np.linalg.lstsq(basisbins, np.sqrt(np.maximum(bin_d, 0)),
954
+ rcond=None)[0]
955
+ basisall = _ns_basis_with_knots(ave_log_cpm_vals, knots,
956
+ boundary_knots=r)
957
+ dispersion = np.maximum((basisall @ beta) ** 2, 0)
958
+ except Exception:
959
+ dispersion = np.full(ntags, np.mean(bin_d))
960
+
961
+ return {'AveLogCPM': ave_log_cpm_vals, 'dispersion': dispersion,
962
+ 'bin.AveLogCPM': bin_a, 'bin.dispersion': bin_d}
963
+
964
+
965
+ def disp_pearson(y, design=None, offset=None, subset=10000,
966
+ ave_log_cpm_vals=None):
967
+ """Pearson estimator of common dispersion.
968
+
969
+ Port of edgeR's dispPearson.
970
+ """
971
+ y = np.asarray(y, dtype=np.float64)
972
+ if y.ndim == 1:
973
+ y = y.reshape(1, -1)
974
+
975
+ if design is None:
976
+ design = np.ones((y.shape[1], 1))
977
+ design = np.asarray(design, dtype=np.float64)
978
+
979
+ if offset is None:
980
+ offset = np.log(y.sum(axis=0))
981
+ offset = expand_as_matrix(np.asarray(offset, dtype=np.float64), y.shape)
982
+
983
+ ntags, nlibs = y.shape
984
+ ncoefs = design.shape[1]
985
+ df_res = nlibs - ncoefs
986
+
987
+ if df_res <= 0:
988
+ warnings.warn("No residual df: setting dispersion to NA")
989
+ return np.nan
990
+
991
+ # Subsetting
992
+ if subset is not None and subset < ntags:
993
+ if ave_log_cpm_vals is None:
994
+ ave_log_cpm_vals = ave_log_cpm(y, offset=offset)
995
+ i = systematic_subset(subset, ave_log_cpm_vals)
996
+ y = y[i]
997
+ offset = offset[i]
998
+ ntags = y.shape[0]
999
+
1000
+ def pearson_disp(d):
1001
+ from .glm_fit import glm_fit
1002
+ fit = glm_fit(y, design=design, dispersion=d, offset=offset, prior_count=0)
1003
+ mu = fit['fitted.values']
1004
+ # Pearson chi-squared
1005
+ pearson = np.sum((y - mu) ** 2 / (mu + d * mu ** 2))
1006
+ return (pearson / (ntags * df_res) - 1)
1007
+
1008
+ # Bisection search
1009
+ try:
1010
+ from scipy.optimize import brentq
1011
+ result = brentq(pearson_disp, 0.001, 10.0, xtol=1e-5)
1012
+ except Exception:
1013
+ result = 0.1
1014
+
1015
+ return max(result, 0)
1016
+
1017
+
1018
+ def disp_deviance(y, design=None, offset=None, subset=10000,
1019
+ ave_log_cpm_vals=None):
1020
+ """Deviance estimator of common dispersion.
1021
+
1022
+ Port of edgeR's dispDeviance.
1023
+ """
1024
+ y = np.asarray(y, dtype=np.float64)
1025
+ if y.ndim == 1:
1026
+ y = y.reshape(1, -1)
1027
+
1028
+ if design is None:
1029
+ design = np.ones((y.shape[1], 1))
1030
+ design = np.asarray(design, dtype=np.float64)
1031
+
1032
+ if offset is None:
1033
+ offset = np.log(y.sum(axis=0))
1034
+ offset = expand_as_matrix(np.asarray(offset, dtype=np.float64), y.shape)
1035
+
1036
+ ntags, nlibs = y.shape
1037
+ ncoefs = design.shape[1]
1038
+ df_res = nlibs - ncoefs
1039
+
1040
+ if df_res <= 0:
1041
+ warnings.warn("No residual df: setting dispersion to NA")
1042
+ return np.nan
1043
+
1044
+ # Subsetting
1045
+ if subset is not None and subset < ntags:
1046
+ if ave_log_cpm_vals is None:
1047
+ ave_log_cpm_vals = ave_log_cpm(y, offset=offset)
1048
+ i = systematic_subset(subset, ave_log_cpm_vals)
1049
+ y = y[i]
1050
+ offset = offset[i]
1051
+ ntags = y.shape[0]
1052
+
1053
+ def dev_disp(d):
1054
+ from .glm_fit import glm_fit
1055
+ from .glm_levenberg import nbinom_deviance
1056
+ fit = glm_fit(y, design=design, dispersion=d, offset=offset, prior_count=0)
1057
+ dev = nbinom_deviance(y, fit['fitted.values'], d)
1058
+ return np.sum(dev) / (ntags * df_res) - 1
1059
+
1060
+ try:
1061
+ from scipy.optimize import brentq
1062
+ result = brentq(dev_disp, 0.001, 10.0, xtol=1e-5)
1063
+ except Exception:
1064
+ result = 0.1
1065
+
1066
+ return max(result, 0)