edgepython 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,920 @@
1
+ # This code was written by Claude (Anthropic). The project was directed by Lior Pachter.
2
+ """
3
+ Dispersion estimation for edgePython.
4
+
5
+ Port of edgeR's estimateDisp, estimateCommonDisp, estimateTagwiseDisp,
6
+ estimateTrendedDisp, estimateGLMCommonDisp, estimateGLMTrendedDisp,
7
+ estimateGLMTagwiseDisp, and WLEB.
8
+ """
9
+
10
+ import numpy as np
11
+ import warnings
12
+ from scipy.optimize import minimize_scalar
13
+
14
+ from .expression import ave_log_cpm
15
+ from .utils import (expand_as_matrix, moving_average_by_col, cut_with_min_n,
16
+ drop_empty_levels, systematic_subset)
17
+ from .smoothing import locfit_by_col, loess_by_col
18
+ from .limma_port import squeeze_var, choose_lowess_span
19
+ from .dispersion_lowlevel import (
20
+ adjusted_profile_lik, adjusted_profile_lik_grid, maximize_interpolant,
21
+ cond_log_lik_der_delta, common_cond_log_lik_der_delta,
22
+ disp_cox_reid, disp_cox_reid_interpolate_tagwise,
23
+ disp_cox_reid_spline_trend, disp_cox_reid_power_trend,
24
+ disp_bin_trend, disp_pearson, disp_deviance
25
+ )
26
+
27
+
28
+ def estimate_disp(y, design=None, group=None, lib_size=None, offset=None,
29
+ prior_df=None, trend_method='locfit', tagwise=True,
30
+ span=None, legacy_span=False, min_row_sum=5,
31
+ grid_length=21, grid_range=(-10, 10), robust=False,
32
+ winsor_tail_p=(0.05, 0.1), tol=1e-6, weights=None):
33
+ """Estimate common, trended and tagwise dispersions.
34
+
35
+ Port of edgeR's estimateDisp.
36
+
37
+ Parameters
38
+ ----------
39
+ y : ndarray or DGEList
40
+ Count matrix or DGEList.
41
+ design : ndarray or str, optional
42
+ Design matrix, or an R-style formula string (e.g.
43
+ ``'~ group'``, ``'~ batch + condition'``) evaluated
44
+ against DGEList sample metadata via patsy.
45
+ If None, uses classic edgeR approach.
46
+ group : array-like, optional
47
+ Group factor.
48
+ lib_size : ndarray, optional
49
+ Library sizes.
50
+ offset : ndarray, optional
51
+ Log-scale offsets.
52
+ prior_df : float, optional
53
+ Prior degrees of freedom.
54
+ trend_method : str
55
+ 'locfit', 'loess', 'movingave', or 'none'.
56
+ tagwise : bool
57
+ Estimate tagwise dispersions.
58
+ span : float, optional
59
+ Span for smoothing.
60
+ legacy_span : bool
61
+ Use legacy span selection.
62
+ min_row_sum : int
63
+ Minimum row sum for a gene.
64
+ grid_length : int
65
+ Number of grid points.
66
+ grid_range : tuple
67
+ Range for dispersion grid.
68
+ robust : bool
69
+ Robust estimation.
70
+ winsor_tail_p : tuple
71
+ Winsorization tail proportions.
72
+ tol : float
73
+ Tolerance.
74
+ weights : ndarray, optional
75
+ Observation weights.
76
+
77
+ Returns
78
+ -------
79
+ DGEList (if input is DGEList) or dict with common.dispersion,
80
+ trended.dispersion, tagwise.dispersion, span, prior.df, prior.n.
81
+ """
82
+ # Resolve formula string to design matrix
83
+ from .utils import _resolve_design
84
+ design = _resolve_design(design, y)
85
+
86
+ # DGEList input
87
+ if isinstance(y, dict) and 'counts' in y:
88
+ dge = y
89
+ from .dgelist import valid_dgelist, get_offset
90
+ dge = valid_dgelist(dge)
91
+ group_val = dge['samples']['group'].values
92
+ ls = dge['samples']['lib.size'].values * dge['samples']['norm.factors'].values
93
+
94
+ if design is None:
95
+ design = dge.get('design')
96
+ else:
97
+ dge['design'] = design
98
+
99
+ d = estimate_disp(
100
+ dge['counts'], design=design, group=group_val, lib_size=ls,
101
+ offset=get_offset(dge), prior_df=prior_df,
102
+ trend_method=trend_method, tagwise=tagwise, span=span,
103
+ legacy_span=legacy_span, min_row_sum=min_row_sum,
104
+ grid_length=grid_length, grid_range=grid_range,
105
+ robust=robust, winsor_tail_p=winsor_tail_p, tol=tol,
106
+ weights=dge.get('weights'))
107
+
108
+ dge['common.dispersion'] = d['common.dispersion']
109
+ dge['trended.dispersion'] = d['trended.dispersion']
110
+ if tagwise:
111
+ dge['tagwise.dispersion'] = d.get('tagwise.dispersion')
112
+ dge['AveLogCPM'] = ave_log_cpm(dge)
113
+ dge['trend.method'] = trend_method
114
+ dge['prior.df'] = d.get('prior.df')
115
+ dge['prior.n'] = d.get('prior.n')
116
+ dge['span'] = d.get('span')
117
+ return dge
118
+
119
+ # Default method
120
+ y = np.asarray(y, dtype=np.float64)
121
+ if y.ndim == 1:
122
+ y = y.reshape(1, -1)
123
+ ntags, nlibs = y.shape
124
+
125
+ if ntags == 0:
126
+ return {'span': span, 'prior.df': prior_df, 'prior.n': None}
127
+
128
+ # Check trend_method
129
+ valid_methods = ('none', 'loess', 'locfit', 'movingave', 'locfit.mixed')
130
+ if trend_method not in valid_methods:
131
+ raise ValueError(f"trend_method must be one of {valid_methods}")
132
+
133
+ # Check group
134
+ if group is None:
135
+ group = np.ones(nlibs, dtype=int)
136
+ group = drop_empty_levels(np.asarray(group))
137
+
138
+ # Check lib_size
139
+ if lib_size is None:
140
+ lib_size = y.sum(axis=0)
141
+ lib_size = np.asarray(lib_size, dtype=np.float64)
142
+
143
+ # Build offset
144
+ if offset is None:
145
+ offset = np.log(lib_size)
146
+ offset = np.asarray(offset, dtype=np.float64)
147
+ offset_mat = expand_as_matrix(offset, y.shape)
148
+
149
+ if weights is not None:
150
+ w_mat = expand_as_matrix(np.asarray(weights, dtype=np.float64), y.shape)
151
+ else:
152
+ w_mat = np.ones_like(y)
153
+
154
+ # Filter genes with small counts
155
+ sel = y.sum(axis=1) >= min_row_sum
156
+ sely = y[sel]
157
+ seloffset = offset_mat[sel]
158
+ selweights = w_mat[sel]
159
+
160
+ # Spline points
161
+ spline_pts = np.linspace(grid_range[0], grid_range[1], grid_length)
162
+ spline_disp = 0.1 * 2 ** spline_pts
163
+ grid_vals = spline_disp / (1 + spline_disp)
164
+ l0 = np.zeros((np.sum(sel), grid_length))
165
+
166
+ if design is None:
167
+ # Classic edgeR approach
168
+ unique_groups = np.unique(group)
169
+ if np.all(np.bincount(group.astype(int) if np.issubdtype(group.dtype, np.integer) else
170
+ np.searchsorted(unique_groups, group)) <= 1):
171
+ warnings.warn("There is no replication, setting dispersion to NA.")
172
+ return {'common.dispersion': np.nan,
173
+ 'trended.dispersion': np.nan,
174
+ 'tagwise.dispersion': np.nan}
175
+
176
+ if len(unique_groups) == 1:
177
+ design_classic = np.ones((nlibs, 1))
178
+ else:
179
+ from .utils import _model_matrix_group
180
+ design_classic = _model_matrix_group(group)
181
+
182
+ # Equalize library sizes and estimate common dispersion
183
+ from .exact_test import equalize_lib_sizes, split_into_groups
184
+ eq = equalize_lib_sizes(y, group=group, dispersion=0.01, lib_size=lib_size)
185
+ y_pseudo = eq['pseudo.counts'][sel]
186
+ y_split = split_into_groups(y_pseudo, group=group)
187
+
188
+ # Optimize common dispersion
189
+ result = minimize_scalar(
190
+ lambda d: -common_cond_log_lik_der_delta(y_split, d, der=0),
191
+ bounds=(1e-4, 100 / 101), method='bounded')
192
+ delta = result.x
193
+ disp = delta / (1 - delta)
194
+
195
+ # Re-equalize
196
+ eq = equalize_lib_sizes(y, group=group, dispersion=disp, lib_size=lib_size)
197
+ y_pseudo = eq['pseudo.counts'][sel]
198
+ y_split = split_into_groups(y_pseudo, group=group)
199
+
200
+ # Compute log-likelihoods on grid
201
+ for j in range(grid_length):
202
+ for grp_data in y_split:
203
+ l0[:, j] += cond_log_lik_der_delta(grp_data[sel] if grp_data.shape[0] > np.sum(sel) else grp_data,
204
+ grid_vals[j], der=0)
205
+ else:
206
+ # GLM edgeR approach
207
+ design = np.asarray(design, dtype=np.float64)
208
+ if design.ndim == 1:
209
+ design = design.reshape(-1, 1)
210
+
211
+ if design.shape[1] >= nlibs:
212
+ warnings.warn("No residual df: setting dispersion to NA")
213
+ return {'common.dispersion': np.nan,
214
+ 'trended.dispersion': np.nan,
215
+ 'tagwise.dispersion': np.nan}
216
+
217
+ # Compute APL on grid for all genes (fast batch)
218
+ l0 = adjusted_profile_lik_grid(
219
+ spline_disp, sely, design, seloffset, weights=selweights)
220
+
221
+ # Calculate common dispersion
222
+ overall = maximize_interpolant(spline_pts, np.sum(l0, axis=0).reshape(1, -1))
223
+ common_dispersion = 0.1 * 2 ** overall[0]
224
+
225
+ # Allow dispersion trend
226
+ if trend_method != 'none':
227
+ ave_lcpm = ave_log_cpm(y, lib_size=lib_size, dispersion=common_dispersion,
228
+ weights=weights)
229
+ out_1 = WLEB(theta=spline_pts, loglik=l0, covariate=ave_lcpm[sel],
230
+ trend_method=trend_method, span=span, legacy_span=legacy_span,
231
+ overall=False, individual=False, m0_out=True)
232
+ span = out_1['span']
233
+ m0 = out_1['shared.loglik']
234
+ disp_trend = 0.1 * 2 ** out_1['trend']
235
+ trended_dispersion = np.full(ntags, disp_trend[np.argmin(ave_lcpm[sel])])
236
+ trended_dispersion[sel] = disp_trend
237
+ else:
238
+ ave_lcpm = None
239
+ m0 = np.tile(np.mean(l0, axis=0), (np.sum(sel), 1))
240
+ disp_trend = common_dispersion
241
+ trended_dispersion = None
242
+
243
+ # Are tagwise dispersions required?
244
+ if not tagwise:
245
+ return {'common.dispersion': common_dispersion,
246
+ 'trended.dispersion': trended_dispersion}
247
+
248
+ # Calculate prior_df
249
+ if prior_df is None:
250
+ from .glm_fit import glm_fit
251
+ if design is None:
252
+ design_fit = np.ones((nlibs, 1))
253
+ else:
254
+ design_fit = design
255
+ glmfit = glm_fit(sely, offset=seloffset, weights=selweights,
256
+ design=design_fit, dispersion=disp_trend, prior_count=0)
257
+
258
+ df_residual = glmfit['df.residual'].astype(float)
259
+
260
+ # Adjust for zeros
261
+ from .utils import residual_df
262
+ zerofit = (glmfit['counts'] < 1e-4) & (glmfit['fitted.values'] < 1e-4)
263
+ df_residual = residual_df(zerofit, design_fit)
264
+
265
+ s2 = glmfit['deviance'] / np.maximum(df_residual, 1e-8)
266
+ s2[df_residual == 0] = 0
267
+ s2 = np.maximum(s2, 0)
268
+ covariate = ave_lcpm[sel] if ave_lcpm is not None else None
269
+ s2_fit = squeeze_var(s2, df=df_residual, covariate=covariate,
270
+ robust=robust, winsor_tail_p=winsor_tail_p)
271
+ prior_df = s2_fit.get('df.prior', s2_fit.get('df_prior'))
272
+
273
+ ncoefs = design.shape[1] if design is not None else 1
274
+ prior_n = prior_df / (nlibs - ncoefs)
275
+
276
+ # Initiate tagwise dispersions
277
+ if trend_method != 'none':
278
+ tagwise_dispersion = trended_dispersion.copy()
279
+ else:
280
+ tagwise_dispersion = np.full(ntags, common_dispersion)
281
+
282
+ # Estimate tagwise dispersions via WLEB
283
+ too_large = np.atleast_1d(prior_n > 1e6)
284
+ if not np.all(too_large):
285
+ temp_n = np.atleast_1d(prior_n).copy()
286
+ if np.any(too_large):
287
+ temp_n[too_large] = 1e6
288
+
289
+ out_2 = WLEB(theta=spline_pts, loglik=l0, prior_n=temp_n,
290
+ covariate=ave_lcpm[sel] if ave_lcpm is not None else None,
291
+ trend_method=trend_method, span=span, legacy_span=False,
292
+ overall=False, trend=False, m0=m0)
293
+ tagwise_dispersion[sel] = 0.1 * 2 ** out_2['individual']
294
+
295
+ if robust:
296
+ temp_df = prior_df
297
+ temp_n = prior_n
298
+ prior_df = np.full(ntags, np.inf)
299
+ prior_n = np.full(ntags, np.inf)
300
+ prior_df[sel] = temp_df
301
+ prior_n[sel] = temp_n
302
+
303
+ return {
304
+ 'common.dispersion': common_dispersion,
305
+ 'trended.dispersion': trended_dispersion,
306
+ 'tagwise.dispersion': tagwise_dispersion,
307
+ 'span': span,
308
+ 'prior.df': prior_df,
309
+ 'prior.n': prior_n
310
+ }
311
+
312
+
313
+ def WLEB(theta, loglik, prior_n=5, covariate=None, trend_method='locfit',
314
+ span=None, legacy_span=False, overall=True, trend=True,
315
+ individual=True, m0=None, m0_out=False):
316
+ """Weighted likelihood empirical Bayes.
317
+
318
+ Port of edgeR's WLEB.
319
+
320
+ Parameters
321
+ ----------
322
+ theta : ndarray
323
+ Grid of theta values.
324
+ loglik : ndarray
325
+ Log-likelihood matrix (genes x grid points).
326
+ prior_n : float or ndarray
327
+ Prior sample size.
328
+ covariate : ndarray, optional
329
+ Covariate for trend.
330
+ trend_method : str
331
+ Smoothing method.
332
+ span : float, optional
333
+ Smoothing span.
334
+ legacy_span : bool
335
+ Use legacy span selection.
336
+ overall : bool
337
+ Compute overall estimate.
338
+ trend : bool
339
+ Compute trended estimate.
340
+ individual : bool
341
+ Compute individual estimates.
342
+ m0 : ndarray, optional
343
+ Pre-computed shared loglik.
344
+ m0_out : bool
345
+ Return shared loglik.
346
+
347
+ Returns
348
+ -------
349
+ dict with 'overall', 'trend', 'individual', 'span', 'shared.loglik'.
350
+ """
351
+ loglik = np.asarray(loglik, dtype=np.float64)
352
+ if loglik.ndim == 1:
353
+ loglik = loglik.reshape(1, -1)
354
+ ntags = loglik.shape[0]
355
+ theta = np.asarray(theta, dtype=np.float64)
356
+
357
+ # Check covariate and trend
358
+ if covariate is None:
359
+ trend_method = 'none'
360
+
361
+ # Set span matching R's WLEB formula exactly
362
+ if span is None:
363
+ if ntags <= 50:
364
+ span = 1.0
365
+ else:
366
+ span = 0.25 + 0.75 * (50 / ntags) ** 0.5
367
+
368
+ out = {'span': span}
369
+
370
+ # Overall prior
371
+ if overall:
372
+ out['overall'] = maximize_interpolant(
373
+ theta, np.sum(loglik, axis=0).reshape(1, -1))[0]
374
+
375
+ # Trended prior
376
+ if m0 is None:
377
+ if trend_method == 'movingave':
378
+ o = np.argsort(covariate)
379
+ oo = np.argsort(o)
380
+ width = int(np.floor(span * ntags))
381
+ width = max(width, 1)
382
+ m0 = moving_average_by_col(loglik[o], width=width)[oo]
383
+ elif trend_method == 'loess':
384
+ result = loess_by_col(loglik, x=covariate, span=span)
385
+ m0 = result['fitted_values']
386
+ elif trend_method == 'locfit':
387
+ m0 = locfit_by_col(loglik, x=covariate, span=span, degree=0)
388
+ elif trend_method == 'locfit.mixed':
389
+ deg0 = locfit_by_col(loglik, x=covariate, span=span, degree=0)
390
+ deg1 = locfit_by_col(loglik, x=covariate, span=span, degree=1)
391
+ from scipy.stats import beta as beta_dist
392
+ r = np.array([np.min(covariate), np.max(covariate)])
393
+ if r[1] - r[0] > 0:
394
+ w = beta_dist.cdf((covariate - r[0]) / (r[1] - r[0]), 2, 2)
395
+ else:
396
+ w = np.full(len(covariate), 0.5)
397
+ m0 = w[:, None] * deg0 + (1 - w[:, None]) * deg1
398
+ else:
399
+ # 'none'
400
+ m0 = np.tile(np.mean(loglik, axis=0), (ntags, 1))
401
+
402
+ if trend:
403
+ out['trend'] = maximize_interpolant(theta, m0)
404
+
405
+ # Weighted empirical Bayes posterior estimates
406
+ if individual:
407
+ prior_n = np.atleast_1d(np.asarray(prior_n, dtype=np.float64))
408
+ if len(prior_n) == 1:
409
+ l0a = loglik + prior_n[0] * m0
410
+ else:
411
+ l0a = loglik + prior_n[:, None] * m0
412
+ out['individual'] = maximize_interpolant(theta, l0a)
413
+
414
+ if m0_out:
415
+ out['shared.loglik'] = m0
416
+
417
+ return out
418
+
419
+
420
+ def estimate_common_disp(y, group=None, lib_size=None, tol=1e-6,
421
+ rowsum_filter=5, verbose=False):
422
+ """Estimate common dispersion using exact conditional likelihood.
423
+
424
+ Port of edgeR's estimateCommonDisp.
425
+
426
+ Parameters
427
+ ----------
428
+ y : ndarray or DGEList
429
+ Count matrix or DGEList.
430
+ group : array-like, optional
431
+ Group factor.
432
+ lib_size : ndarray, optional
433
+ Library sizes.
434
+ tol : float
435
+ Optimization tolerance.
436
+ rowsum_filter : int
437
+ Minimum row sum.
438
+ verbose : bool
439
+ Print progress.
440
+
441
+ Returns
442
+ -------
443
+ DGEList (if input is DGEList) or float.
444
+ """
445
+ # DGEList input
446
+ if isinstance(y, dict) and 'counts' in y:
447
+ dge = y
448
+ from .dgelist import valid_dgelist
449
+ dge = valid_dgelist(dge)
450
+ group = dge['samples']['group'].values
451
+ ls = dge['samples']['lib.size'].values * dge['samples']['norm.factors'].values
452
+
453
+ d = estimate_common_disp(dge['counts'], group=group, lib_size=ls,
454
+ tol=tol, rowsum_filter=rowsum_filter, verbose=verbose)
455
+ dge['common.dispersion'] = d
456
+ dge['AveLogCPM'] = ave_log_cpm(dge, dispersion=d)
457
+ return dge
458
+
459
+ y = np.asarray(y, dtype=np.float64)
460
+ if y.ndim == 1:
461
+ y = y.reshape(1, -1)
462
+ ntags, nlibs = y.shape
463
+
464
+ if group is None:
465
+ group = np.ones(nlibs, dtype=int)
466
+ group = np.asarray(group)
467
+
468
+ if lib_size is None:
469
+ lib_size = y.sum(axis=0)
470
+ lib_size = np.asarray(lib_size, dtype=np.float64)
471
+
472
+ # Filter
473
+ keep = y.sum(axis=1) >= rowsum_filter
474
+ y_filt = y[keep]
475
+
476
+ if y_filt.shape[0] == 0:
477
+ warnings.warn("No genes pass rowsum filter")
478
+ return 0.1
479
+
480
+ # Equalize library sizes and split into groups
481
+ from .exact_test import equalize_lib_sizes, split_into_groups
482
+
483
+ # First pass with rough dispersion
484
+ eq = equalize_lib_sizes(y_filt, group=group, dispersion=0.01, lib_size=lib_size)
485
+ y_pseudo = eq['pseudo.counts']
486
+ y_split = split_into_groups(y_pseudo, group=group)
487
+
488
+ # Optimize
489
+ result = minimize_scalar(
490
+ lambda d: -common_cond_log_lik_der_delta(y_split, d, der=0),
491
+ bounds=(1e-4, 100 / 101), method='bounded',
492
+ options={'xatol': tol})
493
+ delta = result.x
494
+ disp = delta / (1 - delta)
495
+
496
+ if verbose:
497
+ print(f"Disp = {disp:.5f}, BCV = {np.sqrt(disp):.4f}")
498
+
499
+ return disp
500
+
501
+
502
+ def estimate_tagwise_disp(y, group=None, lib_size=None, dispersion=None,
503
+ prior_df=10, trend='movingave', span=None,
504
+ method='grid', grid_length=11, grid_range=(-6, 6),
505
+ tol=1e-6, verbose=False):
506
+ """Estimate tagwise dispersions using exact conditional likelihood.
507
+
508
+ Port of edgeR's estimateTagwiseDisp.
509
+
510
+ Parameters
511
+ ----------
512
+ y : ndarray or DGEList
513
+ Count matrix or DGEList.
514
+ group : array-like, optional
515
+ Group factor.
516
+ lib_size : ndarray, optional
517
+ Library sizes.
518
+ dispersion : float or ndarray, optional
519
+ Starting dispersion.
520
+ prior_df : float
521
+ Prior degrees of freedom.
522
+ trend : str
523
+ 'movingave', 'loess', or 'none'.
524
+ span : float, optional
525
+ Smoothing span.
526
+ method : str
527
+ 'grid' or 'optimize'.
528
+ grid_length : int
529
+ Number of grid points.
530
+ grid_range : tuple
531
+ Grid range.
532
+ tol : float
533
+ Tolerance.
534
+
535
+ Returns
536
+ -------
537
+ DGEList (if input is DGEList) or ndarray of tagwise dispersions.
538
+ """
539
+ # DGEList input
540
+ if isinstance(y, dict) and 'counts' in y:
541
+ dge = y
542
+ from .dgelist import valid_dgelist
543
+ dge = valid_dgelist(dge)
544
+ group = dge['samples']['group'].values
545
+ ls = dge['samples']['lib.size'].values * dge['samples']['norm.factors'].values
546
+
547
+ if dispersion is None:
548
+ dispersion = dge.get('common.dispersion')
549
+ if dispersion is None:
550
+ raise ValueError("No common.dispersion found. Run estimate_common_disp first.")
551
+
552
+ if dge.get('AveLogCPM') is None:
553
+ dge['AveLogCPM'] = ave_log_cpm(dge)
554
+
555
+ td = estimate_tagwise_disp(
556
+ dge['counts'], group=group, lib_size=ls, dispersion=dispersion,
557
+ prior_df=prior_df, trend=trend, span=span, method=method,
558
+ grid_length=grid_length, grid_range=grid_range, tol=tol)
559
+ dge['tagwise.dispersion'] = td
560
+ dge['prior.df'] = prior_df
561
+ return dge
562
+
563
+ y = np.asarray(y, dtype=np.float64)
564
+ if y.ndim == 1:
565
+ y = y.reshape(1, -1)
566
+ ntags, nlibs = y.shape
567
+
568
+ if group is None:
569
+ group = np.ones(nlibs, dtype=int)
570
+ group = np.asarray(group)
571
+
572
+ if lib_size is None:
573
+ lib_size = y.sum(axis=0)
574
+ lib_size = np.asarray(lib_size, dtype=np.float64)
575
+
576
+ if dispersion is None:
577
+ dispersion = 0.1
578
+
579
+ if span is None:
580
+ span = (10 / ntags) ** 0.23 if ntags > 10 else 1.0
581
+
582
+ # Equalize library sizes
583
+ from .exact_test import equalize_lib_sizes, split_into_groups
584
+ eq = equalize_lib_sizes(y, group=group, dispersion=dispersion, lib_size=lib_size)
585
+ y_pseudo = eq['pseudo.counts']
586
+ y_split = split_into_groups(y_pseudo, group=group)
587
+
588
+ # Compute log-likelihoods on grid
589
+ spline_pts = np.linspace(grid_range[0], grid_range[1], grid_length)
590
+
591
+ if np.isscalar(dispersion):
592
+ disp_base = dispersion
593
+ else:
594
+ disp_base = np.median(dispersion)
595
+
596
+ grid_disp = disp_base * 2 ** spline_pts
597
+ grid_delta = grid_disp / (1 + grid_disp)
598
+
599
+ l0 = np.zeros((ntags, grid_length))
600
+ for j in range(grid_length):
601
+ for grp_data in y_split:
602
+ l0[:, j] += cond_log_lik_der_delta(grp_data, grid_delta[j], der=0)
603
+
604
+ # Compute AveLogCPM for smoothing
605
+ alc = ave_log_cpm(y, lib_size=lib_size)
606
+
607
+ # Use WLEB
608
+ prior_n = prior_df / (nlibs - len(np.unique(group)))
609
+
610
+ out = WLEB(theta=spline_pts, loglik=l0, prior_n=prior_n,
611
+ covariate=alc, trend_method='movingave' if trend == 'movingave' else
612
+ ('loess' if trend == 'loess' else 'none'),
613
+ span=span)
614
+
615
+ tagwise_dispersion = disp_base * 2 ** out['individual']
616
+ return tagwise_dispersion
617
+
618
+
619
+ def estimate_trended_disp(y, group=None, lib_size=None, ave_log_cpm_vals=None,
620
+ method='bin.spline', df=5, span=2/3):
621
+ """Estimate trended dispersions using exact conditional likelihood.
622
+
623
+ Port of edgeR's estimateTrendedDisp.
624
+
625
+ Returns
626
+ -------
627
+ DGEList (if input is DGEList) or ndarray of trended dispersions.
628
+ """
629
+ # DGEList input
630
+ if isinstance(y, dict) and 'counts' in y:
631
+ dge = y
632
+ from .dgelist import valid_dgelist
633
+ dge = valid_dgelist(dge)
634
+ group_val = dge['samples']['group'].values
635
+ ls = dge['samples']['lib.size'].values * dge['samples']['norm.factors'].values
636
+ if dge.get('AveLogCPM') is None:
637
+ dge['AveLogCPM'] = ave_log_cpm(dge)
638
+ out = estimate_trended_disp(dge['counts'], group=group_val, lib_size=ls,
639
+ ave_log_cpm_vals=dge['AveLogCPM'],
640
+ method=method, df=df, span=span)
641
+ dge['trended.dispersion'] = out
642
+ return dge
643
+
644
+ y = np.asarray(y, dtype=np.float64)
645
+ if y.ndim == 1:
646
+ y = y.reshape(1, -1)
647
+ ntags, nlibs = y.shape
648
+
649
+ if group is None:
650
+ group = np.ones(nlibs, dtype=int)
651
+ group = drop_empty_levels(np.asarray(group))
652
+
653
+ if lib_size is None:
654
+ lib_size = y.sum(axis=0)
655
+ lib_size = np.asarray(lib_size, dtype=np.float64)
656
+
657
+ if ave_log_cpm_vals is None:
658
+ ave_log_cpm_vals = ave_log_cpm(y, lib_size=lib_size)
659
+
660
+ # Bin genes by abundance and estimate dispersion in each bin
661
+ nbins = 50
662
+ if nbins > ntags:
663
+ nbins = max(1, ntags // 2)
664
+
665
+ bins = cut_with_min_n(ave_log_cpm_vals, intervals=nbins,
666
+ min_n=max(1, ntags // nbins))
667
+ disp_bins = np.zeros(nbins)
668
+ ave_bins = np.zeros(nbins)
669
+
670
+ for i in range(1, nbins + 1):
671
+ mask = bins['group'] == i
672
+ if np.sum(mask) == 0:
673
+ continue
674
+ disp_bins[i - 1] = estimate_common_disp(y[mask], group=group,
675
+ lib_size=lib_size,
676
+ rowsum_filter=0)
677
+ ave_bins[i - 1] = np.mean(ave_log_cpm_vals[mask])
678
+
679
+ # Fit trend
680
+ if method == 'bin.spline':
681
+ from scipy.interpolate import UnivariateSpline
682
+ order = np.argsort(ave_bins)
683
+ try:
684
+ spl = UnivariateSpline(ave_bins[order],
685
+ np.sqrt(np.maximum(disp_bins[order], 0)),
686
+ k=min(3, len(ave_bins) - 1),
687
+ s=len(ave_bins) * 0.1)
688
+ trended = spl(ave_log_cpm_vals) ** 2
689
+ except Exception:
690
+ trended = np.full(ntags, np.mean(disp_bins))
691
+ else:
692
+ # bin.loess
693
+ from scipy.interpolate import interp1d
694
+ try:
695
+ f = interp1d(ave_bins, np.sqrt(np.maximum(disp_bins, 0)),
696
+ fill_value='extrapolate')
697
+ trended = f(ave_log_cpm_vals) ** 2
698
+ except Exception:
699
+ trended = np.full(ntags, np.mean(disp_bins))
700
+
701
+ return np.maximum(trended, 0)
702
+
703
+
704
+ def estimate_glm_common_disp(y, design=None, offset=None, method='CoxReid',
705
+ subset=10000, ave_log_cpm_vals=None, verbose=False,
706
+ weights=None):
707
+ """Estimate common dispersion using GLM approach.
708
+
709
+ Port of edgeR's estimateGLMCommonDisp.
710
+
711
+ Returns
712
+ -------
713
+ DGEList (if input is DGEList) or float.
714
+ """
715
+ # DGEList input
716
+ if isinstance(y, dict) and 'counts' in y:
717
+ dge = y
718
+ from .dgelist import valid_dgelist, get_offset
719
+ dge = valid_dgelist(dge)
720
+ alc = ave_log_cpm(dge, dispersion=0.05)
721
+ offset_val = get_offset(dge)
722
+ d = estimate_glm_common_disp(
723
+ dge['counts'], design=design, offset=offset_val,
724
+ method=method, subset=subset, ave_log_cpm_vals=alc,
725
+ verbose=verbose, weights=dge.get('weights'))
726
+ dge['common.dispersion'] = d
727
+ dge['AveLogCPM'] = ave_log_cpm(dge, dispersion=d)
728
+ return dge
729
+
730
+ y = np.asarray(y, dtype=np.float64)
731
+ if y.ndim == 1:
732
+ y = y.reshape(1, -1)
733
+
734
+ if design is None:
735
+ design = np.ones((y.shape[1], 1))
736
+ else:
737
+ design = np.asarray(design, dtype=np.float64)
738
+ if design.ndim == 1:
739
+ design = design.reshape(-1, 1)
740
+
741
+ if design.shape[1] >= y.shape[1]:
742
+ warnings.warn("No residual df: setting dispersion to NA")
743
+ return np.nan
744
+
745
+ if offset is None:
746
+ offset = np.log(y.sum(axis=0))
747
+
748
+ if ave_log_cpm_vals is None:
749
+ ave_log_cpm_vals = ave_log_cpm(y, offset=offset, weights=weights)
750
+
751
+ valid_methods = ('CoxReid', 'Pearson', 'deviance')
752
+ if method not in valid_methods:
753
+ raise ValueError(f"method must be one of {valid_methods}")
754
+
755
+ if method != 'CoxReid' and weights is not None:
756
+ warnings.warn("weights only supported by CoxReid method")
757
+
758
+ if method == 'CoxReid':
759
+ d = disp_cox_reid(y, design=design, offset=offset, subset=subset,
760
+ ave_log_cpm_vals=ave_log_cpm_vals, weights=weights)
761
+ elif method == 'Pearson':
762
+ d = disp_pearson(y, design=design, offset=offset, subset=subset,
763
+ ave_log_cpm_vals=ave_log_cpm_vals)
764
+ else:
765
+ d = disp_deviance(y, design=design, offset=offset, subset=subset,
766
+ ave_log_cpm_vals=ave_log_cpm_vals)
767
+
768
+ if verbose:
769
+ print(f"Disp = {d:.5f}, BCV = {np.sqrt(d):.4f}")
770
+
771
+ return d
772
+
773
+
774
+ def estimate_glm_trended_disp(y, design=None, offset=None,
775
+ ave_log_cpm_vals=None, method='auto',
776
+ weights=None):
777
+ """Estimate trended dispersion using GLM approach.
778
+
779
+ Port of edgeR's estimateGLMTrendedDisp.
780
+
781
+ Returns
782
+ -------
783
+ DGEList (if input is DGEList) or ndarray.
784
+ """
785
+ # DGEList input
786
+ if isinstance(y, dict) and 'counts' in y:
787
+ dge = y
788
+ if dge.get('AveLogCPM') is None:
789
+ dge['AveLogCPM'] = ave_log_cpm(dge)
790
+ from .dgelist import get_offset
791
+ d = estimate_glm_trended_disp(
792
+ dge['counts'], design=design, offset=get_offset(dge),
793
+ ave_log_cpm_vals=dge['AveLogCPM'], method=method,
794
+ weights=dge.get('weights'))
795
+ dge['trended.dispersion'] = d
796
+ return dge
797
+
798
+ y = np.asarray(y, dtype=np.float64)
799
+ if y.ndim == 1:
800
+ y = y.reshape(1, -1)
801
+ ntags = y.shape[0]
802
+ nlibs = y.shape[1]
803
+
804
+ if ntags == 0:
805
+ return np.array([], dtype=np.float64)
806
+
807
+ if design is None:
808
+ design = np.ones((nlibs, 1))
809
+ else:
810
+ design = np.asarray(design, dtype=np.float64)
811
+ if design.ndim == 1:
812
+ design = design.reshape(-1, 1)
813
+
814
+ if design.shape[1] >= nlibs:
815
+ warnings.warn("No residual df: cannot estimate dispersion")
816
+ return np.full(ntags, np.nan)
817
+
818
+ if offset is None:
819
+ offset = np.log(y.sum(axis=0))
820
+
821
+ if ave_log_cpm_vals is None:
822
+ ave_log_cpm_vals = ave_log_cpm(y, offset=offset, weights=weights)
823
+
824
+ if method == 'auto':
825
+ method = 'power' if ntags < 200 else 'bin.spline'
826
+
827
+ valid_methods = ('bin.spline', 'bin.loess', 'power', 'spline')
828
+ if method not in valid_methods:
829
+ raise ValueError(f"method must be one of {valid_methods}")
830
+
831
+ if method in ('bin.spline', 'bin.loess'):
832
+ mt = 'spline' if method == 'bin.spline' else 'loess'
833
+ result = disp_bin_trend(y, design, offset=offset, method_trend=mt,
834
+ ave_log_cpm_vals=ave_log_cpm_vals, weights=weights)
835
+ elif method == 'power':
836
+ result = disp_cox_reid_power_trend(y, design, offset=offset,
837
+ ave_log_cpm_vals=ave_log_cpm_vals)
838
+ else:
839
+ result = disp_cox_reid_spline_trend(y, design, offset=offset,
840
+ ave_log_cpm_vals=ave_log_cpm_vals)
841
+
842
+ return result['dispersion']
843
+
844
+
845
+ def estimate_glm_tagwise_disp(y, design=None, offset=None, dispersion=None,
846
+ prior_df=10, trend=True, span=None,
847
+ ave_log_cpm_vals=None, weights=None):
848
+ """Estimate tagwise dispersions using GLM approach.
849
+
850
+ Port of edgeR's estimateGLMTagwiseDisp.
851
+
852
+ Returns
853
+ -------
854
+ DGEList (if input is DGEList) or ndarray.
855
+ """
856
+ # DGEList input
857
+ if isinstance(y, dict) and 'counts' in y:
858
+ dge = y
859
+ if trend:
860
+ dispersion = dge.get('trended.dispersion')
861
+ if dispersion is None:
862
+ raise ValueError("No trended.dispersion found. Run estimate_glm_trended_disp first.")
863
+ else:
864
+ if dispersion is None:
865
+ dispersion = dge.get('common.dispersion')
866
+ if dispersion is None:
867
+ raise ValueError("No common.dispersion found. Run estimate_glm_common_disp first.")
868
+
869
+ if dge.get('AveLogCPM') is None:
870
+ dge['AveLogCPM'] = ave_log_cpm(dge)
871
+
872
+ ntags = dge['counts'].shape[0]
873
+ if span is None:
874
+ span = (10 / ntags) ** 0.23 if ntags > 10 else 1.0
875
+ dge['span'] = span
876
+
877
+ from .dgelist import get_offset
878
+ d = estimate_glm_tagwise_disp(
879
+ dge['counts'], design=design, offset=get_offset(dge),
880
+ dispersion=dispersion, prior_df=prior_df, trend=trend,
881
+ span=span, ave_log_cpm_vals=dge['AveLogCPM'],
882
+ weights=dge.get('weights'))
883
+ dge['prior.df'] = prior_df
884
+ dge['tagwise.dispersion'] = d
885
+ return dge
886
+
887
+ y = np.asarray(y, dtype=np.float64)
888
+ if y.ndim == 1:
889
+ y = y.reshape(1, -1)
890
+ ntags, nlibs = y.shape
891
+
892
+ if ntags == 0:
893
+ return np.array([], dtype=np.float64)
894
+
895
+ if design is None:
896
+ design = np.ones((nlibs, 1))
897
+ else:
898
+ design = np.asarray(design, dtype=np.float64)
899
+ if design.ndim == 1:
900
+ design = design.reshape(-1, 1)
901
+
902
+ if design.shape[1] >= nlibs:
903
+ warnings.warn("No residual df: setting dispersion to NA")
904
+ return np.full(ntags, np.nan)
905
+
906
+ if offset is None:
907
+ offset = np.log(y.sum(axis=0))
908
+
909
+ if span is None:
910
+ span = (10 / ntags) ** 0.23 if ntags > 10 else 1.0
911
+
912
+ if ave_log_cpm_vals is None:
913
+ ave_log_cpm_vals = ave_log_cpm(y, offset=offset, weights=weights)
914
+
915
+ tagwise = disp_cox_reid_interpolate_tagwise(
916
+ y, design, offset=offset, dispersion=dispersion,
917
+ trend=trend, prior_df=prior_df, span=span,
918
+ ave_log_cpm_vals=ave_log_cpm_vals, weights=weights)
919
+
920
+ return tagwise