edgepython 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,987 @@
1
+ # This code was written by Claude (Anthropic). The project was directed by Lior Pachter.
2
+ """
3
+ Essential limma functions ported for edgePython.
4
+
5
+ Port of limma's squeezeVar, contrastAsCoef, nonEstimable, is.fullrank,
6
+ chooseLowessSpan, and related utility functions.
7
+ """
8
+
9
+ import numpy as np
10
+ from scipy import stats, interpolate
11
+ import warnings
12
+
13
+
14
+ def squeeze_var(var, df, covariate=None, span=None, robust=False, winsor_tail_p=(0.05, 0.1), legacy=None):
15
+ """Empirical Bayes moderation of genewise variances.
16
+
17
+ Port of limma's squeezeVar().
18
+
19
+ Parameters
20
+ ----------
21
+ var : array-like
22
+ Genewise variances.
23
+ df : array-like
24
+ Residual degrees of freedom.
25
+ covariate : array-like, optional
26
+ Covariate for trended prior.
27
+ span : float, optional
28
+ Loess span. If provided, forces legacy=False.
29
+ robust : bool
30
+ Use robust estimation.
31
+ winsor_tail_p : tuple
32
+ Tail proportions for Winsorization when robust=True (legacy only).
33
+ legacy : bool or None
34
+ If True, use original limma algorithm (fitFDist).
35
+ If False, use fitFDistUnequalDF1.
36
+ If None (default), auto-detect based on whether df values are equal.
37
+
38
+ Returns
39
+ -------
40
+ dict with keys: var_post, var_prior, df_prior
41
+ """
42
+ var = np.asarray(var, dtype=np.float64)
43
+ n = len(var)
44
+
45
+ if n == 0:
46
+ raise ValueError("var is empty")
47
+ if n < 3:
48
+ return {'var_post': var.copy(), 'var_prior': var.copy(), 'df_prior': 0.0}
49
+
50
+ df = np.atleast_1d(np.asarray(df, dtype=np.float64))
51
+ if len(df) == 1:
52
+ df = np.full(n, df[0])
53
+
54
+ # When df==0, guard against missing or infinite values in var
55
+ var = var.copy()
56
+ var[df == 0] = 0
57
+
58
+ # Auto-detect legacy mode
59
+ if span is not None:
60
+ legacy = False
61
+ if legacy is None:
62
+ dfp = df[df > 0]
63
+ if len(dfp) > 0:
64
+ legacy = (np.min(dfp) == np.max(dfp))
65
+ else:
66
+ legacy = True
67
+
68
+ if legacy:
69
+ # Original limma algorithm (fitFDist / fitFDistRobustly)
70
+ ok = np.isfinite(var) & np.isfinite(df) & (df > 0)
71
+ if not np.any(ok):
72
+ return {'var_post': var, 'var_prior': np.nan, 'df_prior': 0.0}
73
+
74
+ if covariate is not None:
75
+ covariate = np.asarray(covariate, dtype=np.float64)
76
+
77
+ if robust:
78
+ cov_arg = covariate
79
+ if cov_arg is not None and len(np.unique(cov_arg[ok])) < 2:
80
+ cov_arg = None
81
+ fit = _fit_f_dist_robustly(var, df, covariate=cov_arg,
82
+ winsor_tail_p=winsor_tail_p)
83
+ var_prior = fit['scale']
84
+ df_prior = fit['df2_shrunk']
85
+ var_post = _posterior_var(var, df, var_prior, df_prior)
86
+ return {'var_post': var_post, 'var_prior': var_prior, 'df_prior': df_prior}
87
+
88
+ # Estimate prior (non-robust)
89
+ if covariate is None or len(np.unique(covariate[ok])) < 2:
90
+ # No trend
91
+ result = _fit_f_dist(var[ok], df[ok])
92
+ df_prior = result['df2']
93
+ var_prior = result['s2']
94
+ var_post = _posterior_var(var, df, var_prior, df_prior)
95
+ return {'var_post': var_post, 'var_prior': var_prior, 'df_prior': df_prior}
96
+ else:
97
+ # Trended prior
98
+ result = _fit_f_dist_trend(var[ok], df[ok], covariate[ok])
99
+ var_prior_full = np.full(n, np.nan)
100
+ var_prior_full[ok] = result['var_prior']
101
+ if not np.all(ok):
102
+ from scipy.interpolate import interp1d
103
+ f = interp1d(covariate[ok], result['var_prior'], kind='linear',
104
+ bounds_error=False, fill_value='extrapolate')
105
+ var_prior_full[~ok] = f(covariate[~ok])
106
+ df_prior = result['df_prior']
107
+ var_post = _posterior_var(var, df, var_prior_full, df_prior)
108
+ return {'var_post': var_post, 'var_prior': var_prior_full, 'df_prior': df_prior}
109
+ else:
110
+ # New method: fitFDistUnequalDF1
111
+ fit = _fit_f_dist_unequal_df1(var, df, covariate=covariate, span=span, robust=robust)
112
+ df_prior = fit.get('df2_shrunk')
113
+ if df_prior is None:
114
+ df_prior = fit['df2']
115
+ scale = fit['scale']
116
+ var_post = _posterior_var(var, df, scale, df_prior)
117
+ return {'var_post': var_post, 'var_prior': scale, 'df_prior': df_prior}
118
+
119
+
120
+ def _posterior_var(var, df, var_prior, df_prior):
121
+ """Compute posterior variance: (df*var + df_prior*var_prior) / (df + df_prior)."""
122
+ var = np.asarray(var, dtype=np.float64)
123
+ df = np.atleast_1d(np.asarray(df, dtype=np.float64))
124
+ var_prior = np.atleast_1d(np.asarray(var_prior, dtype=np.float64))
125
+ if len(df) == 1:
126
+ df = np.full(len(var), df[0])
127
+ df_prior_val = np.atleast_1d(np.asarray(df_prior, dtype=np.float64))
128
+ if len(df_prior_val) == 1:
129
+ df_prior_val = np.full(len(var), df_prior_val[0])
130
+ if len(var_prior) == 1:
131
+ var_prior = np.full(len(var), var_prior[0])
132
+ total_df = df + df_prior_val
133
+ # Handle infinite df_prior: var_post = var_prior when df_prior is infinite
134
+ inf_mask = np.isinf(df_prior_val)
135
+ with np.errstate(invalid='ignore', divide='ignore'):
136
+ var_post = np.where(inf_mask, var_prior,
137
+ (df * var + df_prior_val * var_prior) / np.where(total_df == 0, 1, total_df))
138
+ var_post[total_df <= 0] = var[total_df <= 0]
139
+ return var_post
140
+
141
+
142
+ def _fit_f_dist(x, df1):
143
+ """Fit a scaled F-distribution to data.
144
+
145
+ Moment matching to estimate s2 (scale) and df2 (prior df).
146
+ Faithful port of limma's fitFDist() (no-covariate case).
147
+ """
148
+ x = np.asarray(x, dtype=np.float64)
149
+ n = len(x)
150
+ df1 = np.atleast_1d(np.asarray(df1, dtype=np.float64))
151
+
152
+ if n == 0:
153
+ return {'s2': np.nan, 'df2': np.nan}
154
+ if n == 1:
155
+ return {'s2': float(x[0]), 'df2': 0.0}
156
+
157
+ # Filter ok values: R uses df1 > 1e-15 and x > -1e-15
158
+ ok_df1 = np.isfinite(df1) & (df1 > 1e-15)
159
+ if len(df1) == 1:
160
+ if not ok_df1[0]:
161
+ return {'s2': np.nan, 'df2': np.nan}
162
+ ok = np.full(n, True)
163
+ else:
164
+ ok = ok_df1
165
+ ok = ok & np.isfinite(x) & (x > -1e-15)
166
+
167
+ nok = int(np.sum(ok))
168
+ if nok <= 1:
169
+ if nok == 1:
170
+ return {'s2': float(x[ok][0]), 'df2': 0.0}
171
+ return {'s2': np.nan, 'df2': np.nan}
172
+
173
+ x_ok = x[ok].copy()
174
+ df1_ok = df1[ok] if len(df1) > 1 else df1
175
+
176
+ # Clamp x: match R's pmax(x, 0), handle zeros, pmax(x, 1e-5 * median)
177
+ x_ok = np.maximum(x_ok, 0.0)
178
+ m = np.median(x_ok)
179
+ if m == 0:
180
+ m = 1.0
181
+ x_ok = np.maximum(x_ok, 1e-5 * m)
182
+
183
+ # Compute e = log(x) + logmdigamma(df1/2), matching R exactly
184
+ z = np.log(x_ok)
185
+ e = z + logmdigamma(df1_ok / 2)
186
+ emean = np.mean(e)
187
+ evar = np.sum((e - emean) ** 2) / (nok - 1) # R uses /(nok - 1L)
188
+
189
+ # Subtract trigamma(df1/2) contribution
190
+ evar = evar - np.mean(_trigamma_safe(df1_ok / 2))
191
+
192
+ if evar > 0:
193
+ df2 = 2.0 * _trigamma_inverse(evar)
194
+ df2 = max(df2, 1e-6)
195
+ if df2 > 1e15:
196
+ df2 = np.inf
197
+ s2 = float(np.exp(emean - logmdigamma(df2 / 2)))
198
+ else:
199
+ df2 = np.inf
200
+ s2 = float(np.mean(x_ok)) # R: mean(x) for no-covariate case
201
+
202
+ s2 = max(s2, 1e-15)
203
+
204
+ return {'s2': s2, 'df2': df2}
205
+
206
+
207
+ def _fit_f_dist_trend(var, df, covariate):
208
+ """Fit an F-distribution with trended prior variance.
209
+
210
+ Faithful port of R's fitFDist() with covariate parameter.
211
+ Uses natural spline basis + OLS regression, matching R's approach of
212
+ fitting e = log(x) + logmdigamma(df1/2) on ns(covariate, df=splinedf).
213
+ """
214
+ n = len(var)
215
+ var = np.asarray(var, dtype=np.float64).copy()
216
+ df_arr = np.atleast_1d(np.asarray(df, dtype=np.float64))
217
+ if len(df_arr) == 1:
218
+ df_arr = np.full(n, df_arr[0])
219
+ covariate = np.asarray(covariate, dtype=np.float64)
220
+
221
+ # Handle infinite covariate values (matching R)
222
+ isfin = np.isfinite(covariate)
223
+ if not np.all(isfin):
224
+ if np.any(isfin):
225
+ r = (np.min(covariate[isfin]), np.max(covariate[isfin]))
226
+ covariate = covariate.copy()
227
+ covariate[covariate == -np.inf] = r[0] - 1
228
+ covariate[covariate == np.inf] = r[1] + 1
229
+ else:
230
+ covariate = np.sign(covariate)
231
+
232
+ # Adaptive spline df (matching R: 1 + (nok>=3) + (nok>=6) + (nok>=30))
233
+ splinedf = 1 + int(n >= 3) + int(n >= 6) + int(n >= 30)
234
+ splinedf = min(splinedf, len(np.unique(covariate)))
235
+
236
+ if splinedf < 2:
237
+ # Fall back to scalar (no-covariate) fit, matching R's Recall()
238
+ result = _fit_f_dist(var, df_arr)
239
+ return {'var_prior': np.full(n, result['s2']), 'df_prior': result['df2']}
240
+
241
+ # Clamp var: match R's pmax(x, 0), handle zeros, pmax(x, 1e-5 * median)
242
+ var = np.maximum(var, 0.0)
243
+ m = np.median(var)
244
+ if m == 0:
245
+ m = 1.0
246
+ var = np.maximum(var, 1e-5 * m)
247
+
248
+ # Compute e = log(x) + logmdigamma(df1/2), matching R exactly
249
+ z = np.log(var)
250
+ e = z + logmdigamma(df_arr / 2)
251
+
252
+ # Fit natural spline basis + OLS (matching R's lm.fit(ns(...), e))
253
+ basis = _natural_spline_basis(covariate, df=splinedf)
254
+ coeffs, _, _, _ = np.linalg.lstsq(basis, e, rcond=None)
255
+ emean = basis @ coeffs
256
+
257
+ # Residual variance: R uses mean(fit$effects[-(1:rank)]^2) = RSS/(n-rank)
258
+ resid = e - emean
259
+ actual_rank = np.linalg.matrix_rank(basis)
260
+ if n > actual_rank:
261
+ evar = np.sum(resid ** 2) / (n - actual_rank)
262
+ else:
263
+ evar = 0.0
264
+
265
+ # Subtract trigamma(df1/2) contribution
266
+ evar = evar - np.mean(_trigamma_safe(df_arr / 2))
267
+
268
+ # Estimate df2 and s20
269
+ if evar > 0:
270
+ df2 = 2.0 * _trigamma_inverse(evar)
271
+ if df2 > 1e15:
272
+ df2 = np.inf
273
+ s20 = np.exp(emean - logmdigamma(df2 / 2))
274
+ else:
275
+ df2 = np.inf
276
+ s20 = np.exp(emean)
277
+
278
+ return {'var_prior': s20, 'df_prior': df2}
279
+
280
+
281
+ def _natural_spline_basis(x, df):
282
+ """Create natural cubic spline basis matrix matching R's ns(x, df=df, intercept=TRUE).
283
+
284
+ Uses the truncated power basis representation from Hastie, Tibshirani &
285
+ Friedman (Elements of Statistical Learning, eq 5.4-5.5).
286
+
287
+ Parameters
288
+ ----------
289
+ x : array-like
290
+ Covariate values.
291
+ df : int
292
+ Number of basis functions (columns in the returned matrix).
293
+
294
+ Returns
295
+ -------
296
+ ndarray of shape (n, df)
297
+ """
298
+ x = np.asarray(x, dtype=np.float64)
299
+ n = len(x)
300
+
301
+ # Number of internal knots: R uses df - 1 - intercept = df - 2
302
+ n_internal = df - 2
303
+
304
+ # Boundary knots at range
305
+ a = np.min(x)
306
+ b = np.max(x)
307
+
308
+ if n_internal <= 0 or a == b:
309
+ # Linear basis only: [1, x]
310
+ basis = np.column_stack([np.ones(n), x])
311
+ return basis[:, :df]
312
+
313
+ # Internal knots at quantiles (matching R's quantile placement)
314
+ probs = np.linspace(0, 1, n_internal + 2)[1:-1]
315
+ internal_knots = np.quantile(x, probs)
316
+
317
+ # All knots sorted: [boundary_left, internal_1, ..., internal_K, boundary_right]
318
+ all_knots = np.sort(np.concatenate([[a], internal_knots, [b]]))
319
+ K = len(all_knots) # Total knots = df
320
+
321
+ # Build basis: [1, x, d_1-d_{K-1}, d_2-d_{K-1}, ..., d_{K-2}-d_{K-1}]
322
+ # where d_k(x) = [(x - xi_k)_+^3 - (x - xi_K)_+^3] / (xi_K - xi_k)
323
+ basis = np.zeros((n, df))
324
+ basis[:, 0] = 1.0
325
+ basis[:, 1] = x
326
+
327
+ if K > 2:
328
+ xi_K = all_knots[-1] # rightmost boundary knot
329
+ xi_Km1 = all_knots[-2] # second-to-last knot (K-1 in 1-indexed)
330
+
331
+ def d_func(xi_j):
332
+ return (np.maximum(x - xi_j, 0) ** 3 - np.maximum(x - xi_K, 0) ** 3) / (xi_K - xi_j)
333
+
334
+ d_Km1 = d_func(xi_Km1)
335
+
336
+ for j in range(K - 2):
337
+ d_j = d_func(all_knots[j])
338
+ basis[:, 2 + j] = d_j - d_Km1
339
+
340
+ return basis
341
+
342
+
343
+ def _fit_f_dist_robustly(x, df1, covariate=None, winsor_tail_p=(0.05, 0.1)):
344
+ """Fit F-distribution with robust outlier detection.
345
+
346
+ Port of limma's fitFDistRobustly().
347
+ Returns dict with 'scale', 'df2', and 'df2_shrunk' (per-gene).
348
+
349
+ Parameters
350
+ ----------
351
+ x : array-like
352
+ Genewise variances.
353
+ df1 : array-like or scalar
354
+ Residual degrees of freedom.
355
+ covariate : array-like, optional
356
+ Covariate for trended prior.
357
+ winsor_tail_p : tuple
358
+ Tail proportions for Winsorization (lower, upper).
359
+
360
+ Returns
361
+ -------
362
+ dict with keys: scale, df2, df2_shrunk
363
+ """
364
+ x = np.asarray(x, dtype=np.float64).copy()
365
+ n = len(x)
366
+
367
+ if n < 2:
368
+ return {'scale': np.nan, 'df2': np.nan, 'df2_shrunk': np.full(max(n, 1), np.nan)}
369
+
370
+ df1 = np.atleast_1d(np.asarray(df1, dtype=np.float64)).copy()
371
+ if len(df1) == 1:
372
+ df1 = np.full(n, df1[0])
373
+
374
+ if n == 2:
375
+ if covariate is None:
376
+ result = _fit_f_dist(x, df1)
377
+ return {'scale': result['s2'], 'df2': result['df2'],
378
+ 'df2_shrunk': np.full(n, result['df2'])}
379
+ else:
380
+ result = _fit_f_dist_trend(x, df1, covariate)
381
+ return {'scale': result['var_prior'], 'df2': result['df_prior'],
382
+ 'df2_shrunk': np.full(n, result['df_prior'])}
383
+
384
+ # Filter ok values
385
+ ok = ~np.isnan(x) & np.isfinite(df1) & (df1 > 1e-6)
386
+
387
+ if not np.all(ok):
388
+ # Recursive call on ok subset
389
+ df2_shrunk_full = np.empty(n)
390
+ x_ok = x[ok]
391
+ df1_ok = df1[ok]
392
+ cov_ok = covariate[ok] if covariate is not None else None
393
+
394
+ fit = _fit_f_dist_robustly(x_ok, df1_ok, covariate=cov_ok,
395
+ winsor_tail_p=winsor_tail_p)
396
+
397
+ df2_shrunk_full[ok] = fit['df2_shrunk']
398
+ df2_shrunk_full[~ok] = fit['df2']
399
+
400
+ if covariate is None:
401
+ scale = fit['scale']
402
+ else:
403
+ scale_ok = np.atleast_1d(fit['scale'])
404
+ scale = np.empty(n)
405
+ scale[ok] = scale_ok
406
+ from scipy.interpolate import interp1d
407
+ f_interp = interp1d(covariate[ok], np.log(scale_ok), kind='linear',
408
+ bounds_error=False, fill_value='extrapolate')
409
+ scale[~ok] = np.exp(f_interp(covariate[~ok]))
410
+
411
+ return {'scale': scale, 'df2': fit['df2'], 'df2_shrunk': df2_shrunk_full}
412
+
413
+ # All values ok from here
414
+ m = np.median(x)
415
+ if m <= 0:
416
+ return {'scale': np.nan, 'df2': np.nan, 'df2_shrunk': np.full(n, np.nan)}
417
+
418
+ small = x < m * 1e-12
419
+ if np.any(small):
420
+ x[small] = m * 1e-12
421
+
422
+ # Non-robust initial fit
423
+ if covariate is None:
424
+ non_robust = _fit_f_dist(x, df1)
425
+ nr_s20 = non_robust['s2']
426
+ nr_df2 = non_robust['df2']
427
+ else:
428
+ non_robust = _fit_f_dist_trend(x, df1, covariate)
429
+ nr_s20 = non_robust['var_prior']
430
+ nr_df2 = non_robust['df_prior']
431
+
432
+ if not np.isfinite(nr_df2) and nr_df2 != np.inf:
433
+ return {'scale': nr_s20, 'df2': nr_df2, 'df2_shrunk': np.full(n, 0.0)}
434
+
435
+ # Winsor tail probabilities
436
+ wtp = [float(winsor_tail_p[0]), float(winsor_tail_p[1])]
437
+ prob = [wtp[0], 1.0 - wtp[1]]
438
+
439
+ # Check if winsor_tail_p is too small for this sample size
440
+ if all(p < 1.0 / n for p in wtp):
441
+ return {'scale': nr_s20, 'df2': nr_df2, 'df2_shrunk': np.full(n, nr_df2)}
442
+
443
+ # Unify df1 if vector with different values
444
+ if np.min(df1) < np.max(df1) - 1e-14:
445
+ df1max = np.max(df1)
446
+ i = df1 < (df1max - 1e-14)
447
+ if np.any(i):
448
+ if covariate is None:
449
+ s = nr_s20
450
+ else:
451
+ s = nr_s20[i]
452
+ f_vals = x[i] / s
453
+ d2 = nr_df2
454
+ pupper = stats.f.logsf(f_vals, df1[i], d2)
455
+ plower = stats.f.logcdf(f_vals, df1[i], d2)
456
+ up = pupper < plower
457
+ f_new = f_vals.copy()
458
+ if np.any(up):
459
+ f_new[up] = stats.f.isf(np.exp(np.clip(pupper[up], -500, 0)), df1max, d2)
460
+ if np.any(~up):
461
+ f_new[~up] = stats.f.ppf(np.exp(np.clip(plower[~up], -500, 0)), df1max, d2)
462
+ x[i] = f_new * s
463
+ df1_val = df1max
464
+ else:
465
+ df1_val = df1[0]
466
+ else:
467
+ df1_val = df1[0]
468
+
469
+ z = np.log(x)
470
+
471
+ if covariate is None:
472
+ # Trimmed mean matching R's mean(z, trim=winsor.tail.p[2])
473
+ ztrend = float(stats.trim_mean(z, proportiontocut=wtp[1]))
474
+ zresid = z - ztrend
475
+ else:
476
+ from .weighted_lowess import weighted_lowess as _wlowess
477
+ lo = _wlowess(covariate, z, span=0.4, iterations=4, npts=200)
478
+ ztrend = lo['fitted']
479
+ zresid = z - ztrend
480
+
481
+ # Winsorize z-residuals
482
+ zrq = np.quantile(zresid, prob)
483
+ zwins = np.clip(zresid, zrq[0], zrq[1])
484
+ zwmean = float(np.mean(zwins))
485
+ zwvar = float(np.mean((zwins - zwmean) ** 2) * n / (n - 1))
486
+
487
+ # Gauss-Legendre quadrature on [0,1] (128 nodes)
488
+ gl_nodes_raw, gl_weights_raw = np.polynomial.legendre.leggauss(128)
489
+ gl_nodes_01 = (gl_nodes_raw + 1.0) / 2.0
490
+ gl_weights_01 = gl_weights_raw / 2.0
491
+
492
+ def linkfun(v):
493
+ return v / (1.0 + v)
494
+
495
+ def linkinv(v):
496
+ return v / (1.0 - v)
497
+
498
+ def winsorized_moments(d1, d2, wtp_arg):
499
+ """Compute Winsorized mean and variance of log(F(d1, d2))."""
500
+ fq = stats.f.ppf([wtp_arg[0], 1.0 - wtp_arg[1]], d1, d2)
501
+ zq = np.log(fq)
502
+ q = linkfun(fq)
503
+ nodes = q[0] + (q[1] - q[0]) * gl_nodes_01
504
+ fnodes = linkinv(nodes)
505
+ znodes = np.log(fnodes)
506
+ f_dens = stats.f.pdf(fnodes, d1, d2) / (1.0 - nodes) ** 2
507
+ q21 = q[1] - q[0]
508
+ wtp_arr = np.array(wtp_arg)
509
+ m_val = q21 * np.sum(gl_weights_01 * f_dens * znodes) + np.sum(zq * wtp_arr)
510
+ v_val = (q21 * np.sum(gl_weights_01 * f_dens * (znodes - m_val) ** 2)
511
+ + np.sum((zq - m_val) ** 2 * wtp_arr))
512
+ return {'mean': m_val, 'var': v_val}
513
+
514
+ # Check df2=Inf case
515
+ mom_inf = winsorized_moments(df1_val, np.inf, wtp)
516
+
517
+ if mom_inf['var'] <= 0 or zwvar <= 0:
518
+ return {'scale': nr_s20, 'df2': nr_df2, 'df2_shrunk': np.full(n, nr_df2)}
519
+
520
+ funval_inf = np.log(zwvar / mom_inf['var'])
521
+
522
+ if funval_inf <= 0:
523
+ # df2 = Inf: observed variance <= theoretical at df2=Inf
524
+ df2 = np.inf
525
+ ztrendcorrected = ztrend + zwmean - mom_inf['mean']
526
+ s20 = np.exp(ztrendcorrected)
527
+ Fstat = np.exp(z - ztrendcorrected)
528
+ TailP = stats.chi2.sf(Fstat * df1_val, df1_val)
529
+ r = stats.rankdata(Fstat)
530
+ EmpiricalTailProb = (n - r + 0.5) / n
531
+ ProbNotOutlier = np.minimum(TailP / EmpiricalTailProb, 1.0)
532
+ df_pooled = n * df1_val
533
+ df2_shrunk = np.full(n, float(df2))
534
+ O = ProbNotOutlier < 1
535
+ if np.any(O):
536
+ df2_shrunk[O] = ProbNotOutlier[O] * df_pooled
537
+ o = np.argsort(TailP)
538
+ df2_shrunk[o] = np.maximum.accumulate(df2_shrunk[o])
539
+ return {'scale': s20, 'df2': df2, 'df2_shrunk': df2_shrunk}
540
+
541
+ # Check if non-robust already gives Inf
542
+ if nr_df2 == np.inf:
543
+ return {'scale': nr_s20, 'df2': nr_df2, 'df2_shrunk': np.full(n, nr_df2)}
544
+
545
+ # Root-finding for df2
546
+ rbx = linkfun(nr_df2)
547
+
548
+ def fun_root(par):
549
+ d2 = linkinv(par)
550
+ mom = winsorized_moments(df1_val, d2, wtp)
551
+ if mom['var'] <= 0:
552
+ return funval_inf
553
+ return np.log(zwvar / mom['var'])
554
+
555
+ funval_low = fun_root(rbx)
556
+
557
+ if funval_low >= 0:
558
+ df2 = nr_df2
559
+ else:
560
+ from scipy.optimize import brentq
561
+ root = brentq(fun_root, rbx, 1.0 - 1e-10, xtol=1e-8)
562
+ df2 = linkinv(root)
563
+
564
+ mom = winsorized_moments(df1_val, df2, wtp)
565
+ ztrendcorrected = ztrend + zwmean - mom['mean']
566
+ s20 = np.exp(ztrendcorrected)
567
+ Fstat = np.exp(z - ztrendcorrected)
568
+
569
+ LogTailP = stats.f.logsf(Fstat, df1_val, df2)
570
+ TailP = np.exp(LogTailP)
571
+ r = stats.rankdata(Fstat)
572
+ LogEmpiricalTailProb = np.log(n - r + 0.5) - np.log(n)
573
+ LogProbNotOutlier = np.minimum(LogTailP - LogEmpiricalTailProb, 0.0)
574
+ ProbNotOutlier = np.exp(LogProbNotOutlier)
575
+ ProbOutlier = -np.expm1(LogProbNotOutlier)
576
+
577
+ if np.any(LogProbNotOutlier < 0):
578
+ minLogTailP = np.min(LogTailP)
579
+ if minLogTailP == -np.inf:
580
+ df2_outlier = 0.0
581
+ df2_shrunk = ProbNotOutlier * df2
582
+ else:
583
+ df2_outlier = np.log(0.5) / minLogTailP * df2
584
+ NewLogTailP = stats.f.logsf(np.max(Fstat), df1_val, df2_outlier)
585
+ df2_outlier = np.log(0.5) / NewLogTailP * df2_outlier
586
+ df2_shrunk = ProbNotOutlier * df2 + ProbOutlier * df2_outlier
587
+
588
+ # Monotonize via cummax on ordered tail p-values
589
+ o = np.argsort(LogTailP)
590
+ df2_ordered = df2_shrunk[o].copy()
591
+ m_arr = np.cumsum(df2_ordered) / np.arange(1, n + 1, dtype=np.float64)
592
+ imin = int(np.argmin(m_arr))
593
+ df2_ordered[:imin + 1] = m_arr[imin]
594
+ df2_shrunk_final = np.empty(n)
595
+ df2_shrunk_final[o] = np.maximum.accumulate(df2_ordered)
596
+ df2_shrunk = df2_shrunk_final
597
+ else:
598
+ df2_shrunk = np.full(n, df2)
599
+
600
+ return {'scale': s20, 'df2': df2, 'df2_shrunk': df2_shrunk}
601
+
602
+
603
+ def _digamma_safe(x):
604
+ """Safe digamma that handles arrays."""
605
+ from scipy.special import digamma
606
+ return digamma(np.asarray(x, dtype=np.float64))
607
+
608
+
609
+ def _trigamma_safe(x):
610
+ """Safe trigamma (polygamma of order 1)."""
611
+ from scipy.special import polygamma
612
+ return polygamma(1, np.asarray(x, dtype=np.float64))
613
+
614
+
615
+ def _trigamma_inverse(x):
616
+ """Inverse of the trigamma function.
617
+
618
+ Port of limma's trigammaInverse().
619
+ Uses Newton's method.
620
+ """
621
+ from scipy.special import polygamma
622
+
623
+ x = float(x)
624
+ if x > 1e7:
625
+ return 1.0 / x
626
+ if x < 1e-6:
627
+ return 1.0 / x
628
+
629
+ # Starting value
630
+ if x > 0.5:
631
+ y = 1.0 / x
632
+ else:
633
+ y = 1.0 / (x * (1 + x))
634
+
635
+ # Newton iterations
636
+ for _ in range(50):
637
+ tri = float(polygamma(1, y))
638
+ dif = tri * (1 - tri / x) / float(polygamma(2, y))
639
+ y = y + dif
640
+ if y <= 0:
641
+ y = x # reset
642
+ if abs(dif / y) < 1e-10:
643
+ break
644
+
645
+ return y
646
+
647
+
648
+ def logmdigamma(x):
649
+ """Compute log(x) - digamma(x) avoiding subtractive cancellation.
650
+
651
+ Port of statmod's logmdigamma().
652
+ Uses recursive shift for small values and asymptotic expansion for large.
653
+ """
654
+ x = np.asarray(x, dtype=np.float64)
655
+ scalar_input = x.ndim == 0
656
+ x = np.atleast_1d(x)
657
+ result = np.full_like(x, np.nan)
658
+
659
+ valid = x > 0
660
+ if not np.any(valid):
661
+ return float(result[0]) if scalar_input else result
662
+
663
+ xv = x[valid]
664
+ rv = np.empty_like(xv)
665
+ large = xv >= 5
666
+ small = ~large
667
+
668
+ # Large values: asymptotic expansion
669
+ if np.any(large):
670
+ z = xv[large]
671
+ inv_z2 = 1.0 / (z * z)
672
+ tail = inv_z2 * (-1.0/12 + inv_z2 * (1.0/120 + inv_z2 * (-1.0/252 + inv_z2 * (
673
+ 1.0/240 + inv_z2 * (-1.0/132 + inv_z2 * (691.0/32760 + inv_z2 * (
674
+ -1.0/12 + 3617.0/8160 * inv_z2)))))))
675
+ rv[large] = 1.0 / (2.0 * z) - tail
676
+
677
+ # Small values: recursive shift by 5, then use asymptotic on z+5
678
+ if np.any(small):
679
+ z = xv[small]
680
+ z5 = z + 5.0
681
+ inv_z5_2 = 1.0 / (z5 * z5)
682
+ tail5 = inv_z5_2 * (-1.0/12 + inv_z5_2 * (1.0/120 + inv_z5_2 * (-1.0/252 + inv_z5_2 * (
683
+ 1.0/240 + inv_z5_2 * (-1.0/132 + inv_z5_2 * (691.0/32760 + inv_z5_2 * (
684
+ -1.0/12 + 3617.0/8160 * inv_z5_2)))))))
685
+ lmd_z5 = 1.0 / (2.0 * z5) - tail5
686
+ rv[small] = (np.log(z / z5) + lmd_z5
687
+ + 1.0/z + 1.0/(z+1) + 1.0/(z+2) + 1.0/(z+3) + 1.0/(z+4))
688
+
689
+ result[valid] = rv
690
+ return float(result[0]) if scalar_input else result
691
+
692
+
693
+ def _p_adjust_bh(p):
694
+ """Benjamini-Hochberg p-value adjustment.
695
+
696
+ Port of R's p.adjust(method="BH").
697
+ """
698
+ p = np.asarray(p, dtype=np.float64)
699
+ n = len(p)
700
+ o = np.argsort(p)[::-1]
701
+ ro = np.argsort(o)
702
+ i_vals = np.arange(n, 0, -1, dtype=np.float64)
703
+ adjusted = np.minimum.accumulate(n / i_vals * p[o])
704
+ adjusted = np.minimum(adjusted, 1.0)
705
+ return adjusted[ro]
706
+
707
+
708
+ def _fit_f_dist_unequal_df1(x, df1, covariate=None, span=None, robust=True, prior_weights=None):
709
+ """Fit a scaled F-distribution with unequal df1 values.
710
+
711
+ Port of limma's fitFDistUnequalDF1().
712
+ Uses MLE to estimate scale and df2 (prior df).
713
+
714
+ Parameters
715
+ ----------
716
+ x : array-like
717
+ Genewise variances (s2 values).
718
+ df1 : array-like
719
+ Residual degrees of freedom per gene.
720
+ covariate : array-like, optional
721
+ Covariate for trended prior (e.g. AveLogCPM).
722
+ span : float, optional
723
+ Loess span.
724
+ robust : bool
725
+ Robust estimation with outlier handling.
726
+ prior_weights : array-like, optional
727
+ Prior weights for each observation.
728
+
729
+ Returns
730
+ -------
731
+ dict with keys: scale, df2, and optionally df2_shrunk, df2_outlier.
732
+ """
733
+ from scipy.optimize import minimize_scalar
734
+ from scipy.special import gammaln
735
+
736
+ x = np.asarray(x, dtype=np.float64).copy()
737
+ df1 = np.atleast_1d(np.asarray(df1, dtype=np.float64)).copy()
738
+ n = len(x)
739
+
740
+ if len(df1) == 1:
741
+ df1 = np.full(n, df1[0])
742
+
743
+ if prior_weights is not None:
744
+ prior_weights = np.asarray(prior_weights, dtype=np.float64).copy()
745
+
746
+ # Handle NA values
747
+ na_mask = np.isnan(x)
748
+ if np.any(na_mask):
749
+ if prior_weights is None:
750
+ prior_weights = (~na_mask).astype(np.float64)
751
+ else:
752
+ prior_weights[na_mask] = 0
753
+ x[na_mask] = 0
754
+
755
+ # Handle small df1
756
+ small_df1 = df1 < 0.01
757
+ if np.any(small_df1):
758
+ if prior_weights is None:
759
+ prior_weights = (~small_df1).astype(np.float64)
760
+ else:
761
+ prior_weights[small_df1] = 0
762
+ df1[small_df1] = 1
763
+
764
+ has_pw = prior_weights is not None
765
+
766
+ # Identify informative observations
767
+ informative = x > 0
768
+ if has_pw:
769
+ informative = informative & (prior_weights > 0)
770
+ n_informative = int(np.sum(informative))
771
+
772
+ if n_informative < 2:
773
+ return {'scale': np.nan, 'df2': np.nan}
774
+
775
+ if n_informative == 2:
776
+ covariate = None
777
+ robust = False
778
+ prior_weights = None
779
+ has_pw = False
780
+
781
+ m = np.median(x[informative])
782
+ xpos = np.maximum(x, 1e-12 * m)
783
+ z = np.log(xpos)
784
+ d1 = df1 / 2.0
785
+ e = z + logmdigamma(d1)
786
+ w = 1.0 / _trigamma_safe(d1)
787
+ if len(w) < n:
788
+ w = np.full(n, w[0])
789
+ if has_pw:
790
+ w = w * prior_weights
791
+
792
+ if covariate is None:
793
+ emean = np.sum(w * e) / np.sum(w)
794
+ else:
795
+ covariate = np.asarray(covariate, dtype=np.float64)
796
+ if span is None:
797
+ span = choose_lowess_span(n, small_n=500)
798
+ # Normalize weights: w / quantile(w, 0.75), clipped to [1e-8, 100]
799
+ w_q75 = np.quantile(w, 0.75)
800
+ loess_w = w / w_q75 if w_q75 > 0 else w.copy()
801
+ loess_w = np.clip(loess_w, 1e-08, 100)
802
+
803
+ from .weighted_lowess import weighted_lowess as _wlowess
804
+ wl_result = _wlowess(covariate, e, weights=loess_w, span=span,
805
+ iterations=1, npts=200)
806
+ emean = wl_result['fitted']
807
+
808
+ d1x = d1 * xpos
809
+
810
+ # MLE optimization for d2 = par/(1-par) over par in [0.5, 0.9998]
811
+ def minus_twice_loglik(par):
812
+ d2 = par / (1 - par)
813
+ lmd2 = logmdigamma(d2)
814
+ d2s20 = d2 * np.exp(emean - lmd2)
815
+ ll = (-(d1 + d2) * np.log1p(d1x / d2s20)
816
+ - d1 * np.log(d2s20)
817
+ + gammaln(d1 + d2) - gammaln(d2))
818
+ if has_pw:
819
+ return -2 * np.sum(prior_weights * ll)
820
+ return -2 * np.sum(ll)
821
+
822
+ opt = minimize_scalar(minus_twice_loglik, bounds=(0.5, 0.9998), method='bounded')
823
+ d2 = opt.x / (1 - opt.x)
824
+ s20 = np.exp(emean - logmdigamma(d2))
825
+
826
+ if not robust:
827
+ return {'scale': s20, 'df2': 2 * d2}
828
+
829
+ # Robust estimation: detect and down-weight outliers
830
+ df2 = 2 * d2
831
+ f_stat = x / s20
832
+
833
+ right_p = stats.f.sf(f_stat, df1, df2)
834
+ left_p = 1 - right_p
835
+
836
+ # Better computation for very small left p-values
837
+ small_left = left_p < 0.001
838
+ if np.any(small_left):
839
+ df1_sub = df1[small_left] if len(df1) > 1 else df1
840
+ left_p[small_left] = stats.f.cdf(f_stat[small_left], df1_sub, df2)
841
+
842
+ two_sided_p = 2 * np.minimum(left_p, right_p)
843
+
844
+ fdr = _p_adjust_bh(two_sided_p)
845
+ fdr[fdr > 0.3] = 1
846
+
847
+ if np.min(fdr) == 1:
848
+ return {'scale': s20, 'df2': df2}
849
+
850
+ # Re-fit with FDR as prior weights
851
+ outpw = _fit_f_dist_unequal_df1(x, df1, covariate=covariate, span=span,
852
+ robust=False, prior_weights=fdr)
853
+ s20 = outpw['scale']
854
+ df2 = outpw['df2']
855
+
856
+ r = stats.rankdata(f_stat)
857
+ uniform_p = (n - r + 0.5) / n
858
+ prob_not_outlier = np.minimum(right_p / uniform_p, 1)
859
+
860
+ if np.min(prob_not_outlier) == 1:
861
+ return outpw
862
+
863
+ i_min = int(np.argmin(right_p))
864
+ min_right_p = right_p[i_min]
865
+
866
+ if min_right_p == 0:
867
+ df2_outlier = 0.0
868
+ df2_shrunk = prob_not_outlier * df2
869
+ else:
870
+ df2_outlier = np.log(0.5) / np.log(min_right_p) * df2
871
+ df1_i = df1[i_min] if len(df1) > 1 else df1[0]
872
+ new_log_right_p = stats.f.logsf(f_stat[i_min], df1_i, df2_outlier)
873
+ df2_outlier = np.log(0.5) / new_log_right_p * df2_outlier
874
+ df2_shrunk = prob_not_outlier * df2 + (1 - prob_not_outlier) * df2_outlier
875
+
876
+ # Monotonize df2_shrunk
877
+ o = np.argsort(right_p)
878
+ df2_ordered = df2_shrunk[o].copy()
879
+ m_arr = np.cumsum(df2_ordered) / np.arange(1, n + 1, dtype=np.float64)
880
+ imin = int(np.argmin(m_arr))
881
+ df2_ordered[:imin + 1] = m_arr[imin]
882
+ df2_shrunk_final = np.empty(n)
883
+ df2_shrunk_final[o] = np.maximum.accumulate(df2_ordered)
884
+
885
+ return {'scale': s20, 'df2': df2, 'df2_outlier': df2_outlier, 'df2_shrunk': df2_shrunk_final}
886
+
887
+
888
+ def non_estimable(x):
889
+ """Identify non-estimable coefficients in a design matrix.
890
+
891
+ Port of limma's nonEstimable().
892
+ """
893
+ x = np.asarray(x, dtype=np.float64)
894
+ p = x.shape[1]
895
+ if p == 0:
896
+ return None
897
+ _, R = np.linalg.qr(x)
898
+ d = np.abs(np.diag(R))
899
+ if len(d) == 0:
900
+ return np.arange(p)
901
+ tol = np.max(d) * max(x.shape) * np.finfo(np.float64).eps
902
+ non_est = np.where(d < tol)[0]
903
+ if len(non_est) == 0:
904
+ return None
905
+ # Return coefficient names if available
906
+ return non_est
907
+
908
+
909
+ def is_fullrank(x):
910
+ """Check if a matrix is full column rank.
911
+
912
+ Port of limma's is.fullrank().
913
+ """
914
+ x = np.asarray(x, dtype=np.float64)
915
+ if x.ndim == 1:
916
+ x = x.reshape(-1, 1)
917
+ return np.linalg.matrix_rank(x) == x.shape[1]
918
+
919
+
920
+ def choose_lowess_span(n, small_n=25, min_span=0.2, power=1/3):
921
+ """Choose lowess span based on number of observations.
922
+
923
+ Port of limma's chooseLowessSpan().
924
+ Formula: min(min_span + (1 - min_span) * (small_n/n)^power, 1)
925
+ """
926
+ return min(min_span + (1 - min_span) * (small_n / n) ** power, 1.0)
927
+
928
+
929
+ def contrast_as_coef(design, contrast, first=False):
930
+ """Reform a design matrix so that a contrast becomes a coefficient.
931
+
932
+ Port of limma's contrastAsCoef().
933
+
934
+ Parameters
935
+ ----------
936
+ design : ndarray
937
+ Design matrix.
938
+ contrast : array-like
939
+ Contrast vector.
940
+ first : bool
941
+ If True, put contrast as first column.
942
+
943
+ Returns
944
+ -------
945
+ dict with 'design' (reformed design) and 'coef' (column index of contrast).
946
+ """
947
+ design = np.asarray(design, dtype=np.float64)
948
+ contrast = np.asarray(contrast, dtype=np.float64).ravel()
949
+ p = design.shape[1]
950
+
951
+ if len(contrast) != p:
952
+ raise ValueError("Length of contrast must equal number of columns in design")
953
+
954
+ # Port of limma's contrastAsCoef: QR decompose contrast, apply Q^T
955
+ # rotation, then backsolve the contrast coefficient row by R so that
956
+ # the coefficient directly represents the contrast effect.
957
+ contrast_mat = contrast.reshape(-1, 1)
958
+ Q, R_mat = np.linalg.qr(contrast_mat, mode='complete')
959
+ r_val = R_mat[0, 0] # scalar R factor (= ±||contrast||)
960
+
961
+ # design_rotated = design @ Q (apply QR rotation)
962
+ design_rotated = design @ Q
963
+
964
+ # Backsolve: divide contrast coefficient column by R to normalize
965
+ # This makes the coefficient directly represent the logFC
966
+ ncontrasts = 1
967
+ design_rotated[:, 0] = design_rotated[:, 0] / r_val
968
+
969
+ if first:
970
+ new_design = design_rotated
971
+ coef = 0
972
+ else:
973
+ # Move contrast column (first) to last
974
+ cols = list(range(1, p)) + [0]
975
+ new_design = design_rotated[:, cols]
976
+ coef = p - 1
977
+
978
+ return {'design': new_design, 'coef': coef}
979
+
980
+
981
+ def logsumexp(x, y):
982
+ """Compute log(exp(x) + exp(y)) avoiding overflow.
983
+
984
+ Helper used in zscoreNBinom.
985
+ """
986
+ m = np.maximum(x, y)
987
+ return m + np.log(np.exp(x - m) + np.exp(y - m))