edgepython 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- edgepython/__init__.py +114 -0
- edgepython/classes.py +517 -0
- edgepython/compressed_matrix.py +388 -0
- edgepython/dgelist.py +314 -0
- edgepython/dispersion.py +920 -0
- edgepython/dispersion_lowlevel.py +1066 -0
- edgepython/exact_test.py +525 -0
- edgepython/expression.py +323 -0
- edgepython/filtering.py +96 -0
- edgepython/gene_sets.py +1215 -0
- edgepython/glm_fit.py +653 -0
- edgepython/glm_levenberg.py +359 -0
- edgepython/glm_test.py +375 -0
- edgepython/io.py +1887 -0
- edgepython/limma_port.py +987 -0
- edgepython/normalization.py +546 -0
- edgepython/ql_weights.py +765 -0
- edgepython/results.py +236 -0
- edgepython/sc_fit.py +1511 -0
- edgepython/smoothing.py +474 -0
- edgepython/splicing.py +537 -0
- edgepython/utils.py +1050 -0
- edgepython/visualization.py +409 -0
- edgepython/weighted_lowess.py +323 -0
- edgepython-0.2.0.dist-info/METADATA +201 -0
- edgepython-0.2.0.dist-info/RECORD +29 -0
- edgepython-0.2.0.dist-info/WHEEL +5 -0
- edgepython-0.2.0.dist-info/licenses/LICENSE +674 -0
- edgepython-0.2.0.dist-info/top_level.txt +1 -0
edgepython/limma_port.py
ADDED
|
@@ -0,0 +1,987 @@
|
|
|
1
|
+
# This code was written by Claude (Anthropic). The project was directed by Lior Pachter.
|
|
2
|
+
"""
|
|
3
|
+
Essential limma functions ported for edgePython.
|
|
4
|
+
|
|
5
|
+
Port of limma's squeezeVar, contrastAsCoef, nonEstimable, is.fullrank,
|
|
6
|
+
chooseLowessSpan, and related utility functions.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
from scipy import stats, interpolate
|
|
11
|
+
import warnings
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def squeeze_var(var, df, covariate=None, span=None, robust=False, winsor_tail_p=(0.05, 0.1), legacy=None):
|
|
15
|
+
"""Empirical Bayes moderation of genewise variances.
|
|
16
|
+
|
|
17
|
+
Port of limma's squeezeVar().
|
|
18
|
+
|
|
19
|
+
Parameters
|
|
20
|
+
----------
|
|
21
|
+
var : array-like
|
|
22
|
+
Genewise variances.
|
|
23
|
+
df : array-like
|
|
24
|
+
Residual degrees of freedom.
|
|
25
|
+
covariate : array-like, optional
|
|
26
|
+
Covariate for trended prior.
|
|
27
|
+
span : float, optional
|
|
28
|
+
Loess span. If provided, forces legacy=False.
|
|
29
|
+
robust : bool
|
|
30
|
+
Use robust estimation.
|
|
31
|
+
winsor_tail_p : tuple
|
|
32
|
+
Tail proportions for Winsorization when robust=True (legacy only).
|
|
33
|
+
legacy : bool or None
|
|
34
|
+
If True, use original limma algorithm (fitFDist).
|
|
35
|
+
If False, use fitFDistUnequalDF1.
|
|
36
|
+
If None (default), auto-detect based on whether df values are equal.
|
|
37
|
+
|
|
38
|
+
Returns
|
|
39
|
+
-------
|
|
40
|
+
dict with keys: var_post, var_prior, df_prior
|
|
41
|
+
"""
|
|
42
|
+
var = np.asarray(var, dtype=np.float64)
|
|
43
|
+
n = len(var)
|
|
44
|
+
|
|
45
|
+
if n == 0:
|
|
46
|
+
raise ValueError("var is empty")
|
|
47
|
+
if n < 3:
|
|
48
|
+
return {'var_post': var.copy(), 'var_prior': var.copy(), 'df_prior': 0.0}
|
|
49
|
+
|
|
50
|
+
df = np.atleast_1d(np.asarray(df, dtype=np.float64))
|
|
51
|
+
if len(df) == 1:
|
|
52
|
+
df = np.full(n, df[0])
|
|
53
|
+
|
|
54
|
+
# When df==0, guard against missing or infinite values in var
|
|
55
|
+
var = var.copy()
|
|
56
|
+
var[df == 0] = 0
|
|
57
|
+
|
|
58
|
+
# Auto-detect legacy mode
|
|
59
|
+
if span is not None:
|
|
60
|
+
legacy = False
|
|
61
|
+
if legacy is None:
|
|
62
|
+
dfp = df[df > 0]
|
|
63
|
+
if len(dfp) > 0:
|
|
64
|
+
legacy = (np.min(dfp) == np.max(dfp))
|
|
65
|
+
else:
|
|
66
|
+
legacy = True
|
|
67
|
+
|
|
68
|
+
if legacy:
|
|
69
|
+
# Original limma algorithm (fitFDist / fitFDistRobustly)
|
|
70
|
+
ok = np.isfinite(var) & np.isfinite(df) & (df > 0)
|
|
71
|
+
if not np.any(ok):
|
|
72
|
+
return {'var_post': var, 'var_prior': np.nan, 'df_prior': 0.0}
|
|
73
|
+
|
|
74
|
+
if covariate is not None:
|
|
75
|
+
covariate = np.asarray(covariate, dtype=np.float64)
|
|
76
|
+
|
|
77
|
+
if robust:
|
|
78
|
+
cov_arg = covariate
|
|
79
|
+
if cov_arg is not None and len(np.unique(cov_arg[ok])) < 2:
|
|
80
|
+
cov_arg = None
|
|
81
|
+
fit = _fit_f_dist_robustly(var, df, covariate=cov_arg,
|
|
82
|
+
winsor_tail_p=winsor_tail_p)
|
|
83
|
+
var_prior = fit['scale']
|
|
84
|
+
df_prior = fit['df2_shrunk']
|
|
85
|
+
var_post = _posterior_var(var, df, var_prior, df_prior)
|
|
86
|
+
return {'var_post': var_post, 'var_prior': var_prior, 'df_prior': df_prior}
|
|
87
|
+
|
|
88
|
+
# Estimate prior (non-robust)
|
|
89
|
+
if covariate is None or len(np.unique(covariate[ok])) < 2:
|
|
90
|
+
# No trend
|
|
91
|
+
result = _fit_f_dist(var[ok], df[ok])
|
|
92
|
+
df_prior = result['df2']
|
|
93
|
+
var_prior = result['s2']
|
|
94
|
+
var_post = _posterior_var(var, df, var_prior, df_prior)
|
|
95
|
+
return {'var_post': var_post, 'var_prior': var_prior, 'df_prior': df_prior}
|
|
96
|
+
else:
|
|
97
|
+
# Trended prior
|
|
98
|
+
result = _fit_f_dist_trend(var[ok], df[ok], covariate[ok])
|
|
99
|
+
var_prior_full = np.full(n, np.nan)
|
|
100
|
+
var_prior_full[ok] = result['var_prior']
|
|
101
|
+
if not np.all(ok):
|
|
102
|
+
from scipy.interpolate import interp1d
|
|
103
|
+
f = interp1d(covariate[ok], result['var_prior'], kind='linear',
|
|
104
|
+
bounds_error=False, fill_value='extrapolate')
|
|
105
|
+
var_prior_full[~ok] = f(covariate[~ok])
|
|
106
|
+
df_prior = result['df_prior']
|
|
107
|
+
var_post = _posterior_var(var, df, var_prior_full, df_prior)
|
|
108
|
+
return {'var_post': var_post, 'var_prior': var_prior_full, 'df_prior': df_prior}
|
|
109
|
+
else:
|
|
110
|
+
# New method: fitFDistUnequalDF1
|
|
111
|
+
fit = _fit_f_dist_unequal_df1(var, df, covariate=covariate, span=span, robust=robust)
|
|
112
|
+
df_prior = fit.get('df2_shrunk')
|
|
113
|
+
if df_prior is None:
|
|
114
|
+
df_prior = fit['df2']
|
|
115
|
+
scale = fit['scale']
|
|
116
|
+
var_post = _posterior_var(var, df, scale, df_prior)
|
|
117
|
+
return {'var_post': var_post, 'var_prior': scale, 'df_prior': df_prior}
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def _posterior_var(var, df, var_prior, df_prior):
|
|
121
|
+
"""Compute posterior variance: (df*var + df_prior*var_prior) / (df + df_prior)."""
|
|
122
|
+
var = np.asarray(var, dtype=np.float64)
|
|
123
|
+
df = np.atleast_1d(np.asarray(df, dtype=np.float64))
|
|
124
|
+
var_prior = np.atleast_1d(np.asarray(var_prior, dtype=np.float64))
|
|
125
|
+
if len(df) == 1:
|
|
126
|
+
df = np.full(len(var), df[0])
|
|
127
|
+
df_prior_val = np.atleast_1d(np.asarray(df_prior, dtype=np.float64))
|
|
128
|
+
if len(df_prior_val) == 1:
|
|
129
|
+
df_prior_val = np.full(len(var), df_prior_val[0])
|
|
130
|
+
if len(var_prior) == 1:
|
|
131
|
+
var_prior = np.full(len(var), var_prior[0])
|
|
132
|
+
total_df = df + df_prior_val
|
|
133
|
+
# Handle infinite df_prior: var_post = var_prior when df_prior is infinite
|
|
134
|
+
inf_mask = np.isinf(df_prior_val)
|
|
135
|
+
with np.errstate(invalid='ignore', divide='ignore'):
|
|
136
|
+
var_post = np.where(inf_mask, var_prior,
|
|
137
|
+
(df * var + df_prior_val * var_prior) / np.where(total_df == 0, 1, total_df))
|
|
138
|
+
var_post[total_df <= 0] = var[total_df <= 0]
|
|
139
|
+
return var_post
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def _fit_f_dist(x, df1):
|
|
143
|
+
"""Fit a scaled F-distribution to data.
|
|
144
|
+
|
|
145
|
+
Moment matching to estimate s2 (scale) and df2 (prior df).
|
|
146
|
+
Faithful port of limma's fitFDist() (no-covariate case).
|
|
147
|
+
"""
|
|
148
|
+
x = np.asarray(x, dtype=np.float64)
|
|
149
|
+
n = len(x)
|
|
150
|
+
df1 = np.atleast_1d(np.asarray(df1, dtype=np.float64))
|
|
151
|
+
|
|
152
|
+
if n == 0:
|
|
153
|
+
return {'s2': np.nan, 'df2': np.nan}
|
|
154
|
+
if n == 1:
|
|
155
|
+
return {'s2': float(x[0]), 'df2': 0.0}
|
|
156
|
+
|
|
157
|
+
# Filter ok values: R uses df1 > 1e-15 and x > -1e-15
|
|
158
|
+
ok_df1 = np.isfinite(df1) & (df1 > 1e-15)
|
|
159
|
+
if len(df1) == 1:
|
|
160
|
+
if not ok_df1[0]:
|
|
161
|
+
return {'s2': np.nan, 'df2': np.nan}
|
|
162
|
+
ok = np.full(n, True)
|
|
163
|
+
else:
|
|
164
|
+
ok = ok_df1
|
|
165
|
+
ok = ok & np.isfinite(x) & (x > -1e-15)
|
|
166
|
+
|
|
167
|
+
nok = int(np.sum(ok))
|
|
168
|
+
if nok <= 1:
|
|
169
|
+
if nok == 1:
|
|
170
|
+
return {'s2': float(x[ok][0]), 'df2': 0.0}
|
|
171
|
+
return {'s2': np.nan, 'df2': np.nan}
|
|
172
|
+
|
|
173
|
+
x_ok = x[ok].copy()
|
|
174
|
+
df1_ok = df1[ok] if len(df1) > 1 else df1
|
|
175
|
+
|
|
176
|
+
# Clamp x: match R's pmax(x, 0), handle zeros, pmax(x, 1e-5 * median)
|
|
177
|
+
x_ok = np.maximum(x_ok, 0.0)
|
|
178
|
+
m = np.median(x_ok)
|
|
179
|
+
if m == 0:
|
|
180
|
+
m = 1.0
|
|
181
|
+
x_ok = np.maximum(x_ok, 1e-5 * m)
|
|
182
|
+
|
|
183
|
+
# Compute e = log(x) + logmdigamma(df1/2), matching R exactly
|
|
184
|
+
z = np.log(x_ok)
|
|
185
|
+
e = z + logmdigamma(df1_ok / 2)
|
|
186
|
+
emean = np.mean(e)
|
|
187
|
+
evar = np.sum((e - emean) ** 2) / (nok - 1) # R uses /(nok - 1L)
|
|
188
|
+
|
|
189
|
+
# Subtract trigamma(df1/2) contribution
|
|
190
|
+
evar = evar - np.mean(_trigamma_safe(df1_ok / 2))
|
|
191
|
+
|
|
192
|
+
if evar > 0:
|
|
193
|
+
df2 = 2.0 * _trigamma_inverse(evar)
|
|
194
|
+
df2 = max(df2, 1e-6)
|
|
195
|
+
if df2 > 1e15:
|
|
196
|
+
df2 = np.inf
|
|
197
|
+
s2 = float(np.exp(emean - logmdigamma(df2 / 2)))
|
|
198
|
+
else:
|
|
199
|
+
df2 = np.inf
|
|
200
|
+
s2 = float(np.mean(x_ok)) # R: mean(x) for no-covariate case
|
|
201
|
+
|
|
202
|
+
s2 = max(s2, 1e-15)
|
|
203
|
+
|
|
204
|
+
return {'s2': s2, 'df2': df2}
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def _fit_f_dist_trend(var, df, covariate):
|
|
208
|
+
"""Fit an F-distribution with trended prior variance.
|
|
209
|
+
|
|
210
|
+
Faithful port of R's fitFDist() with covariate parameter.
|
|
211
|
+
Uses natural spline basis + OLS regression, matching R's approach of
|
|
212
|
+
fitting e = log(x) + logmdigamma(df1/2) on ns(covariate, df=splinedf).
|
|
213
|
+
"""
|
|
214
|
+
n = len(var)
|
|
215
|
+
var = np.asarray(var, dtype=np.float64).copy()
|
|
216
|
+
df_arr = np.atleast_1d(np.asarray(df, dtype=np.float64))
|
|
217
|
+
if len(df_arr) == 1:
|
|
218
|
+
df_arr = np.full(n, df_arr[0])
|
|
219
|
+
covariate = np.asarray(covariate, dtype=np.float64)
|
|
220
|
+
|
|
221
|
+
# Handle infinite covariate values (matching R)
|
|
222
|
+
isfin = np.isfinite(covariate)
|
|
223
|
+
if not np.all(isfin):
|
|
224
|
+
if np.any(isfin):
|
|
225
|
+
r = (np.min(covariate[isfin]), np.max(covariate[isfin]))
|
|
226
|
+
covariate = covariate.copy()
|
|
227
|
+
covariate[covariate == -np.inf] = r[0] - 1
|
|
228
|
+
covariate[covariate == np.inf] = r[1] + 1
|
|
229
|
+
else:
|
|
230
|
+
covariate = np.sign(covariate)
|
|
231
|
+
|
|
232
|
+
# Adaptive spline df (matching R: 1 + (nok>=3) + (nok>=6) + (nok>=30))
|
|
233
|
+
splinedf = 1 + int(n >= 3) + int(n >= 6) + int(n >= 30)
|
|
234
|
+
splinedf = min(splinedf, len(np.unique(covariate)))
|
|
235
|
+
|
|
236
|
+
if splinedf < 2:
|
|
237
|
+
# Fall back to scalar (no-covariate) fit, matching R's Recall()
|
|
238
|
+
result = _fit_f_dist(var, df_arr)
|
|
239
|
+
return {'var_prior': np.full(n, result['s2']), 'df_prior': result['df2']}
|
|
240
|
+
|
|
241
|
+
# Clamp var: match R's pmax(x, 0), handle zeros, pmax(x, 1e-5 * median)
|
|
242
|
+
var = np.maximum(var, 0.0)
|
|
243
|
+
m = np.median(var)
|
|
244
|
+
if m == 0:
|
|
245
|
+
m = 1.0
|
|
246
|
+
var = np.maximum(var, 1e-5 * m)
|
|
247
|
+
|
|
248
|
+
# Compute e = log(x) + logmdigamma(df1/2), matching R exactly
|
|
249
|
+
z = np.log(var)
|
|
250
|
+
e = z + logmdigamma(df_arr / 2)
|
|
251
|
+
|
|
252
|
+
# Fit natural spline basis + OLS (matching R's lm.fit(ns(...), e))
|
|
253
|
+
basis = _natural_spline_basis(covariate, df=splinedf)
|
|
254
|
+
coeffs, _, _, _ = np.linalg.lstsq(basis, e, rcond=None)
|
|
255
|
+
emean = basis @ coeffs
|
|
256
|
+
|
|
257
|
+
# Residual variance: R uses mean(fit$effects[-(1:rank)]^2) = RSS/(n-rank)
|
|
258
|
+
resid = e - emean
|
|
259
|
+
actual_rank = np.linalg.matrix_rank(basis)
|
|
260
|
+
if n > actual_rank:
|
|
261
|
+
evar = np.sum(resid ** 2) / (n - actual_rank)
|
|
262
|
+
else:
|
|
263
|
+
evar = 0.0
|
|
264
|
+
|
|
265
|
+
# Subtract trigamma(df1/2) contribution
|
|
266
|
+
evar = evar - np.mean(_trigamma_safe(df_arr / 2))
|
|
267
|
+
|
|
268
|
+
# Estimate df2 and s20
|
|
269
|
+
if evar > 0:
|
|
270
|
+
df2 = 2.0 * _trigamma_inverse(evar)
|
|
271
|
+
if df2 > 1e15:
|
|
272
|
+
df2 = np.inf
|
|
273
|
+
s20 = np.exp(emean - logmdigamma(df2 / 2))
|
|
274
|
+
else:
|
|
275
|
+
df2 = np.inf
|
|
276
|
+
s20 = np.exp(emean)
|
|
277
|
+
|
|
278
|
+
return {'var_prior': s20, 'df_prior': df2}
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
def _natural_spline_basis(x, df):
|
|
282
|
+
"""Create natural cubic spline basis matrix matching R's ns(x, df=df, intercept=TRUE).
|
|
283
|
+
|
|
284
|
+
Uses the truncated power basis representation from Hastie, Tibshirani &
|
|
285
|
+
Friedman (Elements of Statistical Learning, eq 5.4-5.5).
|
|
286
|
+
|
|
287
|
+
Parameters
|
|
288
|
+
----------
|
|
289
|
+
x : array-like
|
|
290
|
+
Covariate values.
|
|
291
|
+
df : int
|
|
292
|
+
Number of basis functions (columns in the returned matrix).
|
|
293
|
+
|
|
294
|
+
Returns
|
|
295
|
+
-------
|
|
296
|
+
ndarray of shape (n, df)
|
|
297
|
+
"""
|
|
298
|
+
x = np.asarray(x, dtype=np.float64)
|
|
299
|
+
n = len(x)
|
|
300
|
+
|
|
301
|
+
# Number of internal knots: R uses df - 1 - intercept = df - 2
|
|
302
|
+
n_internal = df - 2
|
|
303
|
+
|
|
304
|
+
# Boundary knots at range
|
|
305
|
+
a = np.min(x)
|
|
306
|
+
b = np.max(x)
|
|
307
|
+
|
|
308
|
+
if n_internal <= 0 or a == b:
|
|
309
|
+
# Linear basis only: [1, x]
|
|
310
|
+
basis = np.column_stack([np.ones(n), x])
|
|
311
|
+
return basis[:, :df]
|
|
312
|
+
|
|
313
|
+
# Internal knots at quantiles (matching R's quantile placement)
|
|
314
|
+
probs = np.linspace(0, 1, n_internal + 2)[1:-1]
|
|
315
|
+
internal_knots = np.quantile(x, probs)
|
|
316
|
+
|
|
317
|
+
# All knots sorted: [boundary_left, internal_1, ..., internal_K, boundary_right]
|
|
318
|
+
all_knots = np.sort(np.concatenate([[a], internal_knots, [b]]))
|
|
319
|
+
K = len(all_knots) # Total knots = df
|
|
320
|
+
|
|
321
|
+
# Build basis: [1, x, d_1-d_{K-1}, d_2-d_{K-1}, ..., d_{K-2}-d_{K-1}]
|
|
322
|
+
# where d_k(x) = [(x - xi_k)_+^3 - (x - xi_K)_+^3] / (xi_K - xi_k)
|
|
323
|
+
basis = np.zeros((n, df))
|
|
324
|
+
basis[:, 0] = 1.0
|
|
325
|
+
basis[:, 1] = x
|
|
326
|
+
|
|
327
|
+
if K > 2:
|
|
328
|
+
xi_K = all_knots[-1] # rightmost boundary knot
|
|
329
|
+
xi_Km1 = all_knots[-2] # second-to-last knot (K-1 in 1-indexed)
|
|
330
|
+
|
|
331
|
+
def d_func(xi_j):
|
|
332
|
+
return (np.maximum(x - xi_j, 0) ** 3 - np.maximum(x - xi_K, 0) ** 3) / (xi_K - xi_j)
|
|
333
|
+
|
|
334
|
+
d_Km1 = d_func(xi_Km1)
|
|
335
|
+
|
|
336
|
+
for j in range(K - 2):
|
|
337
|
+
d_j = d_func(all_knots[j])
|
|
338
|
+
basis[:, 2 + j] = d_j - d_Km1
|
|
339
|
+
|
|
340
|
+
return basis
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
def _fit_f_dist_robustly(x, df1, covariate=None, winsor_tail_p=(0.05, 0.1)):
|
|
344
|
+
"""Fit F-distribution with robust outlier detection.
|
|
345
|
+
|
|
346
|
+
Port of limma's fitFDistRobustly().
|
|
347
|
+
Returns dict with 'scale', 'df2', and 'df2_shrunk' (per-gene).
|
|
348
|
+
|
|
349
|
+
Parameters
|
|
350
|
+
----------
|
|
351
|
+
x : array-like
|
|
352
|
+
Genewise variances.
|
|
353
|
+
df1 : array-like or scalar
|
|
354
|
+
Residual degrees of freedom.
|
|
355
|
+
covariate : array-like, optional
|
|
356
|
+
Covariate for trended prior.
|
|
357
|
+
winsor_tail_p : tuple
|
|
358
|
+
Tail proportions for Winsorization (lower, upper).
|
|
359
|
+
|
|
360
|
+
Returns
|
|
361
|
+
-------
|
|
362
|
+
dict with keys: scale, df2, df2_shrunk
|
|
363
|
+
"""
|
|
364
|
+
x = np.asarray(x, dtype=np.float64).copy()
|
|
365
|
+
n = len(x)
|
|
366
|
+
|
|
367
|
+
if n < 2:
|
|
368
|
+
return {'scale': np.nan, 'df2': np.nan, 'df2_shrunk': np.full(max(n, 1), np.nan)}
|
|
369
|
+
|
|
370
|
+
df1 = np.atleast_1d(np.asarray(df1, dtype=np.float64)).copy()
|
|
371
|
+
if len(df1) == 1:
|
|
372
|
+
df1 = np.full(n, df1[0])
|
|
373
|
+
|
|
374
|
+
if n == 2:
|
|
375
|
+
if covariate is None:
|
|
376
|
+
result = _fit_f_dist(x, df1)
|
|
377
|
+
return {'scale': result['s2'], 'df2': result['df2'],
|
|
378
|
+
'df2_shrunk': np.full(n, result['df2'])}
|
|
379
|
+
else:
|
|
380
|
+
result = _fit_f_dist_trend(x, df1, covariate)
|
|
381
|
+
return {'scale': result['var_prior'], 'df2': result['df_prior'],
|
|
382
|
+
'df2_shrunk': np.full(n, result['df_prior'])}
|
|
383
|
+
|
|
384
|
+
# Filter ok values
|
|
385
|
+
ok = ~np.isnan(x) & np.isfinite(df1) & (df1 > 1e-6)
|
|
386
|
+
|
|
387
|
+
if not np.all(ok):
|
|
388
|
+
# Recursive call on ok subset
|
|
389
|
+
df2_shrunk_full = np.empty(n)
|
|
390
|
+
x_ok = x[ok]
|
|
391
|
+
df1_ok = df1[ok]
|
|
392
|
+
cov_ok = covariate[ok] if covariate is not None else None
|
|
393
|
+
|
|
394
|
+
fit = _fit_f_dist_robustly(x_ok, df1_ok, covariate=cov_ok,
|
|
395
|
+
winsor_tail_p=winsor_tail_p)
|
|
396
|
+
|
|
397
|
+
df2_shrunk_full[ok] = fit['df2_shrunk']
|
|
398
|
+
df2_shrunk_full[~ok] = fit['df2']
|
|
399
|
+
|
|
400
|
+
if covariate is None:
|
|
401
|
+
scale = fit['scale']
|
|
402
|
+
else:
|
|
403
|
+
scale_ok = np.atleast_1d(fit['scale'])
|
|
404
|
+
scale = np.empty(n)
|
|
405
|
+
scale[ok] = scale_ok
|
|
406
|
+
from scipy.interpolate import interp1d
|
|
407
|
+
f_interp = interp1d(covariate[ok], np.log(scale_ok), kind='linear',
|
|
408
|
+
bounds_error=False, fill_value='extrapolate')
|
|
409
|
+
scale[~ok] = np.exp(f_interp(covariate[~ok]))
|
|
410
|
+
|
|
411
|
+
return {'scale': scale, 'df2': fit['df2'], 'df2_shrunk': df2_shrunk_full}
|
|
412
|
+
|
|
413
|
+
# All values ok from here
|
|
414
|
+
m = np.median(x)
|
|
415
|
+
if m <= 0:
|
|
416
|
+
return {'scale': np.nan, 'df2': np.nan, 'df2_shrunk': np.full(n, np.nan)}
|
|
417
|
+
|
|
418
|
+
small = x < m * 1e-12
|
|
419
|
+
if np.any(small):
|
|
420
|
+
x[small] = m * 1e-12
|
|
421
|
+
|
|
422
|
+
# Non-robust initial fit
|
|
423
|
+
if covariate is None:
|
|
424
|
+
non_robust = _fit_f_dist(x, df1)
|
|
425
|
+
nr_s20 = non_robust['s2']
|
|
426
|
+
nr_df2 = non_robust['df2']
|
|
427
|
+
else:
|
|
428
|
+
non_robust = _fit_f_dist_trend(x, df1, covariate)
|
|
429
|
+
nr_s20 = non_robust['var_prior']
|
|
430
|
+
nr_df2 = non_robust['df_prior']
|
|
431
|
+
|
|
432
|
+
if not np.isfinite(nr_df2) and nr_df2 != np.inf:
|
|
433
|
+
return {'scale': nr_s20, 'df2': nr_df2, 'df2_shrunk': np.full(n, 0.0)}
|
|
434
|
+
|
|
435
|
+
# Winsor tail probabilities
|
|
436
|
+
wtp = [float(winsor_tail_p[0]), float(winsor_tail_p[1])]
|
|
437
|
+
prob = [wtp[0], 1.0 - wtp[1]]
|
|
438
|
+
|
|
439
|
+
# Check if winsor_tail_p is too small for this sample size
|
|
440
|
+
if all(p < 1.0 / n for p in wtp):
|
|
441
|
+
return {'scale': nr_s20, 'df2': nr_df2, 'df2_shrunk': np.full(n, nr_df2)}
|
|
442
|
+
|
|
443
|
+
# Unify df1 if vector with different values
|
|
444
|
+
if np.min(df1) < np.max(df1) - 1e-14:
|
|
445
|
+
df1max = np.max(df1)
|
|
446
|
+
i = df1 < (df1max - 1e-14)
|
|
447
|
+
if np.any(i):
|
|
448
|
+
if covariate is None:
|
|
449
|
+
s = nr_s20
|
|
450
|
+
else:
|
|
451
|
+
s = nr_s20[i]
|
|
452
|
+
f_vals = x[i] / s
|
|
453
|
+
d2 = nr_df2
|
|
454
|
+
pupper = stats.f.logsf(f_vals, df1[i], d2)
|
|
455
|
+
plower = stats.f.logcdf(f_vals, df1[i], d2)
|
|
456
|
+
up = pupper < plower
|
|
457
|
+
f_new = f_vals.copy()
|
|
458
|
+
if np.any(up):
|
|
459
|
+
f_new[up] = stats.f.isf(np.exp(np.clip(pupper[up], -500, 0)), df1max, d2)
|
|
460
|
+
if np.any(~up):
|
|
461
|
+
f_new[~up] = stats.f.ppf(np.exp(np.clip(plower[~up], -500, 0)), df1max, d2)
|
|
462
|
+
x[i] = f_new * s
|
|
463
|
+
df1_val = df1max
|
|
464
|
+
else:
|
|
465
|
+
df1_val = df1[0]
|
|
466
|
+
else:
|
|
467
|
+
df1_val = df1[0]
|
|
468
|
+
|
|
469
|
+
z = np.log(x)
|
|
470
|
+
|
|
471
|
+
if covariate is None:
|
|
472
|
+
# Trimmed mean matching R's mean(z, trim=winsor.tail.p[2])
|
|
473
|
+
ztrend = float(stats.trim_mean(z, proportiontocut=wtp[1]))
|
|
474
|
+
zresid = z - ztrend
|
|
475
|
+
else:
|
|
476
|
+
from .weighted_lowess import weighted_lowess as _wlowess
|
|
477
|
+
lo = _wlowess(covariate, z, span=0.4, iterations=4, npts=200)
|
|
478
|
+
ztrend = lo['fitted']
|
|
479
|
+
zresid = z - ztrend
|
|
480
|
+
|
|
481
|
+
# Winsorize z-residuals
|
|
482
|
+
zrq = np.quantile(zresid, prob)
|
|
483
|
+
zwins = np.clip(zresid, zrq[0], zrq[1])
|
|
484
|
+
zwmean = float(np.mean(zwins))
|
|
485
|
+
zwvar = float(np.mean((zwins - zwmean) ** 2) * n / (n - 1))
|
|
486
|
+
|
|
487
|
+
# Gauss-Legendre quadrature on [0,1] (128 nodes)
|
|
488
|
+
gl_nodes_raw, gl_weights_raw = np.polynomial.legendre.leggauss(128)
|
|
489
|
+
gl_nodes_01 = (gl_nodes_raw + 1.0) / 2.0
|
|
490
|
+
gl_weights_01 = gl_weights_raw / 2.0
|
|
491
|
+
|
|
492
|
+
def linkfun(v):
|
|
493
|
+
return v / (1.0 + v)
|
|
494
|
+
|
|
495
|
+
def linkinv(v):
|
|
496
|
+
return v / (1.0 - v)
|
|
497
|
+
|
|
498
|
+
def winsorized_moments(d1, d2, wtp_arg):
|
|
499
|
+
"""Compute Winsorized mean and variance of log(F(d1, d2))."""
|
|
500
|
+
fq = stats.f.ppf([wtp_arg[0], 1.0 - wtp_arg[1]], d1, d2)
|
|
501
|
+
zq = np.log(fq)
|
|
502
|
+
q = linkfun(fq)
|
|
503
|
+
nodes = q[0] + (q[1] - q[0]) * gl_nodes_01
|
|
504
|
+
fnodes = linkinv(nodes)
|
|
505
|
+
znodes = np.log(fnodes)
|
|
506
|
+
f_dens = stats.f.pdf(fnodes, d1, d2) / (1.0 - nodes) ** 2
|
|
507
|
+
q21 = q[1] - q[0]
|
|
508
|
+
wtp_arr = np.array(wtp_arg)
|
|
509
|
+
m_val = q21 * np.sum(gl_weights_01 * f_dens * znodes) + np.sum(zq * wtp_arr)
|
|
510
|
+
v_val = (q21 * np.sum(gl_weights_01 * f_dens * (znodes - m_val) ** 2)
|
|
511
|
+
+ np.sum((zq - m_val) ** 2 * wtp_arr))
|
|
512
|
+
return {'mean': m_val, 'var': v_val}
|
|
513
|
+
|
|
514
|
+
# Check df2=Inf case
|
|
515
|
+
mom_inf = winsorized_moments(df1_val, np.inf, wtp)
|
|
516
|
+
|
|
517
|
+
if mom_inf['var'] <= 0 or zwvar <= 0:
|
|
518
|
+
return {'scale': nr_s20, 'df2': nr_df2, 'df2_shrunk': np.full(n, nr_df2)}
|
|
519
|
+
|
|
520
|
+
funval_inf = np.log(zwvar / mom_inf['var'])
|
|
521
|
+
|
|
522
|
+
if funval_inf <= 0:
|
|
523
|
+
# df2 = Inf: observed variance <= theoretical at df2=Inf
|
|
524
|
+
df2 = np.inf
|
|
525
|
+
ztrendcorrected = ztrend + zwmean - mom_inf['mean']
|
|
526
|
+
s20 = np.exp(ztrendcorrected)
|
|
527
|
+
Fstat = np.exp(z - ztrendcorrected)
|
|
528
|
+
TailP = stats.chi2.sf(Fstat * df1_val, df1_val)
|
|
529
|
+
r = stats.rankdata(Fstat)
|
|
530
|
+
EmpiricalTailProb = (n - r + 0.5) / n
|
|
531
|
+
ProbNotOutlier = np.minimum(TailP / EmpiricalTailProb, 1.0)
|
|
532
|
+
df_pooled = n * df1_val
|
|
533
|
+
df2_shrunk = np.full(n, float(df2))
|
|
534
|
+
O = ProbNotOutlier < 1
|
|
535
|
+
if np.any(O):
|
|
536
|
+
df2_shrunk[O] = ProbNotOutlier[O] * df_pooled
|
|
537
|
+
o = np.argsort(TailP)
|
|
538
|
+
df2_shrunk[o] = np.maximum.accumulate(df2_shrunk[o])
|
|
539
|
+
return {'scale': s20, 'df2': df2, 'df2_shrunk': df2_shrunk}
|
|
540
|
+
|
|
541
|
+
# Check if non-robust already gives Inf
|
|
542
|
+
if nr_df2 == np.inf:
|
|
543
|
+
return {'scale': nr_s20, 'df2': nr_df2, 'df2_shrunk': np.full(n, nr_df2)}
|
|
544
|
+
|
|
545
|
+
# Root-finding for df2
|
|
546
|
+
rbx = linkfun(nr_df2)
|
|
547
|
+
|
|
548
|
+
def fun_root(par):
|
|
549
|
+
d2 = linkinv(par)
|
|
550
|
+
mom = winsorized_moments(df1_val, d2, wtp)
|
|
551
|
+
if mom['var'] <= 0:
|
|
552
|
+
return funval_inf
|
|
553
|
+
return np.log(zwvar / mom['var'])
|
|
554
|
+
|
|
555
|
+
funval_low = fun_root(rbx)
|
|
556
|
+
|
|
557
|
+
if funval_low >= 0:
|
|
558
|
+
df2 = nr_df2
|
|
559
|
+
else:
|
|
560
|
+
from scipy.optimize import brentq
|
|
561
|
+
root = brentq(fun_root, rbx, 1.0 - 1e-10, xtol=1e-8)
|
|
562
|
+
df2 = linkinv(root)
|
|
563
|
+
|
|
564
|
+
mom = winsorized_moments(df1_val, df2, wtp)
|
|
565
|
+
ztrendcorrected = ztrend + zwmean - mom['mean']
|
|
566
|
+
s20 = np.exp(ztrendcorrected)
|
|
567
|
+
Fstat = np.exp(z - ztrendcorrected)
|
|
568
|
+
|
|
569
|
+
LogTailP = stats.f.logsf(Fstat, df1_val, df2)
|
|
570
|
+
TailP = np.exp(LogTailP)
|
|
571
|
+
r = stats.rankdata(Fstat)
|
|
572
|
+
LogEmpiricalTailProb = np.log(n - r + 0.5) - np.log(n)
|
|
573
|
+
LogProbNotOutlier = np.minimum(LogTailP - LogEmpiricalTailProb, 0.0)
|
|
574
|
+
ProbNotOutlier = np.exp(LogProbNotOutlier)
|
|
575
|
+
ProbOutlier = -np.expm1(LogProbNotOutlier)
|
|
576
|
+
|
|
577
|
+
if np.any(LogProbNotOutlier < 0):
|
|
578
|
+
minLogTailP = np.min(LogTailP)
|
|
579
|
+
if minLogTailP == -np.inf:
|
|
580
|
+
df2_outlier = 0.0
|
|
581
|
+
df2_shrunk = ProbNotOutlier * df2
|
|
582
|
+
else:
|
|
583
|
+
df2_outlier = np.log(0.5) / minLogTailP * df2
|
|
584
|
+
NewLogTailP = stats.f.logsf(np.max(Fstat), df1_val, df2_outlier)
|
|
585
|
+
df2_outlier = np.log(0.5) / NewLogTailP * df2_outlier
|
|
586
|
+
df2_shrunk = ProbNotOutlier * df2 + ProbOutlier * df2_outlier
|
|
587
|
+
|
|
588
|
+
# Monotonize via cummax on ordered tail p-values
|
|
589
|
+
o = np.argsort(LogTailP)
|
|
590
|
+
df2_ordered = df2_shrunk[o].copy()
|
|
591
|
+
m_arr = np.cumsum(df2_ordered) / np.arange(1, n + 1, dtype=np.float64)
|
|
592
|
+
imin = int(np.argmin(m_arr))
|
|
593
|
+
df2_ordered[:imin + 1] = m_arr[imin]
|
|
594
|
+
df2_shrunk_final = np.empty(n)
|
|
595
|
+
df2_shrunk_final[o] = np.maximum.accumulate(df2_ordered)
|
|
596
|
+
df2_shrunk = df2_shrunk_final
|
|
597
|
+
else:
|
|
598
|
+
df2_shrunk = np.full(n, df2)
|
|
599
|
+
|
|
600
|
+
return {'scale': s20, 'df2': df2, 'df2_shrunk': df2_shrunk}
|
|
601
|
+
|
|
602
|
+
|
|
603
|
+
def _digamma_safe(x):
|
|
604
|
+
"""Safe digamma that handles arrays."""
|
|
605
|
+
from scipy.special import digamma
|
|
606
|
+
return digamma(np.asarray(x, dtype=np.float64))
|
|
607
|
+
|
|
608
|
+
|
|
609
|
+
def _trigamma_safe(x):
|
|
610
|
+
"""Safe trigamma (polygamma of order 1)."""
|
|
611
|
+
from scipy.special import polygamma
|
|
612
|
+
return polygamma(1, np.asarray(x, dtype=np.float64))
|
|
613
|
+
|
|
614
|
+
|
|
615
|
+
def _trigamma_inverse(x):
|
|
616
|
+
"""Inverse of the trigamma function.
|
|
617
|
+
|
|
618
|
+
Port of limma's trigammaInverse().
|
|
619
|
+
Uses Newton's method.
|
|
620
|
+
"""
|
|
621
|
+
from scipy.special import polygamma
|
|
622
|
+
|
|
623
|
+
x = float(x)
|
|
624
|
+
if x > 1e7:
|
|
625
|
+
return 1.0 / x
|
|
626
|
+
if x < 1e-6:
|
|
627
|
+
return 1.0 / x
|
|
628
|
+
|
|
629
|
+
# Starting value
|
|
630
|
+
if x > 0.5:
|
|
631
|
+
y = 1.0 / x
|
|
632
|
+
else:
|
|
633
|
+
y = 1.0 / (x * (1 + x))
|
|
634
|
+
|
|
635
|
+
# Newton iterations
|
|
636
|
+
for _ in range(50):
|
|
637
|
+
tri = float(polygamma(1, y))
|
|
638
|
+
dif = tri * (1 - tri / x) / float(polygamma(2, y))
|
|
639
|
+
y = y + dif
|
|
640
|
+
if y <= 0:
|
|
641
|
+
y = x # reset
|
|
642
|
+
if abs(dif / y) < 1e-10:
|
|
643
|
+
break
|
|
644
|
+
|
|
645
|
+
return y
|
|
646
|
+
|
|
647
|
+
|
|
648
|
+
def logmdigamma(x):
|
|
649
|
+
"""Compute log(x) - digamma(x) avoiding subtractive cancellation.
|
|
650
|
+
|
|
651
|
+
Port of statmod's logmdigamma().
|
|
652
|
+
Uses recursive shift for small values and asymptotic expansion for large.
|
|
653
|
+
"""
|
|
654
|
+
x = np.asarray(x, dtype=np.float64)
|
|
655
|
+
scalar_input = x.ndim == 0
|
|
656
|
+
x = np.atleast_1d(x)
|
|
657
|
+
result = np.full_like(x, np.nan)
|
|
658
|
+
|
|
659
|
+
valid = x > 0
|
|
660
|
+
if not np.any(valid):
|
|
661
|
+
return float(result[0]) if scalar_input else result
|
|
662
|
+
|
|
663
|
+
xv = x[valid]
|
|
664
|
+
rv = np.empty_like(xv)
|
|
665
|
+
large = xv >= 5
|
|
666
|
+
small = ~large
|
|
667
|
+
|
|
668
|
+
# Large values: asymptotic expansion
|
|
669
|
+
if np.any(large):
|
|
670
|
+
z = xv[large]
|
|
671
|
+
inv_z2 = 1.0 / (z * z)
|
|
672
|
+
tail = inv_z2 * (-1.0/12 + inv_z2 * (1.0/120 + inv_z2 * (-1.0/252 + inv_z2 * (
|
|
673
|
+
1.0/240 + inv_z2 * (-1.0/132 + inv_z2 * (691.0/32760 + inv_z2 * (
|
|
674
|
+
-1.0/12 + 3617.0/8160 * inv_z2)))))))
|
|
675
|
+
rv[large] = 1.0 / (2.0 * z) - tail
|
|
676
|
+
|
|
677
|
+
# Small values: recursive shift by 5, then use asymptotic on z+5
|
|
678
|
+
if np.any(small):
|
|
679
|
+
z = xv[small]
|
|
680
|
+
z5 = z + 5.0
|
|
681
|
+
inv_z5_2 = 1.0 / (z5 * z5)
|
|
682
|
+
tail5 = inv_z5_2 * (-1.0/12 + inv_z5_2 * (1.0/120 + inv_z5_2 * (-1.0/252 + inv_z5_2 * (
|
|
683
|
+
1.0/240 + inv_z5_2 * (-1.0/132 + inv_z5_2 * (691.0/32760 + inv_z5_2 * (
|
|
684
|
+
-1.0/12 + 3617.0/8160 * inv_z5_2)))))))
|
|
685
|
+
lmd_z5 = 1.0 / (2.0 * z5) - tail5
|
|
686
|
+
rv[small] = (np.log(z / z5) + lmd_z5
|
|
687
|
+
+ 1.0/z + 1.0/(z+1) + 1.0/(z+2) + 1.0/(z+3) + 1.0/(z+4))
|
|
688
|
+
|
|
689
|
+
result[valid] = rv
|
|
690
|
+
return float(result[0]) if scalar_input else result
|
|
691
|
+
|
|
692
|
+
|
|
693
|
+
def _p_adjust_bh(p):
|
|
694
|
+
"""Benjamini-Hochberg p-value adjustment.
|
|
695
|
+
|
|
696
|
+
Port of R's p.adjust(method="BH").
|
|
697
|
+
"""
|
|
698
|
+
p = np.asarray(p, dtype=np.float64)
|
|
699
|
+
n = len(p)
|
|
700
|
+
o = np.argsort(p)[::-1]
|
|
701
|
+
ro = np.argsort(o)
|
|
702
|
+
i_vals = np.arange(n, 0, -1, dtype=np.float64)
|
|
703
|
+
adjusted = np.minimum.accumulate(n / i_vals * p[o])
|
|
704
|
+
adjusted = np.minimum(adjusted, 1.0)
|
|
705
|
+
return adjusted[ro]
|
|
706
|
+
|
|
707
|
+
|
|
708
|
+
def _fit_f_dist_unequal_df1(x, df1, covariate=None, span=None, robust=True, prior_weights=None):
|
|
709
|
+
"""Fit a scaled F-distribution with unequal df1 values.
|
|
710
|
+
|
|
711
|
+
Port of limma's fitFDistUnequalDF1().
|
|
712
|
+
Uses MLE to estimate scale and df2 (prior df).
|
|
713
|
+
|
|
714
|
+
Parameters
|
|
715
|
+
----------
|
|
716
|
+
x : array-like
|
|
717
|
+
Genewise variances (s2 values).
|
|
718
|
+
df1 : array-like
|
|
719
|
+
Residual degrees of freedom per gene.
|
|
720
|
+
covariate : array-like, optional
|
|
721
|
+
Covariate for trended prior (e.g. AveLogCPM).
|
|
722
|
+
span : float, optional
|
|
723
|
+
Loess span.
|
|
724
|
+
robust : bool
|
|
725
|
+
Robust estimation with outlier handling.
|
|
726
|
+
prior_weights : array-like, optional
|
|
727
|
+
Prior weights for each observation.
|
|
728
|
+
|
|
729
|
+
Returns
|
|
730
|
+
-------
|
|
731
|
+
dict with keys: scale, df2, and optionally df2_shrunk, df2_outlier.
|
|
732
|
+
"""
|
|
733
|
+
from scipy.optimize import minimize_scalar
|
|
734
|
+
from scipy.special import gammaln
|
|
735
|
+
|
|
736
|
+
x = np.asarray(x, dtype=np.float64).copy()
|
|
737
|
+
df1 = np.atleast_1d(np.asarray(df1, dtype=np.float64)).copy()
|
|
738
|
+
n = len(x)
|
|
739
|
+
|
|
740
|
+
if len(df1) == 1:
|
|
741
|
+
df1 = np.full(n, df1[0])
|
|
742
|
+
|
|
743
|
+
if prior_weights is not None:
|
|
744
|
+
prior_weights = np.asarray(prior_weights, dtype=np.float64).copy()
|
|
745
|
+
|
|
746
|
+
# Handle NA values
|
|
747
|
+
na_mask = np.isnan(x)
|
|
748
|
+
if np.any(na_mask):
|
|
749
|
+
if prior_weights is None:
|
|
750
|
+
prior_weights = (~na_mask).astype(np.float64)
|
|
751
|
+
else:
|
|
752
|
+
prior_weights[na_mask] = 0
|
|
753
|
+
x[na_mask] = 0
|
|
754
|
+
|
|
755
|
+
# Handle small df1
|
|
756
|
+
small_df1 = df1 < 0.01
|
|
757
|
+
if np.any(small_df1):
|
|
758
|
+
if prior_weights is None:
|
|
759
|
+
prior_weights = (~small_df1).astype(np.float64)
|
|
760
|
+
else:
|
|
761
|
+
prior_weights[small_df1] = 0
|
|
762
|
+
df1[small_df1] = 1
|
|
763
|
+
|
|
764
|
+
has_pw = prior_weights is not None
|
|
765
|
+
|
|
766
|
+
# Identify informative observations
|
|
767
|
+
informative = x > 0
|
|
768
|
+
if has_pw:
|
|
769
|
+
informative = informative & (prior_weights > 0)
|
|
770
|
+
n_informative = int(np.sum(informative))
|
|
771
|
+
|
|
772
|
+
if n_informative < 2:
|
|
773
|
+
return {'scale': np.nan, 'df2': np.nan}
|
|
774
|
+
|
|
775
|
+
if n_informative == 2:
|
|
776
|
+
covariate = None
|
|
777
|
+
robust = False
|
|
778
|
+
prior_weights = None
|
|
779
|
+
has_pw = False
|
|
780
|
+
|
|
781
|
+
m = np.median(x[informative])
|
|
782
|
+
xpos = np.maximum(x, 1e-12 * m)
|
|
783
|
+
z = np.log(xpos)
|
|
784
|
+
d1 = df1 / 2.0
|
|
785
|
+
e = z + logmdigamma(d1)
|
|
786
|
+
w = 1.0 / _trigamma_safe(d1)
|
|
787
|
+
if len(w) < n:
|
|
788
|
+
w = np.full(n, w[0])
|
|
789
|
+
if has_pw:
|
|
790
|
+
w = w * prior_weights
|
|
791
|
+
|
|
792
|
+
if covariate is None:
|
|
793
|
+
emean = np.sum(w * e) / np.sum(w)
|
|
794
|
+
else:
|
|
795
|
+
covariate = np.asarray(covariate, dtype=np.float64)
|
|
796
|
+
if span is None:
|
|
797
|
+
span = choose_lowess_span(n, small_n=500)
|
|
798
|
+
# Normalize weights: w / quantile(w, 0.75), clipped to [1e-8, 100]
|
|
799
|
+
w_q75 = np.quantile(w, 0.75)
|
|
800
|
+
loess_w = w / w_q75 if w_q75 > 0 else w.copy()
|
|
801
|
+
loess_w = np.clip(loess_w, 1e-08, 100)
|
|
802
|
+
|
|
803
|
+
from .weighted_lowess import weighted_lowess as _wlowess
|
|
804
|
+
wl_result = _wlowess(covariate, e, weights=loess_w, span=span,
|
|
805
|
+
iterations=1, npts=200)
|
|
806
|
+
emean = wl_result['fitted']
|
|
807
|
+
|
|
808
|
+
d1x = d1 * xpos
|
|
809
|
+
|
|
810
|
+
# MLE optimization for d2 = par/(1-par) over par in [0.5, 0.9998]
|
|
811
|
+
def minus_twice_loglik(par):
|
|
812
|
+
d2 = par / (1 - par)
|
|
813
|
+
lmd2 = logmdigamma(d2)
|
|
814
|
+
d2s20 = d2 * np.exp(emean - lmd2)
|
|
815
|
+
ll = (-(d1 + d2) * np.log1p(d1x / d2s20)
|
|
816
|
+
- d1 * np.log(d2s20)
|
|
817
|
+
+ gammaln(d1 + d2) - gammaln(d2))
|
|
818
|
+
if has_pw:
|
|
819
|
+
return -2 * np.sum(prior_weights * ll)
|
|
820
|
+
return -2 * np.sum(ll)
|
|
821
|
+
|
|
822
|
+
opt = minimize_scalar(minus_twice_loglik, bounds=(0.5, 0.9998), method='bounded')
|
|
823
|
+
d2 = opt.x / (1 - opt.x)
|
|
824
|
+
s20 = np.exp(emean - logmdigamma(d2))
|
|
825
|
+
|
|
826
|
+
if not robust:
|
|
827
|
+
return {'scale': s20, 'df2': 2 * d2}
|
|
828
|
+
|
|
829
|
+
# Robust estimation: detect and down-weight outliers
|
|
830
|
+
df2 = 2 * d2
|
|
831
|
+
f_stat = x / s20
|
|
832
|
+
|
|
833
|
+
right_p = stats.f.sf(f_stat, df1, df2)
|
|
834
|
+
left_p = 1 - right_p
|
|
835
|
+
|
|
836
|
+
# Better computation for very small left p-values
|
|
837
|
+
small_left = left_p < 0.001
|
|
838
|
+
if np.any(small_left):
|
|
839
|
+
df1_sub = df1[small_left] if len(df1) > 1 else df1
|
|
840
|
+
left_p[small_left] = stats.f.cdf(f_stat[small_left], df1_sub, df2)
|
|
841
|
+
|
|
842
|
+
two_sided_p = 2 * np.minimum(left_p, right_p)
|
|
843
|
+
|
|
844
|
+
fdr = _p_adjust_bh(two_sided_p)
|
|
845
|
+
fdr[fdr > 0.3] = 1
|
|
846
|
+
|
|
847
|
+
if np.min(fdr) == 1:
|
|
848
|
+
return {'scale': s20, 'df2': df2}
|
|
849
|
+
|
|
850
|
+
# Re-fit with FDR as prior weights
|
|
851
|
+
outpw = _fit_f_dist_unequal_df1(x, df1, covariate=covariate, span=span,
|
|
852
|
+
robust=False, prior_weights=fdr)
|
|
853
|
+
s20 = outpw['scale']
|
|
854
|
+
df2 = outpw['df2']
|
|
855
|
+
|
|
856
|
+
r = stats.rankdata(f_stat)
|
|
857
|
+
uniform_p = (n - r + 0.5) / n
|
|
858
|
+
prob_not_outlier = np.minimum(right_p / uniform_p, 1)
|
|
859
|
+
|
|
860
|
+
if np.min(prob_not_outlier) == 1:
|
|
861
|
+
return outpw
|
|
862
|
+
|
|
863
|
+
i_min = int(np.argmin(right_p))
|
|
864
|
+
min_right_p = right_p[i_min]
|
|
865
|
+
|
|
866
|
+
if min_right_p == 0:
|
|
867
|
+
df2_outlier = 0.0
|
|
868
|
+
df2_shrunk = prob_not_outlier * df2
|
|
869
|
+
else:
|
|
870
|
+
df2_outlier = np.log(0.5) / np.log(min_right_p) * df2
|
|
871
|
+
df1_i = df1[i_min] if len(df1) > 1 else df1[0]
|
|
872
|
+
new_log_right_p = stats.f.logsf(f_stat[i_min], df1_i, df2_outlier)
|
|
873
|
+
df2_outlier = np.log(0.5) / new_log_right_p * df2_outlier
|
|
874
|
+
df2_shrunk = prob_not_outlier * df2 + (1 - prob_not_outlier) * df2_outlier
|
|
875
|
+
|
|
876
|
+
# Monotonize df2_shrunk
|
|
877
|
+
o = np.argsort(right_p)
|
|
878
|
+
df2_ordered = df2_shrunk[o].copy()
|
|
879
|
+
m_arr = np.cumsum(df2_ordered) / np.arange(1, n + 1, dtype=np.float64)
|
|
880
|
+
imin = int(np.argmin(m_arr))
|
|
881
|
+
df2_ordered[:imin + 1] = m_arr[imin]
|
|
882
|
+
df2_shrunk_final = np.empty(n)
|
|
883
|
+
df2_shrunk_final[o] = np.maximum.accumulate(df2_ordered)
|
|
884
|
+
|
|
885
|
+
return {'scale': s20, 'df2': df2, 'df2_outlier': df2_outlier, 'df2_shrunk': df2_shrunk_final}
|
|
886
|
+
|
|
887
|
+
|
|
888
|
+
def non_estimable(x):
|
|
889
|
+
"""Identify non-estimable coefficients in a design matrix.
|
|
890
|
+
|
|
891
|
+
Port of limma's nonEstimable().
|
|
892
|
+
"""
|
|
893
|
+
x = np.asarray(x, dtype=np.float64)
|
|
894
|
+
p = x.shape[1]
|
|
895
|
+
if p == 0:
|
|
896
|
+
return None
|
|
897
|
+
_, R = np.linalg.qr(x)
|
|
898
|
+
d = np.abs(np.diag(R))
|
|
899
|
+
if len(d) == 0:
|
|
900
|
+
return np.arange(p)
|
|
901
|
+
tol = np.max(d) * max(x.shape) * np.finfo(np.float64).eps
|
|
902
|
+
non_est = np.where(d < tol)[0]
|
|
903
|
+
if len(non_est) == 0:
|
|
904
|
+
return None
|
|
905
|
+
# Return coefficient names if available
|
|
906
|
+
return non_est
|
|
907
|
+
|
|
908
|
+
|
|
909
|
+
def is_fullrank(x):
|
|
910
|
+
"""Check if a matrix is full column rank.
|
|
911
|
+
|
|
912
|
+
Port of limma's is.fullrank().
|
|
913
|
+
"""
|
|
914
|
+
x = np.asarray(x, dtype=np.float64)
|
|
915
|
+
if x.ndim == 1:
|
|
916
|
+
x = x.reshape(-1, 1)
|
|
917
|
+
return np.linalg.matrix_rank(x) == x.shape[1]
|
|
918
|
+
|
|
919
|
+
|
|
920
|
+
def choose_lowess_span(n, small_n=25, min_span=0.2, power=1/3):
|
|
921
|
+
"""Choose lowess span based on number of observations.
|
|
922
|
+
|
|
923
|
+
Port of limma's chooseLowessSpan().
|
|
924
|
+
Formula: min(min_span + (1 - min_span) * (small_n/n)^power, 1)
|
|
925
|
+
"""
|
|
926
|
+
return min(min_span + (1 - min_span) * (small_n / n) ** power, 1.0)
|
|
927
|
+
|
|
928
|
+
|
|
929
|
+
def contrast_as_coef(design, contrast, first=False):
|
|
930
|
+
"""Reform a design matrix so that a contrast becomes a coefficient.
|
|
931
|
+
|
|
932
|
+
Port of limma's contrastAsCoef().
|
|
933
|
+
|
|
934
|
+
Parameters
|
|
935
|
+
----------
|
|
936
|
+
design : ndarray
|
|
937
|
+
Design matrix.
|
|
938
|
+
contrast : array-like
|
|
939
|
+
Contrast vector.
|
|
940
|
+
first : bool
|
|
941
|
+
If True, put contrast as first column.
|
|
942
|
+
|
|
943
|
+
Returns
|
|
944
|
+
-------
|
|
945
|
+
dict with 'design' (reformed design) and 'coef' (column index of contrast).
|
|
946
|
+
"""
|
|
947
|
+
design = np.asarray(design, dtype=np.float64)
|
|
948
|
+
contrast = np.asarray(contrast, dtype=np.float64).ravel()
|
|
949
|
+
p = design.shape[1]
|
|
950
|
+
|
|
951
|
+
if len(contrast) != p:
|
|
952
|
+
raise ValueError("Length of contrast must equal number of columns in design")
|
|
953
|
+
|
|
954
|
+
# Port of limma's contrastAsCoef: QR decompose contrast, apply Q^T
|
|
955
|
+
# rotation, then backsolve the contrast coefficient row by R so that
|
|
956
|
+
# the coefficient directly represents the contrast effect.
|
|
957
|
+
contrast_mat = contrast.reshape(-1, 1)
|
|
958
|
+
Q, R_mat = np.linalg.qr(contrast_mat, mode='complete')
|
|
959
|
+
r_val = R_mat[0, 0] # scalar R factor (= ±||contrast||)
|
|
960
|
+
|
|
961
|
+
# design_rotated = design @ Q (apply QR rotation)
|
|
962
|
+
design_rotated = design @ Q
|
|
963
|
+
|
|
964
|
+
# Backsolve: divide contrast coefficient column by R to normalize
|
|
965
|
+
# This makes the coefficient directly represent the logFC
|
|
966
|
+
ncontrasts = 1
|
|
967
|
+
design_rotated[:, 0] = design_rotated[:, 0] / r_val
|
|
968
|
+
|
|
969
|
+
if first:
|
|
970
|
+
new_design = design_rotated
|
|
971
|
+
coef = 0
|
|
972
|
+
else:
|
|
973
|
+
# Move contrast column (first) to last
|
|
974
|
+
cols = list(range(1, p)) + [0]
|
|
975
|
+
new_design = design_rotated[:, cols]
|
|
976
|
+
coef = p - 1
|
|
977
|
+
|
|
978
|
+
return {'design': new_design, 'coef': coef}
|
|
979
|
+
|
|
980
|
+
|
|
981
|
+
def logsumexp(x, y):
|
|
982
|
+
"""Compute log(exp(x) + exp(y)) avoiding overflow.
|
|
983
|
+
|
|
984
|
+
Helper used in zscoreNBinom.
|
|
985
|
+
"""
|
|
986
|
+
m = np.maximum(x, y)
|
|
987
|
+
return m + np.log(np.exp(x - m) + np.exp(y - m))
|