edgepython 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- edgepython/__init__.py +114 -0
- edgepython/classes.py +517 -0
- edgepython/compressed_matrix.py +388 -0
- edgepython/dgelist.py +314 -0
- edgepython/dispersion.py +920 -0
- edgepython/dispersion_lowlevel.py +1066 -0
- edgepython/exact_test.py +525 -0
- edgepython/expression.py +323 -0
- edgepython/filtering.py +96 -0
- edgepython/gene_sets.py +1215 -0
- edgepython/glm_fit.py +653 -0
- edgepython/glm_levenberg.py +359 -0
- edgepython/glm_test.py +375 -0
- edgepython/io.py +1887 -0
- edgepython/limma_port.py +987 -0
- edgepython/normalization.py +546 -0
- edgepython/ql_weights.py +765 -0
- edgepython/results.py +236 -0
- edgepython/sc_fit.py +1511 -0
- edgepython/smoothing.py +474 -0
- edgepython/splicing.py +537 -0
- edgepython/utils.py +1050 -0
- edgepython/visualization.py +409 -0
- edgepython/weighted_lowess.py +323 -0
- edgepython-0.2.0.dist-info/METADATA +201 -0
- edgepython-0.2.0.dist-info/RECORD +29 -0
- edgepython-0.2.0.dist-info/WHEEL +5 -0
- edgepython-0.2.0.dist-info/licenses/LICENSE +674 -0
- edgepython-0.2.0.dist-info/top_level.txt +1 -0
edgepython/gene_sets.py
ADDED
|
@@ -0,0 +1,1215 @@
|
|
|
1
|
+
# This code was written by Claude (Anthropic). The project was directed by Lior Pachter.
|
|
2
|
+
"""
|
|
3
|
+
Gene set testing for edgePython.
|
|
4
|
+
|
|
5
|
+
Port of edgeR's camera, fry, roast, mroast, romer, goana, kegga.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
import pandas as pd
|
|
10
|
+
import warnings
|
|
11
|
+
from scipy.stats import t as t_dist, norm as norm_dist, beta as beta_dist, rankdata
|
|
12
|
+
from statsmodels.stats.multitest import multipletests
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _zscore_t_hill(x, df):
|
|
16
|
+
"""Convert t-statistics to z-scores using Hill's approximation.
|
|
17
|
+
|
|
18
|
+
Port of limma's .zscoreTHill. This is the method used by R's camera
|
|
19
|
+
when approx=TRUE, method="hill".
|
|
20
|
+
"""
|
|
21
|
+
x = np.asarray(x, dtype=np.float64)
|
|
22
|
+
df = np.minimum(df, 1e100)
|
|
23
|
+
A = df - 0.5
|
|
24
|
+
B = 48.0 * A * A
|
|
25
|
+
z = A * np.log1p(x / df * x)
|
|
26
|
+
z = (((((-0.4 * z - 3.3) * z - 24.0) * z - 85.5) / (0.8 * z * z + 100.0 + B) + z + 3.0) / B + 1.0) * np.sqrt(z)
|
|
27
|
+
return z * np.sign(x)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
# -----------------------------------------------------------------------
|
|
31
|
+
# Private helpers
|
|
32
|
+
# -----------------------------------------------------------------------
|
|
33
|
+
|
|
34
|
+
def _zscore_glm(y, design, contrast):
|
|
35
|
+
"""Convert DGEGLM counts to NB z-scores under null model.
|
|
36
|
+
|
|
37
|
+
Port of edgeR's .zscoreGLM.
|
|
38
|
+
"""
|
|
39
|
+
from .glm_fit import glm_fit
|
|
40
|
+
from .utils import zscore_nbinom
|
|
41
|
+
|
|
42
|
+
counts = y['counts'].copy().astype(np.float64)
|
|
43
|
+
|
|
44
|
+
# QL scaling
|
|
45
|
+
if y.get('average.ql.dispersion') is not None:
|
|
46
|
+
s2_prior = np.atleast_1d(np.asarray(y.get('s2.prior', 1.0), dtype=np.float64))
|
|
47
|
+
if s2_prior.ndim == 0 or s2_prior.size == 1:
|
|
48
|
+
s2_prior = np.full(counts.shape[0], float(s2_prior.ravel()[0]))
|
|
49
|
+
counts = counts / np.maximum(1.0, s2_prior)[:, np.newaxis]
|
|
50
|
+
|
|
51
|
+
design = np.asarray(design, dtype=np.float64)
|
|
52
|
+
p = design.shape[1]
|
|
53
|
+
|
|
54
|
+
# Build null design by removing the contrast column
|
|
55
|
+
if isinstance(contrast, (int, np.integer)):
|
|
56
|
+
contrast_idx = int(contrast)
|
|
57
|
+
cols = [i for i in range(p) if i != contrast_idx]
|
|
58
|
+
design0 = design[:, cols]
|
|
59
|
+
else:
|
|
60
|
+
# contrast is a vector - remove last column (after contrastAsCoef)
|
|
61
|
+
design0 = design[:, :-1]
|
|
62
|
+
|
|
63
|
+
dispersion = y.get('dispersion', 0.05)
|
|
64
|
+
offset = y.get('offset')
|
|
65
|
+
w = y.get('weights')
|
|
66
|
+
|
|
67
|
+
# Fit null model
|
|
68
|
+
fit_null = glm_fit(counts, design=design0, dispersion=dispersion,
|
|
69
|
+
offset=offset, weights=w, prior_count=0)
|
|
70
|
+
|
|
71
|
+
mu = np.maximum(fit_null['fitted.values'], 1e-17)
|
|
72
|
+
|
|
73
|
+
# size parameter = 1/dispersion
|
|
74
|
+
disp = np.atleast_1d(np.asarray(dispersion, dtype=np.float64))
|
|
75
|
+
if disp.size == 1:
|
|
76
|
+
disp = np.full(counts.shape[0], float(disp.ravel()[0]))
|
|
77
|
+
|
|
78
|
+
# Compute z-scores column by column
|
|
79
|
+
ngenes, nsamples = counts.shape
|
|
80
|
+
z = np.zeros_like(counts)
|
|
81
|
+
for j in range(nsamples):
|
|
82
|
+
z[:, j] = zscore_nbinom(counts[:, j], size=1.0 / disp, mu=mu[:, j])
|
|
83
|
+
|
|
84
|
+
return z
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _zscore_dge(y, design, contrast):
|
|
88
|
+
"""Convert DGEList counts to NB z-scores under null model.
|
|
89
|
+
|
|
90
|
+
Port of edgeR's .zscoreDGE. Fits a null GLM (without contrast column)
|
|
91
|
+
and converts raw counts to standard normal z-scores using the mid-p
|
|
92
|
+
negative binomial quantile residual method.
|
|
93
|
+
"""
|
|
94
|
+
from .glm_fit import glm_fit
|
|
95
|
+
from .dgelist import get_dispersion, get_offset
|
|
96
|
+
from .utils import zscore_nbinom
|
|
97
|
+
from .limma_port import contrast_as_coef
|
|
98
|
+
|
|
99
|
+
counts = y['counts'].copy().astype(np.float64)
|
|
100
|
+
design = np.asarray(design, dtype=np.float64)
|
|
101
|
+
p = design.shape[1]
|
|
102
|
+
|
|
103
|
+
if p < 2:
|
|
104
|
+
raise ValueError("design matrix must have at least two columns")
|
|
105
|
+
|
|
106
|
+
# Get dispersion
|
|
107
|
+
dispersion = get_dispersion(y)
|
|
108
|
+
if dispersion is None:
|
|
109
|
+
raise ValueError("Dispersion estimate not found. "
|
|
110
|
+
"Please estimate dispersions before gene set testing.")
|
|
111
|
+
|
|
112
|
+
# Build null design by removing the contrast column
|
|
113
|
+
if isinstance(contrast, (int, np.integer)):
|
|
114
|
+
contrast_idx = int(contrast)
|
|
115
|
+
cols = [i for i in range(p) if i != contrast_idx]
|
|
116
|
+
design0 = design[:, cols]
|
|
117
|
+
else:
|
|
118
|
+
# Contrast is a vector: use contrastAsCoef to reparametrize,
|
|
119
|
+
# then drop the last column
|
|
120
|
+
cac = contrast_as_coef(design, contrast, first=False)
|
|
121
|
+
design_reparametrized = cac['design']
|
|
122
|
+
design0 = design_reparametrized[:, :-1]
|
|
123
|
+
|
|
124
|
+
# Get offset from DGEList
|
|
125
|
+
offset = get_offset(y)
|
|
126
|
+
|
|
127
|
+
# Fit null model
|
|
128
|
+
fit_null = glm_fit(counts, design=design0, dispersion=dispersion,
|
|
129
|
+
offset=offset, prior_count=0)
|
|
130
|
+
|
|
131
|
+
mu = np.maximum(fit_null['fitted.values'], 1e-17)
|
|
132
|
+
|
|
133
|
+
# size parameter = 1/dispersion
|
|
134
|
+
disp = np.atleast_1d(np.asarray(dispersion, dtype=np.float64))
|
|
135
|
+
if disp.size == 1:
|
|
136
|
+
disp = np.full(counts.shape[0], float(disp.ravel()[0]))
|
|
137
|
+
|
|
138
|
+
# Compute z-scores column by column
|
|
139
|
+
ngenes, nsamples = counts.shape
|
|
140
|
+
z = np.zeros_like(counts)
|
|
141
|
+
for j in range(nsamples):
|
|
142
|
+
z[:, j] = zscore_nbinom(counts[:, j], size=1.0 / disp, mu=mu[:, j])
|
|
143
|
+
|
|
144
|
+
return z
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def _resolve_input(y, design, contrast):
|
|
148
|
+
"""Resolve input type and return z-score matrix, design, contrast.
|
|
149
|
+
|
|
150
|
+
Used by fry, roast, mroast, romer to dispatch DGEList/DGEGLM/matrix.
|
|
151
|
+
"""
|
|
152
|
+
is_dgeglm = isinstance(y, dict) and 'coefficients' in y and 'dispersion' in y
|
|
153
|
+
is_dgelist = isinstance(y, dict) and 'counts' in y and 'coefficients' not in y
|
|
154
|
+
|
|
155
|
+
if design is None and isinstance(y, dict):
|
|
156
|
+
design = y.get('design')
|
|
157
|
+
if design is None:
|
|
158
|
+
raise ValueError("design matrix must be provided")
|
|
159
|
+
design = np.asarray(design, dtype=np.float64)
|
|
160
|
+
|
|
161
|
+
if contrast is None:
|
|
162
|
+
contrast = design.shape[1] - 1
|
|
163
|
+
|
|
164
|
+
if is_dgeglm:
|
|
165
|
+
expr = _zscore_glm(y, design=design, contrast=contrast)
|
|
166
|
+
elif is_dgelist:
|
|
167
|
+
expr = _zscore_dge(y, design=design, contrast=contrast)
|
|
168
|
+
else:
|
|
169
|
+
expr = np.asarray(y, dtype=np.float64)
|
|
170
|
+
|
|
171
|
+
return expr, design, contrast
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def _extract_effects(y, design, contrast):
|
|
175
|
+
"""QR decomposition of design to extract contrast effect and residuals.
|
|
176
|
+
|
|
177
|
+
Port of limma's .lmEffects (internal).
|
|
178
|
+
|
|
179
|
+
Returns
|
|
180
|
+
-------
|
|
181
|
+
dict with:
|
|
182
|
+
unscaledt : ndarray (G,) - unscaled t-statistics (contrast effect)
|
|
183
|
+
U : ndarray (df_residual, G) - residual effects
|
|
184
|
+
sigma2 : ndarray (G,) - residual variances
|
|
185
|
+
df_residual : int - residual degrees of freedom
|
|
186
|
+
"""
|
|
187
|
+
y = np.asarray(y, dtype=np.float64)
|
|
188
|
+
G, n = y.shape
|
|
189
|
+
design = np.asarray(design, dtype=np.float64)
|
|
190
|
+
p = design.shape[1]
|
|
191
|
+
df_residual = n - p
|
|
192
|
+
|
|
193
|
+
# Reorder design so contrast column is last
|
|
194
|
+
if isinstance(contrast, (int, np.integer)):
|
|
195
|
+
contrast_idx = int(contrast)
|
|
196
|
+
if contrast_idx < p - 1:
|
|
197
|
+
j = [i for i in range(p) if i != contrast_idx] + [contrast_idx]
|
|
198
|
+
design = design[:, j]
|
|
199
|
+
else:
|
|
200
|
+
contrast_vec = np.asarray(contrast, dtype=np.float64)
|
|
201
|
+
if contrast_vec.ndim == 1 and len(contrast_vec) == p:
|
|
202
|
+
nonzero = np.where(contrast_vec != 0)[0]
|
|
203
|
+
if len(nonzero) == 1 and contrast_vec[nonzero[0]] == 1:
|
|
204
|
+
contrast_idx = nonzero[0]
|
|
205
|
+
if contrast_idx < p - 1:
|
|
206
|
+
j = [i for i in range(p) if i != contrast_idx] + [contrast_idx]
|
|
207
|
+
design = design[:, j]
|
|
208
|
+
else:
|
|
209
|
+
QR_c = np.linalg.qr(contrast_vec.reshape(-1, 1))
|
|
210
|
+
design = (QR_c[0].T @ design.T).T
|
|
211
|
+
if QR_c[1][0, 0] < 0:
|
|
212
|
+
design[:, 0] = -design[:, 0]
|
|
213
|
+
design = np.column_stack([design[:, 1:], design[:, 0]])
|
|
214
|
+
|
|
215
|
+
# QR decomposition of design
|
|
216
|
+
Q_full, R_full = np.linalg.qr(design, mode='complete')
|
|
217
|
+
effects = Q_full.T @ y.T # n x G
|
|
218
|
+
|
|
219
|
+
unscaledt = effects[p - 1, :] # contrast row
|
|
220
|
+
# Check sign
|
|
221
|
+
R_reduced = np.linalg.qr(design, mode='reduced')[1]
|
|
222
|
+
if R_reduced[p - 1, p - 1] < 0:
|
|
223
|
+
unscaledt = -unscaledt
|
|
224
|
+
|
|
225
|
+
# Residual effects
|
|
226
|
+
U = effects[p:, :] # (n-p) x G
|
|
227
|
+
sigma2 = np.mean(U ** 2, axis=0)
|
|
228
|
+
|
|
229
|
+
return {
|
|
230
|
+
'unscaledt': unscaledt,
|
|
231
|
+
'U': U,
|
|
232
|
+
'sigma2': sigma2,
|
|
233
|
+
'df_residual': df_residual,
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
# -----------------------------------------------------------------------
|
|
238
|
+
# camera.default
|
|
239
|
+
# -----------------------------------------------------------------------
|
|
240
|
+
|
|
241
|
+
def _camera_default(y, index, design, contrast, weights=None,
|
|
242
|
+
use_ranks=False, allow_neg_cor=False, inter_gene_cor=0.01,
|
|
243
|
+
trend_var=False, sort=True):
|
|
244
|
+
"""Standard camera test. Port of limma's camera.default."""
|
|
245
|
+
from .limma_port import squeeze_var
|
|
246
|
+
|
|
247
|
+
y = np.asarray(y, dtype=np.float64)
|
|
248
|
+
G, n = y.shape
|
|
249
|
+
|
|
250
|
+
if design is None:
|
|
251
|
+
design = np.ones((n, 1))
|
|
252
|
+
design = np.asarray(design, dtype=np.float64)
|
|
253
|
+
p = design.shape[1]
|
|
254
|
+
df_residual = n - p
|
|
255
|
+
|
|
256
|
+
fixed_cor = inter_gene_cor is not None and not (
|
|
257
|
+
isinstance(inter_gene_cor, float) and np.isnan(inter_gene_cor))
|
|
258
|
+
|
|
259
|
+
if fixed_cor:
|
|
260
|
+
if use_ranks:
|
|
261
|
+
df_camera = np.inf
|
|
262
|
+
else:
|
|
263
|
+
df_camera = G - 2
|
|
264
|
+
else:
|
|
265
|
+
df_camera = min(df_residual, G - 2)
|
|
266
|
+
|
|
267
|
+
# Handle contrast: reorder design so contrast column is last
|
|
268
|
+
if isinstance(contrast, (int, np.integer)):
|
|
269
|
+
contrast_idx = int(contrast)
|
|
270
|
+
if contrast_idx < p - 1:
|
|
271
|
+
j = [i for i in range(p) if i != contrast_idx] + [contrast_idx]
|
|
272
|
+
design = design[:, j]
|
|
273
|
+
else:
|
|
274
|
+
contrast_vec = np.asarray(contrast, dtype=np.float64)
|
|
275
|
+
if contrast_vec.ndim == 1 and len(contrast_vec) == p:
|
|
276
|
+
nonzero = np.where(contrast_vec != 0)[0]
|
|
277
|
+
if len(nonzero) == 1 and contrast_vec[nonzero[0]] == 1:
|
|
278
|
+
contrast_idx = nonzero[0]
|
|
279
|
+
if contrast_idx < p - 1:
|
|
280
|
+
j = [i for i in range(p) if i != contrast_idx] + [contrast_idx]
|
|
281
|
+
design = design[:, j]
|
|
282
|
+
else:
|
|
283
|
+
QR_c = np.linalg.qr(contrast_vec.reshape(-1, 1))
|
|
284
|
+
design = (QR_c[0].T @ design.T).T
|
|
285
|
+
if QR_c[1][0, 0] < 0:
|
|
286
|
+
design[:, 0] = -design[:, 0]
|
|
287
|
+
design = np.column_stack([design[:, 1:], design[:, 0]])
|
|
288
|
+
|
|
289
|
+
# QR decomposition of design
|
|
290
|
+
Q_full, R_full = np.linalg.qr(design, mode='complete')
|
|
291
|
+
effects = Q_full.T @ y.T # n x G
|
|
292
|
+
|
|
293
|
+
unscaledt = effects[p - 1, :]
|
|
294
|
+
R_reduced = np.linalg.qr(design, mode='reduced')[1]
|
|
295
|
+
if R_reduced[p - 1, p - 1] < 0:
|
|
296
|
+
unscaledt = -unscaledt
|
|
297
|
+
|
|
298
|
+
# Residual effects
|
|
299
|
+
U = effects[p:, :] # (n-p) x G
|
|
300
|
+
sigma2 = np.mean(U ** 2, axis=0)
|
|
301
|
+
|
|
302
|
+
# Normalize residuals for correlation estimation
|
|
303
|
+
U_norm = (U / np.sqrt(np.maximum(sigma2, 1e-8))).T # G x (n-p)
|
|
304
|
+
|
|
305
|
+
# squeezeVar
|
|
306
|
+
A = np.mean(y, axis=1) if trend_var else None
|
|
307
|
+
sv = squeeze_var(sigma2, np.full(G, float(df_residual)), covariate=A)
|
|
308
|
+
var_post = sv['var_post']
|
|
309
|
+
df_prior_val = sv['df_prior']
|
|
310
|
+
|
|
311
|
+
modt = unscaledt / np.sqrt(np.maximum(var_post, 1e-15))
|
|
312
|
+
|
|
313
|
+
if use_ranks:
|
|
314
|
+
Stat = modt.copy()
|
|
315
|
+
else:
|
|
316
|
+
# zscoreT: convert moderated t to z-scores using Hill's approximation
|
|
317
|
+
# (matches R's limma: zscoreT(modt, df=df.total, approx=TRUE, method="hill"))
|
|
318
|
+
if np.isscalar(df_prior_val) or (hasattr(df_prior_val, 'size') and df_prior_val.size == 1):
|
|
319
|
+
dp = float(np.ravel(df_prior_val)[0])
|
|
320
|
+
else:
|
|
321
|
+
dp = float(np.median(df_prior_val))
|
|
322
|
+
df_total = min(df_residual + dp, G * df_residual)
|
|
323
|
+
Stat = _zscore_t_hill(modt, df_total)
|
|
324
|
+
Stat = np.where(np.isfinite(Stat), Stat, 0.0)
|
|
325
|
+
|
|
326
|
+
# Convert index format
|
|
327
|
+
if isinstance(index, dict):
|
|
328
|
+
set_names = list(index.keys())
|
|
329
|
+
set_indices = list(index.values())
|
|
330
|
+
elif isinstance(index, list):
|
|
331
|
+
set_names = [f'Set{i+1}' for i in range(len(index))]
|
|
332
|
+
set_indices = index
|
|
333
|
+
else:
|
|
334
|
+
raise ValueError("index must be a dict or list of lists")
|
|
335
|
+
|
|
336
|
+
nsets = len(set_names)
|
|
337
|
+
|
|
338
|
+
if not use_ranks:
|
|
339
|
+
meanStat = np.mean(Stat)
|
|
340
|
+
varStat = np.var(Stat, ddof=1)
|
|
341
|
+
|
|
342
|
+
results = []
|
|
343
|
+
for s_idx in range(nsets):
|
|
344
|
+
idx = np.asarray(set_indices[s_idx], dtype=int)
|
|
345
|
+
StatInSet = Stat[idx]
|
|
346
|
+
m = len(StatInSet)
|
|
347
|
+
m2 = G - m
|
|
348
|
+
|
|
349
|
+
if fixed_cor:
|
|
350
|
+
correlation = inter_gene_cor
|
|
351
|
+
vif = 1 + (m - 1) * correlation
|
|
352
|
+
else:
|
|
353
|
+
if m > 1:
|
|
354
|
+
Uset = U_norm[idx, :]
|
|
355
|
+
vif = m * np.mean(np.mean(Uset, axis=0) ** 2)
|
|
356
|
+
correlation = (vif - 1) / (m - 1)
|
|
357
|
+
else:
|
|
358
|
+
vif = 1
|
|
359
|
+
correlation = np.nan
|
|
360
|
+
|
|
361
|
+
if use_ranks:
|
|
362
|
+
if not allow_neg_cor:
|
|
363
|
+
correlation = max(0, correlation)
|
|
364
|
+
p_down, p_up = _rank_sum_test_with_correlation(
|
|
365
|
+
idx, Stat, correlation, df_camera)
|
|
366
|
+
else:
|
|
367
|
+
if not allow_neg_cor:
|
|
368
|
+
vif = max(1.0, vif)
|
|
369
|
+
meanStatInSet = np.mean(StatInSet)
|
|
370
|
+
delta = G / m2 * (meanStatInSet - meanStat)
|
|
371
|
+
varStatPooled = ((G - 1) * varStat - delta ** 2 * m * m2 / G) / (G - 2)
|
|
372
|
+
varStatPooled = max(varStatPooled, 1e-15)
|
|
373
|
+
two_sample_t = delta / np.sqrt(varStatPooled * (vif / m + 1.0 / m2))
|
|
374
|
+
p_down = t_dist.cdf(two_sample_t, df_camera)
|
|
375
|
+
p_up = t_dist.sf(two_sample_t, df_camera)
|
|
376
|
+
|
|
377
|
+
p_two = 2 * min(p_down, p_up)
|
|
378
|
+
direction = 'Up' if p_up < p_down else 'Down'
|
|
379
|
+
|
|
380
|
+
results.append({
|
|
381
|
+
'NGenes': m,
|
|
382
|
+
'Direction': direction,
|
|
383
|
+
'PValue': p_two
|
|
384
|
+
})
|
|
385
|
+
|
|
386
|
+
df = pd.DataFrame(results, index=set_names)
|
|
387
|
+
if nsets > 1:
|
|
388
|
+
_, fdr, _, _ = multipletests(df['PValue'].values, method='fdr_bh')
|
|
389
|
+
df['FDR'] = fdr
|
|
390
|
+
|
|
391
|
+
if sort and nsets > 1:
|
|
392
|
+
df = df.sort_values('PValue')
|
|
393
|
+
|
|
394
|
+
return df
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
def _rank_sum_test_with_correlation(iset, statistics, correlation, df):
|
|
398
|
+
"""Port of limma's rankSumTestWithCorrelation.
|
|
399
|
+
|
|
400
|
+
Wilcoxon rank-sum test adjusted for inter-gene correlation,
|
|
401
|
+
using the arcsin-based variance formula from limma.
|
|
402
|
+
"""
|
|
403
|
+
n = len(statistics)
|
|
404
|
+
n1 = len(iset)
|
|
405
|
+
n2 = n - n1
|
|
406
|
+
|
|
407
|
+
ranks = rankdata(statistics, method='average')
|
|
408
|
+
r1 = ranks[iset]
|
|
409
|
+
|
|
410
|
+
# U statistic (R convention: U = n1*n2 + n1*(n1+1)/2 - sum(r1))
|
|
411
|
+
U = n1 * n2 + n1 * (n1 + 1) / 2.0 - np.sum(r1)
|
|
412
|
+
mu = n1 * n2 / 2.0
|
|
413
|
+
|
|
414
|
+
# Variance formula using arcsin (matches R's limma exactly)
|
|
415
|
+
if correlation == 0 or n1 == 1:
|
|
416
|
+
sigma2 = n1 * n2 * (n + 1) / 12.0
|
|
417
|
+
else:
|
|
418
|
+
sigma2 = (np.arcsin(1.0) * n1 * n2
|
|
419
|
+
+ np.arcsin(0.5) * n1 * n2 * (n2 - 1)
|
|
420
|
+
+ np.arcsin(correlation / 2.0) * n1 * (n1 - 1) * n2 * (n2 - 1)
|
|
421
|
+
+ np.arcsin((correlation + 1.0) / 2.0) * n1 * (n1 - 1) * n2)
|
|
422
|
+
sigma2 = sigma2 / (2.0 * np.pi)
|
|
423
|
+
|
|
424
|
+
# Ties adjustment
|
|
425
|
+
unique_ranks = np.unique(ranks)
|
|
426
|
+
if len(unique_ranks) < len(ranks):
|
|
427
|
+
nties = np.array([np.sum(ranks == r) for r in unique_ranks])
|
|
428
|
+
adjustment = np.sum(nties * (nties + 1) * (nties - 1)) / (n * (n + 1) * (n - 1))
|
|
429
|
+
sigma2 = sigma2 * (1.0 - adjustment)
|
|
430
|
+
|
|
431
|
+
sigma2 = max(sigma2, 1e-15)
|
|
432
|
+
|
|
433
|
+
# Continuity correction (matching R)
|
|
434
|
+
z_lower = (U + 0.5 - mu) / np.sqrt(sigma2)
|
|
435
|
+
z_upper = (U - 0.5 - mu) / np.sqrt(sigma2)
|
|
436
|
+
|
|
437
|
+
if np.isinf(df):
|
|
438
|
+
p_down = norm_dist.sf(z_upper) # less = P(T > z_upper)
|
|
439
|
+
p_up = norm_dist.cdf(z_lower) # greater = P(T < z_lower)
|
|
440
|
+
else:
|
|
441
|
+
p_down = t_dist.sf(z_upper, df)
|
|
442
|
+
p_up = t_dist.cdf(z_lower, df)
|
|
443
|
+
|
|
444
|
+
return p_down, p_up
|
|
445
|
+
|
|
446
|
+
|
|
447
|
+
# -----------------------------------------------------------------------
|
|
448
|
+
# Public API
|
|
449
|
+
# -----------------------------------------------------------------------
|
|
450
|
+
|
|
451
|
+
def camera(y, index, design=None, contrast=None, weights=None,
|
|
452
|
+
use_ranks=False, allow_neg_cor=False, inter_gene_cor=0.01,
|
|
453
|
+
sort=True):
|
|
454
|
+
"""Competitive gene set test accounting for inter-gene correlation.
|
|
455
|
+
|
|
456
|
+
Port of edgeR's camera (camera.DGEList + camera.DGEGLM + camera.default).
|
|
457
|
+
|
|
458
|
+
Parameters
|
|
459
|
+
----------
|
|
460
|
+
y : ndarray, DGEList-like dict, or DGEGLM-like dict
|
|
461
|
+
If DGEGLM (has 'coefficients' and 'dispersion'), counts are converted
|
|
462
|
+
to NB z-scores under the null model before testing.
|
|
463
|
+
If DGEList (has 'counts' but no 'coefficients'), counts are converted
|
|
464
|
+
to NB z-scores via _zscore_dge (matching R's camera.DGEList).
|
|
465
|
+
If ndarray, used directly as expression matrix.
|
|
466
|
+
index : dict or list of lists
|
|
467
|
+
Gene set indices. If dict, keys are set names and values are
|
|
468
|
+
lists of gene indices (0-based).
|
|
469
|
+
design : ndarray, optional
|
|
470
|
+
Design matrix.
|
|
471
|
+
contrast : int or ndarray, optional
|
|
472
|
+
Column index (0-based) or contrast vector.
|
|
473
|
+
weights : ndarray, optional
|
|
474
|
+
Gene weights.
|
|
475
|
+
use_ranks : bool
|
|
476
|
+
Use rank-based test.
|
|
477
|
+
allow_neg_cor : bool
|
|
478
|
+
Allow negative inter-gene correlation.
|
|
479
|
+
inter_gene_cor : float
|
|
480
|
+
Fixed inter-gene correlation to use (default 0.01).
|
|
481
|
+
sort : bool
|
|
482
|
+
Sort results by p-value.
|
|
483
|
+
|
|
484
|
+
Returns
|
|
485
|
+
-------
|
|
486
|
+
DataFrame with columns NGenes, Direction, PValue, FDR.
|
|
487
|
+
"""
|
|
488
|
+
is_dgeglm = isinstance(y, dict) and 'coefficients' in y and 'dispersion' in y
|
|
489
|
+
is_dgelist = isinstance(y, dict) and 'counts' in y and 'coefficients' not in y
|
|
490
|
+
|
|
491
|
+
if design is None and isinstance(y, dict):
|
|
492
|
+
design = y.get('design')
|
|
493
|
+
if design is None:
|
|
494
|
+
raise ValueError("design matrix must be provided")
|
|
495
|
+
design = np.asarray(design, dtype=np.float64)
|
|
496
|
+
|
|
497
|
+
if contrast is None:
|
|
498
|
+
contrast = design.shape[1] - 1
|
|
499
|
+
|
|
500
|
+
if is_dgeglm:
|
|
501
|
+
expr = _zscore_glm(y, design=design, contrast=contrast)
|
|
502
|
+
return _camera_default(expr, index, design=design, contrast=contrast,
|
|
503
|
+
weights=weights, use_ranks=use_ranks,
|
|
504
|
+
allow_neg_cor=allow_neg_cor,
|
|
505
|
+
inter_gene_cor=inter_gene_cor,
|
|
506
|
+
trend_var=False, sort=sort)
|
|
507
|
+
elif is_dgelist:
|
|
508
|
+
expr = _zscore_dge(y, design=design, contrast=contrast)
|
|
509
|
+
return _camera_default(expr, index, design=design, contrast=contrast,
|
|
510
|
+
weights=weights, use_ranks=use_ranks,
|
|
511
|
+
allow_neg_cor=allow_neg_cor,
|
|
512
|
+
inter_gene_cor=inter_gene_cor,
|
|
513
|
+
trend_var=False, sort=sort)
|
|
514
|
+
else:
|
|
515
|
+
expr = np.asarray(y, dtype=np.float64)
|
|
516
|
+
return _camera_default(expr, index, design=design, contrast=contrast,
|
|
517
|
+
weights=weights, use_ranks=use_ranks,
|
|
518
|
+
allow_neg_cor=allow_neg_cor,
|
|
519
|
+
inter_gene_cor=inter_gene_cor,
|
|
520
|
+
trend_var=False, sort=sort)
|
|
521
|
+
|
|
522
|
+
|
|
523
|
+
def fry(y, index, design=None, contrast=None, sort=True):
|
|
524
|
+
"""Fast analytical gene set test (rotation-free).
|
|
525
|
+
|
|
526
|
+
Port of edgeR's fry.DGEList → limma's fry.default.
|
|
527
|
+
|
|
528
|
+
For DGEList/DGEGLM input, counts are first converted to NB z-scores,
|
|
529
|
+
then fry is applied with standardize="none" (no re-standardization).
|
|
530
|
+
|
|
531
|
+
Parameters
|
|
532
|
+
----------
|
|
533
|
+
y : ndarray, DGEList-like dict, or DGEGLM-like dict
|
|
534
|
+
Expression data.
|
|
535
|
+
index : dict or list of lists
|
|
536
|
+
Gene set indices (0-based).
|
|
537
|
+
design : ndarray, optional
|
|
538
|
+
Design matrix.
|
|
539
|
+
contrast : int or ndarray, optional
|
|
540
|
+
Column index (0-based) or contrast vector.
|
|
541
|
+
sort : bool
|
|
542
|
+
Sort results by p-value.
|
|
543
|
+
|
|
544
|
+
Returns
|
|
545
|
+
-------
|
|
546
|
+
DataFrame with columns NGenes, Direction, PValue, FDR, PValue.Mixed, FDR.Mixed.
|
|
547
|
+
"""
|
|
548
|
+
expr, design, contrast = _resolve_input(y, design, contrast)
|
|
549
|
+
eff = _extract_effects(expr, design, contrast)
|
|
550
|
+
|
|
551
|
+
unscaledt = eff['unscaledt']
|
|
552
|
+
U = eff['U']
|
|
553
|
+
df_residual = eff['df_residual']
|
|
554
|
+
G = len(unscaledt)
|
|
555
|
+
neffects = df_residual + 1 # contrast + residuals
|
|
556
|
+
|
|
557
|
+
# For DGEList input (z-scores), standardize="none":
|
|
558
|
+
# Effects matrix is used directly without squeezeVar.
|
|
559
|
+
# This matches R's fry.DGEList → fry(standardize="none")
|
|
560
|
+
|
|
561
|
+
# Build the full effects matrix: G × neffects
|
|
562
|
+
# Column 0 = contrast effect, columns 1..df_residual = residual effects
|
|
563
|
+
# In our representation: unscaledt is (G,), U is (df_residual, G)
|
|
564
|
+
# R's .fryEffects works on the effects matrix directly.
|
|
565
|
+
|
|
566
|
+
# Convert index format
|
|
567
|
+
if isinstance(index, dict):
|
|
568
|
+
set_names = list(index.keys())
|
|
569
|
+
set_indices = list(index.values())
|
|
570
|
+
elif isinstance(index, list):
|
|
571
|
+
set_names = [f'Set{i+1}' for i in range(len(index))]
|
|
572
|
+
set_indices = index
|
|
573
|
+
else:
|
|
574
|
+
raise ValueError("index must be a dict or list of lists")
|
|
575
|
+
|
|
576
|
+
nsets = len(set_names)
|
|
577
|
+
t_stat_arr = np.zeros(nsets)
|
|
578
|
+
p_mixed_arr = np.zeros(nsets)
|
|
579
|
+
ngenes_arr = np.zeros(nsets, dtype=int)
|
|
580
|
+
|
|
581
|
+
for s_idx in range(nsets):
|
|
582
|
+
idx = np.asarray(set_indices[s_idx], dtype=int)
|
|
583
|
+
m = len(idx)
|
|
584
|
+
ngenes_arr[s_idx] = m
|
|
585
|
+
|
|
586
|
+
# Build EffectsSet: m × neffects (genes × effects)
|
|
587
|
+
# Column 0 = contrast, columns 1: = residuals
|
|
588
|
+
effects_set = np.column_stack([
|
|
589
|
+
unscaledt[idx].reshape(-1, 1),
|
|
590
|
+
U[:, idx].T
|
|
591
|
+
]) # m × (df_residual + 1)
|
|
592
|
+
|
|
593
|
+
# --- Directional test (matching R's .fryEffects) ---
|
|
594
|
+
# Average effects across genes in the set
|
|
595
|
+
mean_effects = np.mean(effects_set, axis=0) # (neffects,)
|
|
596
|
+
# t-statistic: mean contrast effect / sqrt(mean squared residual effects)
|
|
597
|
+
mean_resid_sq = np.mean(mean_effects[1:] ** 2)
|
|
598
|
+
if mean_resid_sq > 1e-30:
|
|
599
|
+
t_stat_arr[s_idx] = mean_effects[0] / np.sqrt(mean_resid_sq)
|
|
600
|
+
else:
|
|
601
|
+
t_stat_arr[s_idx] = 0.0
|
|
602
|
+
|
|
603
|
+
# --- Mixed test (SVD-based, matching R's .fryEffects) ---
|
|
604
|
+
if m > 1:
|
|
605
|
+
svd_vals = np.linalg.svd(effects_set, compute_uv=False)
|
|
606
|
+
A = svd_vals ** 2 # squared singular values
|
|
607
|
+
d1 = len(A)
|
|
608
|
+
d = d1 - 1
|
|
609
|
+
|
|
610
|
+
if d > 0 and A[0] > A[-1] + 1e-15:
|
|
611
|
+
beta_mean = 1.0 / d1
|
|
612
|
+
beta_var = d / (d1 * d1 * (d1 / 2.0 + 1.0))
|
|
613
|
+
|
|
614
|
+
Fobs = (np.sum(effects_set[:, 0] ** 2) - A[-1]) / (A[0] - A[-1])
|
|
615
|
+
Frb_mean = (np.sum(A) * beta_mean - A[-1]) / (A[0] - A[-1])
|
|
616
|
+
|
|
617
|
+
COV = np.full((d1, d1), -beta_var / d)
|
|
618
|
+
np.fill_diagonal(COV, beta_var)
|
|
619
|
+
Frb_var = float(A @ COV @ A) / (A[0] - A[-1]) ** 2
|
|
620
|
+
|
|
621
|
+
if Frb_var > 1e-30 and Frb_mean > 0 and Frb_mean < 1:
|
|
622
|
+
alphaplusbeta = Frb_mean * (1.0 - Frb_mean) / Frb_var - 1.0
|
|
623
|
+
alpha = alphaplusbeta * Frb_mean
|
|
624
|
+
beta_param = alphaplusbeta - alpha
|
|
625
|
+
if alpha > 0 and beta_param > 0:
|
|
626
|
+
p_mixed_arr[s_idx] = beta_dist.sf(Fobs, alpha, beta_param)
|
|
627
|
+
else:
|
|
628
|
+
p_mixed_arr[s_idx] = 1.0
|
|
629
|
+
else:
|
|
630
|
+
p_mixed_arr[s_idx] = 1.0
|
|
631
|
+
else:
|
|
632
|
+
p_mixed_arr[s_idx] = 1.0
|
|
633
|
+
else:
|
|
634
|
+
p_mixed_arr[s_idx] = 0.0 # will be overwritten below
|
|
635
|
+
|
|
636
|
+
# Directional p-values (matching R: 2 * pt(-abs(t.stat), df=df.residual))
|
|
637
|
+
p_dir = 2.0 * t_dist.sf(np.abs(t_stat_arr), df_residual)
|
|
638
|
+
|
|
639
|
+
# Direction
|
|
640
|
+
directions = np.where(t_stat_arr >= 0, 'Up', 'Down')
|
|
641
|
+
|
|
642
|
+
# For single-gene sets, mixed p-value = directional p-value (matching R)
|
|
643
|
+
p_mixed_arr[ngenes_arr == 1] = p_dir[ngenes_arr == 1]
|
|
644
|
+
|
|
645
|
+
results = []
|
|
646
|
+
for s_idx in range(nsets):
|
|
647
|
+
results.append({
|
|
648
|
+
'NGenes': ngenes_arr[s_idx],
|
|
649
|
+
'Direction': directions[s_idx],
|
|
650
|
+
'PValue': p_dir[s_idx],
|
|
651
|
+
'PValue.Mixed': p_mixed_arr[s_idx],
|
|
652
|
+
})
|
|
653
|
+
|
|
654
|
+
result_df = pd.DataFrame(results, index=set_names)
|
|
655
|
+
|
|
656
|
+
# FDR correction
|
|
657
|
+
if nsets > 1:
|
|
658
|
+
_, fdr, _, _ = multipletests(result_df['PValue'].values, method='fdr_bh')
|
|
659
|
+
result_df['FDR'] = fdr
|
|
660
|
+
_, fdr_mixed, _, _ = multipletests(result_df['PValue.Mixed'].values, method='fdr_bh')
|
|
661
|
+
result_df['FDR.Mixed'] = fdr_mixed
|
|
662
|
+
else:
|
|
663
|
+
result_df['FDR'] = result_df['PValue'].values
|
|
664
|
+
result_df['FDR.Mixed'] = result_df['PValue.Mixed'].values
|
|
665
|
+
|
|
666
|
+
# Reorder columns
|
|
667
|
+
result_df = result_df[['NGenes', 'Direction', 'PValue', 'FDR', 'PValue.Mixed', 'FDR.Mixed']]
|
|
668
|
+
|
|
669
|
+
if sort and nsets > 1:
|
|
670
|
+
result_df = result_df.sort_values('PValue')
|
|
671
|
+
|
|
672
|
+
return result_df
|
|
673
|
+
|
|
674
|
+
|
|
675
|
+
def roast(y, index, design=None, contrast=None, nrot=999,
|
|
676
|
+
set_statistic='mean', sort=True):
|
|
677
|
+
"""Rotation gene set test for a single or multiple gene sets.
|
|
678
|
+
|
|
679
|
+
Port of edgeR's roast.DGEList → limma's roast.default.
|
|
680
|
+
|
|
681
|
+
For DGEList/DGEGLM input, counts are first converted to NB z-scores,
|
|
682
|
+
then roast is applied with var.prior=1, df.prior=Inf (since z-scores
|
|
683
|
+
are already standardized).
|
|
684
|
+
|
|
685
|
+
Parameters
|
|
686
|
+
----------
|
|
687
|
+
y : ndarray, DGEList-like dict, or DGEGLM-like dict
|
|
688
|
+
Expression data.
|
|
689
|
+
index : dict, list of lists, or list of ints
|
|
690
|
+
Gene set indices (0-based). If dict or list of lists, tests first set.
|
|
691
|
+
If list of ints, treats as single gene set.
|
|
692
|
+
design : ndarray, optional
|
|
693
|
+
Design matrix.
|
|
694
|
+
contrast : int or ndarray, optional
|
|
695
|
+
Column index (0-based) or contrast vector.
|
|
696
|
+
nrot : int
|
|
697
|
+
Number of rotations (default 999).
|
|
698
|
+
set_statistic : str
|
|
699
|
+
'mean' (default), 'floormean', or 'mean50'.
|
|
700
|
+
sort : bool
|
|
701
|
+
Sort results by p-value.
|
|
702
|
+
|
|
703
|
+
Returns
|
|
704
|
+
-------
|
|
705
|
+
DataFrame with columns Active.Prop, P.Value for Down/Up/UpOrDown/Mixed.
|
|
706
|
+
"""
|
|
707
|
+
expr, design, contrast = _resolve_input(y, design, contrast)
|
|
708
|
+
|
|
709
|
+
# Handle index format - roast tests a single gene set
|
|
710
|
+
if isinstance(index, dict):
|
|
711
|
+
first_key = list(index.keys())[0]
|
|
712
|
+
idx = np.asarray(index[first_key], dtype=int)
|
|
713
|
+
elif isinstance(index, list):
|
|
714
|
+
if len(index) > 0 and isinstance(index[0], (list, np.ndarray)):
|
|
715
|
+
idx = np.asarray(index[0], dtype=int)
|
|
716
|
+
else:
|
|
717
|
+
idx = np.asarray(index, dtype=int)
|
|
718
|
+
else:
|
|
719
|
+
idx = np.asarray(index, dtype=int)
|
|
720
|
+
|
|
721
|
+
eff = _extract_effects(expr, design, contrast)
|
|
722
|
+
unscaledt = eff['unscaledt']
|
|
723
|
+
U = eff['U']
|
|
724
|
+
df_residual = eff['df_residual']
|
|
725
|
+
G = len(unscaledt)
|
|
726
|
+
|
|
727
|
+
# For DGEList z-scores: var.prior=1, df.prior=Inf => var_post=1
|
|
728
|
+
# So modt = unscaledt / 1 = unscaledt
|
|
729
|
+
modt = unscaledt.copy()
|
|
730
|
+
|
|
731
|
+
# Compute set statistics for observed data
|
|
732
|
+
m = len(idx)
|
|
733
|
+
t_set = modt[idx]
|
|
734
|
+
|
|
735
|
+
# Active proportions
|
|
736
|
+
p_thresh = 0.05
|
|
737
|
+
# Two-sided p-values for each gene
|
|
738
|
+
gene_pvals = 2 * t_dist.sf(np.abs(modt), df_residual)
|
|
739
|
+
active_down = np.sum((gene_pvals[idx] < p_thresh) & (modt[idx] < 0)) / m
|
|
740
|
+
active_up = np.sum((gene_pvals[idx] < p_thresh) & (modt[idx] > 0)) / m
|
|
741
|
+
|
|
742
|
+
# Observed set statistics
|
|
743
|
+
obs_mean_up = np.mean(t_set)
|
|
744
|
+
obs_mean_down = -obs_mean_up
|
|
745
|
+
obs_mean_mixed = np.mean(np.abs(t_set))
|
|
746
|
+
|
|
747
|
+
# Rotation loop
|
|
748
|
+
count_up = 0
|
|
749
|
+
count_down = 0
|
|
750
|
+
count_upordown = 0
|
|
751
|
+
count_mixed = 0
|
|
752
|
+
|
|
753
|
+
rng = np.random.default_rng()
|
|
754
|
+
for _ in range(nrot):
|
|
755
|
+
# Random rotation in the residual space
|
|
756
|
+
# Generate random unit vector in R^(df_residual)
|
|
757
|
+
rand_vec = rng.standard_normal(df_residual)
|
|
758
|
+
rand_vec = rand_vec / np.linalg.norm(rand_vec)
|
|
759
|
+
|
|
760
|
+
# Rotated residuals projected onto random direction
|
|
761
|
+
rotated_resid = rand_vec @ U # (G,)
|
|
762
|
+
|
|
763
|
+
# Rotated moderated t: combine original contrast effect direction
|
|
764
|
+
# with rotated residual (simulating rotation in the space)
|
|
765
|
+
# Under the rotation framework, we rotate the entire effects space
|
|
766
|
+
# For DGEList with var.prior=1: rotated modt = Q_contrast @ rotated_effects
|
|
767
|
+
rot_t = rotated_resid # Since var_post=1, this is already the statistic
|
|
768
|
+
|
|
769
|
+
rot_t_set = rot_t[idx]
|
|
770
|
+
rot_mean_up = np.mean(rot_t_set)
|
|
771
|
+
rot_mean_down = -rot_mean_up
|
|
772
|
+
rot_mean_mixed = np.mean(np.abs(rot_t_set))
|
|
773
|
+
|
|
774
|
+
if rot_mean_up >= obs_mean_up:
|
|
775
|
+
count_up += 1
|
|
776
|
+
if rot_mean_down >= obs_mean_down:
|
|
777
|
+
count_down += 1
|
|
778
|
+
if max(rot_mean_up, rot_mean_down) >= max(obs_mean_up, obs_mean_down):
|
|
779
|
+
count_upordown += 1
|
|
780
|
+
if rot_mean_mixed >= obs_mean_mixed:
|
|
781
|
+
count_mixed += 1
|
|
782
|
+
|
|
783
|
+
# P-values
|
|
784
|
+
p_up = (count_up + 1) / (nrot + 1)
|
|
785
|
+
p_down = (count_down + 1) / (nrot + 1)
|
|
786
|
+
p_upordown = (count_upordown + 1) / (nrot + 1)
|
|
787
|
+
p_mixed = (count_mixed + 1) / (nrot + 1)
|
|
788
|
+
|
|
789
|
+
result = pd.DataFrame({
|
|
790
|
+
'Active.Prop': [active_down, active_up, max(active_down, active_up), np.nan],
|
|
791
|
+
'P.Value': [p_down, p_up, p_upordown, p_mixed],
|
|
792
|
+
}, index=['Down', 'Up', 'UpOrDown', 'Mixed'])
|
|
793
|
+
|
|
794
|
+
# Add ngenes as metadata
|
|
795
|
+
result.attrs['ngenes'] = m
|
|
796
|
+
|
|
797
|
+
return result
|
|
798
|
+
|
|
799
|
+
|
|
800
|
+
def mroast(y, index, design=None, contrast=None, nrot=999,
|
|
801
|
+
set_statistic='mean', adjust_method='BH', midp=True, sort=True):
|
|
802
|
+
"""Rotation gene set test for multiple gene sets.
|
|
803
|
+
|
|
804
|
+
Port of edgeR's mroast.DGEList → limma's mroast.default.
|
|
805
|
+
|
|
806
|
+
Tests multiple gene sets simultaneously using shared rotations for
|
|
807
|
+
proper FDR correction.
|
|
808
|
+
|
|
809
|
+
Parameters
|
|
810
|
+
----------
|
|
811
|
+
y : ndarray, DGEList-like dict, or DGEGLM-like dict
|
|
812
|
+
Expression data.
|
|
813
|
+
index : dict or list of lists
|
|
814
|
+
Gene set indices (0-based).
|
|
815
|
+
design : ndarray, optional
|
|
816
|
+
Design matrix.
|
|
817
|
+
contrast : int or ndarray, optional
|
|
818
|
+
Column index (0-based) or contrast vector.
|
|
819
|
+
nrot : int
|
|
820
|
+
Number of rotations (default 999).
|
|
821
|
+
set_statistic : str
|
|
822
|
+
'mean' (default), 'floormean', or 'mean50'.
|
|
823
|
+
adjust_method : str
|
|
824
|
+
P-value adjustment method (default 'BH').
|
|
825
|
+
midp : bool
|
|
826
|
+
Use mid-p adjustment (default True).
|
|
827
|
+
sort : bool
|
|
828
|
+
Sort results by p-value.
|
|
829
|
+
|
|
830
|
+
Returns
|
|
831
|
+
-------
|
|
832
|
+
DataFrame with columns NGenes, PropDown, PropUp, Direction, PValue, FDR,
|
|
833
|
+
PValue.Mixed, FDR.Mixed.
|
|
834
|
+
"""
|
|
835
|
+
expr, design, contrast = _resolve_input(y, design, contrast)
|
|
836
|
+
|
|
837
|
+
# Convert index format
|
|
838
|
+
if isinstance(index, dict):
|
|
839
|
+
set_names = list(index.keys())
|
|
840
|
+
set_indices = [np.asarray(v, dtype=int) for v in index.values()]
|
|
841
|
+
elif isinstance(index, list):
|
|
842
|
+
set_names = [f'Set{i+1}' for i in range(len(index))]
|
|
843
|
+
set_indices = [np.asarray(v, dtype=int) for v in index]
|
|
844
|
+
else:
|
|
845
|
+
raise ValueError("index must be a dict or list of lists")
|
|
846
|
+
|
|
847
|
+
nsets = len(set_names)
|
|
848
|
+
|
|
849
|
+
eff = _extract_effects(expr, design, contrast)
|
|
850
|
+
unscaledt = eff['unscaledt']
|
|
851
|
+
U = eff['U']
|
|
852
|
+
df_residual = eff['df_residual']
|
|
853
|
+
G = len(unscaledt)
|
|
854
|
+
|
|
855
|
+
# For DGEList z-scores: var.prior=1, df.prior=Inf => var_post=1
|
|
856
|
+
modt = unscaledt.copy()
|
|
857
|
+
|
|
858
|
+
# Compute observed statistics and proportions for each set
|
|
859
|
+
p_thresh = 0.05
|
|
860
|
+
gene_pvals = 2 * t_dist.sf(np.abs(modt), df_residual)
|
|
861
|
+
|
|
862
|
+
obs_up = np.zeros(nsets)
|
|
863
|
+
obs_down = np.zeros(nsets)
|
|
864
|
+
obs_mixed = np.zeros(nsets)
|
|
865
|
+
prop_down = np.zeros(nsets)
|
|
866
|
+
prop_up = np.zeros(nsets)
|
|
867
|
+
set_sizes = np.zeros(nsets, dtype=int)
|
|
868
|
+
|
|
869
|
+
for s in range(nsets):
|
|
870
|
+
idx = set_indices[s]
|
|
871
|
+
m = len(idx)
|
|
872
|
+
set_sizes[s] = m
|
|
873
|
+
t_set = modt[idx]
|
|
874
|
+
obs_up[s] = np.mean(t_set)
|
|
875
|
+
obs_down[s] = -obs_up[s]
|
|
876
|
+
obs_mixed[s] = np.mean(np.abs(t_set))
|
|
877
|
+
prop_down[s] = np.sum((gene_pvals[idx] < p_thresh) & (modt[idx] < 0)) / m
|
|
878
|
+
prop_up[s] = np.sum((gene_pvals[idx] < p_thresh) & (modt[idx] > 0)) / m
|
|
879
|
+
|
|
880
|
+
# Shared rotation loop
|
|
881
|
+
count_up = np.zeros(nsets)
|
|
882
|
+
count_down = np.zeros(nsets)
|
|
883
|
+
count_mixed = np.zeros(nsets)
|
|
884
|
+
|
|
885
|
+
rng = np.random.default_rng()
|
|
886
|
+
for _ in range(nrot):
|
|
887
|
+
rand_vec = rng.standard_normal(df_residual)
|
|
888
|
+
rand_vec = rand_vec / np.linalg.norm(rand_vec)
|
|
889
|
+
rot_t = rand_vec @ U # (G,)
|
|
890
|
+
|
|
891
|
+
for s in range(nsets):
|
|
892
|
+
idx = set_indices[s]
|
|
893
|
+
rot_t_set = rot_t[idx]
|
|
894
|
+
rot_mean = np.mean(rot_t_set)
|
|
895
|
+
|
|
896
|
+
if rot_mean >= obs_up[s]:
|
|
897
|
+
count_up[s] += 1
|
|
898
|
+
if -rot_mean >= obs_down[s]:
|
|
899
|
+
count_down[s] += 1
|
|
900
|
+
if np.mean(np.abs(rot_t_set)) >= obs_mixed[s]:
|
|
901
|
+
count_mixed[s] += 1
|
|
902
|
+
|
|
903
|
+
# P-values
|
|
904
|
+
if midp:
|
|
905
|
+
p_up_vals = (count_up + 0.5) / (nrot + 1)
|
|
906
|
+
p_down_vals = (count_down + 0.5) / (nrot + 1)
|
|
907
|
+
p_mixed_vals = (count_mixed + 0.5) / (nrot + 1)
|
|
908
|
+
else:
|
|
909
|
+
p_up_vals = (count_up + 1) / (nrot + 1)
|
|
910
|
+
p_down_vals = (count_down + 1) / (nrot + 1)
|
|
911
|
+
p_mixed_vals = (count_mixed + 1) / (nrot + 1)
|
|
912
|
+
|
|
913
|
+
# Two-sided directional p-value and direction
|
|
914
|
+
p_dir = np.minimum(2 * np.minimum(p_up_vals, p_down_vals), 1.0)
|
|
915
|
+
directions = np.where(p_up_vals < p_down_vals, 'Up', 'Down')
|
|
916
|
+
|
|
917
|
+
# FDR correction
|
|
918
|
+
method_map = {'BH': 'fdr_bh', 'bonferroni': 'bonferroni',
|
|
919
|
+
'holm': 'holm', 'hochberg': 'simes-hochberg',
|
|
920
|
+
'BY': 'fdr_by', 'fdr': 'fdr_bh'}
|
|
921
|
+
sm_method = method_map.get(adjust_method, 'fdr_bh')
|
|
922
|
+
|
|
923
|
+
if nsets > 1:
|
|
924
|
+
_, fdr_dir, _, _ = multipletests(p_dir, method=sm_method)
|
|
925
|
+
_, fdr_mixed, _, _ = multipletests(p_mixed_vals, method=sm_method)
|
|
926
|
+
else:
|
|
927
|
+
fdr_dir = p_dir
|
|
928
|
+
fdr_mixed = p_mixed_vals
|
|
929
|
+
|
|
930
|
+
result_df = pd.DataFrame({
|
|
931
|
+
'NGenes': set_sizes,
|
|
932
|
+
'PropDown': prop_down,
|
|
933
|
+
'PropUp': prop_up,
|
|
934
|
+
'Direction': directions,
|
|
935
|
+
'PValue': p_dir,
|
|
936
|
+
'FDR': fdr_dir,
|
|
937
|
+
'PValue.Mixed': p_mixed_vals,
|
|
938
|
+
'FDR.Mixed': fdr_mixed,
|
|
939
|
+
}, index=set_names)
|
|
940
|
+
|
|
941
|
+
if sort and nsets > 1:
|
|
942
|
+
result_df = result_df.sort_values('PValue')
|
|
943
|
+
|
|
944
|
+
return result_df
|
|
945
|
+
|
|
946
|
+
|
|
947
|
+
def romer(y, index, design=None, contrast=None, nrot=9999):
|
|
948
|
+
"""Rank-based rotation gene set enrichment test.
|
|
949
|
+
|
|
950
|
+
Port of edgeR's romer.DGEList → limma's romer.default.
|
|
951
|
+
|
|
952
|
+
For DGEList/DGEGLM input, counts are first converted to NB z-scores,
|
|
953
|
+
then romer is applied. Unlike roast/mroast/fry, romer lets squeezeVar
|
|
954
|
+
estimate its own variance prior from the z-score data.
|
|
955
|
+
|
|
956
|
+
Parameters
|
|
957
|
+
----------
|
|
958
|
+
y : ndarray, DGEList-like dict, or DGEGLM-like dict
|
|
959
|
+
Expression data.
|
|
960
|
+
index : dict or list of lists
|
|
961
|
+
Gene set indices (0-based).
|
|
962
|
+
design : ndarray, optional
|
|
963
|
+
Design matrix.
|
|
964
|
+
contrast : int or ndarray, optional
|
|
965
|
+
Column index (0-based) or contrast vector.
|
|
966
|
+
nrot : int
|
|
967
|
+
Number of rotations (default 9999).
|
|
968
|
+
|
|
969
|
+
Returns
|
|
970
|
+
-------
|
|
971
|
+
DataFrame with columns NGenes, Up, Down, Mixed (p-values).
|
|
972
|
+
"""
|
|
973
|
+
from .limma_port import squeeze_var
|
|
974
|
+
|
|
975
|
+
expr, design, contrast = _resolve_input(y, design, contrast)
|
|
976
|
+
|
|
977
|
+
# Convert index format
|
|
978
|
+
if isinstance(index, dict):
|
|
979
|
+
set_names = list(index.keys())
|
|
980
|
+
set_indices = [np.asarray(v, dtype=int) for v in index.values()]
|
|
981
|
+
elif isinstance(index, list):
|
|
982
|
+
set_names = [f'Set{i+1}' for i in range(len(index))]
|
|
983
|
+
set_indices = [np.asarray(v, dtype=int) for v in index]
|
|
984
|
+
else:
|
|
985
|
+
raise ValueError("index must be a dict or list of lists")
|
|
986
|
+
|
|
987
|
+
nsets = len(set_names)
|
|
988
|
+
|
|
989
|
+
eff = _extract_effects(expr, design, contrast)
|
|
990
|
+
unscaledt = eff['unscaledt']
|
|
991
|
+
U = eff['U']
|
|
992
|
+
sigma2 = eff['sigma2']
|
|
993
|
+
df_residual = eff['df_residual']
|
|
994
|
+
G = len(unscaledt)
|
|
995
|
+
|
|
996
|
+
# squeezeVar to estimate prior (romer does its own variance moderation)
|
|
997
|
+
sv = squeeze_var(sigma2, np.full(G, float(df_residual)))
|
|
998
|
+
var_post = sv['var_post']
|
|
999
|
+
df_prior_val = sv['df_prior']
|
|
1000
|
+
|
|
1001
|
+
# Moderated t-statistics
|
|
1002
|
+
sd_post = np.sqrt(np.maximum(var_post, 1e-15))
|
|
1003
|
+
modt = unscaledt / sd_post
|
|
1004
|
+
|
|
1005
|
+
# Shrink residuals (as R's romer does with shrink.resid=TRUE)
|
|
1006
|
+
if np.isscalar(df_prior_val):
|
|
1007
|
+
dp = float(df_prior_val)
|
|
1008
|
+
else:
|
|
1009
|
+
dp = float(np.median(df_prior_val))
|
|
1010
|
+
s0 = np.sqrt(np.maximum(sv.get('var_prior', 1.0), 1e-15))
|
|
1011
|
+
if np.isscalar(s0):
|
|
1012
|
+
s0 = float(s0)
|
|
1013
|
+
else:
|
|
1014
|
+
s0 = float(np.median(s0))
|
|
1015
|
+
|
|
1016
|
+
# Shrink residuals: U_shrunk = U * s0 / sd_unshrunk
|
|
1017
|
+
sd_unshrunk = np.sqrt(np.maximum(sigma2, 1e-15))
|
|
1018
|
+
shrink_factor = s0 / np.maximum(sd_unshrunk, 1e-15)
|
|
1019
|
+
U_shrunk = U * shrink_factor[np.newaxis, :]
|
|
1020
|
+
|
|
1021
|
+
# Compute ranks for observed data
|
|
1022
|
+
# Up: high t -> high rank (ascending ranks)
|
|
1023
|
+
# Down: low t -> high rank (descending ranks)
|
|
1024
|
+
# Mixed: high |t| -> high rank
|
|
1025
|
+
up_ranks = rankdata(modt)
|
|
1026
|
+
down_ranks = rankdata(-modt)
|
|
1027
|
+
mixed_ranks = rankdata(np.abs(modt))
|
|
1028
|
+
|
|
1029
|
+
# Observed mean ranks per set
|
|
1030
|
+
obs_up = np.zeros(nsets)
|
|
1031
|
+
obs_down = np.zeros(nsets)
|
|
1032
|
+
obs_mixed = np.zeros(nsets)
|
|
1033
|
+
set_sizes = np.zeros(nsets, dtype=int)
|
|
1034
|
+
|
|
1035
|
+
for s in range(nsets):
|
|
1036
|
+
idx = set_indices[s]
|
|
1037
|
+
m = len(idx)
|
|
1038
|
+
set_sizes[s] = m
|
|
1039
|
+
obs_up[s] = np.mean(up_ranks[idx])
|
|
1040
|
+
obs_down[s] = np.mean(down_ranks[idx])
|
|
1041
|
+
obs_mixed[s] = np.mean(mixed_ranks[idx])
|
|
1042
|
+
|
|
1043
|
+
# Rotation loop
|
|
1044
|
+
count_up = np.zeros(nsets)
|
|
1045
|
+
count_down = np.zeros(nsets)
|
|
1046
|
+
count_mixed = np.zeros(nsets)
|
|
1047
|
+
|
|
1048
|
+
rng = np.random.default_rng()
|
|
1049
|
+
for _ in range(nrot):
|
|
1050
|
+
# Random rotation in residual space
|
|
1051
|
+
rand_vec = rng.standard_normal(df_residual)
|
|
1052
|
+
rand_vec = rand_vec / np.linalg.norm(rand_vec)
|
|
1053
|
+
|
|
1054
|
+
# Rotated statistics
|
|
1055
|
+
rot_resid = rand_vec @ U_shrunk # (G,)
|
|
1056
|
+
rot_t = rot_resid / sd_post # Approximate rotated moderated t
|
|
1057
|
+
|
|
1058
|
+
# Compute ranks
|
|
1059
|
+
rot_up_ranks = rankdata(rot_t)
|
|
1060
|
+
rot_down_ranks = rankdata(-rot_t)
|
|
1061
|
+
rot_mixed_ranks = rankdata(np.abs(rot_t))
|
|
1062
|
+
|
|
1063
|
+
for s in range(nsets):
|
|
1064
|
+
idx = set_indices[s]
|
|
1065
|
+
if np.mean(rot_up_ranks[idx]) >= obs_up[s]:
|
|
1066
|
+
count_up[s] += 1
|
|
1067
|
+
if np.mean(rot_down_ranks[idx]) >= obs_down[s]:
|
|
1068
|
+
count_down[s] += 1
|
|
1069
|
+
if np.mean(rot_mixed_ranks[idx]) >= obs_mixed[s]:
|
|
1070
|
+
count_mixed[s] += 1
|
|
1071
|
+
|
|
1072
|
+
# P-values
|
|
1073
|
+
p_up = (count_up + 1) / (nrot + 1)
|
|
1074
|
+
p_down = (count_down + 1) / (nrot + 1)
|
|
1075
|
+
p_mixed = (count_mixed + 1) / (nrot + 1)
|
|
1076
|
+
|
|
1077
|
+
result_df = pd.DataFrame({
|
|
1078
|
+
'NGenes': set_sizes,
|
|
1079
|
+
'Up': p_up,
|
|
1080
|
+
'Down': p_down,
|
|
1081
|
+
'Mixed': p_mixed,
|
|
1082
|
+
}, index=set_names)
|
|
1083
|
+
|
|
1084
|
+
return result_df
|
|
1085
|
+
|
|
1086
|
+
|
|
1087
|
+
def goana(de, species='Hs', **kwargs):
|
|
1088
|
+
"""Gene ontology enrichment analysis using g:Profiler.
|
|
1089
|
+
|
|
1090
|
+
Wraps the gprofiler-official Python package for GO enrichment.
|
|
1091
|
+
Requires: pip install gprofiler-official
|
|
1092
|
+
|
|
1093
|
+
Parameters
|
|
1094
|
+
----------
|
|
1095
|
+
de : dict (DGELRT/DGEExact) or list
|
|
1096
|
+
If DGELRT/DGEExact dict (has 'table'), significant genes are extracted.
|
|
1097
|
+
If list, used directly as gene identifiers.
|
|
1098
|
+
species : str
|
|
1099
|
+
Species code. 'Hs' for human, 'Mm' for mouse, etc.
|
|
1100
|
+
**kwargs
|
|
1101
|
+
Additional arguments passed to GProfiler.profile().
|
|
1102
|
+
|
|
1103
|
+
Returns
|
|
1104
|
+
-------
|
|
1105
|
+
DataFrame with GO enrichment results.
|
|
1106
|
+
"""
|
|
1107
|
+
try:
|
|
1108
|
+
from gprofiler import GProfiler
|
|
1109
|
+
except ImportError:
|
|
1110
|
+
warnings.warn(
|
|
1111
|
+
"goana() requires gprofiler-official. Install with:\n"
|
|
1112
|
+
" pip install gprofiler-official\n"
|
|
1113
|
+
"Then:\n"
|
|
1114
|
+
" from gprofiler import GProfiler\n"
|
|
1115
|
+
" gp = GProfiler(return_dataframe=True)\n"
|
|
1116
|
+
" result = gp.profile(organism='hsapiens', query=gene_list)")
|
|
1117
|
+
return pd.DataFrame()
|
|
1118
|
+
|
|
1119
|
+
# Map species codes
|
|
1120
|
+
species_map = {
|
|
1121
|
+
'Hs': 'hsapiens', 'Mm': 'mmusculus', 'Rn': 'rnorvegicus',
|
|
1122
|
+
'Dm': 'dmelanogaster', 'Sc': 'scerevisiae', 'Ce': 'celegans',
|
|
1123
|
+
'Dr': 'drerio',
|
|
1124
|
+
}
|
|
1125
|
+
organism = species_map.get(species, species)
|
|
1126
|
+
|
|
1127
|
+
# Extract gene list
|
|
1128
|
+
if isinstance(de, dict) and 'table' in de:
|
|
1129
|
+
table = de['table']
|
|
1130
|
+
if isinstance(table, pd.DataFrame):
|
|
1131
|
+
sig = table[table['PValue'] < 0.05] if 'PValue' in table.columns else table
|
|
1132
|
+
gene_list = list(sig.index)
|
|
1133
|
+
else:
|
|
1134
|
+
gene_list = []
|
|
1135
|
+
elif isinstance(de, (list, np.ndarray)):
|
|
1136
|
+
gene_list = list(de)
|
|
1137
|
+
else:
|
|
1138
|
+
warnings.warn("goana: cannot extract gene list from input. "
|
|
1139
|
+
"Provide a DGELRT/DGEExact dict or a list of gene IDs.")
|
|
1140
|
+
return pd.DataFrame()
|
|
1141
|
+
|
|
1142
|
+
if len(gene_list) == 0:
|
|
1143
|
+
warnings.warn("goana: no genes to test")
|
|
1144
|
+
return pd.DataFrame()
|
|
1145
|
+
|
|
1146
|
+
gp = GProfiler(return_dataframe=True)
|
|
1147
|
+
sources = kwargs.pop('sources', ['GO:BP', 'GO:MF', 'GO:CC'])
|
|
1148
|
+
result = gp.profile(organism=organism, query=gene_list,
|
|
1149
|
+
sources=sources, **kwargs)
|
|
1150
|
+
return result
|
|
1151
|
+
|
|
1152
|
+
|
|
1153
|
+
def kegga(de, species='Hs', **kwargs):
|
|
1154
|
+
"""KEGG pathway enrichment analysis using g:Profiler.
|
|
1155
|
+
|
|
1156
|
+
Wraps the gprofiler-official Python package for KEGG enrichment.
|
|
1157
|
+
Requires: pip install gprofiler-official
|
|
1158
|
+
|
|
1159
|
+
Parameters
|
|
1160
|
+
----------
|
|
1161
|
+
de : dict (DGELRT/DGEExact) or list
|
|
1162
|
+
If DGELRT/DGEExact dict (has 'table'), significant genes are extracted.
|
|
1163
|
+
If list, used directly as gene identifiers.
|
|
1164
|
+
species : str
|
|
1165
|
+
Species code. 'Hs' for human, 'Mm' for mouse, etc.
|
|
1166
|
+
**kwargs
|
|
1167
|
+
Additional arguments passed to GProfiler.profile().
|
|
1168
|
+
|
|
1169
|
+
Returns
|
|
1170
|
+
-------
|
|
1171
|
+
DataFrame with KEGG enrichment results.
|
|
1172
|
+
"""
|
|
1173
|
+
try:
|
|
1174
|
+
from gprofiler import GProfiler
|
|
1175
|
+
except ImportError:
|
|
1176
|
+
warnings.warn(
|
|
1177
|
+
"kegga() requires gprofiler-official. Install with:\n"
|
|
1178
|
+
" pip install gprofiler-official\n"
|
|
1179
|
+
"Then:\n"
|
|
1180
|
+
" from gprofiler import GProfiler\n"
|
|
1181
|
+
" gp = GProfiler(return_dataframe=True)\n"
|
|
1182
|
+
" result = gp.profile(organism='hsapiens', query=gene_list, "
|
|
1183
|
+
"sources=['KEGG'])")
|
|
1184
|
+
return pd.DataFrame()
|
|
1185
|
+
|
|
1186
|
+
species_map = {
|
|
1187
|
+
'Hs': 'hsapiens', 'Mm': 'mmusculus', 'Rn': 'rnorvegicus',
|
|
1188
|
+
'Dm': 'dmelanogaster', 'Sc': 'scerevisiae', 'Ce': 'celegans',
|
|
1189
|
+
'Dr': 'drerio',
|
|
1190
|
+
}
|
|
1191
|
+
organism = species_map.get(species, species)
|
|
1192
|
+
|
|
1193
|
+
# Extract gene list
|
|
1194
|
+
if isinstance(de, dict) and 'table' in de:
|
|
1195
|
+
table = de['table']
|
|
1196
|
+
if isinstance(table, pd.DataFrame):
|
|
1197
|
+
sig = table[table['PValue'] < 0.05] if 'PValue' in table.columns else table
|
|
1198
|
+
gene_list = list(sig.index)
|
|
1199
|
+
else:
|
|
1200
|
+
gene_list = []
|
|
1201
|
+
elif isinstance(de, (list, np.ndarray)):
|
|
1202
|
+
gene_list = list(de)
|
|
1203
|
+
else:
|
|
1204
|
+
warnings.warn("kegga: cannot extract gene list from input. "
|
|
1205
|
+
"Provide a DGELRT/DGEExact dict or a list of gene IDs.")
|
|
1206
|
+
return pd.DataFrame()
|
|
1207
|
+
|
|
1208
|
+
if len(gene_list) == 0:
|
|
1209
|
+
warnings.warn("kegga: no genes to test")
|
|
1210
|
+
return pd.DataFrame()
|
|
1211
|
+
|
|
1212
|
+
gp = GProfiler(return_dataframe=True)
|
|
1213
|
+
result = gp.profile(organism=organism, query=gene_list,
|
|
1214
|
+
sources=['KEGG'], **kwargs)
|
|
1215
|
+
return result
|