edgepython 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1215 @@
1
+ # This code was written by Claude (Anthropic). The project was directed by Lior Pachter.
2
+ """
3
+ Gene set testing for edgePython.
4
+
5
+ Port of edgeR's camera, fry, roast, mroast, romer, goana, kegga.
6
+ """
7
+
8
+ import numpy as np
9
+ import pandas as pd
10
+ import warnings
11
+ from scipy.stats import t as t_dist, norm as norm_dist, beta as beta_dist, rankdata
12
+ from statsmodels.stats.multitest import multipletests
13
+
14
+
15
+ def _zscore_t_hill(x, df):
16
+ """Convert t-statistics to z-scores using Hill's approximation.
17
+
18
+ Port of limma's .zscoreTHill. This is the method used by R's camera
19
+ when approx=TRUE, method="hill".
20
+ """
21
+ x = np.asarray(x, dtype=np.float64)
22
+ df = np.minimum(df, 1e100)
23
+ A = df - 0.5
24
+ B = 48.0 * A * A
25
+ z = A * np.log1p(x / df * x)
26
+ z = (((((-0.4 * z - 3.3) * z - 24.0) * z - 85.5) / (0.8 * z * z + 100.0 + B) + z + 3.0) / B + 1.0) * np.sqrt(z)
27
+ return z * np.sign(x)
28
+
29
+
30
+ # -----------------------------------------------------------------------
31
+ # Private helpers
32
+ # -----------------------------------------------------------------------
33
+
34
+ def _zscore_glm(y, design, contrast):
35
+ """Convert DGEGLM counts to NB z-scores under null model.
36
+
37
+ Port of edgeR's .zscoreGLM.
38
+ """
39
+ from .glm_fit import glm_fit
40
+ from .utils import zscore_nbinom
41
+
42
+ counts = y['counts'].copy().astype(np.float64)
43
+
44
+ # QL scaling
45
+ if y.get('average.ql.dispersion') is not None:
46
+ s2_prior = np.atleast_1d(np.asarray(y.get('s2.prior', 1.0), dtype=np.float64))
47
+ if s2_prior.ndim == 0 or s2_prior.size == 1:
48
+ s2_prior = np.full(counts.shape[0], float(s2_prior.ravel()[0]))
49
+ counts = counts / np.maximum(1.0, s2_prior)[:, np.newaxis]
50
+
51
+ design = np.asarray(design, dtype=np.float64)
52
+ p = design.shape[1]
53
+
54
+ # Build null design by removing the contrast column
55
+ if isinstance(contrast, (int, np.integer)):
56
+ contrast_idx = int(contrast)
57
+ cols = [i for i in range(p) if i != contrast_idx]
58
+ design0 = design[:, cols]
59
+ else:
60
+ # contrast is a vector - remove last column (after contrastAsCoef)
61
+ design0 = design[:, :-1]
62
+
63
+ dispersion = y.get('dispersion', 0.05)
64
+ offset = y.get('offset')
65
+ w = y.get('weights')
66
+
67
+ # Fit null model
68
+ fit_null = glm_fit(counts, design=design0, dispersion=dispersion,
69
+ offset=offset, weights=w, prior_count=0)
70
+
71
+ mu = np.maximum(fit_null['fitted.values'], 1e-17)
72
+
73
+ # size parameter = 1/dispersion
74
+ disp = np.atleast_1d(np.asarray(dispersion, dtype=np.float64))
75
+ if disp.size == 1:
76
+ disp = np.full(counts.shape[0], float(disp.ravel()[0]))
77
+
78
+ # Compute z-scores column by column
79
+ ngenes, nsamples = counts.shape
80
+ z = np.zeros_like(counts)
81
+ for j in range(nsamples):
82
+ z[:, j] = zscore_nbinom(counts[:, j], size=1.0 / disp, mu=mu[:, j])
83
+
84
+ return z
85
+
86
+
87
+ def _zscore_dge(y, design, contrast):
88
+ """Convert DGEList counts to NB z-scores under null model.
89
+
90
+ Port of edgeR's .zscoreDGE. Fits a null GLM (without contrast column)
91
+ and converts raw counts to standard normal z-scores using the mid-p
92
+ negative binomial quantile residual method.
93
+ """
94
+ from .glm_fit import glm_fit
95
+ from .dgelist import get_dispersion, get_offset
96
+ from .utils import zscore_nbinom
97
+ from .limma_port import contrast_as_coef
98
+
99
+ counts = y['counts'].copy().astype(np.float64)
100
+ design = np.asarray(design, dtype=np.float64)
101
+ p = design.shape[1]
102
+
103
+ if p < 2:
104
+ raise ValueError("design matrix must have at least two columns")
105
+
106
+ # Get dispersion
107
+ dispersion = get_dispersion(y)
108
+ if dispersion is None:
109
+ raise ValueError("Dispersion estimate not found. "
110
+ "Please estimate dispersions before gene set testing.")
111
+
112
+ # Build null design by removing the contrast column
113
+ if isinstance(contrast, (int, np.integer)):
114
+ contrast_idx = int(contrast)
115
+ cols = [i for i in range(p) if i != contrast_idx]
116
+ design0 = design[:, cols]
117
+ else:
118
+ # Contrast is a vector: use contrastAsCoef to reparametrize,
119
+ # then drop the last column
120
+ cac = contrast_as_coef(design, contrast, first=False)
121
+ design_reparametrized = cac['design']
122
+ design0 = design_reparametrized[:, :-1]
123
+
124
+ # Get offset from DGEList
125
+ offset = get_offset(y)
126
+
127
+ # Fit null model
128
+ fit_null = glm_fit(counts, design=design0, dispersion=dispersion,
129
+ offset=offset, prior_count=0)
130
+
131
+ mu = np.maximum(fit_null['fitted.values'], 1e-17)
132
+
133
+ # size parameter = 1/dispersion
134
+ disp = np.atleast_1d(np.asarray(dispersion, dtype=np.float64))
135
+ if disp.size == 1:
136
+ disp = np.full(counts.shape[0], float(disp.ravel()[0]))
137
+
138
+ # Compute z-scores column by column
139
+ ngenes, nsamples = counts.shape
140
+ z = np.zeros_like(counts)
141
+ for j in range(nsamples):
142
+ z[:, j] = zscore_nbinom(counts[:, j], size=1.0 / disp, mu=mu[:, j])
143
+
144
+ return z
145
+
146
+
147
+ def _resolve_input(y, design, contrast):
148
+ """Resolve input type and return z-score matrix, design, contrast.
149
+
150
+ Used by fry, roast, mroast, romer to dispatch DGEList/DGEGLM/matrix.
151
+ """
152
+ is_dgeglm = isinstance(y, dict) and 'coefficients' in y and 'dispersion' in y
153
+ is_dgelist = isinstance(y, dict) and 'counts' in y and 'coefficients' not in y
154
+
155
+ if design is None and isinstance(y, dict):
156
+ design = y.get('design')
157
+ if design is None:
158
+ raise ValueError("design matrix must be provided")
159
+ design = np.asarray(design, dtype=np.float64)
160
+
161
+ if contrast is None:
162
+ contrast = design.shape[1] - 1
163
+
164
+ if is_dgeglm:
165
+ expr = _zscore_glm(y, design=design, contrast=contrast)
166
+ elif is_dgelist:
167
+ expr = _zscore_dge(y, design=design, contrast=contrast)
168
+ else:
169
+ expr = np.asarray(y, dtype=np.float64)
170
+
171
+ return expr, design, contrast
172
+
173
+
174
+ def _extract_effects(y, design, contrast):
175
+ """QR decomposition of design to extract contrast effect and residuals.
176
+
177
+ Port of limma's .lmEffects (internal).
178
+
179
+ Returns
180
+ -------
181
+ dict with:
182
+ unscaledt : ndarray (G,) - unscaled t-statistics (contrast effect)
183
+ U : ndarray (df_residual, G) - residual effects
184
+ sigma2 : ndarray (G,) - residual variances
185
+ df_residual : int - residual degrees of freedom
186
+ """
187
+ y = np.asarray(y, dtype=np.float64)
188
+ G, n = y.shape
189
+ design = np.asarray(design, dtype=np.float64)
190
+ p = design.shape[1]
191
+ df_residual = n - p
192
+
193
+ # Reorder design so contrast column is last
194
+ if isinstance(contrast, (int, np.integer)):
195
+ contrast_idx = int(contrast)
196
+ if contrast_idx < p - 1:
197
+ j = [i for i in range(p) if i != contrast_idx] + [contrast_idx]
198
+ design = design[:, j]
199
+ else:
200
+ contrast_vec = np.asarray(contrast, dtype=np.float64)
201
+ if contrast_vec.ndim == 1 and len(contrast_vec) == p:
202
+ nonzero = np.where(contrast_vec != 0)[0]
203
+ if len(nonzero) == 1 and contrast_vec[nonzero[0]] == 1:
204
+ contrast_idx = nonzero[0]
205
+ if contrast_idx < p - 1:
206
+ j = [i for i in range(p) if i != contrast_idx] + [contrast_idx]
207
+ design = design[:, j]
208
+ else:
209
+ QR_c = np.linalg.qr(contrast_vec.reshape(-1, 1))
210
+ design = (QR_c[0].T @ design.T).T
211
+ if QR_c[1][0, 0] < 0:
212
+ design[:, 0] = -design[:, 0]
213
+ design = np.column_stack([design[:, 1:], design[:, 0]])
214
+
215
+ # QR decomposition of design
216
+ Q_full, R_full = np.linalg.qr(design, mode='complete')
217
+ effects = Q_full.T @ y.T # n x G
218
+
219
+ unscaledt = effects[p - 1, :] # contrast row
220
+ # Check sign
221
+ R_reduced = np.linalg.qr(design, mode='reduced')[1]
222
+ if R_reduced[p - 1, p - 1] < 0:
223
+ unscaledt = -unscaledt
224
+
225
+ # Residual effects
226
+ U = effects[p:, :] # (n-p) x G
227
+ sigma2 = np.mean(U ** 2, axis=0)
228
+
229
+ return {
230
+ 'unscaledt': unscaledt,
231
+ 'U': U,
232
+ 'sigma2': sigma2,
233
+ 'df_residual': df_residual,
234
+ }
235
+
236
+
237
+ # -----------------------------------------------------------------------
238
+ # camera.default
239
+ # -----------------------------------------------------------------------
240
+
241
+ def _camera_default(y, index, design, contrast, weights=None,
242
+ use_ranks=False, allow_neg_cor=False, inter_gene_cor=0.01,
243
+ trend_var=False, sort=True):
244
+ """Standard camera test. Port of limma's camera.default."""
245
+ from .limma_port import squeeze_var
246
+
247
+ y = np.asarray(y, dtype=np.float64)
248
+ G, n = y.shape
249
+
250
+ if design is None:
251
+ design = np.ones((n, 1))
252
+ design = np.asarray(design, dtype=np.float64)
253
+ p = design.shape[1]
254
+ df_residual = n - p
255
+
256
+ fixed_cor = inter_gene_cor is not None and not (
257
+ isinstance(inter_gene_cor, float) and np.isnan(inter_gene_cor))
258
+
259
+ if fixed_cor:
260
+ if use_ranks:
261
+ df_camera = np.inf
262
+ else:
263
+ df_camera = G - 2
264
+ else:
265
+ df_camera = min(df_residual, G - 2)
266
+
267
+ # Handle contrast: reorder design so contrast column is last
268
+ if isinstance(contrast, (int, np.integer)):
269
+ contrast_idx = int(contrast)
270
+ if contrast_idx < p - 1:
271
+ j = [i for i in range(p) if i != contrast_idx] + [contrast_idx]
272
+ design = design[:, j]
273
+ else:
274
+ contrast_vec = np.asarray(contrast, dtype=np.float64)
275
+ if contrast_vec.ndim == 1 and len(contrast_vec) == p:
276
+ nonzero = np.where(contrast_vec != 0)[0]
277
+ if len(nonzero) == 1 and contrast_vec[nonzero[0]] == 1:
278
+ contrast_idx = nonzero[0]
279
+ if contrast_idx < p - 1:
280
+ j = [i for i in range(p) if i != contrast_idx] + [contrast_idx]
281
+ design = design[:, j]
282
+ else:
283
+ QR_c = np.linalg.qr(contrast_vec.reshape(-1, 1))
284
+ design = (QR_c[0].T @ design.T).T
285
+ if QR_c[1][0, 0] < 0:
286
+ design[:, 0] = -design[:, 0]
287
+ design = np.column_stack([design[:, 1:], design[:, 0]])
288
+
289
+ # QR decomposition of design
290
+ Q_full, R_full = np.linalg.qr(design, mode='complete')
291
+ effects = Q_full.T @ y.T # n x G
292
+
293
+ unscaledt = effects[p - 1, :]
294
+ R_reduced = np.linalg.qr(design, mode='reduced')[1]
295
+ if R_reduced[p - 1, p - 1] < 0:
296
+ unscaledt = -unscaledt
297
+
298
+ # Residual effects
299
+ U = effects[p:, :] # (n-p) x G
300
+ sigma2 = np.mean(U ** 2, axis=0)
301
+
302
+ # Normalize residuals for correlation estimation
303
+ U_norm = (U / np.sqrt(np.maximum(sigma2, 1e-8))).T # G x (n-p)
304
+
305
+ # squeezeVar
306
+ A = np.mean(y, axis=1) if trend_var else None
307
+ sv = squeeze_var(sigma2, np.full(G, float(df_residual)), covariate=A)
308
+ var_post = sv['var_post']
309
+ df_prior_val = sv['df_prior']
310
+
311
+ modt = unscaledt / np.sqrt(np.maximum(var_post, 1e-15))
312
+
313
+ if use_ranks:
314
+ Stat = modt.copy()
315
+ else:
316
+ # zscoreT: convert moderated t to z-scores using Hill's approximation
317
+ # (matches R's limma: zscoreT(modt, df=df.total, approx=TRUE, method="hill"))
318
+ if np.isscalar(df_prior_val) or (hasattr(df_prior_val, 'size') and df_prior_val.size == 1):
319
+ dp = float(np.ravel(df_prior_val)[0])
320
+ else:
321
+ dp = float(np.median(df_prior_val))
322
+ df_total = min(df_residual + dp, G * df_residual)
323
+ Stat = _zscore_t_hill(modt, df_total)
324
+ Stat = np.where(np.isfinite(Stat), Stat, 0.0)
325
+
326
+ # Convert index format
327
+ if isinstance(index, dict):
328
+ set_names = list(index.keys())
329
+ set_indices = list(index.values())
330
+ elif isinstance(index, list):
331
+ set_names = [f'Set{i+1}' for i in range(len(index))]
332
+ set_indices = index
333
+ else:
334
+ raise ValueError("index must be a dict or list of lists")
335
+
336
+ nsets = len(set_names)
337
+
338
+ if not use_ranks:
339
+ meanStat = np.mean(Stat)
340
+ varStat = np.var(Stat, ddof=1)
341
+
342
+ results = []
343
+ for s_idx in range(nsets):
344
+ idx = np.asarray(set_indices[s_idx], dtype=int)
345
+ StatInSet = Stat[idx]
346
+ m = len(StatInSet)
347
+ m2 = G - m
348
+
349
+ if fixed_cor:
350
+ correlation = inter_gene_cor
351
+ vif = 1 + (m - 1) * correlation
352
+ else:
353
+ if m > 1:
354
+ Uset = U_norm[idx, :]
355
+ vif = m * np.mean(np.mean(Uset, axis=0) ** 2)
356
+ correlation = (vif - 1) / (m - 1)
357
+ else:
358
+ vif = 1
359
+ correlation = np.nan
360
+
361
+ if use_ranks:
362
+ if not allow_neg_cor:
363
+ correlation = max(0, correlation)
364
+ p_down, p_up = _rank_sum_test_with_correlation(
365
+ idx, Stat, correlation, df_camera)
366
+ else:
367
+ if not allow_neg_cor:
368
+ vif = max(1.0, vif)
369
+ meanStatInSet = np.mean(StatInSet)
370
+ delta = G / m2 * (meanStatInSet - meanStat)
371
+ varStatPooled = ((G - 1) * varStat - delta ** 2 * m * m2 / G) / (G - 2)
372
+ varStatPooled = max(varStatPooled, 1e-15)
373
+ two_sample_t = delta / np.sqrt(varStatPooled * (vif / m + 1.0 / m2))
374
+ p_down = t_dist.cdf(two_sample_t, df_camera)
375
+ p_up = t_dist.sf(two_sample_t, df_camera)
376
+
377
+ p_two = 2 * min(p_down, p_up)
378
+ direction = 'Up' if p_up < p_down else 'Down'
379
+
380
+ results.append({
381
+ 'NGenes': m,
382
+ 'Direction': direction,
383
+ 'PValue': p_two
384
+ })
385
+
386
+ df = pd.DataFrame(results, index=set_names)
387
+ if nsets > 1:
388
+ _, fdr, _, _ = multipletests(df['PValue'].values, method='fdr_bh')
389
+ df['FDR'] = fdr
390
+
391
+ if sort and nsets > 1:
392
+ df = df.sort_values('PValue')
393
+
394
+ return df
395
+
396
+
397
+ def _rank_sum_test_with_correlation(iset, statistics, correlation, df):
398
+ """Port of limma's rankSumTestWithCorrelation.
399
+
400
+ Wilcoxon rank-sum test adjusted for inter-gene correlation,
401
+ using the arcsin-based variance formula from limma.
402
+ """
403
+ n = len(statistics)
404
+ n1 = len(iset)
405
+ n2 = n - n1
406
+
407
+ ranks = rankdata(statistics, method='average')
408
+ r1 = ranks[iset]
409
+
410
+ # U statistic (R convention: U = n1*n2 + n1*(n1+1)/2 - sum(r1))
411
+ U = n1 * n2 + n1 * (n1 + 1) / 2.0 - np.sum(r1)
412
+ mu = n1 * n2 / 2.0
413
+
414
+ # Variance formula using arcsin (matches R's limma exactly)
415
+ if correlation == 0 or n1 == 1:
416
+ sigma2 = n1 * n2 * (n + 1) / 12.0
417
+ else:
418
+ sigma2 = (np.arcsin(1.0) * n1 * n2
419
+ + np.arcsin(0.5) * n1 * n2 * (n2 - 1)
420
+ + np.arcsin(correlation / 2.0) * n1 * (n1 - 1) * n2 * (n2 - 1)
421
+ + np.arcsin((correlation + 1.0) / 2.0) * n1 * (n1 - 1) * n2)
422
+ sigma2 = sigma2 / (2.0 * np.pi)
423
+
424
+ # Ties adjustment
425
+ unique_ranks = np.unique(ranks)
426
+ if len(unique_ranks) < len(ranks):
427
+ nties = np.array([np.sum(ranks == r) for r in unique_ranks])
428
+ adjustment = np.sum(nties * (nties + 1) * (nties - 1)) / (n * (n + 1) * (n - 1))
429
+ sigma2 = sigma2 * (1.0 - adjustment)
430
+
431
+ sigma2 = max(sigma2, 1e-15)
432
+
433
+ # Continuity correction (matching R)
434
+ z_lower = (U + 0.5 - mu) / np.sqrt(sigma2)
435
+ z_upper = (U - 0.5 - mu) / np.sqrt(sigma2)
436
+
437
+ if np.isinf(df):
438
+ p_down = norm_dist.sf(z_upper) # less = P(T > z_upper)
439
+ p_up = norm_dist.cdf(z_lower) # greater = P(T < z_lower)
440
+ else:
441
+ p_down = t_dist.sf(z_upper, df)
442
+ p_up = t_dist.cdf(z_lower, df)
443
+
444
+ return p_down, p_up
445
+
446
+
447
+ # -----------------------------------------------------------------------
448
+ # Public API
449
+ # -----------------------------------------------------------------------
450
+
451
+ def camera(y, index, design=None, contrast=None, weights=None,
452
+ use_ranks=False, allow_neg_cor=False, inter_gene_cor=0.01,
453
+ sort=True):
454
+ """Competitive gene set test accounting for inter-gene correlation.
455
+
456
+ Port of edgeR's camera (camera.DGEList + camera.DGEGLM + camera.default).
457
+
458
+ Parameters
459
+ ----------
460
+ y : ndarray, DGEList-like dict, or DGEGLM-like dict
461
+ If DGEGLM (has 'coefficients' and 'dispersion'), counts are converted
462
+ to NB z-scores under the null model before testing.
463
+ If DGEList (has 'counts' but no 'coefficients'), counts are converted
464
+ to NB z-scores via _zscore_dge (matching R's camera.DGEList).
465
+ If ndarray, used directly as expression matrix.
466
+ index : dict or list of lists
467
+ Gene set indices. If dict, keys are set names and values are
468
+ lists of gene indices (0-based).
469
+ design : ndarray, optional
470
+ Design matrix.
471
+ contrast : int or ndarray, optional
472
+ Column index (0-based) or contrast vector.
473
+ weights : ndarray, optional
474
+ Gene weights.
475
+ use_ranks : bool
476
+ Use rank-based test.
477
+ allow_neg_cor : bool
478
+ Allow negative inter-gene correlation.
479
+ inter_gene_cor : float
480
+ Fixed inter-gene correlation to use (default 0.01).
481
+ sort : bool
482
+ Sort results by p-value.
483
+
484
+ Returns
485
+ -------
486
+ DataFrame with columns NGenes, Direction, PValue, FDR.
487
+ """
488
+ is_dgeglm = isinstance(y, dict) and 'coefficients' in y and 'dispersion' in y
489
+ is_dgelist = isinstance(y, dict) and 'counts' in y and 'coefficients' not in y
490
+
491
+ if design is None and isinstance(y, dict):
492
+ design = y.get('design')
493
+ if design is None:
494
+ raise ValueError("design matrix must be provided")
495
+ design = np.asarray(design, dtype=np.float64)
496
+
497
+ if contrast is None:
498
+ contrast = design.shape[1] - 1
499
+
500
+ if is_dgeglm:
501
+ expr = _zscore_glm(y, design=design, contrast=contrast)
502
+ return _camera_default(expr, index, design=design, contrast=contrast,
503
+ weights=weights, use_ranks=use_ranks,
504
+ allow_neg_cor=allow_neg_cor,
505
+ inter_gene_cor=inter_gene_cor,
506
+ trend_var=False, sort=sort)
507
+ elif is_dgelist:
508
+ expr = _zscore_dge(y, design=design, contrast=contrast)
509
+ return _camera_default(expr, index, design=design, contrast=contrast,
510
+ weights=weights, use_ranks=use_ranks,
511
+ allow_neg_cor=allow_neg_cor,
512
+ inter_gene_cor=inter_gene_cor,
513
+ trend_var=False, sort=sort)
514
+ else:
515
+ expr = np.asarray(y, dtype=np.float64)
516
+ return _camera_default(expr, index, design=design, contrast=contrast,
517
+ weights=weights, use_ranks=use_ranks,
518
+ allow_neg_cor=allow_neg_cor,
519
+ inter_gene_cor=inter_gene_cor,
520
+ trend_var=False, sort=sort)
521
+
522
+
523
+ def fry(y, index, design=None, contrast=None, sort=True):
524
+ """Fast analytical gene set test (rotation-free).
525
+
526
+ Port of edgeR's fry.DGEList → limma's fry.default.
527
+
528
+ For DGEList/DGEGLM input, counts are first converted to NB z-scores,
529
+ then fry is applied with standardize="none" (no re-standardization).
530
+
531
+ Parameters
532
+ ----------
533
+ y : ndarray, DGEList-like dict, or DGEGLM-like dict
534
+ Expression data.
535
+ index : dict or list of lists
536
+ Gene set indices (0-based).
537
+ design : ndarray, optional
538
+ Design matrix.
539
+ contrast : int or ndarray, optional
540
+ Column index (0-based) or contrast vector.
541
+ sort : bool
542
+ Sort results by p-value.
543
+
544
+ Returns
545
+ -------
546
+ DataFrame with columns NGenes, Direction, PValue, FDR, PValue.Mixed, FDR.Mixed.
547
+ """
548
+ expr, design, contrast = _resolve_input(y, design, contrast)
549
+ eff = _extract_effects(expr, design, contrast)
550
+
551
+ unscaledt = eff['unscaledt']
552
+ U = eff['U']
553
+ df_residual = eff['df_residual']
554
+ G = len(unscaledt)
555
+ neffects = df_residual + 1 # contrast + residuals
556
+
557
+ # For DGEList input (z-scores), standardize="none":
558
+ # Effects matrix is used directly without squeezeVar.
559
+ # This matches R's fry.DGEList → fry(standardize="none")
560
+
561
+ # Build the full effects matrix: G × neffects
562
+ # Column 0 = contrast effect, columns 1..df_residual = residual effects
563
+ # In our representation: unscaledt is (G,), U is (df_residual, G)
564
+ # R's .fryEffects works on the effects matrix directly.
565
+
566
+ # Convert index format
567
+ if isinstance(index, dict):
568
+ set_names = list(index.keys())
569
+ set_indices = list(index.values())
570
+ elif isinstance(index, list):
571
+ set_names = [f'Set{i+1}' for i in range(len(index))]
572
+ set_indices = index
573
+ else:
574
+ raise ValueError("index must be a dict or list of lists")
575
+
576
+ nsets = len(set_names)
577
+ t_stat_arr = np.zeros(nsets)
578
+ p_mixed_arr = np.zeros(nsets)
579
+ ngenes_arr = np.zeros(nsets, dtype=int)
580
+
581
+ for s_idx in range(nsets):
582
+ idx = np.asarray(set_indices[s_idx], dtype=int)
583
+ m = len(idx)
584
+ ngenes_arr[s_idx] = m
585
+
586
+ # Build EffectsSet: m × neffects (genes × effects)
587
+ # Column 0 = contrast, columns 1: = residuals
588
+ effects_set = np.column_stack([
589
+ unscaledt[idx].reshape(-1, 1),
590
+ U[:, idx].T
591
+ ]) # m × (df_residual + 1)
592
+
593
+ # --- Directional test (matching R's .fryEffects) ---
594
+ # Average effects across genes in the set
595
+ mean_effects = np.mean(effects_set, axis=0) # (neffects,)
596
+ # t-statistic: mean contrast effect / sqrt(mean squared residual effects)
597
+ mean_resid_sq = np.mean(mean_effects[1:] ** 2)
598
+ if mean_resid_sq > 1e-30:
599
+ t_stat_arr[s_idx] = mean_effects[0] / np.sqrt(mean_resid_sq)
600
+ else:
601
+ t_stat_arr[s_idx] = 0.0
602
+
603
+ # --- Mixed test (SVD-based, matching R's .fryEffects) ---
604
+ if m > 1:
605
+ svd_vals = np.linalg.svd(effects_set, compute_uv=False)
606
+ A = svd_vals ** 2 # squared singular values
607
+ d1 = len(A)
608
+ d = d1 - 1
609
+
610
+ if d > 0 and A[0] > A[-1] + 1e-15:
611
+ beta_mean = 1.0 / d1
612
+ beta_var = d / (d1 * d1 * (d1 / 2.0 + 1.0))
613
+
614
+ Fobs = (np.sum(effects_set[:, 0] ** 2) - A[-1]) / (A[0] - A[-1])
615
+ Frb_mean = (np.sum(A) * beta_mean - A[-1]) / (A[0] - A[-1])
616
+
617
+ COV = np.full((d1, d1), -beta_var / d)
618
+ np.fill_diagonal(COV, beta_var)
619
+ Frb_var = float(A @ COV @ A) / (A[0] - A[-1]) ** 2
620
+
621
+ if Frb_var > 1e-30 and Frb_mean > 0 and Frb_mean < 1:
622
+ alphaplusbeta = Frb_mean * (1.0 - Frb_mean) / Frb_var - 1.0
623
+ alpha = alphaplusbeta * Frb_mean
624
+ beta_param = alphaplusbeta - alpha
625
+ if alpha > 0 and beta_param > 0:
626
+ p_mixed_arr[s_idx] = beta_dist.sf(Fobs, alpha, beta_param)
627
+ else:
628
+ p_mixed_arr[s_idx] = 1.0
629
+ else:
630
+ p_mixed_arr[s_idx] = 1.0
631
+ else:
632
+ p_mixed_arr[s_idx] = 1.0
633
+ else:
634
+ p_mixed_arr[s_idx] = 0.0 # will be overwritten below
635
+
636
+ # Directional p-values (matching R: 2 * pt(-abs(t.stat), df=df.residual))
637
+ p_dir = 2.0 * t_dist.sf(np.abs(t_stat_arr), df_residual)
638
+
639
+ # Direction
640
+ directions = np.where(t_stat_arr >= 0, 'Up', 'Down')
641
+
642
+ # For single-gene sets, mixed p-value = directional p-value (matching R)
643
+ p_mixed_arr[ngenes_arr == 1] = p_dir[ngenes_arr == 1]
644
+
645
+ results = []
646
+ for s_idx in range(nsets):
647
+ results.append({
648
+ 'NGenes': ngenes_arr[s_idx],
649
+ 'Direction': directions[s_idx],
650
+ 'PValue': p_dir[s_idx],
651
+ 'PValue.Mixed': p_mixed_arr[s_idx],
652
+ })
653
+
654
+ result_df = pd.DataFrame(results, index=set_names)
655
+
656
+ # FDR correction
657
+ if nsets > 1:
658
+ _, fdr, _, _ = multipletests(result_df['PValue'].values, method='fdr_bh')
659
+ result_df['FDR'] = fdr
660
+ _, fdr_mixed, _, _ = multipletests(result_df['PValue.Mixed'].values, method='fdr_bh')
661
+ result_df['FDR.Mixed'] = fdr_mixed
662
+ else:
663
+ result_df['FDR'] = result_df['PValue'].values
664
+ result_df['FDR.Mixed'] = result_df['PValue.Mixed'].values
665
+
666
+ # Reorder columns
667
+ result_df = result_df[['NGenes', 'Direction', 'PValue', 'FDR', 'PValue.Mixed', 'FDR.Mixed']]
668
+
669
+ if sort and nsets > 1:
670
+ result_df = result_df.sort_values('PValue')
671
+
672
+ return result_df
673
+
674
+
675
+ def roast(y, index, design=None, contrast=None, nrot=999,
676
+ set_statistic='mean', sort=True):
677
+ """Rotation gene set test for a single or multiple gene sets.
678
+
679
+ Port of edgeR's roast.DGEList → limma's roast.default.
680
+
681
+ For DGEList/DGEGLM input, counts are first converted to NB z-scores,
682
+ then roast is applied with var.prior=1, df.prior=Inf (since z-scores
683
+ are already standardized).
684
+
685
+ Parameters
686
+ ----------
687
+ y : ndarray, DGEList-like dict, or DGEGLM-like dict
688
+ Expression data.
689
+ index : dict, list of lists, or list of ints
690
+ Gene set indices (0-based). If dict or list of lists, tests first set.
691
+ If list of ints, treats as single gene set.
692
+ design : ndarray, optional
693
+ Design matrix.
694
+ contrast : int or ndarray, optional
695
+ Column index (0-based) or contrast vector.
696
+ nrot : int
697
+ Number of rotations (default 999).
698
+ set_statistic : str
699
+ 'mean' (default), 'floormean', or 'mean50'.
700
+ sort : bool
701
+ Sort results by p-value.
702
+
703
+ Returns
704
+ -------
705
+ DataFrame with columns Active.Prop, P.Value for Down/Up/UpOrDown/Mixed.
706
+ """
707
+ expr, design, contrast = _resolve_input(y, design, contrast)
708
+
709
+ # Handle index format - roast tests a single gene set
710
+ if isinstance(index, dict):
711
+ first_key = list(index.keys())[0]
712
+ idx = np.asarray(index[first_key], dtype=int)
713
+ elif isinstance(index, list):
714
+ if len(index) > 0 and isinstance(index[0], (list, np.ndarray)):
715
+ idx = np.asarray(index[0], dtype=int)
716
+ else:
717
+ idx = np.asarray(index, dtype=int)
718
+ else:
719
+ idx = np.asarray(index, dtype=int)
720
+
721
+ eff = _extract_effects(expr, design, contrast)
722
+ unscaledt = eff['unscaledt']
723
+ U = eff['U']
724
+ df_residual = eff['df_residual']
725
+ G = len(unscaledt)
726
+
727
+ # For DGEList z-scores: var.prior=1, df.prior=Inf => var_post=1
728
+ # So modt = unscaledt / 1 = unscaledt
729
+ modt = unscaledt.copy()
730
+
731
+ # Compute set statistics for observed data
732
+ m = len(idx)
733
+ t_set = modt[idx]
734
+
735
+ # Active proportions
736
+ p_thresh = 0.05
737
+ # Two-sided p-values for each gene
738
+ gene_pvals = 2 * t_dist.sf(np.abs(modt), df_residual)
739
+ active_down = np.sum((gene_pvals[idx] < p_thresh) & (modt[idx] < 0)) / m
740
+ active_up = np.sum((gene_pvals[idx] < p_thresh) & (modt[idx] > 0)) / m
741
+
742
+ # Observed set statistics
743
+ obs_mean_up = np.mean(t_set)
744
+ obs_mean_down = -obs_mean_up
745
+ obs_mean_mixed = np.mean(np.abs(t_set))
746
+
747
+ # Rotation loop
748
+ count_up = 0
749
+ count_down = 0
750
+ count_upordown = 0
751
+ count_mixed = 0
752
+
753
+ rng = np.random.default_rng()
754
+ for _ in range(nrot):
755
+ # Random rotation in the residual space
756
+ # Generate random unit vector in R^(df_residual)
757
+ rand_vec = rng.standard_normal(df_residual)
758
+ rand_vec = rand_vec / np.linalg.norm(rand_vec)
759
+
760
+ # Rotated residuals projected onto random direction
761
+ rotated_resid = rand_vec @ U # (G,)
762
+
763
+ # Rotated moderated t: combine original contrast effect direction
764
+ # with rotated residual (simulating rotation in the space)
765
+ # Under the rotation framework, we rotate the entire effects space
766
+ # For DGEList with var.prior=1: rotated modt = Q_contrast @ rotated_effects
767
+ rot_t = rotated_resid # Since var_post=1, this is already the statistic
768
+
769
+ rot_t_set = rot_t[idx]
770
+ rot_mean_up = np.mean(rot_t_set)
771
+ rot_mean_down = -rot_mean_up
772
+ rot_mean_mixed = np.mean(np.abs(rot_t_set))
773
+
774
+ if rot_mean_up >= obs_mean_up:
775
+ count_up += 1
776
+ if rot_mean_down >= obs_mean_down:
777
+ count_down += 1
778
+ if max(rot_mean_up, rot_mean_down) >= max(obs_mean_up, obs_mean_down):
779
+ count_upordown += 1
780
+ if rot_mean_mixed >= obs_mean_mixed:
781
+ count_mixed += 1
782
+
783
+ # P-values
784
+ p_up = (count_up + 1) / (nrot + 1)
785
+ p_down = (count_down + 1) / (nrot + 1)
786
+ p_upordown = (count_upordown + 1) / (nrot + 1)
787
+ p_mixed = (count_mixed + 1) / (nrot + 1)
788
+
789
+ result = pd.DataFrame({
790
+ 'Active.Prop': [active_down, active_up, max(active_down, active_up), np.nan],
791
+ 'P.Value': [p_down, p_up, p_upordown, p_mixed],
792
+ }, index=['Down', 'Up', 'UpOrDown', 'Mixed'])
793
+
794
+ # Add ngenes as metadata
795
+ result.attrs['ngenes'] = m
796
+
797
+ return result
798
+
799
+
800
+ def mroast(y, index, design=None, contrast=None, nrot=999,
801
+ set_statistic='mean', adjust_method='BH', midp=True, sort=True):
802
+ """Rotation gene set test for multiple gene sets.
803
+
804
+ Port of edgeR's mroast.DGEList → limma's mroast.default.
805
+
806
+ Tests multiple gene sets simultaneously using shared rotations for
807
+ proper FDR correction.
808
+
809
+ Parameters
810
+ ----------
811
+ y : ndarray, DGEList-like dict, or DGEGLM-like dict
812
+ Expression data.
813
+ index : dict or list of lists
814
+ Gene set indices (0-based).
815
+ design : ndarray, optional
816
+ Design matrix.
817
+ contrast : int or ndarray, optional
818
+ Column index (0-based) or contrast vector.
819
+ nrot : int
820
+ Number of rotations (default 999).
821
+ set_statistic : str
822
+ 'mean' (default), 'floormean', or 'mean50'.
823
+ adjust_method : str
824
+ P-value adjustment method (default 'BH').
825
+ midp : bool
826
+ Use mid-p adjustment (default True).
827
+ sort : bool
828
+ Sort results by p-value.
829
+
830
+ Returns
831
+ -------
832
+ DataFrame with columns NGenes, PropDown, PropUp, Direction, PValue, FDR,
833
+ PValue.Mixed, FDR.Mixed.
834
+ """
835
+ expr, design, contrast = _resolve_input(y, design, contrast)
836
+
837
+ # Convert index format
838
+ if isinstance(index, dict):
839
+ set_names = list(index.keys())
840
+ set_indices = [np.asarray(v, dtype=int) for v in index.values()]
841
+ elif isinstance(index, list):
842
+ set_names = [f'Set{i+1}' for i in range(len(index))]
843
+ set_indices = [np.asarray(v, dtype=int) for v in index]
844
+ else:
845
+ raise ValueError("index must be a dict or list of lists")
846
+
847
+ nsets = len(set_names)
848
+
849
+ eff = _extract_effects(expr, design, contrast)
850
+ unscaledt = eff['unscaledt']
851
+ U = eff['U']
852
+ df_residual = eff['df_residual']
853
+ G = len(unscaledt)
854
+
855
+ # For DGEList z-scores: var.prior=1, df.prior=Inf => var_post=1
856
+ modt = unscaledt.copy()
857
+
858
+ # Compute observed statistics and proportions for each set
859
+ p_thresh = 0.05
860
+ gene_pvals = 2 * t_dist.sf(np.abs(modt), df_residual)
861
+
862
+ obs_up = np.zeros(nsets)
863
+ obs_down = np.zeros(nsets)
864
+ obs_mixed = np.zeros(nsets)
865
+ prop_down = np.zeros(nsets)
866
+ prop_up = np.zeros(nsets)
867
+ set_sizes = np.zeros(nsets, dtype=int)
868
+
869
+ for s in range(nsets):
870
+ idx = set_indices[s]
871
+ m = len(idx)
872
+ set_sizes[s] = m
873
+ t_set = modt[idx]
874
+ obs_up[s] = np.mean(t_set)
875
+ obs_down[s] = -obs_up[s]
876
+ obs_mixed[s] = np.mean(np.abs(t_set))
877
+ prop_down[s] = np.sum((gene_pvals[idx] < p_thresh) & (modt[idx] < 0)) / m
878
+ prop_up[s] = np.sum((gene_pvals[idx] < p_thresh) & (modt[idx] > 0)) / m
879
+
880
+ # Shared rotation loop
881
+ count_up = np.zeros(nsets)
882
+ count_down = np.zeros(nsets)
883
+ count_mixed = np.zeros(nsets)
884
+
885
+ rng = np.random.default_rng()
886
+ for _ in range(nrot):
887
+ rand_vec = rng.standard_normal(df_residual)
888
+ rand_vec = rand_vec / np.linalg.norm(rand_vec)
889
+ rot_t = rand_vec @ U # (G,)
890
+
891
+ for s in range(nsets):
892
+ idx = set_indices[s]
893
+ rot_t_set = rot_t[idx]
894
+ rot_mean = np.mean(rot_t_set)
895
+
896
+ if rot_mean >= obs_up[s]:
897
+ count_up[s] += 1
898
+ if -rot_mean >= obs_down[s]:
899
+ count_down[s] += 1
900
+ if np.mean(np.abs(rot_t_set)) >= obs_mixed[s]:
901
+ count_mixed[s] += 1
902
+
903
+ # P-values
904
+ if midp:
905
+ p_up_vals = (count_up + 0.5) / (nrot + 1)
906
+ p_down_vals = (count_down + 0.5) / (nrot + 1)
907
+ p_mixed_vals = (count_mixed + 0.5) / (nrot + 1)
908
+ else:
909
+ p_up_vals = (count_up + 1) / (nrot + 1)
910
+ p_down_vals = (count_down + 1) / (nrot + 1)
911
+ p_mixed_vals = (count_mixed + 1) / (nrot + 1)
912
+
913
+ # Two-sided directional p-value and direction
914
+ p_dir = np.minimum(2 * np.minimum(p_up_vals, p_down_vals), 1.0)
915
+ directions = np.where(p_up_vals < p_down_vals, 'Up', 'Down')
916
+
917
+ # FDR correction
918
+ method_map = {'BH': 'fdr_bh', 'bonferroni': 'bonferroni',
919
+ 'holm': 'holm', 'hochberg': 'simes-hochberg',
920
+ 'BY': 'fdr_by', 'fdr': 'fdr_bh'}
921
+ sm_method = method_map.get(adjust_method, 'fdr_bh')
922
+
923
+ if nsets > 1:
924
+ _, fdr_dir, _, _ = multipletests(p_dir, method=sm_method)
925
+ _, fdr_mixed, _, _ = multipletests(p_mixed_vals, method=sm_method)
926
+ else:
927
+ fdr_dir = p_dir
928
+ fdr_mixed = p_mixed_vals
929
+
930
+ result_df = pd.DataFrame({
931
+ 'NGenes': set_sizes,
932
+ 'PropDown': prop_down,
933
+ 'PropUp': prop_up,
934
+ 'Direction': directions,
935
+ 'PValue': p_dir,
936
+ 'FDR': fdr_dir,
937
+ 'PValue.Mixed': p_mixed_vals,
938
+ 'FDR.Mixed': fdr_mixed,
939
+ }, index=set_names)
940
+
941
+ if sort and nsets > 1:
942
+ result_df = result_df.sort_values('PValue')
943
+
944
+ return result_df
945
+
946
+
947
+ def romer(y, index, design=None, contrast=None, nrot=9999):
948
+ """Rank-based rotation gene set enrichment test.
949
+
950
+ Port of edgeR's romer.DGEList → limma's romer.default.
951
+
952
+ For DGEList/DGEGLM input, counts are first converted to NB z-scores,
953
+ then romer is applied. Unlike roast/mroast/fry, romer lets squeezeVar
954
+ estimate its own variance prior from the z-score data.
955
+
956
+ Parameters
957
+ ----------
958
+ y : ndarray, DGEList-like dict, or DGEGLM-like dict
959
+ Expression data.
960
+ index : dict or list of lists
961
+ Gene set indices (0-based).
962
+ design : ndarray, optional
963
+ Design matrix.
964
+ contrast : int or ndarray, optional
965
+ Column index (0-based) or contrast vector.
966
+ nrot : int
967
+ Number of rotations (default 9999).
968
+
969
+ Returns
970
+ -------
971
+ DataFrame with columns NGenes, Up, Down, Mixed (p-values).
972
+ """
973
+ from .limma_port import squeeze_var
974
+
975
+ expr, design, contrast = _resolve_input(y, design, contrast)
976
+
977
+ # Convert index format
978
+ if isinstance(index, dict):
979
+ set_names = list(index.keys())
980
+ set_indices = [np.asarray(v, dtype=int) for v in index.values()]
981
+ elif isinstance(index, list):
982
+ set_names = [f'Set{i+1}' for i in range(len(index))]
983
+ set_indices = [np.asarray(v, dtype=int) for v in index]
984
+ else:
985
+ raise ValueError("index must be a dict or list of lists")
986
+
987
+ nsets = len(set_names)
988
+
989
+ eff = _extract_effects(expr, design, contrast)
990
+ unscaledt = eff['unscaledt']
991
+ U = eff['U']
992
+ sigma2 = eff['sigma2']
993
+ df_residual = eff['df_residual']
994
+ G = len(unscaledt)
995
+
996
+ # squeezeVar to estimate prior (romer does its own variance moderation)
997
+ sv = squeeze_var(sigma2, np.full(G, float(df_residual)))
998
+ var_post = sv['var_post']
999
+ df_prior_val = sv['df_prior']
1000
+
1001
+ # Moderated t-statistics
1002
+ sd_post = np.sqrt(np.maximum(var_post, 1e-15))
1003
+ modt = unscaledt / sd_post
1004
+
1005
+ # Shrink residuals (as R's romer does with shrink.resid=TRUE)
1006
+ if np.isscalar(df_prior_val):
1007
+ dp = float(df_prior_val)
1008
+ else:
1009
+ dp = float(np.median(df_prior_val))
1010
+ s0 = np.sqrt(np.maximum(sv.get('var_prior', 1.0), 1e-15))
1011
+ if np.isscalar(s0):
1012
+ s0 = float(s0)
1013
+ else:
1014
+ s0 = float(np.median(s0))
1015
+
1016
+ # Shrink residuals: U_shrunk = U * s0 / sd_unshrunk
1017
+ sd_unshrunk = np.sqrt(np.maximum(sigma2, 1e-15))
1018
+ shrink_factor = s0 / np.maximum(sd_unshrunk, 1e-15)
1019
+ U_shrunk = U * shrink_factor[np.newaxis, :]
1020
+
1021
+ # Compute ranks for observed data
1022
+ # Up: high t -> high rank (ascending ranks)
1023
+ # Down: low t -> high rank (descending ranks)
1024
+ # Mixed: high |t| -> high rank
1025
+ up_ranks = rankdata(modt)
1026
+ down_ranks = rankdata(-modt)
1027
+ mixed_ranks = rankdata(np.abs(modt))
1028
+
1029
+ # Observed mean ranks per set
1030
+ obs_up = np.zeros(nsets)
1031
+ obs_down = np.zeros(nsets)
1032
+ obs_mixed = np.zeros(nsets)
1033
+ set_sizes = np.zeros(nsets, dtype=int)
1034
+
1035
+ for s in range(nsets):
1036
+ idx = set_indices[s]
1037
+ m = len(idx)
1038
+ set_sizes[s] = m
1039
+ obs_up[s] = np.mean(up_ranks[idx])
1040
+ obs_down[s] = np.mean(down_ranks[idx])
1041
+ obs_mixed[s] = np.mean(mixed_ranks[idx])
1042
+
1043
+ # Rotation loop
1044
+ count_up = np.zeros(nsets)
1045
+ count_down = np.zeros(nsets)
1046
+ count_mixed = np.zeros(nsets)
1047
+
1048
+ rng = np.random.default_rng()
1049
+ for _ in range(nrot):
1050
+ # Random rotation in residual space
1051
+ rand_vec = rng.standard_normal(df_residual)
1052
+ rand_vec = rand_vec / np.linalg.norm(rand_vec)
1053
+
1054
+ # Rotated statistics
1055
+ rot_resid = rand_vec @ U_shrunk # (G,)
1056
+ rot_t = rot_resid / sd_post # Approximate rotated moderated t
1057
+
1058
+ # Compute ranks
1059
+ rot_up_ranks = rankdata(rot_t)
1060
+ rot_down_ranks = rankdata(-rot_t)
1061
+ rot_mixed_ranks = rankdata(np.abs(rot_t))
1062
+
1063
+ for s in range(nsets):
1064
+ idx = set_indices[s]
1065
+ if np.mean(rot_up_ranks[idx]) >= obs_up[s]:
1066
+ count_up[s] += 1
1067
+ if np.mean(rot_down_ranks[idx]) >= obs_down[s]:
1068
+ count_down[s] += 1
1069
+ if np.mean(rot_mixed_ranks[idx]) >= obs_mixed[s]:
1070
+ count_mixed[s] += 1
1071
+
1072
+ # P-values
1073
+ p_up = (count_up + 1) / (nrot + 1)
1074
+ p_down = (count_down + 1) / (nrot + 1)
1075
+ p_mixed = (count_mixed + 1) / (nrot + 1)
1076
+
1077
+ result_df = pd.DataFrame({
1078
+ 'NGenes': set_sizes,
1079
+ 'Up': p_up,
1080
+ 'Down': p_down,
1081
+ 'Mixed': p_mixed,
1082
+ }, index=set_names)
1083
+
1084
+ return result_df
1085
+
1086
+
1087
+ def goana(de, species='Hs', **kwargs):
1088
+ """Gene ontology enrichment analysis using g:Profiler.
1089
+
1090
+ Wraps the gprofiler-official Python package for GO enrichment.
1091
+ Requires: pip install gprofiler-official
1092
+
1093
+ Parameters
1094
+ ----------
1095
+ de : dict (DGELRT/DGEExact) or list
1096
+ If DGELRT/DGEExact dict (has 'table'), significant genes are extracted.
1097
+ If list, used directly as gene identifiers.
1098
+ species : str
1099
+ Species code. 'Hs' for human, 'Mm' for mouse, etc.
1100
+ **kwargs
1101
+ Additional arguments passed to GProfiler.profile().
1102
+
1103
+ Returns
1104
+ -------
1105
+ DataFrame with GO enrichment results.
1106
+ """
1107
+ try:
1108
+ from gprofiler import GProfiler
1109
+ except ImportError:
1110
+ warnings.warn(
1111
+ "goana() requires gprofiler-official. Install with:\n"
1112
+ " pip install gprofiler-official\n"
1113
+ "Then:\n"
1114
+ " from gprofiler import GProfiler\n"
1115
+ " gp = GProfiler(return_dataframe=True)\n"
1116
+ " result = gp.profile(organism='hsapiens', query=gene_list)")
1117
+ return pd.DataFrame()
1118
+
1119
+ # Map species codes
1120
+ species_map = {
1121
+ 'Hs': 'hsapiens', 'Mm': 'mmusculus', 'Rn': 'rnorvegicus',
1122
+ 'Dm': 'dmelanogaster', 'Sc': 'scerevisiae', 'Ce': 'celegans',
1123
+ 'Dr': 'drerio',
1124
+ }
1125
+ organism = species_map.get(species, species)
1126
+
1127
+ # Extract gene list
1128
+ if isinstance(de, dict) and 'table' in de:
1129
+ table = de['table']
1130
+ if isinstance(table, pd.DataFrame):
1131
+ sig = table[table['PValue'] < 0.05] if 'PValue' in table.columns else table
1132
+ gene_list = list(sig.index)
1133
+ else:
1134
+ gene_list = []
1135
+ elif isinstance(de, (list, np.ndarray)):
1136
+ gene_list = list(de)
1137
+ else:
1138
+ warnings.warn("goana: cannot extract gene list from input. "
1139
+ "Provide a DGELRT/DGEExact dict or a list of gene IDs.")
1140
+ return pd.DataFrame()
1141
+
1142
+ if len(gene_list) == 0:
1143
+ warnings.warn("goana: no genes to test")
1144
+ return pd.DataFrame()
1145
+
1146
+ gp = GProfiler(return_dataframe=True)
1147
+ sources = kwargs.pop('sources', ['GO:BP', 'GO:MF', 'GO:CC'])
1148
+ result = gp.profile(organism=organism, query=gene_list,
1149
+ sources=sources, **kwargs)
1150
+ return result
1151
+
1152
+
1153
+ def kegga(de, species='Hs', **kwargs):
1154
+ """KEGG pathway enrichment analysis using g:Profiler.
1155
+
1156
+ Wraps the gprofiler-official Python package for KEGG enrichment.
1157
+ Requires: pip install gprofiler-official
1158
+
1159
+ Parameters
1160
+ ----------
1161
+ de : dict (DGELRT/DGEExact) or list
1162
+ If DGELRT/DGEExact dict (has 'table'), significant genes are extracted.
1163
+ If list, used directly as gene identifiers.
1164
+ species : str
1165
+ Species code. 'Hs' for human, 'Mm' for mouse, etc.
1166
+ **kwargs
1167
+ Additional arguments passed to GProfiler.profile().
1168
+
1169
+ Returns
1170
+ -------
1171
+ DataFrame with KEGG enrichment results.
1172
+ """
1173
+ try:
1174
+ from gprofiler import GProfiler
1175
+ except ImportError:
1176
+ warnings.warn(
1177
+ "kegga() requires gprofiler-official. Install with:\n"
1178
+ " pip install gprofiler-official\n"
1179
+ "Then:\n"
1180
+ " from gprofiler import GProfiler\n"
1181
+ " gp = GProfiler(return_dataframe=True)\n"
1182
+ " result = gp.profile(organism='hsapiens', query=gene_list, "
1183
+ "sources=['KEGG'])")
1184
+ return pd.DataFrame()
1185
+
1186
+ species_map = {
1187
+ 'Hs': 'hsapiens', 'Mm': 'mmusculus', 'Rn': 'rnorvegicus',
1188
+ 'Dm': 'dmelanogaster', 'Sc': 'scerevisiae', 'Ce': 'celegans',
1189
+ 'Dr': 'drerio',
1190
+ }
1191
+ organism = species_map.get(species, species)
1192
+
1193
+ # Extract gene list
1194
+ if isinstance(de, dict) and 'table' in de:
1195
+ table = de['table']
1196
+ if isinstance(table, pd.DataFrame):
1197
+ sig = table[table['PValue'] < 0.05] if 'PValue' in table.columns else table
1198
+ gene_list = list(sig.index)
1199
+ else:
1200
+ gene_list = []
1201
+ elif isinstance(de, (list, np.ndarray)):
1202
+ gene_list = list(de)
1203
+ else:
1204
+ warnings.warn("kegga: cannot extract gene list from input. "
1205
+ "Provide a DGELRT/DGEExact dict or a list of gene IDs.")
1206
+ return pd.DataFrame()
1207
+
1208
+ if len(gene_list) == 0:
1209
+ warnings.warn("kegga: no genes to test")
1210
+ return pd.DataFrame()
1211
+
1212
+ gp = GProfiler(return_dataframe=True)
1213
+ result = gp.profile(organism=organism, query=gene_list,
1214
+ sources=['KEGG'], **kwargs)
1215
+ return result