edgepython 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,525 @@
1
+ # This code was written by Claude (Anthropic). The project was directed by Lior Pachter.
2
+ """
3
+ Exact tests for differential expression in edgePython.
4
+
5
+ Port of edgeR's exactTest, equalizeLibSizes, q2qnbinom, splitIntoGroups.
6
+ """
7
+
8
+ import math
9
+ import numpy as np
10
+ import pandas as pd
11
+ from scipy.stats import nbinom, norm
12
+ from scipy.special import gammaln, logsumexp
13
+ from numba import njit
14
+
15
+ from .utils import drop_empty_levels, binom_test
16
+
17
+
18
+ @njit(cache=True)
19
+ def _nb_logpmf(k, size, prob):
20
+ """Numba-compatible NB log-PMF using math.lgamma."""
21
+ return (math.lgamma(k + size) - math.lgamma(k + 1.0) - math.lgamma(size)
22
+ + size * math.log(prob) + k * math.log(1.0 - prob))
23
+
24
+
25
+ @njit(cache=True)
26
+ def _logsumexp_1d(arr, n):
27
+ """Numba-compatible logsumexp for a buffer of length n."""
28
+ if n == 0:
29
+ return -math.inf
30
+ x_max = arr[0]
31
+ for i in range(1, n):
32
+ if arr[i] > x_max:
33
+ x_max = arr[i]
34
+ if x_max == -math.inf:
35
+ return -math.inf
36
+ s = 0.0
37
+ for i in range(n):
38
+ s += math.exp(arr[i] - x_max)
39
+ return x_max + math.log(s)
40
+
41
+
42
+ @njit(cache=True)
43
+ def _nb_exact_loop(s1, s2, s, mu1, dispersion, remaining_mask, n1, n2, pvals, work_buf):
44
+ """Numba kernel for the NB exact test per-gene loop."""
45
+ log2 = math.log(2.0)
46
+ ntags = len(s1)
47
+ n_total = n1 + n2
48
+
49
+ for g in range(ntags):
50
+ if not remaining_mask[g]:
51
+ continue
52
+ if s[g] == 0:
53
+ pvals[g] = 1.0
54
+ continue
55
+
56
+ d = dispersion[g]
57
+ size1 = n1 / d
58
+ size2 = n2 / d
59
+ prob1 = size1 / (size1 + mu1[g])
60
+ mu2_g = n2 * s[g] / n_total
61
+ prob2 = size2 / (size2 + mu2_g)
62
+ size_total = n_total / d
63
+ mu_total = s[g] # mu[g] * n_total = s[g]/n_total * n_total = s[g]
64
+ prob_total = size_total / (size_total + mu_total)
65
+
66
+ if s1[g] < mu1[g]:
67
+ # Left tail: x = 0..s1[g]
68
+ count = s1[g] + 1
69
+ for x in range(count):
70
+ work_buf[x] = (_nb_logpmf(x, size1, prob1)
71
+ + _nb_logpmf(s[g] - x, size2, prob2))
72
+ log_sum_top = _logsumexp_1d(work_buf, count)
73
+ log_p_bot = _nb_logpmf(s[g], size_total, prob_total)
74
+ log_pval = log2 + log_sum_top - log_p_bot
75
+ p = math.exp(min(log_pval, 0.0))
76
+ pvals[g] = min(p, 1.0)
77
+ elif s1[g] > mu1[g]:
78
+ # Right tail: x = s1[g]..s[g]
79
+ count = s[g] - s1[g] + 1
80
+ for idx in range(count):
81
+ x = s1[g] + idx
82
+ work_buf[idx] = (_nb_logpmf(x, size1, prob1)
83
+ + _nb_logpmf(s[g] - x, size2, prob2))
84
+ log_sum_top = _logsumexp_1d(work_buf, count)
85
+ log_p_bot = _nb_logpmf(s[g], size_total, prob_total)
86
+ log_pval = log2 + log_sum_top - log_p_bot
87
+ p = math.exp(min(log_pval, 0.0))
88
+ pvals[g] = min(p, 1.0)
89
+ # else s1[g] == mu1[g]: pvals[g] stays 1.0
90
+
91
+
92
+ def exact_test(y, pair=None, dispersion='auto', rejection_region='doubletail',
93
+ big_count=900, prior_count=0.125):
94
+ """Exact test for differential expression between two groups.
95
+
96
+ Port of edgeR's exactTest.
97
+
98
+ Parameters
99
+ ----------
100
+ y : DGEList
101
+ DGEList object.
102
+ pair : list of length 2, optional
103
+ Groups to compare. Default is first two groups.
104
+ dispersion : str, float, or ndarray
105
+ 'auto', 'common', 'trended', 'tagwise', or numeric.
106
+ rejection_region : str
107
+ 'doubletail', 'deviance', or 'smallp'.
108
+ big_count : int
109
+ Threshold for beta approximation.
110
+ prior_count : float
111
+ Prior count for logFC calculation.
112
+
113
+ Returns
114
+ -------
115
+ dict (DGEExact-like) with 'table', 'comparison', 'genes'.
116
+ """
117
+ if not (isinstance(y, dict) and 'counts' in y):
118
+ raise ValueError("Currently only supports DGEList objects.")
119
+
120
+ group = np.asarray(y['samples']['group'].values)
121
+ unique_groups = np.unique(group)
122
+
123
+ if pair is None:
124
+ pair = unique_groups[:2].tolist()
125
+ elif len(pair) != 2:
126
+ raise ValueError("pair must be of length 2.")
127
+
128
+ pair = [str(p) for p in pair]
129
+ group = np.array([str(g) for g in group])
130
+
131
+ # Get dispersion
132
+ if dispersion is None or dispersion == 'auto':
133
+ from .dgelist import get_dispersion
134
+ dispersion = get_dispersion(y)
135
+ if dispersion is None:
136
+ raise ValueError("No dispersion values found in DGEList object.")
137
+ elif isinstance(dispersion, str):
138
+ valid = ('common', 'trended', 'tagwise')
139
+ if dispersion not in valid:
140
+ raise ValueError(f"dispersion must be one of {valid}")
141
+ dispersion = y.get(f'{dispersion}.dispersion')
142
+ if dispersion is None:
143
+ raise ValueError("Specified dispersion not found in object")
144
+
145
+ ntags = y['counts'].shape[0]
146
+ dispersion = np.atleast_1d(np.asarray(dispersion, dtype=np.float64))
147
+ if len(dispersion) == 1:
148
+ dispersion = np.full(ntags, dispersion[0])
149
+
150
+ # Reduce to two groups
151
+ j = np.isin(group, pair)
152
+ counts = y['counts'][:, j]
153
+ lib_size = y['samples']['lib.size'].values[j]
154
+ norm_factors = y['samples']['norm.factors'].values[j]
155
+ group_sub = group[j]
156
+
157
+ lib_size = lib_size * norm_factors
158
+ offset = np.log(lib_size)
159
+ lib_size_average = np.exp(np.mean(offset))
160
+
161
+ # logFC with prior counts
162
+ pc = prior_count * lib_size / np.mean(lib_size)
163
+ offset_aug = np.log(lib_size + 2 * pc)
164
+
165
+ j1 = group_sub == pair[0]
166
+ n1 = np.sum(j1)
167
+ y1 = counts[:, j1]
168
+
169
+ j2 = group_sub == pair[1]
170
+ n2 = np.sum(j2)
171
+ y2 = counts[:, j2]
172
+
173
+ from .glm_fit import mglm_one_group
174
+ abundance1 = mglm_one_group(
175
+ y1 + np.tile(pc[j1], (ntags, 1)),
176
+ offset=offset_aug[j1], dispersion=dispersion)
177
+ abundance2 = mglm_one_group(
178
+ y2 + np.tile(pc[j2], (ntags, 1)),
179
+ offset=offset_aug[j2], dispersion=dispersion)
180
+ logFC = (abundance2 - abundance1) / np.log(2)
181
+
182
+ # Equalize library sizes
183
+ abundance = mglm_one_group(counts, dispersion=dispersion, offset=offset)
184
+ e = np.exp(abundance)
185
+
186
+ input_mean1 = np.outer(e, lib_size[j1])
187
+ output_mean1 = np.outer(e, np.full(n1, lib_size_average))
188
+ y1_eq = q2q_nbinom(y1.astype(float), input_mean1, output_mean1, dispersion)
189
+
190
+ input_mean2 = np.outer(e, lib_size[j2])
191
+ output_mean2 = np.outer(e, np.full(n2, lib_size_average))
192
+ y2_eq = q2q_nbinom(y2.astype(float), input_mean2, output_mean2, dispersion)
193
+
194
+ # Exact p-values
195
+ if rejection_region == 'doubletail':
196
+ exact_pvals = exact_test_double_tail(y1_eq, y2_eq, dispersion=dispersion,
197
+ big_count=big_count)
198
+ elif rejection_region == 'deviance':
199
+ exact_pvals = exact_test_by_deviance(y1_eq, y2_eq, dispersion=dispersion)
200
+ else:
201
+ exact_pvals = exact_test_by_small_p(y1_eq, y2_eq, dispersion=dispersion)
202
+
203
+ from .expression import ave_log_cpm
204
+ alc = y.get('AveLogCPM')
205
+ if alc is None:
206
+ alc = ave_log_cpm(y)
207
+
208
+ table = pd.DataFrame({
209
+ 'logFC': logFC,
210
+ 'logCPM': alc,
211
+ 'PValue': exact_pvals
212
+ })
213
+ rn = None
214
+ if hasattr(y['counts'], 'index'):
215
+ rn = y['counts'].index
216
+ if rn is not None:
217
+ table.index = rn
218
+
219
+ return {
220
+ 'table': table,
221
+ 'comparison': pair,
222
+ 'genes': y.get('genes')
223
+ }
224
+
225
+
226
+ def exact_test_double_tail(y1, y2, dispersion=0, big_count=900):
227
+ """Double-tail exact test for NB distribution.
228
+
229
+ Port of edgeR's exactTestDoubleTail.
230
+ """
231
+ y1 = np.asarray(y1, dtype=np.float64)
232
+ y2 = np.asarray(y2, dtype=np.float64)
233
+ if y1.ndim == 1:
234
+ y1 = y1.reshape(-1, 1)
235
+ if y2.ndim == 1:
236
+ y2 = y2.reshape(-1, 1)
237
+
238
+ ntags = y1.shape[0]
239
+ n1 = y1.shape[1]
240
+ n2 = y2.shape[1]
241
+
242
+ s1 = np.round(y1.sum(axis=1)).astype(int)
243
+ s2 = np.round(y2.sum(axis=1)).astype(int)
244
+
245
+ dispersion = np.atleast_1d(np.asarray(dispersion, dtype=np.float64))
246
+ if len(dispersion) == 1:
247
+ dispersion = np.full(ntags, dispersion[0])
248
+
249
+ s = s1 + s2
250
+ mu = s / (n1 + n2)
251
+ mu1 = n1 * mu
252
+ mu2 = n2 * mu
253
+
254
+ pvals = np.ones(ntags)
255
+
256
+ # Poisson case
257
+ pois = dispersion <= 0
258
+ if np.any(pois):
259
+ pvals[pois] = binom_test(s1[pois], s2[pois], p=n1 / (n1 + n2))
260
+
261
+ # Beta approximation for large counts
262
+ big = (s1 > big_count) & (s2 > big_count)
263
+ if np.any(big):
264
+ pvals[big] = _exact_test_beta_approx(y1[big], y2[big], dispersion[big])
265
+
266
+ # NB exact test for remaining (use log-scale to avoid underflow)
267
+ remaining = ~pois & ~big
268
+ if np.any(remaining):
269
+ max_s = int(np.max(s[remaining])) + 1
270
+ work_buf = np.empty(max(max_s, 1), dtype=np.float64)
271
+ _nb_exact_loop(s1.astype(np.int64), s2.astype(np.int64),
272
+ s.astype(np.int64), mu1, dispersion,
273
+ remaining, n1, n2, pvals, work_buf)
274
+
275
+ return np.minimum(pvals, 1.0)
276
+
277
+
278
+ def exact_test_by_deviance(y1, y2, dispersion=0):
279
+ """Exact test using deviance as rejection region.
280
+
281
+ Simplified port: uses double-tail as fallback.
282
+ """
283
+ return exact_test_double_tail(y1, y2, dispersion=dispersion)
284
+
285
+
286
+ def exact_test_by_small_p(y1, y2, dispersion=0):
287
+ """Exact test using small-p as rejection region.
288
+
289
+ Simplified port: uses double-tail as fallback.
290
+ """
291
+ return exact_test_double_tail(y1, y2, dispersion=dispersion)
292
+
293
+
294
+ def _exact_test_beta_approx(y1, y2, dispersion):
295
+ """Beta approximation for exact test with large counts.
296
+
297
+ Faithful port of edgeR's exactTestBetaApprox.
298
+ """
299
+ from scipy.stats import beta as beta_dist
300
+
301
+ y1 = np.asarray(y1, dtype=np.float64)
302
+ y2 = np.asarray(y2, dtype=np.float64)
303
+ if y1.ndim == 1:
304
+ y1 = y1.reshape(-1, 1)
305
+ if y2.ndim == 1:
306
+ y2 = y2.reshape(-1, 1)
307
+
308
+ n1 = y1.shape[1]
309
+ n2 = y2.shape[1]
310
+ s1 = y1.sum(axis=1)
311
+ s2 = y2.sum(axis=1)
312
+ y = s1 + s2
313
+
314
+ ntags = len(s1)
315
+ dispersion = np.broadcast_to(np.atleast_1d(np.asarray(dispersion, dtype=np.float64)),
316
+ (ntags,)).copy()
317
+
318
+ mu = y / (n1 + n2)
319
+ pvals = np.ones(ntags)
320
+ all_zero = y <= 0
321
+
322
+ alpha1 = n1 * mu / (1.0 + dispersion * mu)
323
+ alpha2 = (n2 / n1) * alpha1
324
+
325
+ # Median of the beta distribution
326
+ med = np.zeros(ntags)
327
+ nz = ~all_zero
328
+ if np.any(nz):
329
+ med[nz] = beta_dist.median(alpha1[nz], alpha2[nz])
330
+
331
+ # Left tail with continuity correction
332
+ left = ((s1 + 0.5) / y < med) & ~all_zero
333
+ if np.any(left):
334
+ pvals[left] = 2.0 * beta_dist.cdf((s1[left] + 0.5) / y[left],
335
+ alpha1[left], alpha2[left])
336
+
337
+ # Right tail with continuity correction
338
+ right = ((s1 - 0.5) / y > med) & ~all_zero
339
+ if np.any(right):
340
+ pvals[right] = 2.0 * beta_dist.sf((s1[right] - 0.5) / y[right],
341
+ alpha1[right], alpha2[right])
342
+
343
+ return np.minimum(pvals, 1.0)
344
+
345
+
346
+ def equalize_lib_sizes(y, group=None, dispersion=None, lib_size=None):
347
+ """Equalize library sizes using quantile-to-quantile transformation.
348
+
349
+ Port of edgeR's equalizeLibSizes.
350
+
351
+ Parameters
352
+ ----------
353
+ y : ndarray or DGEList
354
+ Count matrix or DGEList.
355
+ group : array-like, optional
356
+ Group factor.
357
+ dispersion : float, optional
358
+ Dispersion.
359
+ lib_size : ndarray, optional
360
+ Library sizes.
361
+
362
+ Returns
363
+ -------
364
+ dict with 'pseudo.counts' and 'pseudo.lib.size'.
365
+ """
366
+ if isinstance(y, dict) and 'counts' in y:
367
+ dge = y
368
+ from .dgelist import valid_dgelist, get_dispersion
369
+ dge = valid_dgelist(dge)
370
+ if dispersion is None:
371
+ dispersion = get_dispersion(dge)
372
+ ls = dge['samples']['lib.size'].values * dge['samples']['norm.factors'].values
373
+ out = equalize_lib_sizes(dge['counts'], group=dge['samples']['group'].values,
374
+ dispersion=dispersion, lib_size=ls)
375
+ dge['pseudo.counts'] = out['pseudo.counts']
376
+ dge['pseudo.lib.size'] = out['pseudo.lib.size']
377
+ return dge
378
+
379
+ y = np.asarray(y, dtype=np.float64)
380
+ if y.ndim == 1:
381
+ y = y.reshape(1, -1)
382
+ ntags, nlibs = y.shape
383
+
384
+ if group is None:
385
+ group = np.ones(nlibs, dtype=int)
386
+ group = np.asarray(group)
387
+
388
+ if dispersion is None:
389
+ dispersion = 0.05
390
+
391
+ if lib_size is None:
392
+ lib_size = y.sum(axis=0)
393
+ lib_size = np.asarray(lib_size, dtype=np.float64)
394
+
395
+ common_lib_size = np.exp(np.mean(np.log(lib_size)))
396
+ unique_groups = np.unique(group)
397
+
398
+ input_mean = np.zeros_like(y)
399
+ output_mean = np.zeros_like(y)
400
+
401
+ from .glm_fit import mglm_one_group
402
+ for grp in unique_groups:
403
+ j = group == grp
404
+ beta = mglm_one_group(y[:, j], dispersion=dispersion,
405
+ offset=np.log(lib_size[j]))
406
+ lam = np.exp(beta)
407
+ input_mean[:, j] = np.outer(lam, lib_size[j])
408
+ output_mean[:, j] = np.outer(lam, np.full(np.sum(j), common_lib_size))
409
+
410
+ pseudo = q2q_nbinom(y, input_mean, output_mean, dispersion)
411
+ pseudo = np.maximum(pseudo, 0)
412
+
413
+ return {'pseudo.counts': pseudo, 'pseudo.lib.size': common_lib_size}
414
+
415
+
416
+ def q2q_nbinom(x, input_mean, output_mean, dispersion=0):
417
+ """Quantile-to-quantile mapping between negative-binomial distributions.
418
+
419
+ Port of edgeR's q2qnbinom. Uses average of normal and gamma approximations.
420
+ """
421
+ from scipy.stats import norm, gamma as gamma_dist
422
+
423
+ x = np.asarray(x, dtype=np.float64)
424
+ input_mean = np.asarray(input_mean, dtype=np.float64)
425
+ output_mean = np.asarray(output_mean, dtype=np.float64)
426
+ dispersion = np.atleast_1d(np.asarray(dispersion, dtype=np.float64))
427
+
428
+ if dispersion.size == 1:
429
+ d = dispersion[0]
430
+ else:
431
+ d = dispersion
432
+
433
+ eps = 1e-14
434
+ zero = (input_mean < eps) | (output_mean < eps)
435
+ input_mean = np.where(zero, input_mean + 0.25, input_mean)
436
+ output_mean = np.where(zero, output_mean + 0.25, output_mean)
437
+
438
+ if np.isscalar(d):
439
+ d_arr = d
440
+ else:
441
+ if d.ndim == 1 and x.ndim == 2:
442
+ d_arr = d[:, None]
443
+ else:
444
+ d_arr = d
445
+
446
+ ri = 1 + d_arr * input_mean
447
+ vi = input_mean * ri
448
+ ro = 1 + d_arr * output_mean
449
+ vo = output_mean * ro
450
+
451
+ i = x >= input_mean
452
+ j = ~i
453
+
454
+ p1 = np.zeros_like(x)
455
+ p2 = np.zeros_like(x)
456
+ q1 = np.zeros_like(x)
457
+ q2 = np.zeros_like(x)
458
+
459
+ # Upper tail (x >= input_mean)
460
+ if np.any(i):
461
+ p1[i] = norm.logsf(x[i], loc=input_mean[i], scale=np.sqrt(np.maximum(vi[i], eps)))
462
+ shape_i = input_mean[i] / np.maximum(ri[i], eps)
463
+ scale_i = ri[i]
464
+ p2[i] = gamma_dist.logsf(x[i], a=shape_i, scale=scale_i)
465
+ q1[i] = norm.isf(np.exp(p1[i]), loc=output_mean[i], scale=np.sqrt(np.maximum(vo[i], eps)))
466
+ shape_o = output_mean[i] / np.maximum(ro[i], eps)
467
+ scale_o = ro[i]
468
+ q2[i] = gamma_dist.isf(np.exp(p2[i]), a=shape_o, scale=scale_o)
469
+
470
+ # Lower tail (x < input_mean)
471
+ if np.any(j):
472
+ p1[j] = norm.logcdf(x[j], loc=input_mean[j], scale=np.sqrt(np.maximum(vi[j], eps)))
473
+ shape_i = input_mean[j] / np.maximum(ri[j], eps)
474
+ scale_i = ri[j]
475
+ p2[j] = gamma_dist.logcdf(x[j], a=shape_i, scale=scale_i)
476
+ q1[j] = norm.ppf(np.exp(p1[j]), loc=output_mean[j], scale=np.sqrt(np.maximum(vo[j], eps)))
477
+ shape_o = output_mean[j] / np.maximum(ro[j], eps)
478
+ scale_o = ro[j]
479
+ q2[j] = gamma_dist.ppf(np.exp(p2[j]), a=shape_o, scale=scale_o)
480
+
481
+ return (q1 + q2) / 2
482
+
483
+
484
+ def split_into_groups(y, group=None):
485
+ """Split a count matrix into a list of matrices by group.
486
+
487
+ Port of edgeR's splitIntoGroups.
488
+ """
489
+ y = np.asarray(y, dtype=np.float64)
490
+ if y.ndim == 1:
491
+ y = y.reshape(1, -1)
492
+
493
+ if group is None:
494
+ return [y]
495
+
496
+ group = np.asarray(group)
497
+ unique_groups = np.unique(group)
498
+
499
+ result = []
500
+ for g in unique_groups:
501
+ mask = group == g
502
+ sub = y[:, mask]
503
+ if sub.ndim == 1:
504
+ sub = sub.reshape(-1, 1)
505
+ result.append(sub)
506
+
507
+ return result
508
+
509
+
510
+ def split_into_groups_pseudo(pseudo, group, pair):
511
+ """Extract data for two groups from pseudo-count matrix.
512
+
513
+ Port of edgeR's splitIntoGroupsPseudo.
514
+ """
515
+ pseudo = np.asarray(pseudo, dtype=np.float64)
516
+ group = np.asarray(group)
517
+
518
+ y1 = pseudo[:, group == pair[0]]
519
+ y2 = pseudo[:, group == pair[1]]
520
+ if y1.ndim == 1:
521
+ y1 = y1.reshape(-1, 1)
522
+ if y2.ndim == 1:
523
+ y2 = y2.reshape(-1, 1)
524
+
525
+ return {'y1': y1, 'y2': y2}