edgepython 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
edgepython/utils.py ADDED
@@ -0,0 +1,1050 @@
1
+ # This code was written by Claude (Anthropic). The project was directed by Lior Pachter.
2
+ """
3
+ Utility functions for edgePython.
4
+
5
+ Port of edgeR utility functions: expandAsMatrix, addPriorCount, movingAverageByCol,
6
+ predFC, goodTuring, thinCounts, gini, cutWithMinN, sumTechReps, systematicSubset,
7
+ nearestReftoX, getPriorN, zscoreNBinom, binomTest, dropEmptyLevels, etc.
8
+ """
9
+
10
+ import numpy as np
11
+ import pandas as pd
12
+ from scipy import stats, special
13
+ from .compressed_matrix import CompressedMatrix, compress_offsets, compress_prior
14
+ import warnings
15
+
16
+
17
+ def expand_as_matrix(x, dim=None, byrow=True):
18
+ """Convert scalar, row/column vector, or matrix to a full matrix.
19
+
20
+ Port of edgeR's expandAsMatrix.
21
+ """
22
+ if dim is None:
23
+ return np.atleast_2d(np.asarray(x, dtype=np.float64))
24
+ dim = (int(dim[0]), int(dim[1]))
25
+
26
+ if isinstance(x, CompressedMatrix):
27
+ return expand_as_matrix(x.as_matrix(), dim=dim, byrow=byrow)
28
+
29
+ x = np.asarray(x, dtype=np.float64)
30
+ if x.ndim == 0 or x.size == 1:
31
+ return np.full(dim, x.ravel()[0])
32
+ if x.ndim <= 1:
33
+ lx = len(x)
34
+ if lx == dim[0] and lx == dim[1]:
35
+ return np.tile(x.reshape(-1, 1) if not byrow else x.reshape(1, -1),
36
+ (1, dim[1]) if not byrow else (dim[0], 1))
37
+ if lx == dim[1]:
38
+ return np.tile(x.reshape(1, -1), (dim[0], 1))
39
+ if lx == dim[0]:
40
+ return np.tile(x.reshape(-1, 1), (1, dim[1]))
41
+ raise ValueError("x of unexpected length")
42
+ if x.ndim == 2:
43
+ if x.shape == tuple(dim):
44
+ return x.copy()
45
+ raise ValueError("x is matrix of wrong size")
46
+ raise ValueError("x has wrong dimensions")
47
+
48
+
49
+ def add_prior_count(y, lib_size=None, offset=None, prior_count=1):
50
+ """Add library-size-adjusted prior counts.
51
+
52
+ Port of edgeR's addPriorCount.
53
+
54
+ Returns
55
+ -------
56
+ dict with 'y' (adjusted counts) and 'offset' (adjusted offsets).
57
+ """
58
+ y = np.asarray(y, dtype=np.float64)
59
+ if y.ndim == 1:
60
+ y = y.reshape(1, -1)
61
+
62
+ if offset is None:
63
+ if lib_size is None:
64
+ lib_size = y.sum(axis=0)
65
+ offset = np.log(lib_size)
66
+ offset = np.atleast_1d(np.asarray(offset, dtype=np.float64))
67
+
68
+ prior_count = np.atleast_1d(np.asarray(prior_count, dtype=np.float64))
69
+
70
+ # Expand offset and prior_count to matrix form
71
+ if offset.ndim == 1:
72
+ offset_mat = np.tile(offset, (y.shape[0], 1))
73
+ else:
74
+ offset_mat = offset.copy()
75
+
76
+ if prior_count.ndim == 0 or prior_count.size == 1:
77
+ prior_mat = np.full(y.shape, prior_count.ravel()[0])
78
+ elif prior_count.ndim == 1 and len(prior_count) == y.shape[0]:
79
+ prior_mat = np.tile(prior_count.reshape(-1, 1), (1, y.shape[1]))
80
+ else:
81
+ prior_mat = expand_as_matrix(prior_count, dim=y.shape, byrow=False)
82
+
83
+ # Effective library sizes from offset
84
+ lib_size_eff = np.exp(offset_mat)
85
+
86
+ # Scale prior counts to be proportional to library size
87
+ avg_lib = np.mean(lib_size_eff, axis=0) if lib_size_eff.ndim == 2 else np.mean(lib_size_eff)
88
+ if lib_size_eff.ndim == 2:
89
+ # prior is scaled by lib_size / mean_lib_size
90
+ mean_lib = np.mean(lib_size_eff)
91
+ scaled_prior = prior_mat * lib_size_eff / mean_lib
92
+ else:
93
+ scaled_prior = prior_mat
94
+
95
+ y_aug = y + scaled_prior
96
+ offset_aug = np.log(lib_size_eff + 2 * np.mean(scaled_prior, axis=0, keepdims=True) * np.mean(lib_size_eff) / lib_size_eff)
97
+
98
+ # Simplified: match edgeR C code behavior
99
+ # offset_aug = log(lib_size + 2*prior_count_scaled)
100
+ if offset.ndim == 1:
101
+ lib = np.exp(offset)
102
+ pc = prior_count.ravel()[0] if prior_count.size == 1 else np.mean(prior_count)
103
+ offset_aug = np.log(lib + 2.0 * pc * lib / np.mean(lib))
104
+ scaled_prior_simple = prior_count.ravel()[0] * lib / np.mean(lib) if prior_count.size == 1 else prior_count.reshape(-1, 1) * lib / np.mean(lib)
105
+ y_aug = y + (scaled_prior_simple if np.ndim(scaled_prior_simple) == 2 else np.tile(scaled_prior_simple, (y.shape[0], 1)))
106
+ offset_aug_mat = np.tile(offset_aug, (y.shape[0], 1)) if offset_aug.ndim == 1 else offset_aug
107
+
108
+ return {'y': y_aug, 'offset': offset_aug}
109
+
110
+
111
+ def moving_average_by_col(x, width=5, full_length=True):
112
+ """Moving average smoother for columns of a matrix.
113
+
114
+ Port of edgeR's movingAverageByCol.
115
+ """
116
+ x = np.asarray(x, dtype=np.float64)
117
+ if x.ndim == 1:
118
+ x = x.reshape(-1, 1)
119
+ width = int(width)
120
+ if width <= 1:
121
+ return x
122
+ n, m = x.shape
123
+ if width > n:
124
+ width = n
125
+
126
+ if full_length:
127
+ half1 = (width + 1) // 2
128
+ half2 = width // 2
129
+ x_pad = np.vstack([np.zeros((half1, m)), x, np.zeros((half2, m))])
130
+ else:
131
+ if width == n:
132
+ return np.tile(x.mean(axis=0), (1, 1))
133
+ x_pad = np.vstack([np.zeros((1, m)), x])
134
+
135
+ cs = np.cumsum(x_pad, axis=0)
136
+ n2 = cs.shape[0]
137
+ result = cs[width:n2] - cs[:n2 - width]
138
+ n3 = result.shape[0]
139
+
140
+ w = np.full(n3, width, dtype=np.float64)
141
+ if full_length:
142
+ if half1 > 1:
143
+ w[:half1 - 1] = width - np.arange(half1 - 1, 0, -1)
144
+ w[n3 - half2:] = width - np.arange(1, half2 + 1)
145
+
146
+ return result / w.reshape(-1, 1)
147
+
148
+
149
+ def pred_fc(y, design, prior_count=0.125, offset=None, dispersion=0, weights=None):
150
+ """Predicted fold changes with shrinkage.
151
+
152
+ Port of edgeR's predFC.
153
+ """
154
+ from .glm_fit import glm_fit
155
+
156
+ out = add_prior_count(y, offset=offset, prior_count=prior_count)
157
+ design = np.asarray(design, dtype=np.float64)
158
+ g = glm_fit(out['y'], design, offset=out['offset'], dispersion=dispersion,
159
+ prior_count=0, weights=weights)
160
+ return g['coefficients'] / np.log(2)
161
+
162
+
163
+ def good_turing(x, conf=1.96):
164
+ """Simple Good-Turing frequency estimation.
165
+
166
+ Faithful port of edgeR's goodTuring (R wrapper + C code in good_turing.c).
167
+ """
168
+ x = np.asarray(x, dtype=int)
169
+
170
+ # Tabulate frequencies — matches R's goodTuring R wrapper
171
+ max_x = x.max()
172
+ if max_x < len(x):
173
+ # np.bincount(x): index i = count of value i in x
174
+ bc = np.bincount(x)
175
+ n0 = bc[0] if len(bc) > 0 else 0
176
+ n = bc[1:] # counts for values 1, 2, ..., max_x
177
+ r = np.arange(1, len(n) + 1)
178
+ mask = n > 0
179
+ r = r[mask]
180
+ n = n[mask]
181
+ else:
182
+ r_unique, counts = np.unique(x, return_counts=True)
183
+ sort_idx = np.argsort(r_unique)
184
+ r_unique = r_unique[sort_idx]
185
+ counts = counts[sort_idx]
186
+ if r_unique[0] == 0:
187
+ n0 = counts[0]
188
+ r = r_unique[1:]
189
+ n = counts[1:]
190
+ else:
191
+ n0 = 0
192
+ r = r_unique
193
+ n = counts
194
+
195
+ if len(r) == 0:
196
+ return {'count': r, 'n': n, 'n0': n0, 'proportion': np.array([]),
197
+ 'P0': 0.0}
198
+
199
+ r = r.astype(np.int64)
200
+ n = n.astype(np.int64)
201
+ nr = len(r)
202
+ last = nr - 1
203
+
204
+ # --- Port of good_turing.c ---
205
+ # Compute bigN, Z values, and linear regression in one pass
206
+ bigN = 0.0
207
+ log_obs = np.log(r.astype(float))
208
+ meanX = 0.0
209
+ meanY = 0.0
210
+ XYs = 0.0
211
+ Xsquares = 0.0
212
+
213
+ for i in range(nr):
214
+ bigN += float(r[i]) * float(n[i])
215
+
216
+ prev_obs = 0 if i == 0 else r[i - 1]
217
+ logO = log_obs[i]
218
+
219
+ xx = (2 * (r[i] - prev_obs)) if i == last else (r[i + 1] - prev_obs)
220
+ logZ = np.log(2.0 * n[i]) - np.log(float(xx))
221
+
222
+ meanX += logO
223
+ meanY += logZ
224
+ XYs += logO * logZ
225
+ Xsquares += logO * logO
226
+
227
+ meanX /= nr
228
+ meanY /= nr
229
+ XYs -= meanX * meanY * nr
230
+ Xsquares -= meanX * meanX * nr
231
+
232
+ slope = XYs / Xsquares if Xsquares != 0 else 0.0
233
+
234
+ # P0: only nonzero if first observed count is 1
235
+ P0 = 0.0 if (nr == 0 or r[0] != 1) else float(n[0]) / bigN
236
+
237
+ # Compute r* values with indiffValsSeen logic
238
+ out = np.zeros(nr)
239
+ bigNprime = 0.0
240
+ indiff_vals_seen = False
241
+
242
+ for i in range(nr):
243
+ next_obs = r[i] + 1
244
+ # Turing estimate (intercept cancels out)
245
+ y = float(next_obs) * np.exp(slope * (np.log(float(next_obs)) - log_obs[i]))
246
+
247
+ if i == last or r[i + 1] != next_obs:
248
+ indiff_vals_seen = True
249
+
250
+ if not indiff_vals_seen:
251
+ # Direct estimate
252
+ x_direct = float(next_obs) * float(n[i + 1]) / float(n[i])
253
+ if abs(x_direct - y) <= conf * x_direct * np.sqrt(
254
+ 1.0 / float(n[i + 1]) + 1.0 / float(n[i])):
255
+ indiff_vals_seen = True
256
+ else:
257
+ out[i] = x_direct
258
+
259
+ if indiff_vals_seen:
260
+ out[i] = y
261
+
262
+ bigNprime += out[i] * float(n[i])
263
+
264
+ # Normalize to proportions
265
+ factor = (1.0 - P0) / bigNprime if bigNprime > 0 else 0.0
266
+ proportion = out * factor
267
+
268
+ return {
269
+ 'count': r,
270
+ 'n': n,
271
+ 'n0': n0,
272
+ 'proportion': proportion,
273
+ 'P0': P0
274
+ }
275
+
276
+
277
+ def good_turing_proportions(counts):
278
+ """Transform counts using Good-Turing proportions.
279
+
280
+ Port of edgeR's goodTuringProportions.
281
+ """
282
+ counts = np.asarray(counts, dtype=int)
283
+ z = counts.astype(float).copy()
284
+ if z.ndim == 1:
285
+ z = z.reshape(-1, 1)
286
+ nlibs = z.shape[1]
287
+ for i in range(nlibs):
288
+ g = good_turing(counts[:, i] if counts.ndim == 2 else counts)
289
+ p0 = g['P0'] / g['n0'] if g['n0'] > 0 else 0
290
+ zero = z[:, i] == 0
291
+ z[zero, i] = p0
292
+ nonzero = ~zero
293
+ if np.any(nonzero):
294
+ m = np.searchsorted(g['count'], z[nonzero, i].astype(int))
295
+ m = np.clip(m, 0, len(g['proportion']) - 1)
296
+ z[nonzero, i] = g['proportion'][m]
297
+ return z
298
+
299
+
300
+ def thin_counts(x, prob=None, target_size=None):
301
+ """Binomial or multinomial thinning of counts.
302
+
303
+ Port of edgeR's thinCounts.
304
+ """
305
+ x = np.asarray(x, dtype=int).copy()
306
+ if prob is not None:
307
+ x = np.random.binomial(x, prob)
308
+ else:
309
+ if x.ndim == 1:
310
+ x = x.reshape(-1, 1)
311
+ if target_size is None:
312
+ target_size = x.sum(axis=0).min()
313
+ target_size = np.atleast_1d(np.asarray(target_size, dtype=int))
314
+ if len(target_size) == 1:
315
+ target_size = np.full(x.shape[1], target_size[0])
316
+ actual_size = x.sum(axis=0)
317
+ if np.any(target_size > actual_size):
318
+ raise ValueError("target_size bigger than actual size")
319
+ for j in range(x.shape[1]):
320
+ diff = actual_size[j] - target_size[j]
321
+ if diff > 0:
322
+ probs = x[:, j].astype(float)
323
+ probs /= probs.sum()
324
+ remove = np.random.multinomial(diff, probs)
325
+ x[:, j] -= remove
326
+ x = np.maximum(x, 0)
327
+ return x
328
+
329
+
330
+ def gini(x):
331
+ """Gini diversity index for columns of a matrix.
332
+
333
+ Port of edgeR's gini.
334
+ """
335
+ x = np.asarray(x, dtype=np.float64)
336
+ if x.ndim == 1:
337
+ x = x.reshape(-1, 1)
338
+ n = x.shape[0]
339
+ result = np.zeros(x.shape[1])
340
+ for j in range(x.shape[1]):
341
+ xs = np.sort(x[:, j])
342
+ i = np.arange(1, n + 1)
343
+ m = 0.75 * n
344
+ s1 = np.sum((i - m) * xs)
345
+ s2 = np.sum(xs)
346
+ if s2 > 0:
347
+ result[j] = (2 * (s1 / s2 + m) - n - 1) / n
348
+ return result
349
+
350
+
351
+ def cut_with_min_n(x, intervals=2, min_n=1):
352
+ """Cut numeric x into intervals with minimum count per bin.
353
+
354
+ Port of edgeR's cutWithMinN.
355
+ """
356
+ x = np.asarray(x, dtype=np.float64)
357
+ isna = np.isnan(x)
358
+ if np.any(isna):
359
+ group = np.full(len(x), np.nan)
360
+ out = cut_with_min_n(x[~isna], intervals=intervals, min_n=min_n)
361
+ group[~isna] = out['group']
362
+ return {'group': group, 'breaks': out['breaks']}
363
+
364
+ intervals = int(intervals)
365
+ min_n = int(min_n)
366
+ nx = len(x)
367
+
368
+ if nx < intervals * min_n:
369
+ raise ValueError("too few observations: length(x) < intervals*min_n")
370
+
371
+ if intervals == 1:
372
+ return {'group': np.ones(nx, dtype=int), 'breaks': None}
373
+
374
+ # Add jitter
375
+ x_jit = x + 1e-10 * (np.random.uniform(size=nx) - 0.5)
376
+
377
+ # Try equally spaced
378
+ breaks = np.linspace(x_jit.min() - 1, x_jit.max() + 1, intervals + 1)
379
+ z = np.digitize(x_jit, breaks[1:-1])
380
+ n = np.bincount(z, minlength=intervals)
381
+ if np.all(n >= min_n):
382
+ return {'group': z + 1, 'breaks': breaks}
383
+
384
+ # Try quantile-based
385
+ quantiles = np.quantile(x_jit, np.linspace(0, 1, intervals + 1))
386
+ quantiles[0] -= 1
387
+ quantiles[-1] += 1
388
+
389
+ for w in np.linspace(0.1, 1.0, 10):
390
+ brk = w * quantiles + (1 - w) * breaks
391
+ z = np.digitize(x_jit, brk[1:-1])
392
+ n = np.bincount(z, minlength=intervals)
393
+ if np.all(n >= min_n):
394
+ return {'group': z + 1, 'breaks': brk}
395
+
396
+ # Fallback: order by x
397
+ o = np.argsort(x_jit)
398
+ n_per = nx // intervals
399
+ nresid = nx - intervals * n_per
400
+ sizes = np.full(intervals, n_per)
401
+ if nresid > 0:
402
+ sizes[:nresid] += 1
403
+ z = np.zeros(nx, dtype=int)
404
+ z[o] = np.repeat(np.arange(1, intervals + 1), sizes)
405
+ return {'group': z, 'breaks': quantiles}
406
+
407
+
408
+ def sum_tech_reps(x, ID=None):
409
+ """Sum over technical replicate columns.
410
+
411
+ Port of edgeR's sumTechReps.
412
+ """
413
+ if isinstance(x, dict) and 'counts' in x:
414
+ # DGEList-like
415
+ if ID is None:
416
+ raise ValueError("No sample IDs")
417
+ ID = np.asarray(ID)
418
+ unique_ids, inverse = np.unique(ID, return_inverse=True)
419
+ if len(unique_ids) == len(ID):
420
+ return x
421
+
422
+ from copy import deepcopy
423
+ y = deepcopy(x)
424
+ # Sum counts
425
+ new_counts = np.zeros((x['counts'].shape[0], len(unique_ids)))
426
+ for i, uid in enumerate(unique_ids):
427
+ mask = ID == uid
428
+ new_counts[:, i] = x['counts'][:, mask].sum(axis=1)
429
+ y['counts'] = new_counts
430
+
431
+ # Average lib.size and norm.factors
432
+ if 'samples' in y:
433
+ new_samples = pd.DataFrame(index=unique_ids)
434
+ for col in y['samples'].columns:
435
+ vals = y['samples'][col].values
436
+ if np.issubdtype(type(vals[0]), np.number) if not isinstance(vals[0], str) else False:
437
+ new_vals = np.array([vals[ID == uid].sum() for uid in unique_ids])
438
+ if col == 'norm.factors':
439
+ counts_per_id = np.array([np.sum(ID == uid) for uid in unique_ids])
440
+ new_vals = new_vals / counts_per_id
441
+ new_samples[col] = new_vals
442
+ else:
443
+ new_samples[col] = [vals[ID == uid][0] for uid in unique_ids]
444
+ y['samples'] = new_samples
445
+ return y
446
+ else:
447
+ # Matrix
448
+ x = np.asarray(x, dtype=np.float64)
449
+ if ID is None:
450
+ raise ValueError("No sample IDs")
451
+ ID = np.asarray(ID)
452
+ unique_ids = np.unique(ID)
453
+ result = np.zeros((x.shape[0], len(unique_ids)))
454
+ for i, uid in enumerate(unique_ids):
455
+ mask = ID == uid
456
+ result[:, i] = x[:, mask].sum(axis=1)
457
+ return result
458
+
459
+
460
+ def systematic_subset(n, order_by):
461
+ """Take a systematic subset of indices stratified by a ranking variable.
462
+
463
+ Port of edgeR's systematicSubset.
464
+ """
465
+ order_by = np.asarray(order_by)
466
+ ntotal = len(order_by)
467
+ sampling_ratio = ntotal // n
468
+ if sampling_ratio <= 1:
469
+ return np.arange(ntotal)
470
+ i1 = sampling_ratio // 2
471
+ indices = np.arange(i1, ntotal, sampling_ratio)
472
+ o = np.argsort(order_by)
473
+ return o[indices]
474
+
475
+
476
+ def nearest_ref_to_x(x, reference):
477
+ """Find nearest element of reference for each element of x.
478
+
479
+ Port of edgeR's nearestReftoX.
480
+ """
481
+ reference = np.sort(reference)
482
+ midpt = (reference[:-1] + reference[1:]) / 2
483
+ return np.searchsorted(midpt, x)
484
+
485
+
486
+ def get_prior_n(y, design=None, prior_df=20):
487
+ """Determine prior.n to keep prior degrees of freedom fixed.
488
+
489
+ Port of edgeR's getPriorN.
490
+ """
491
+ if isinstance(y, dict):
492
+ nlibs = y['counts'].shape[1] if 'counts' in y else 0
493
+ if design is None:
494
+ npar = len(y['samples']['group'].unique()) if 'samples' in y else 1
495
+ else:
496
+ npar = design.shape[1]
497
+ else:
498
+ if design is None:
499
+ raise ValueError("design must be provided for matrix input")
500
+ nlibs = np.asarray(y).shape[1]
501
+ npar = design.shape[1]
502
+
503
+ residual_df = nlibs - npar
504
+ if residual_df <= 0:
505
+ return prior_df
506
+ return prior_df / residual_df
507
+
508
+
509
+ def zscore_nbinom(q, size, mu, method='midp'):
510
+ """Z-score equivalents for negative binomial deviates.
511
+
512
+ Port of edgeR's zscoreNBinom.
513
+ """
514
+ q = np.asarray(q, dtype=np.float64)
515
+ size = np.atleast_1d(np.asarray(size, dtype=np.float64))
516
+ mu = np.atleast_1d(np.asarray(mu, dtype=np.float64))
517
+ n = len(q)
518
+ size = np.broadcast_to(size, n).copy()
519
+ mu = np.broadcast_to(mu, n).copy()
520
+
521
+ z = np.zeros(n)
522
+ qr = np.round(q).astype(int)
523
+
524
+ for i in range(n):
525
+ if mu[i] <= 0 or size[i] <= 0:
526
+ z[i] = 0
527
+ continue
528
+ logd = stats.nbinom.logpmf(qr[i], size[i], size[i] / (size[i] + mu[i]))
529
+ if qr[i] == 0:
530
+ w = (q[i] - qr[i]) + 0.5
531
+ logp = logd + np.log(max(w, 1e-300))
532
+ z[i] = stats.norm.ppf(np.exp(logp)) if np.exp(logp) < 1 else 0
533
+ elif q[i] >= mu[i]:
534
+ logp_tail = stats.nbinom.logsf(qr[i], size[i], size[i] / (size[i] + mu[i]))
535
+ w = 0.5 - (q[i] - qr[i])
536
+ from .limma_port import logsumexp
537
+ logp = logsumexp(logp_tail, logd + np.log(max(w, 1e-300)))
538
+ z[i] = -stats.norm.ppf(np.exp(logp)) if np.exp(logp) < 1 else 0
539
+ else:
540
+ logp_tail = stats.nbinom.logcdf(max(qr[i] - 1, 0), size[i], size[i] / (size[i] + mu[i]))
541
+ w = (q[i] - qr[i]) + 0.5
542
+ from .limma_port import logsumexp
543
+ logp = logsumexp(logp_tail, logd + np.log(max(w, 1e-300)))
544
+ z[i] = stats.norm.ppf(np.exp(logp)) if np.exp(logp) < 1 else 0
545
+
546
+ return z
547
+
548
+
549
+ def binom_test(y1, y2, n1=None, n2=None, p=None):
550
+ """Multiple exact binomial tests.
551
+
552
+ Port of edgeR's binomTest.
553
+ """
554
+ y1 = np.asarray(y1, dtype=int)
555
+ y2 = np.asarray(y2, dtype=int)
556
+ if len(y1) != len(y2):
557
+ raise ValueError("y1 and y2 must have same length")
558
+
559
+ if n1 is None:
560
+ n1 = np.sum(y1)
561
+ if n2 is None:
562
+ n2 = np.sum(y2)
563
+ if p is None:
564
+ p = n1 / (n1 + n2)
565
+
566
+ size = y1 + y2
567
+ pvalue = np.ones(len(y1))
568
+
569
+ if p == 0.5:
570
+ for i in range(len(y1)):
571
+ if size[i] > 0:
572
+ k = min(y1[i], y2[i])
573
+ pvalue[i] = min(2 * stats.binom.cdf(k, size[i], 0.5), 1.0)
574
+ return pvalue
575
+
576
+ for i in range(len(y1)):
577
+ if size[i] == 0:
578
+ pvalue[i] = 1.0
579
+ continue
580
+ if size[i] > 10000:
581
+ table = np.array([[y1[i], y2[i]], [n1 - y1[i], n2 - y2[i]]])
582
+ _, pv, _, _ = stats.chi2_contingency(table, correction=False)
583
+ pvalue[i] = pv
584
+ else:
585
+ # Method of small probabilities
586
+ d = stats.binom.pmf(np.arange(size[i] + 1), size[i], p)
587
+ d_obs = stats.binom.pmf(y1[i], size[i], p)
588
+ pvalue[i] = np.sum(d[d <= d_obs + 1e-15])
589
+
590
+ return np.minimum(pvalue, 1.0)
591
+
592
+
593
+ def drop_empty_levels(x):
594
+ """Drop unused factor levels.
595
+
596
+ Port of edgeR's dropEmptyLevels.
597
+ """
598
+ if isinstance(x, pd.Categorical):
599
+ return x.remove_unused_categories()
600
+ return pd.Categorical(x)
601
+
602
+
603
+ def design_as_factor(design):
604
+ """Construct a factor from the unique rows of a design matrix.
605
+
606
+ Port of edgeR's designAsFactor.
607
+ """
608
+ design = np.asarray(design, dtype=np.float64)
609
+ z = (np.e + np.pi) / 5
610
+ powers = z ** np.arange(design.shape[1])
611
+ row_vals = design @ powers
612
+ _, inverse = np.unique(row_vals, return_inverse=True)
613
+ return inverse
614
+
615
+
616
+ def residual_df(zero_fit, design):
617
+ """Calculate effective residual DF adjusted for exact zeros.
618
+
619
+ Port of edgeR's .residDF.
620
+ """
621
+ zero_fit = np.asarray(zero_fit, dtype=bool)
622
+ nlibs = zero_fit.shape[1] if zero_fit.ndim == 2 else len(zero_fit)
623
+ ncoefs = design.shape[1]
624
+ base_df = nlibs - ncoefs
625
+
626
+ if zero_fit.ndim == 1:
627
+ n_zeros = np.sum(zero_fit)
628
+ return max(base_df - n_zeros, 0)
629
+
630
+ # Group rows with same zero pattern
631
+ ngenes = zero_fit.shape[0]
632
+ df = np.full(ngenes, base_df, dtype=np.float64)
633
+
634
+ for i in range(ngenes):
635
+ zf = zero_fit[i]
636
+ n_zeros = np.sum(zf)
637
+ if n_zeros == 0:
638
+ continue
639
+ if n_zeros >= nlibs - 1:
640
+ df[i] = 0
641
+ continue
642
+ # Reduce design matrix
643
+ keep = ~zf
644
+ design_sub = design[keep]
645
+ rank_sub = np.linalg.matrix_rank(design_sub)
646
+ df[i] = np.sum(keep) - rank_sub
647
+
648
+ return df
649
+
650
+
651
+ def scale_offset(y, offset):
652
+ """Scale offsets to be consistent with library sizes.
653
+
654
+ Port of edgeR's scaleOffset.
655
+ """
656
+ if isinstance(y, dict) and 'counts' in y:
657
+ lib_size = y['samples']['lib.size'].values * y['samples']['norm.factors'].values
658
+ y['offset'] = scale_offset(lib_size, offset)
659
+ return y
660
+
661
+ if isinstance(y, np.ndarray) and y.ndim == 2:
662
+ lib_size = y.sum(axis=0)
663
+ else:
664
+ lib_size = np.asarray(y, dtype=np.float64)
665
+
666
+ offset = np.asarray(offset, dtype=np.float64)
667
+
668
+ if offset.ndim == 2:
669
+ adj = offset.mean(axis=1, keepdims=True)
670
+ else:
671
+ adj = np.mean(offset)
672
+
673
+ return np.mean(np.log(lib_size)) + offset - adj
674
+
675
+
676
+ def _model_matrix_group(group):
677
+ """Create a model matrix from a group factor (model.matrix(~group) equivalent).
678
+
679
+ Returns an intercept + dummy-coded design matrix.
680
+ """
681
+ group = np.asarray(group)
682
+ unique_groups = np.unique(group)
683
+ n = len(group)
684
+ ngroups = len(unique_groups)
685
+
686
+ if ngroups <= 1:
687
+ return np.ones((n, 1))
688
+
689
+ # Intercept + (ngroups - 1) dummy columns
690
+ design = np.zeros((n, ngroups))
691
+ design[:, 0] = 1.0 # intercept
692
+ for i in range(1, ngroups):
693
+ design[group == unique_groups[i], i] = 1.0
694
+
695
+ return design
696
+
697
+
698
+ def model_matrix(formula, data=None):
699
+ """Create a design matrix from an R-style formula.
700
+
701
+ Uses patsy to parse the formula and build the design matrix,
702
+ matching R's ``model.matrix(formula, data)`` behaviour.
703
+
704
+ Parameters
705
+ ----------
706
+ formula : str
707
+ R-style formula, e.g. ``'~ group'``, ``'~ batch + condition'``,
708
+ ``'~ 0 + group'`` (no intercept).
709
+ data : DataFrame, dict, ndarray, scipy.sparse, or Series
710
+ Sample-level data. Column names are used as variables in
711
+ the formula.
712
+
713
+ - **DataFrame**: used directly.
714
+ - **dict**: converted to DataFrame (keys → column names).
715
+ - **ndarray**: columns named ``x0, x1, …`` automatically.
716
+ - **scipy.sparse**: densified, then treated as ndarray.
717
+ - **Series**: wrapped in a single-column DataFrame whose
718
+ column name is the Series ``.name`` (or ``x0``).
719
+
720
+ Returns
721
+ -------
722
+ ndarray
723
+ Design matrix (samples x coefficients), dtype float64.
724
+
725
+ Examples
726
+ --------
727
+ >>> import pandas as pd
728
+ >>> df = pd.DataFrame({'group': ['A','A','B','B'], 'batch': [1,2,1,2]})
729
+ >>> model_matrix('~ group', df)
730
+ array([[1., 0.],
731
+ [1., 0.],
732
+ [1., 1.],
733
+ [1., 1.]])
734
+ >>> model_matrix('~ 0 + group', df) # no intercept
735
+ array([[1., 0.],
736
+ [1., 0.],
737
+ [0., 1.],
738
+ [0., 1.]])
739
+ """
740
+ try:
741
+ import patsy
742
+ except ImportError:
743
+ raise ImportError(
744
+ "patsy package required for formula interface. "
745
+ "Install with: pip install patsy"
746
+ )
747
+
748
+ if data is None:
749
+ raise ValueError("data must be provided for formula-based design")
750
+
751
+ # Convert various types to DataFrame
752
+ if isinstance(data, dict):
753
+ data = pd.DataFrame(data)
754
+ elif isinstance(data, pd.Series):
755
+ name = data.name if data.name is not None else 'x0'
756
+ data = pd.DataFrame({name: data.values})
757
+ elif isinstance(data, np.ndarray):
758
+ if data.ndim == 1:
759
+ data = pd.DataFrame({'x0': data})
760
+ else:
761
+ cols = {f'x{i}': data[:, i] for i in range(data.shape[1])}
762
+ data = pd.DataFrame(cols)
763
+ elif not isinstance(data, pd.DataFrame):
764
+ # scipy.sparse or other array-like
765
+ if hasattr(data, 'toarray'):
766
+ data = data.toarray()
767
+ data = np.asarray(data)
768
+ if data.ndim == 1:
769
+ data = pd.DataFrame({'x0': data})
770
+ else:
771
+ cols = {f'x{i}': data[:, i] for i in range(data.shape[1])}
772
+ data = pd.DataFrame(cols)
773
+
774
+ design_info = patsy.dmatrix(formula, data=data, return_type='dataframe')
775
+ return np.asarray(design_info, dtype=np.float64)
776
+
777
+
778
+ def _resolve_design(design, y):
779
+ """Resolve design argument: formula string → numpy array.
780
+
781
+ If *design* is a string it is treated as an R-style formula and
782
+ evaluated against the sample metadata in *y* (which must be a
783
+ DGEList with a 'samples' key). Otherwise *design* is returned
784
+ as-is.
785
+ """
786
+ if not isinstance(design, str):
787
+ return design
788
+
789
+ if not (isinstance(y, dict) and 'samples' in y):
790
+ raise ValueError(
791
+ "Formula design requires a DGEList with sample metadata. "
792
+ "Pass a DGEList or use model_matrix() explicitly."
793
+ )
794
+ return model_matrix(design, y['samples'])
795
+
796
+
797
+ def model_matrix_meth(object, design=None):
798
+ """Create expanded design matrix for BS-seq methylation analysis.
799
+
800
+ Port of edgeR's ``modelMatrixMeth``.
801
+
802
+ Takes a sample-level design matrix (``nsamples x p``) and expands it
803
+ for a DGEList produced by :func:`read_bismark2dge`, which has
804
+ ``2 * nsamples`` columns arranged as
805
+ ``S1-Me, S1-Un, S2-Me, S2-Un, ...``.
806
+
807
+ The returned design matrix has ``2 * nsamples`` rows and
808
+ ``nsamples + p`` columns:
809
+
810
+ * **Left block** (``nsamples`` columns): sample indicator matrix.
811
+ Each sample gets a 1 in both its Me and Un rows.
812
+ * **Right block** (``p`` columns): treatment design for Me rows
813
+ (odd rows), zeros for Un rows (even rows).
814
+
815
+ Parameters
816
+ ----------
817
+ object : DGEList or ndarray
818
+ Either a DGEList (in which case the sample-level design is taken
819
+ from ``design`` or built from the group factor) or a numpy array
820
+ to use directly as the sample-level treatment design matrix.
821
+ design : ndarray, optional
822
+ Sample-level design matrix (``nsamples x p``). Used when
823
+ *object* is a DGEList. If None and *object* is a DGEList,
824
+ a ``~group`` design is created from the sample metadata.
825
+
826
+ Returns
827
+ -------
828
+ ndarray
829
+ Expanded design matrix, shape ``(2 * nsamples, nsamples + p)``.
830
+ """
831
+ if isinstance(object, np.ndarray):
832
+ design_treatments = object.copy()
833
+ elif isinstance(object, dict):
834
+ # DGEList
835
+ if design is not None:
836
+ design_treatments = np.asarray(design, dtype=np.float64)
837
+ else:
838
+ # Build ~group design from samples
839
+ if 'samples' in object and 'group' in object['samples'].columns:
840
+ group = object['samples']['group'].values
841
+ # Only use first half (Me samples)
842
+ ncols = object['counts'].shape[1]
843
+ nsamples = ncols // 2
844
+ group_half = group[:nsamples] if len(group) > nsamples else group
845
+ design_treatments = _model_matrix_group(group_half)
846
+ else:
847
+ raise ValueError(
848
+ "No design provided and DGEList has no group factor"
849
+ )
850
+ else:
851
+ raise TypeError("object must be a DGEList or a numpy array")
852
+
853
+ nsamples = design_treatments.shape[0]
854
+ nparam = design_treatments.shape[1]
855
+
856
+ # Sample indicator: gl(nsamples, 2) → [0,0,1,1,2,2,...]
857
+ # model.matrix(~0+Sample) → identity matrix indexed by sample
858
+ design_samples = np.zeros((2 * nsamples, nsamples), dtype=np.float64)
859
+ for i in range(nsamples):
860
+ design_samples[2 * i, i] = 1.0
861
+ design_samples[2 * i + 1, i] = 1.0
862
+
863
+ # Expand treatment design: duplicate each row for Me and Un
864
+ design_expanded = np.zeros((2 * nsamples, nparam), dtype=np.float64)
865
+ for i in range(nsamples):
866
+ design_expanded[2 * i, :] = design_treatments[i, :]
867
+ design_expanded[2 * i + 1, :] = design_treatments[i, :]
868
+
869
+ # Methylation indicator: 1 for Me (even rows), 0 for Un (odd rows)
870
+ meth_indicator = np.zeros(2 * nsamples, dtype=np.float64)
871
+ for i in range(nsamples):
872
+ meth_indicator[2 * i] = 1.0
873
+
874
+ # Right block: treatment design * methylation indicator
875
+ design_right = design_expanded * meth_indicator[:, np.newaxis]
876
+
877
+ return np.hstack([design_samples, design_right])
878
+
879
+
880
+ def nearest_tss(chr, locus, tss_data=None, species="Hs"):
881
+ """Find nearest transcription start site for genomic coordinates.
882
+
883
+ Port of edgeR's ``nearestTSS``.
884
+
885
+ For each query position ``(chr[i], locus[i])``, finds the nearest
886
+ TSS on the same chromosome and returns information about the
887
+ corresponding gene.
888
+
889
+ Parameters
890
+ ----------
891
+ chr : array-like of str
892
+ Chromosome names for query positions.
893
+ locus : array-like of int
894
+ Genomic positions for query positions.
895
+ tss_data : DataFrame, optional
896
+ TSS annotation with columns: ``chr``, ``tss``, ``gene_id``,
897
+ ``gene_name``, ``strand``. If None, attempts to fetch from
898
+ Ensembl BioMart using ``pybiomart`` (requires internet).
899
+ species : str
900
+ Species code for BioMart query (default ``"Hs"`` for human).
901
+ Only used when ``tss_data`` is None.
902
+
903
+ Returns
904
+ -------
905
+ DataFrame
906
+ With columns: ``gene_id``, ``gene_name``, ``strand``, ``tss``,
907
+ ``width``, ``distance``. ``distance`` is positive when the
908
+ query locus is downstream of the TSS on the gene's strand.
909
+ """
910
+ chr_arr = np.asarray(chr, dtype=str)
911
+ locus_arr = np.asarray(locus, dtype=np.int64)
912
+ n = len(chr_arr)
913
+
914
+ if len(locus_arr) == 1:
915
+ locus_arr = np.full(n, locus_arr[0], dtype=np.int64)
916
+ elif len(locus_arr) != n:
917
+ raise ValueError("Length of locus doesn't agree with length of chr")
918
+
919
+ # Handle NAs
920
+ na_mask = np.array([(c == '' or c == 'nan' or c == 'None')
921
+ for c in chr_arr])
922
+
923
+ if tss_data is None:
924
+ tss_data = _fetch_tss_biomart(species)
925
+
926
+ # Ensure tss_data has required columns
927
+ required = {'chr', 'tss', 'gene_id', 'gene_name', 'strand'}
928
+ missing = required - set(tss_data.columns)
929
+ if missing:
930
+ raise ValueError(f"tss_data missing columns: {missing}")
931
+
932
+ # Sort tss_data by chromosome and TSS position
933
+ tss_data = tss_data.sort_values(['chr', 'tss']).reset_index(drop=True)
934
+
935
+ # Group by chromosome
936
+ tss_by_chr = {}
937
+ for chrom, grp in tss_data.groupby('chr'):
938
+ tss_by_chr[chrom] = grp
939
+
940
+ # Prepare output
941
+ out_gene_id = np.full(n, np.nan, dtype=object)
942
+ out_gene_name = np.full(n, np.nan, dtype=object)
943
+ out_strand = np.full(n, np.nan, dtype=object)
944
+ out_tss = np.full(n, np.nan, dtype=np.float64)
945
+ out_width = np.full(n, np.nan, dtype=np.float64)
946
+ out_distance = np.full(n, np.nan, dtype=np.float64)
947
+
948
+ # Check if query chr values start with "chr" but tss_data doesn't (or vice versa)
949
+ query_has_chr = any(c.startswith('chr') for c in chr_arr if c)
950
+ tss_has_chr = any(str(c).startswith('chr') for c in tss_data['chr'].values[:10])
951
+
952
+ for chrom_name in tss_by_chr:
953
+ grp = tss_by_chr[chrom_name]
954
+ tss_positions = grp['tss'].values.astype(np.float64)
955
+
956
+ # Match query chromosomes to this reference chromosome
957
+ if query_has_chr and not tss_has_chr:
958
+ query_chrom = 'chr' + str(chrom_name)
959
+ elif not query_has_chr and tss_has_chr:
960
+ query_chrom = str(chrom_name).replace('chr', '')
961
+ else:
962
+ query_chrom = str(chrom_name)
963
+
964
+ iinc = np.where((chr_arr == query_chrom) & ~na_mask)[0]
965
+ if len(iinc) == 0:
966
+ continue
967
+
968
+ which = nearest_ref_to_x(locus_arr[iinc].astype(np.float64),
969
+ tss_positions)
970
+
971
+ for j, qi in enumerate(iinc):
972
+ ref_idx = which[j]
973
+ row = grp.iloc[ref_idx]
974
+ out_gene_id[qi] = row['gene_id']
975
+ out_gene_name[qi] = row['gene_name']
976
+ out_strand[qi] = row['strand']
977
+ out_tss[qi] = row['tss']
978
+ if 'width' in grp.columns:
979
+ out_width[qi] = row['width']
980
+ # distance: signed distance, positive = downstream of TSS
981
+ dist = locus_arr[qi] - int(row['tss'])
982
+ if row['strand'] == '-':
983
+ dist = -dist
984
+ out_distance[qi] = dist
985
+
986
+ result = pd.DataFrame({
987
+ 'gene_id': out_gene_id,
988
+ 'gene_name': out_gene_name,
989
+ 'strand': out_strand,
990
+ 'tss': pd.array(out_tss, dtype=pd.Int64Dtype()),
991
+ 'width': pd.array(out_width, dtype=pd.Int64Dtype()),
992
+ 'distance': pd.array(out_distance, dtype=pd.Int64Dtype()),
993
+ })
994
+ return result
995
+
996
+
997
+ def _fetch_tss_biomart(species="Hs"):
998
+ """Fetch TSS data from Ensembl BioMart.
999
+
1000
+ Requires the ``pybiomart`` package.
1001
+ """
1002
+ try:
1003
+ from pybiomart import Server
1004
+ except ImportError:
1005
+ raise ImportError(
1006
+ "pybiomart package required to fetch TSS data from Ensembl. "
1007
+ "Install with: pip install pybiomart\n"
1008
+ "Alternatively, pass tss_data as a DataFrame with columns: "
1009
+ "chr, tss, gene_id, gene_name, strand"
1010
+ )
1011
+
1012
+ species_map = {
1013
+ 'Hs': 'hsapiens_gene_ensembl',
1014
+ 'Mm': 'mmusculus_gene_ensembl',
1015
+ 'Rn': 'rnorvegicus_gene_ensembl',
1016
+ 'Dm': 'dmelanogaster_gene_ensembl',
1017
+ 'Dr': 'drerio_gene_ensembl',
1018
+ }
1019
+
1020
+ dataset_name = species_map.get(species)
1021
+ if dataset_name is None:
1022
+ raise ValueError(
1023
+ f"Unknown species code '{species}'. Known: {list(species_map.keys())}"
1024
+ )
1025
+
1026
+ server = Server(host='http://www.ensembl.org')
1027
+ dataset = server.marts['ENSEMBL_MART_ENSEMBL'].datasets[dataset_name]
1028
+
1029
+ result = dataset.query(
1030
+ attributes=[
1031
+ 'chromosome_name',
1032
+ 'transcription_start_site',
1033
+ 'ensembl_gene_id',
1034
+ 'external_gene_name',
1035
+ 'strand',
1036
+ 'transcript_length',
1037
+ ]
1038
+ )
1039
+
1040
+ result.columns = ['chr', 'tss', 'gene_id', 'gene_name', 'strand_int',
1041
+ 'width']
1042
+ result['strand'] = np.where(result['strand_int'] > 0, '+', '-')
1043
+ result = result.drop(columns=['strand_int'])
1044
+
1045
+ # Keep one TSS per gene (the one with smallest TSS per chromosome)
1046
+ result = result.sort_values(['chr', 'tss']).drop_duplicates(
1047
+ subset=['chr', 'gene_id'], keep='first'
1048
+ ).reset_index(drop=True)
1049
+
1050
+ return result