edgepython 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,546 @@
1
+ # This code was written by Claude (Anthropic). The project was directed by Lior Pachter.
2
+ """
3
+ Normalization methods for edgePython.
4
+
5
+ Port of edgeR's calcNormFactors/normLibSizes (TMM, TMMwsp, RLE, upperquartile)
6
+ and ChIP-seq normalization (normalizeChIPtoInput, calcNormOffsetsforChIP).
7
+ """
8
+
9
+ import numpy as np
10
+ import warnings
11
+ from scipy import stats
12
+ from statsmodels.stats.multitest import multipletests
13
+
14
+
15
+ def calc_norm_factors(counts, lib_size=None, method='TMM', ref_column=None,
16
+ logratio_trim=0.3, sum_trim=0.05, do_weighting=True,
17
+ a_cutoff=-1e10, p=0.75):
18
+ """Calculate normalization factors for a count matrix.
19
+
20
+ Port of edgeR's calcNormFactors / normLibSizes.
21
+
22
+ Parameters
23
+ ----------
24
+ counts : array-like or DGEList
25
+ Count matrix (genes x samples), or DGEList object.
26
+ lib_size : array-like, optional
27
+ Library sizes. Defaults to column sums.
28
+ method : str
29
+ One of 'TMM', 'TMMwsp', 'RLE', 'upperquartile', 'none'.
30
+ ref_column : int, optional
31
+ Reference column for TMM/TMMwsp.
32
+ logratio_trim : float
33
+ Amount of trim for log-ratios (TMM).
34
+ sum_trim : float
35
+ Amount of trim for sums (TMM).
36
+ do_weighting : bool
37
+ Use precision weights in TMM.
38
+ a_cutoff : float
39
+ Abundance cutoff for TMM.
40
+ p : float
41
+ Quantile for upper-quartile method.
42
+
43
+ Returns
44
+ -------
45
+ DGEList (if input is DGEList) or ndarray of normalization factors.
46
+ """
47
+ # Handle DGEList input
48
+ if isinstance(counts, dict) and 'counts' in counts:
49
+ y = counts
50
+ if y.get('offset') is not None:
51
+ warnings.warn("object contains offsets, which take precedence over library "
52
+ "sizes and norm factors (and which will not be recomputed).")
53
+ ls = y['samples']['lib.size'].values
54
+ nf = _calc_norm_factors_default(
55
+ y['counts'], lib_size=ls, method=method, ref_column=ref_column,
56
+ logratio_trim=logratio_trim, sum_trim=sum_trim,
57
+ do_weighting=do_weighting, a_cutoff=a_cutoff, p=p)
58
+ y['samples']['norm.factors'] = nf
59
+ return y
60
+
61
+ return _calc_norm_factors_default(
62
+ counts, lib_size=lib_size, method=method, ref_column=ref_column,
63
+ logratio_trim=logratio_trim, sum_trim=sum_trim,
64
+ do_weighting=do_weighting, a_cutoff=a_cutoff, p=p)
65
+
66
+
67
+ # Alias
68
+ norm_lib_sizes = calc_norm_factors
69
+
70
+
71
+ def _calc_norm_factors_default(x, lib_size=None, method='TMM', ref_column=None,
72
+ logratio_trim=0.3, sum_trim=0.05, do_weighting=True,
73
+ a_cutoff=-1e10, p=0.75):
74
+ """Core normalization factor calculation for count matrices."""
75
+ x = np.asarray(x, dtype=np.float64)
76
+ if np.any(np.isnan(x)):
77
+ raise ValueError("NA counts not permitted")
78
+ nsamples = x.shape[1]
79
+
80
+ if lib_size is None:
81
+ lib_size = x.sum(axis=0)
82
+ else:
83
+ lib_size = np.asarray(lib_size, dtype=np.float64)
84
+ if np.any(np.isnan(lib_size)):
85
+ raise ValueError("NA lib.sizes not permitted")
86
+ if len(lib_size) != nsamples:
87
+ if len(lib_size) > 1:
88
+ warnings.warn("length(lib_size) doesn't match number of samples")
89
+ lib_size = np.full(nsamples, lib_size[0] if len(lib_size) == 1 else lib_size.mean())
90
+
91
+ # Backward compatibility
92
+ if method == 'TMMwzp':
93
+ method = 'TMMwsp'
94
+
95
+ valid_methods = ('TMM', 'TMMwsp', 'RLE', 'upperquartile', 'none')
96
+ if method not in valid_methods:
97
+ raise ValueError(f"method must be one of {valid_methods}")
98
+
99
+ # Remove all-zero rows
100
+ allzero = np.sum(x > 0, axis=1) == 0
101
+ if np.any(allzero):
102
+ x = x[~allzero]
103
+
104
+ # Degenerate cases
105
+ if x.shape[0] == 0 or nsamples == 1:
106
+ method = 'none'
107
+
108
+ if method == 'TMM':
109
+ f = _calc_tmm(x, lib_size, ref_column, logratio_trim, sum_trim, do_weighting, a_cutoff)
110
+ elif method == 'TMMwsp':
111
+ f = _calc_tmmwsp(x, lib_size, ref_column, logratio_trim, sum_trim, do_weighting, a_cutoff)
112
+ elif method == 'RLE':
113
+ f = _calc_factor_rle(x) / lib_size
114
+ elif method == 'upperquartile':
115
+ f = _calc_factor_quantile(x, lib_size, p)
116
+ else:
117
+ f = np.ones(nsamples)
118
+
119
+ # Normalize so factors multiply to one
120
+ f = f / np.exp(np.mean(np.log(f)))
121
+
122
+ return f
123
+
124
+
125
+ def _calc_tmm(x, lib_size, ref_column, logratio_trim, sum_trim, do_weighting, a_cutoff):
126
+ """TMM normalization."""
127
+ nsamples = x.shape[1]
128
+ if ref_column is None:
129
+ f75 = _calc_factor_quantile(x, lib_size, 0.75)
130
+ with warnings.catch_warnings():
131
+ warnings.simplefilter("ignore")
132
+ if np.median(f75) < 1e-20:
133
+ ref_column = np.argmax(np.sum(np.sqrt(x), axis=0))
134
+ else:
135
+ ref_column = np.argmin(np.abs(f75 - np.mean(f75)))
136
+
137
+ f = np.full(nsamples, np.nan)
138
+ for i in range(nsamples):
139
+ f[i] = _calc_factor_tmm(
140
+ obs=x[:, i], ref=x[:, ref_column],
141
+ libsize_obs=lib_size[i], libsize_ref=lib_size[ref_column],
142
+ logratio_trim=logratio_trim, sum_trim=sum_trim,
143
+ do_weighting=do_weighting, a_cutoff=a_cutoff)
144
+ return f
145
+
146
+
147
+ def _calc_tmmwsp(x, lib_size, ref_column, logratio_trim, sum_trim, do_weighting, a_cutoff):
148
+ """TMMwsp normalization."""
149
+ nsamples = x.shape[1]
150
+ if ref_column is None:
151
+ ref_column = np.argmax(np.sum(np.sqrt(x), axis=0))
152
+
153
+ f = np.full(nsamples, np.nan)
154
+ for i in range(nsamples):
155
+ f[i] = _calc_factor_tmmwsp(
156
+ obs=x[:, i], ref=x[:, ref_column],
157
+ libsize_obs=lib_size[i], libsize_ref=lib_size[ref_column],
158
+ logratio_trim=logratio_trim, sum_trim=sum_trim,
159
+ do_weighting=do_weighting, a_cutoff=a_cutoff)
160
+ return f
161
+
162
+
163
+ def _calc_factor_rle(data):
164
+ """Scale factors as in Anders et al (2010)."""
165
+ with np.errstate(divide='ignore'):
166
+ gm = np.exp(np.mean(np.log(data.astype(float)), axis=1))
167
+ pos = gm > 0
168
+ result = np.zeros(data.shape[1])
169
+ for j in range(data.shape[1]):
170
+ ratio = data[pos, j] / gm[pos]
171
+ result[j] = np.median(ratio)
172
+ return result
173
+
174
+
175
+ def _calc_factor_quantile(data, lib_size, p=0.75):
176
+ """Upper-quartile normalization."""
177
+ f = np.zeros(data.shape[1])
178
+ for j in range(data.shape[1]):
179
+ f[j] = np.quantile(data[:, j], p)
180
+ if np.min(f) == 0:
181
+ warnings.warn("One or more quantiles are zero")
182
+ return f / lib_size
183
+
184
+
185
+ def _calc_factor_tmm(obs, ref, libsize_obs=None, libsize_ref=None,
186
+ logratio_trim=0.3, sum_trim=0.05, do_weighting=True,
187
+ a_cutoff=-1e10):
188
+ """TMM between two libraries."""
189
+ obs = np.asarray(obs, dtype=np.float64)
190
+ ref = np.asarray(ref, dtype=np.float64)
191
+
192
+ if libsize_obs is None:
193
+ nO = np.sum(obs)
194
+ else:
195
+ nO = libsize_obs
196
+ if libsize_ref is None:
197
+ nR = np.sum(ref)
198
+ else:
199
+ nR = libsize_ref
200
+
201
+ with np.errstate(divide='ignore', invalid='ignore'):
202
+ logR = np.log2(obs / nO) - np.log2(ref / nR)
203
+ absE = (np.log2(obs / nO) + np.log2(ref / nR)) / 2
204
+ v = (nO - obs) / nO / obs + (nR - ref) / nR / ref
205
+
206
+ # Remove infinite values
207
+ fin = np.isfinite(logR) & np.isfinite(absE) & (absE > a_cutoff)
208
+ logR = logR[fin]
209
+ absE = absE[fin]
210
+ v = v[fin]
211
+
212
+ if len(logR) == 0 or np.max(np.abs(logR)) < 1e-6:
213
+ return 1.0
214
+
215
+ n = len(logR)
216
+ loL = int(np.floor(n * logratio_trim)) + 1
217
+ hiL = n + 1 - loL
218
+ loS = int(np.floor(n * sum_trim)) + 1
219
+ hiS = n + 1 - loS
220
+
221
+ rank_logR = _rank(logR)
222
+ rank_absE = _rank(absE)
223
+ keep = ((rank_logR >= loL) & (rank_logR <= hiL) &
224
+ (rank_absE >= loS) & (rank_absE <= hiS))
225
+
226
+ if do_weighting:
227
+ denom = np.sum(1 / v[keep])
228
+ if denom > 0 and np.isfinite(denom):
229
+ f = np.sum(logR[keep] / v[keep]) / denom
230
+ else:
231
+ f = np.nanmean(logR[keep])
232
+ else:
233
+ f = np.nanmean(logR[keep])
234
+
235
+ if np.isnan(f):
236
+ f = 0.0
237
+
238
+ return 2 ** f
239
+
240
+
241
+ def _calc_factor_tmmwsp(obs, ref, libsize_obs=None, libsize_ref=None,
242
+ logratio_trim=0.3, sum_trim=0.05, do_weighting=True,
243
+ a_cutoff=-1e10):
244
+ """TMM with singleton pairing."""
245
+ obs = np.asarray(obs, dtype=np.float64)
246
+ ref = np.asarray(ref, dtype=np.float64)
247
+ eps = 1e-14
248
+
249
+ pos_obs = obs > eps
250
+ pos_ref = ref > eps
251
+ npos = 2 * pos_obs.astype(int) + pos_ref.astype(int)
252
+
253
+ # Remove double zeros and NAs
254
+ keep = (npos != 0) & ~np.isnan(npos)
255
+ obs = obs[keep]
256
+ ref = ref[keep]
257
+ npos = npos[keep]
258
+
259
+ if libsize_obs is None:
260
+ libsize_obs = np.sum(obs)
261
+ if libsize_ref is None:
262
+ libsize_ref = np.sum(ref)
263
+
264
+ # Pair singleton positives
265
+ zero_obs = npos == 1
266
+ zero_ref = npos == 2
267
+ k = zero_obs | zero_ref
268
+ n_eligible = min(np.sum(zero_obs), np.sum(zero_ref))
269
+
270
+ if n_eligible > 0:
271
+ refk = np.sort(ref[k])[::-1][:n_eligible]
272
+ obsk = np.sort(obs[k])[::-1][:n_eligible]
273
+ obs = np.concatenate([obs[~k], obsk])
274
+ ref = np.concatenate([ref[~k], refk])
275
+ else:
276
+ obs = obs[~k]
277
+ ref = ref[~k]
278
+
279
+ n = len(obs)
280
+ if n == 0:
281
+ return 1.0
282
+
283
+ obs_p = obs / libsize_obs
284
+ ref_p = ref / libsize_ref
285
+ with np.errstate(divide='ignore', invalid='ignore'):
286
+ M = np.log2(obs_p / ref_p)
287
+ A = 0.5 * np.log2(obs_p * ref_p)
288
+
289
+ if np.max(np.abs(M[np.isfinite(M)])) < 1e-6:
290
+ return 1.0
291
+
292
+ # Sort by M with tie-breaking
293
+ obs_p_shrunk = (obs + 0.5) / (libsize_obs + 0.5)
294
+ ref_p_shrunk = (ref + 0.5) / (libsize_ref + 0.5)
295
+ M_shrunk = np.log2(obs_p_shrunk / ref_p_shrunk)
296
+ o_M = np.lexsort((M_shrunk, M))
297
+ o_A = np.argsort(A)
298
+
299
+ loM = int(n * logratio_trim) + 1
300
+ hiM = n - loM
301
+ keep_M = np.zeros(n, dtype=bool)
302
+ keep_M[o_M[loM:hiM]] = True
303
+
304
+ loA = int(n * sum_trim) + 1
305
+ hiA = n - loA
306
+ keep_A = np.zeros(n, dtype=bool)
307
+ keep_A[o_A[loA:hiA]] = True
308
+
309
+ keep = keep_M & keep_A
310
+ M_keep = M[keep]
311
+
312
+ if do_weighting:
313
+ obs_p_k = obs_p[keep]
314
+ ref_p_k = ref_p[keep]
315
+ v = (1 - obs_p_k) / obs_p_k / libsize_obs + (1 - ref_p_k) / ref_p_k / libsize_ref
316
+ w = (1 + 1e-6) / (v + 1e-6)
317
+ TMM = np.sum(w * M_keep) / np.sum(w)
318
+ else:
319
+ TMM = np.mean(M_keep) if len(M_keep) > 0 else 0
320
+
321
+ return 2 ** TMM
322
+
323
+
324
+ def _rank(x):
325
+ """Compute ranks (1-based, average ties)."""
326
+ from scipy.stats import rankdata
327
+ return rankdata(x, method='average')
328
+
329
+
330
+ # =====================================================================
331
+ # ChIP-seq normalization
332
+ # =====================================================================
333
+
334
+ def normalize_chip_to_input(input_counts, response, dispersion=0.01, niter=6,
335
+ loss='p', verbose=False):
336
+ """Normalize ChIP-Seq read counts to input and test for enrichment.
337
+
338
+ Port of edgeR's normalizeChIPtoInput. For a single sample, aligns
339
+ ChIP-Seq mark counts to input control counts under a negative binomial
340
+ model, iteratively estimating a scaling factor and the proportion of
341
+ enriched features.
342
+
343
+ Parameters
344
+ ----------
345
+ input_counts : array-like
346
+ Non-negative input control counts for each genomic feature.
347
+ response : array-like
348
+ Non-negative integer ChIP-Seq mark counts for each feature.
349
+ dispersion : float
350
+ Negative binomial dispersion (must be positive).
351
+ niter : int
352
+ Number of iterations for estimating scaling factor and
353
+ proportion enriched.
354
+ loss : str
355
+ Loss function: ``'p'`` for cumulative probabilities,
356
+ ``'z'`` for z-values.
357
+ verbose : bool
358
+ If True, print working estimates at each iteration.
359
+
360
+ Returns
361
+ -------
362
+ dict with keys:
363
+ ``p_value`` : ndarray – upper-tail p-values for enrichment.
364
+ ``pmid_value`` : ndarray – mid-p-values.
365
+ ``scaling_factor`` : float – scaling factor aligning response
366
+ to input for unenriched features.
367
+ ``prop_enriched`` : float – estimated proportion of enriched
368
+ features.
369
+ """
370
+ input_counts = np.asarray(input_counts, dtype=np.float64)
371
+ response = np.asarray(response, dtype=np.float64)
372
+
373
+ if len(input_counts) != len(response):
374
+ raise ValueError("input and response must be same length")
375
+ if np.any(input_counts < 0) or np.any(response < 0):
376
+ raise ValueError("negative values not allowed")
377
+ if dispersion <= 0:
378
+ raise ValueError("dispersion must be positive")
379
+
380
+ # Remove features where both input and response are zero
381
+ zero = (input_counts <= 0) & (response <= 0)
382
+ if np.any(zero):
383
+ p_value = np.ones(len(zero))
384
+ pmid_value = np.ones(len(zero))
385
+ out = normalize_chip_to_input(
386
+ input_counts[~zero], response[~zero],
387
+ dispersion=dispersion, niter=niter, loss=loss, verbose=verbose,
388
+ )
389
+ p_value[~zero] = out['p_value']
390
+ pmid_value[~zero] = out['pmid_value']
391
+ return {
392
+ 'p_value': p_value,
393
+ 'pmid_value': pmid_value,
394
+ 'scaling_factor': out['scaling_factor'],
395
+ 'prop_enriched': out['prop_enriched'],
396
+ }
397
+
398
+ n = len(response)
399
+
400
+ # Special cases
401
+ if n == 0:
402
+ return {'p_value': np.array([]), 'pmid_value': np.array([]),
403
+ 'scaling_factor': np.nan, 'prop_enriched': np.nan}
404
+ if np.all(input_counts == 0):
405
+ return {'p_value': np.zeros(n), 'pmid_value': np.zeros(n),
406
+ 'scaling_factor': 0.0, 'prop_enriched': 1.0}
407
+ if n == 1:
408
+ return {'p_value': np.array([1.0]), 'pmid_value': np.array([1.0]),
409
+ 'scaling_factor': float(input_counts[0] / response[0]),
410
+ 'prop_enriched': 0.0}
411
+
412
+ # Replace zero inputs with minimum positive value
413
+ inp = input_counts.copy()
414
+ inp[inp == 0] = np.min(inp[inp > 0])
415
+
416
+ size = 1.0 / dispersion # NB size parameter
417
+
418
+ if loss not in ('p', 'z'):
419
+ raise ValueError("loss must be 'p' or 'z'")
420
+
421
+ def _nb_p_and_d(resp, mu):
422
+ """Upper-tail p and pmf for NB(mu, size)."""
423
+ p_val = stats.nbinom.sf(resp.astype(int), size, size / (size + mu))
424
+ d_val = stats.nbinom.pmf(resp.astype(int), size, size / (size + mu))
425
+ return p_val, d_val
426
+
427
+ def _objective_p(sf, inp_v, resp_v, prop_enrich):
428
+ mu = sf * inp_v
429
+ p = stats.nbinom.cdf(resp_v.astype(int), size, size / (size + mu))
430
+ d = stats.nbinom.pmf(resp_v.astype(int), size, size / (size + mu))
431
+ pmid = p - d / 2
432
+ n_not_enriched = max(round(len(resp_v) * (1 - prop_enrich)), 1)
433
+ p_sorted = np.partition(pmid, n_not_enriched - 1)[:n_not_enriched]
434
+ return abs(np.mean(p_sorted) - 0.5)
435
+
436
+ def _objective_z(sf, inp_v, resp_v, prop_enrich):
437
+ from .utils import zscore_nbinom
438
+ mu = sf * inp_v
439
+ z = zscore_nbinom(resp_v, size=size, mu=mu)
440
+ n_not_enriched = max(round(len(resp_v) * (1 - prop_enrich)), 1)
441
+ z_sorted = np.partition(np.abs(z), n_not_enriched - 1)[:n_not_enriched]
442
+ return np.mean(z_sorted)
443
+
444
+ objective = _objective_p if loss == 'p' else _objective_z
445
+
446
+ # Starting values
447
+ prop_enriched = 0.5
448
+ ratios = response / inp
449
+ sf_interval = (np.percentile(ratios, 10), np.percentile(ratios, 80))
450
+
451
+ if sf_interval[0] == sf_interval[1]:
452
+ scaling_factor = sf_interval[0]
453
+ p, d = _nb_p_and_d(response, scaling_factor * inp)
454
+ pmid = p - d / 2
455
+ _, adj_p, _, _ = multipletests(pmid, method='holm')
456
+ enriched = adj_p < 0.5
457
+ prop_enriched = np.sum(enriched) / n
458
+ if verbose:
459
+ print(f"prop.enriched: {prop_enriched} scaling.factor: {scaling_factor}")
460
+ else:
461
+ from scipy.optimize import minimize_scalar
462
+ for _ in range(niter):
463
+ res = minimize_scalar(
464
+ objective, bounds=sf_interval, method='bounded',
465
+ args=(inp, response, prop_enriched),
466
+ )
467
+ scaling_factor = res.x
468
+ p, d = _nb_p_and_d(response, scaling_factor * inp)
469
+ pmid = p - d / 2
470
+ _, adj_p, _, _ = multipletests(pmid, method='holm')
471
+ enriched = adj_p < 0.5
472
+ prop_enriched = np.sum(enriched) / n
473
+ if verbose:
474
+ print(f"prop.enriched: {prop_enriched} scaling.factor: {scaling_factor}")
475
+
476
+ return {
477
+ 'p_value': p,
478
+ 'pmid_value': pmid,
479
+ 'scaling_factor': float(scaling_factor),
480
+ 'prop_enriched': float(prop_enriched),
481
+ }
482
+
483
+
484
+ def calc_norm_offsets_for_chip(input_counts, response, dispersion=0.01,
485
+ niter=6, loss='p', verbose=False):
486
+ """Compute normalization offsets for ChIP-Seq relative to input.
487
+
488
+ Port of edgeR's calcNormOffsetsforChIP. Calls
489
+ :func:`normalize_chip_to_input` for each sample and returns a matrix
490
+ of offsets (log-scale) suitable for edgePython's GLM framework.
491
+
492
+ Parameters
493
+ ----------
494
+ input_counts : array-like
495
+ Input control count matrix (genes x samples), or a single
496
+ column that is shared across all samples.
497
+ response : array-like or DGEList
498
+ ChIP-Seq mark count matrix (genes x samples), or a DGEList.
499
+ dispersion : float
500
+ Negative binomial dispersion (must be positive).
501
+ niter : int
502
+ Number of iterations.
503
+ loss : str
504
+ Loss function (``'p'`` or ``'z'``).
505
+ verbose : bool
506
+ If True, print working estimates.
507
+
508
+ Returns
509
+ -------
510
+ If *response* is a DGEList, returns the DGEList with the ``offset``
511
+ field set. Otherwise returns a numeric matrix of offsets
512
+ (genes x samples).
513
+ """
514
+ is_dgelist = isinstance(response, dict) and 'counts' in response
515
+
516
+ if is_dgelist:
517
+ resp_mat = np.asarray(response['counts'], dtype=np.float64)
518
+ else:
519
+ resp_mat = np.asarray(response, dtype=np.float64)
520
+
521
+ inp_mat = np.asarray(input_counts, dtype=np.float64)
522
+ if inp_mat.ndim == 1:
523
+ inp_mat = inp_mat[:, np.newaxis]
524
+ if resp_mat.ndim == 1:
525
+ resp_mat = resp_mat[:, np.newaxis]
526
+
527
+ if inp_mat.shape[0] != resp_mat.shape[0]:
528
+ raise ValueError("nrows of input and response disagree")
529
+ if inp_mat.shape[1] == 1 and resp_mat.shape[1] > 1:
530
+ inp_mat = np.broadcast_to(inp_mat, resp_mat.shape).copy()
531
+ if inp_mat.shape[1] != resp_mat.shape[1]:
532
+ raise ValueError("ncols of input and response disagree")
533
+
534
+ offset = np.empty_like(resp_mat, dtype=np.float64)
535
+ for j in range(resp_mat.shape[1]):
536
+ out = normalize_chip_to_input(
537
+ inp_mat[:, j], resp_mat[:, j],
538
+ dispersion=dispersion, niter=niter, loss=loss, verbose=verbose,
539
+ )
540
+ offset[:, j] = np.log(out['scaling_factor'] * inp_mat[:, j])
541
+
542
+ if is_dgelist:
543
+ response = dict(response)
544
+ response['offset'] = offset
545
+ return response
546
+ return offset