edgepython 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- edgepython/__init__.py +114 -0
- edgepython/classes.py +517 -0
- edgepython/compressed_matrix.py +388 -0
- edgepython/dgelist.py +314 -0
- edgepython/dispersion.py +920 -0
- edgepython/dispersion_lowlevel.py +1066 -0
- edgepython/exact_test.py +525 -0
- edgepython/expression.py +323 -0
- edgepython/filtering.py +96 -0
- edgepython/gene_sets.py +1215 -0
- edgepython/glm_fit.py +653 -0
- edgepython/glm_levenberg.py +359 -0
- edgepython/glm_test.py +375 -0
- edgepython/io.py +1887 -0
- edgepython/limma_port.py +987 -0
- edgepython/normalization.py +546 -0
- edgepython/ql_weights.py +765 -0
- edgepython/results.py +236 -0
- edgepython/sc_fit.py +1511 -0
- edgepython/smoothing.py +474 -0
- edgepython/splicing.py +537 -0
- edgepython/utils.py +1050 -0
- edgepython/visualization.py +409 -0
- edgepython/weighted_lowess.py +323 -0
- edgepython-0.2.0.dist-info/METADATA +201 -0
- edgepython-0.2.0.dist-info/RECORD +29 -0
- edgepython-0.2.0.dist-info/WHEEL +5 -0
- edgepython-0.2.0.dist-info/licenses/LICENSE +674 -0
- edgepython-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,546 @@
|
|
|
1
|
+
# This code was written by Claude (Anthropic). The project was directed by Lior Pachter.
|
|
2
|
+
"""
|
|
3
|
+
Normalization methods for edgePython.
|
|
4
|
+
|
|
5
|
+
Port of edgeR's calcNormFactors/normLibSizes (TMM, TMMwsp, RLE, upperquartile)
|
|
6
|
+
and ChIP-seq normalization (normalizeChIPtoInput, calcNormOffsetsforChIP).
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
import warnings
|
|
11
|
+
from scipy import stats
|
|
12
|
+
from statsmodels.stats.multitest import multipletests
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def calc_norm_factors(counts, lib_size=None, method='TMM', ref_column=None,
|
|
16
|
+
logratio_trim=0.3, sum_trim=0.05, do_weighting=True,
|
|
17
|
+
a_cutoff=-1e10, p=0.75):
|
|
18
|
+
"""Calculate normalization factors for a count matrix.
|
|
19
|
+
|
|
20
|
+
Port of edgeR's calcNormFactors / normLibSizes.
|
|
21
|
+
|
|
22
|
+
Parameters
|
|
23
|
+
----------
|
|
24
|
+
counts : array-like or DGEList
|
|
25
|
+
Count matrix (genes x samples), or DGEList object.
|
|
26
|
+
lib_size : array-like, optional
|
|
27
|
+
Library sizes. Defaults to column sums.
|
|
28
|
+
method : str
|
|
29
|
+
One of 'TMM', 'TMMwsp', 'RLE', 'upperquartile', 'none'.
|
|
30
|
+
ref_column : int, optional
|
|
31
|
+
Reference column for TMM/TMMwsp.
|
|
32
|
+
logratio_trim : float
|
|
33
|
+
Amount of trim for log-ratios (TMM).
|
|
34
|
+
sum_trim : float
|
|
35
|
+
Amount of trim for sums (TMM).
|
|
36
|
+
do_weighting : bool
|
|
37
|
+
Use precision weights in TMM.
|
|
38
|
+
a_cutoff : float
|
|
39
|
+
Abundance cutoff for TMM.
|
|
40
|
+
p : float
|
|
41
|
+
Quantile for upper-quartile method.
|
|
42
|
+
|
|
43
|
+
Returns
|
|
44
|
+
-------
|
|
45
|
+
DGEList (if input is DGEList) or ndarray of normalization factors.
|
|
46
|
+
"""
|
|
47
|
+
# Handle DGEList input
|
|
48
|
+
if isinstance(counts, dict) and 'counts' in counts:
|
|
49
|
+
y = counts
|
|
50
|
+
if y.get('offset') is not None:
|
|
51
|
+
warnings.warn("object contains offsets, which take precedence over library "
|
|
52
|
+
"sizes and norm factors (and which will not be recomputed).")
|
|
53
|
+
ls = y['samples']['lib.size'].values
|
|
54
|
+
nf = _calc_norm_factors_default(
|
|
55
|
+
y['counts'], lib_size=ls, method=method, ref_column=ref_column,
|
|
56
|
+
logratio_trim=logratio_trim, sum_trim=sum_trim,
|
|
57
|
+
do_weighting=do_weighting, a_cutoff=a_cutoff, p=p)
|
|
58
|
+
y['samples']['norm.factors'] = nf
|
|
59
|
+
return y
|
|
60
|
+
|
|
61
|
+
return _calc_norm_factors_default(
|
|
62
|
+
counts, lib_size=lib_size, method=method, ref_column=ref_column,
|
|
63
|
+
logratio_trim=logratio_trim, sum_trim=sum_trim,
|
|
64
|
+
do_weighting=do_weighting, a_cutoff=a_cutoff, p=p)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
# Alias
|
|
68
|
+
norm_lib_sizes = calc_norm_factors
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _calc_norm_factors_default(x, lib_size=None, method='TMM', ref_column=None,
|
|
72
|
+
logratio_trim=0.3, sum_trim=0.05, do_weighting=True,
|
|
73
|
+
a_cutoff=-1e10, p=0.75):
|
|
74
|
+
"""Core normalization factor calculation for count matrices."""
|
|
75
|
+
x = np.asarray(x, dtype=np.float64)
|
|
76
|
+
if np.any(np.isnan(x)):
|
|
77
|
+
raise ValueError("NA counts not permitted")
|
|
78
|
+
nsamples = x.shape[1]
|
|
79
|
+
|
|
80
|
+
if lib_size is None:
|
|
81
|
+
lib_size = x.sum(axis=0)
|
|
82
|
+
else:
|
|
83
|
+
lib_size = np.asarray(lib_size, dtype=np.float64)
|
|
84
|
+
if np.any(np.isnan(lib_size)):
|
|
85
|
+
raise ValueError("NA lib.sizes not permitted")
|
|
86
|
+
if len(lib_size) != nsamples:
|
|
87
|
+
if len(lib_size) > 1:
|
|
88
|
+
warnings.warn("length(lib_size) doesn't match number of samples")
|
|
89
|
+
lib_size = np.full(nsamples, lib_size[0] if len(lib_size) == 1 else lib_size.mean())
|
|
90
|
+
|
|
91
|
+
# Backward compatibility
|
|
92
|
+
if method == 'TMMwzp':
|
|
93
|
+
method = 'TMMwsp'
|
|
94
|
+
|
|
95
|
+
valid_methods = ('TMM', 'TMMwsp', 'RLE', 'upperquartile', 'none')
|
|
96
|
+
if method not in valid_methods:
|
|
97
|
+
raise ValueError(f"method must be one of {valid_methods}")
|
|
98
|
+
|
|
99
|
+
# Remove all-zero rows
|
|
100
|
+
allzero = np.sum(x > 0, axis=1) == 0
|
|
101
|
+
if np.any(allzero):
|
|
102
|
+
x = x[~allzero]
|
|
103
|
+
|
|
104
|
+
# Degenerate cases
|
|
105
|
+
if x.shape[0] == 0 or nsamples == 1:
|
|
106
|
+
method = 'none'
|
|
107
|
+
|
|
108
|
+
if method == 'TMM':
|
|
109
|
+
f = _calc_tmm(x, lib_size, ref_column, logratio_trim, sum_trim, do_weighting, a_cutoff)
|
|
110
|
+
elif method == 'TMMwsp':
|
|
111
|
+
f = _calc_tmmwsp(x, lib_size, ref_column, logratio_trim, sum_trim, do_weighting, a_cutoff)
|
|
112
|
+
elif method == 'RLE':
|
|
113
|
+
f = _calc_factor_rle(x) / lib_size
|
|
114
|
+
elif method == 'upperquartile':
|
|
115
|
+
f = _calc_factor_quantile(x, lib_size, p)
|
|
116
|
+
else:
|
|
117
|
+
f = np.ones(nsamples)
|
|
118
|
+
|
|
119
|
+
# Normalize so factors multiply to one
|
|
120
|
+
f = f / np.exp(np.mean(np.log(f)))
|
|
121
|
+
|
|
122
|
+
return f
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def _calc_tmm(x, lib_size, ref_column, logratio_trim, sum_trim, do_weighting, a_cutoff):
|
|
126
|
+
"""TMM normalization."""
|
|
127
|
+
nsamples = x.shape[1]
|
|
128
|
+
if ref_column is None:
|
|
129
|
+
f75 = _calc_factor_quantile(x, lib_size, 0.75)
|
|
130
|
+
with warnings.catch_warnings():
|
|
131
|
+
warnings.simplefilter("ignore")
|
|
132
|
+
if np.median(f75) < 1e-20:
|
|
133
|
+
ref_column = np.argmax(np.sum(np.sqrt(x), axis=0))
|
|
134
|
+
else:
|
|
135
|
+
ref_column = np.argmin(np.abs(f75 - np.mean(f75)))
|
|
136
|
+
|
|
137
|
+
f = np.full(nsamples, np.nan)
|
|
138
|
+
for i in range(nsamples):
|
|
139
|
+
f[i] = _calc_factor_tmm(
|
|
140
|
+
obs=x[:, i], ref=x[:, ref_column],
|
|
141
|
+
libsize_obs=lib_size[i], libsize_ref=lib_size[ref_column],
|
|
142
|
+
logratio_trim=logratio_trim, sum_trim=sum_trim,
|
|
143
|
+
do_weighting=do_weighting, a_cutoff=a_cutoff)
|
|
144
|
+
return f
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def _calc_tmmwsp(x, lib_size, ref_column, logratio_trim, sum_trim, do_weighting, a_cutoff):
|
|
148
|
+
"""TMMwsp normalization."""
|
|
149
|
+
nsamples = x.shape[1]
|
|
150
|
+
if ref_column is None:
|
|
151
|
+
ref_column = np.argmax(np.sum(np.sqrt(x), axis=0))
|
|
152
|
+
|
|
153
|
+
f = np.full(nsamples, np.nan)
|
|
154
|
+
for i in range(nsamples):
|
|
155
|
+
f[i] = _calc_factor_tmmwsp(
|
|
156
|
+
obs=x[:, i], ref=x[:, ref_column],
|
|
157
|
+
libsize_obs=lib_size[i], libsize_ref=lib_size[ref_column],
|
|
158
|
+
logratio_trim=logratio_trim, sum_trim=sum_trim,
|
|
159
|
+
do_weighting=do_weighting, a_cutoff=a_cutoff)
|
|
160
|
+
return f
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def _calc_factor_rle(data):
|
|
164
|
+
"""Scale factors as in Anders et al (2010)."""
|
|
165
|
+
with np.errstate(divide='ignore'):
|
|
166
|
+
gm = np.exp(np.mean(np.log(data.astype(float)), axis=1))
|
|
167
|
+
pos = gm > 0
|
|
168
|
+
result = np.zeros(data.shape[1])
|
|
169
|
+
for j in range(data.shape[1]):
|
|
170
|
+
ratio = data[pos, j] / gm[pos]
|
|
171
|
+
result[j] = np.median(ratio)
|
|
172
|
+
return result
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def _calc_factor_quantile(data, lib_size, p=0.75):
|
|
176
|
+
"""Upper-quartile normalization."""
|
|
177
|
+
f = np.zeros(data.shape[1])
|
|
178
|
+
for j in range(data.shape[1]):
|
|
179
|
+
f[j] = np.quantile(data[:, j], p)
|
|
180
|
+
if np.min(f) == 0:
|
|
181
|
+
warnings.warn("One or more quantiles are zero")
|
|
182
|
+
return f / lib_size
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def _calc_factor_tmm(obs, ref, libsize_obs=None, libsize_ref=None,
|
|
186
|
+
logratio_trim=0.3, sum_trim=0.05, do_weighting=True,
|
|
187
|
+
a_cutoff=-1e10):
|
|
188
|
+
"""TMM between two libraries."""
|
|
189
|
+
obs = np.asarray(obs, dtype=np.float64)
|
|
190
|
+
ref = np.asarray(ref, dtype=np.float64)
|
|
191
|
+
|
|
192
|
+
if libsize_obs is None:
|
|
193
|
+
nO = np.sum(obs)
|
|
194
|
+
else:
|
|
195
|
+
nO = libsize_obs
|
|
196
|
+
if libsize_ref is None:
|
|
197
|
+
nR = np.sum(ref)
|
|
198
|
+
else:
|
|
199
|
+
nR = libsize_ref
|
|
200
|
+
|
|
201
|
+
with np.errstate(divide='ignore', invalid='ignore'):
|
|
202
|
+
logR = np.log2(obs / nO) - np.log2(ref / nR)
|
|
203
|
+
absE = (np.log2(obs / nO) + np.log2(ref / nR)) / 2
|
|
204
|
+
v = (nO - obs) / nO / obs + (nR - ref) / nR / ref
|
|
205
|
+
|
|
206
|
+
# Remove infinite values
|
|
207
|
+
fin = np.isfinite(logR) & np.isfinite(absE) & (absE > a_cutoff)
|
|
208
|
+
logR = logR[fin]
|
|
209
|
+
absE = absE[fin]
|
|
210
|
+
v = v[fin]
|
|
211
|
+
|
|
212
|
+
if len(logR) == 0 or np.max(np.abs(logR)) < 1e-6:
|
|
213
|
+
return 1.0
|
|
214
|
+
|
|
215
|
+
n = len(logR)
|
|
216
|
+
loL = int(np.floor(n * logratio_trim)) + 1
|
|
217
|
+
hiL = n + 1 - loL
|
|
218
|
+
loS = int(np.floor(n * sum_trim)) + 1
|
|
219
|
+
hiS = n + 1 - loS
|
|
220
|
+
|
|
221
|
+
rank_logR = _rank(logR)
|
|
222
|
+
rank_absE = _rank(absE)
|
|
223
|
+
keep = ((rank_logR >= loL) & (rank_logR <= hiL) &
|
|
224
|
+
(rank_absE >= loS) & (rank_absE <= hiS))
|
|
225
|
+
|
|
226
|
+
if do_weighting:
|
|
227
|
+
denom = np.sum(1 / v[keep])
|
|
228
|
+
if denom > 0 and np.isfinite(denom):
|
|
229
|
+
f = np.sum(logR[keep] / v[keep]) / denom
|
|
230
|
+
else:
|
|
231
|
+
f = np.nanmean(logR[keep])
|
|
232
|
+
else:
|
|
233
|
+
f = np.nanmean(logR[keep])
|
|
234
|
+
|
|
235
|
+
if np.isnan(f):
|
|
236
|
+
f = 0.0
|
|
237
|
+
|
|
238
|
+
return 2 ** f
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def _calc_factor_tmmwsp(obs, ref, libsize_obs=None, libsize_ref=None,
|
|
242
|
+
logratio_trim=0.3, sum_trim=0.05, do_weighting=True,
|
|
243
|
+
a_cutoff=-1e10):
|
|
244
|
+
"""TMM with singleton pairing."""
|
|
245
|
+
obs = np.asarray(obs, dtype=np.float64)
|
|
246
|
+
ref = np.asarray(ref, dtype=np.float64)
|
|
247
|
+
eps = 1e-14
|
|
248
|
+
|
|
249
|
+
pos_obs = obs > eps
|
|
250
|
+
pos_ref = ref > eps
|
|
251
|
+
npos = 2 * pos_obs.astype(int) + pos_ref.astype(int)
|
|
252
|
+
|
|
253
|
+
# Remove double zeros and NAs
|
|
254
|
+
keep = (npos != 0) & ~np.isnan(npos)
|
|
255
|
+
obs = obs[keep]
|
|
256
|
+
ref = ref[keep]
|
|
257
|
+
npos = npos[keep]
|
|
258
|
+
|
|
259
|
+
if libsize_obs is None:
|
|
260
|
+
libsize_obs = np.sum(obs)
|
|
261
|
+
if libsize_ref is None:
|
|
262
|
+
libsize_ref = np.sum(ref)
|
|
263
|
+
|
|
264
|
+
# Pair singleton positives
|
|
265
|
+
zero_obs = npos == 1
|
|
266
|
+
zero_ref = npos == 2
|
|
267
|
+
k = zero_obs | zero_ref
|
|
268
|
+
n_eligible = min(np.sum(zero_obs), np.sum(zero_ref))
|
|
269
|
+
|
|
270
|
+
if n_eligible > 0:
|
|
271
|
+
refk = np.sort(ref[k])[::-1][:n_eligible]
|
|
272
|
+
obsk = np.sort(obs[k])[::-1][:n_eligible]
|
|
273
|
+
obs = np.concatenate([obs[~k], obsk])
|
|
274
|
+
ref = np.concatenate([ref[~k], refk])
|
|
275
|
+
else:
|
|
276
|
+
obs = obs[~k]
|
|
277
|
+
ref = ref[~k]
|
|
278
|
+
|
|
279
|
+
n = len(obs)
|
|
280
|
+
if n == 0:
|
|
281
|
+
return 1.0
|
|
282
|
+
|
|
283
|
+
obs_p = obs / libsize_obs
|
|
284
|
+
ref_p = ref / libsize_ref
|
|
285
|
+
with np.errstate(divide='ignore', invalid='ignore'):
|
|
286
|
+
M = np.log2(obs_p / ref_p)
|
|
287
|
+
A = 0.5 * np.log2(obs_p * ref_p)
|
|
288
|
+
|
|
289
|
+
if np.max(np.abs(M[np.isfinite(M)])) < 1e-6:
|
|
290
|
+
return 1.0
|
|
291
|
+
|
|
292
|
+
# Sort by M with tie-breaking
|
|
293
|
+
obs_p_shrunk = (obs + 0.5) / (libsize_obs + 0.5)
|
|
294
|
+
ref_p_shrunk = (ref + 0.5) / (libsize_ref + 0.5)
|
|
295
|
+
M_shrunk = np.log2(obs_p_shrunk / ref_p_shrunk)
|
|
296
|
+
o_M = np.lexsort((M_shrunk, M))
|
|
297
|
+
o_A = np.argsort(A)
|
|
298
|
+
|
|
299
|
+
loM = int(n * logratio_trim) + 1
|
|
300
|
+
hiM = n - loM
|
|
301
|
+
keep_M = np.zeros(n, dtype=bool)
|
|
302
|
+
keep_M[o_M[loM:hiM]] = True
|
|
303
|
+
|
|
304
|
+
loA = int(n * sum_trim) + 1
|
|
305
|
+
hiA = n - loA
|
|
306
|
+
keep_A = np.zeros(n, dtype=bool)
|
|
307
|
+
keep_A[o_A[loA:hiA]] = True
|
|
308
|
+
|
|
309
|
+
keep = keep_M & keep_A
|
|
310
|
+
M_keep = M[keep]
|
|
311
|
+
|
|
312
|
+
if do_weighting:
|
|
313
|
+
obs_p_k = obs_p[keep]
|
|
314
|
+
ref_p_k = ref_p[keep]
|
|
315
|
+
v = (1 - obs_p_k) / obs_p_k / libsize_obs + (1 - ref_p_k) / ref_p_k / libsize_ref
|
|
316
|
+
w = (1 + 1e-6) / (v + 1e-6)
|
|
317
|
+
TMM = np.sum(w * M_keep) / np.sum(w)
|
|
318
|
+
else:
|
|
319
|
+
TMM = np.mean(M_keep) if len(M_keep) > 0 else 0
|
|
320
|
+
|
|
321
|
+
return 2 ** TMM
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
def _rank(x):
|
|
325
|
+
"""Compute ranks (1-based, average ties)."""
|
|
326
|
+
from scipy.stats import rankdata
|
|
327
|
+
return rankdata(x, method='average')
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
# =====================================================================
|
|
331
|
+
# ChIP-seq normalization
|
|
332
|
+
# =====================================================================
|
|
333
|
+
|
|
334
|
+
def normalize_chip_to_input(input_counts, response, dispersion=0.01, niter=6,
|
|
335
|
+
loss='p', verbose=False):
|
|
336
|
+
"""Normalize ChIP-Seq read counts to input and test for enrichment.
|
|
337
|
+
|
|
338
|
+
Port of edgeR's normalizeChIPtoInput. For a single sample, aligns
|
|
339
|
+
ChIP-Seq mark counts to input control counts under a negative binomial
|
|
340
|
+
model, iteratively estimating a scaling factor and the proportion of
|
|
341
|
+
enriched features.
|
|
342
|
+
|
|
343
|
+
Parameters
|
|
344
|
+
----------
|
|
345
|
+
input_counts : array-like
|
|
346
|
+
Non-negative input control counts for each genomic feature.
|
|
347
|
+
response : array-like
|
|
348
|
+
Non-negative integer ChIP-Seq mark counts for each feature.
|
|
349
|
+
dispersion : float
|
|
350
|
+
Negative binomial dispersion (must be positive).
|
|
351
|
+
niter : int
|
|
352
|
+
Number of iterations for estimating scaling factor and
|
|
353
|
+
proportion enriched.
|
|
354
|
+
loss : str
|
|
355
|
+
Loss function: ``'p'`` for cumulative probabilities,
|
|
356
|
+
``'z'`` for z-values.
|
|
357
|
+
verbose : bool
|
|
358
|
+
If True, print working estimates at each iteration.
|
|
359
|
+
|
|
360
|
+
Returns
|
|
361
|
+
-------
|
|
362
|
+
dict with keys:
|
|
363
|
+
``p_value`` : ndarray – upper-tail p-values for enrichment.
|
|
364
|
+
``pmid_value`` : ndarray – mid-p-values.
|
|
365
|
+
``scaling_factor`` : float – scaling factor aligning response
|
|
366
|
+
to input for unenriched features.
|
|
367
|
+
``prop_enriched`` : float – estimated proportion of enriched
|
|
368
|
+
features.
|
|
369
|
+
"""
|
|
370
|
+
input_counts = np.asarray(input_counts, dtype=np.float64)
|
|
371
|
+
response = np.asarray(response, dtype=np.float64)
|
|
372
|
+
|
|
373
|
+
if len(input_counts) != len(response):
|
|
374
|
+
raise ValueError("input and response must be same length")
|
|
375
|
+
if np.any(input_counts < 0) or np.any(response < 0):
|
|
376
|
+
raise ValueError("negative values not allowed")
|
|
377
|
+
if dispersion <= 0:
|
|
378
|
+
raise ValueError("dispersion must be positive")
|
|
379
|
+
|
|
380
|
+
# Remove features where both input and response are zero
|
|
381
|
+
zero = (input_counts <= 0) & (response <= 0)
|
|
382
|
+
if np.any(zero):
|
|
383
|
+
p_value = np.ones(len(zero))
|
|
384
|
+
pmid_value = np.ones(len(zero))
|
|
385
|
+
out = normalize_chip_to_input(
|
|
386
|
+
input_counts[~zero], response[~zero],
|
|
387
|
+
dispersion=dispersion, niter=niter, loss=loss, verbose=verbose,
|
|
388
|
+
)
|
|
389
|
+
p_value[~zero] = out['p_value']
|
|
390
|
+
pmid_value[~zero] = out['pmid_value']
|
|
391
|
+
return {
|
|
392
|
+
'p_value': p_value,
|
|
393
|
+
'pmid_value': pmid_value,
|
|
394
|
+
'scaling_factor': out['scaling_factor'],
|
|
395
|
+
'prop_enriched': out['prop_enriched'],
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
n = len(response)
|
|
399
|
+
|
|
400
|
+
# Special cases
|
|
401
|
+
if n == 0:
|
|
402
|
+
return {'p_value': np.array([]), 'pmid_value': np.array([]),
|
|
403
|
+
'scaling_factor': np.nan, 'prop_enriched': np.nan}
|
|
404
|
+
if np.all(input_counts == 0):
|
|
405
|
+
return {'p_value': np.zeros(n), 'pmid_value': np.zeros(n),
|
|
406
|
+
'scaling_factor': 0.0, 'prop_enriched': 1.0}
|
|
407
|
+
if n == 1:
|
|
408
|
+
return {'p_value': np.array([1.0]), 'pmid_value': np.array([1.0]),
|
|
409
|
+
'scaling_factor': float(input_counts[0] / response[0]),
|
|
410
|
+
'prop_enriched': 0.0}
|
|
411
|
+
|
|
412
|
+
# Replace zero inputs with minimum positive value
|
|
413
|
+
inp = input_counts.copy()
|
|
414
|
+
inp[inp == 0] = np.min(inp[inp > 0])
|
|
415
|
+
|
|
416
|
+
size = 1.0 / dispersion # NB size parameter
|
|
417
|
+
|
|
418
|
+
if loss not in ('p', 'z'):
|
|
419
|
+
raise ValueError("loss must be 'p' or 'z'")
|
|
420
|
+
|
|
421
|
+
def _nb_p_and_d(resp, mu):
|
|
422
|
+
"""Upper-tail p and pmf for NB(mu, size)."""
|
|
423
|
+
p_val = stats.nbinom.sf(resp.astype(int), size, size / (size + mu))
|
|
424
|
+
d_val = stats.nbinom.pmf(resp.astype(int), size, size / (size + mu))
|
|
425
|
+
return p_val, d_val
|
|
426
|
+
|
|
427
|
+
def _objective_p(sf, inp_v, resp_v, prop_enrich):
|
|
428
|
+
mu = sf * inp_v
|
|
429
|
+
p = stats.nbinom.cdf(resp_v.astype(int), size, size / (size + mu))
|
|
430
|
+
d = stats.nbinom.pmf(resp_v.astype(int), size, size / (size + mu))
|
|
431
|
+
pmid = p - d / 2
|
|
432
|
+
n_not_enriched = max(round(len(resp_v) * (1 - prop_enrich)), 1)
|
|
433
|
+
p_sorted = np.partition(pmid, n_not_enriched - 1)[:n_not_enriched]
|
|
434
|
+
return abs(np.mean(p_sorted) - 0.5)
|
|
435
|
+
|
|
436
|
+
def _objective_z(sf, inp_v, resp_v, prop_enrich):
|
|
437
|
+
from .utils import zscore_nbinom
|
|
438
|
+
mu = sf * inp_v
|
|
439
|
+
z = zscore_nbinom(resp_v, size=size, mu=mu)
|
|
440
|
+
n_not_enriched = max(round(len(resp_v) * (1 - prop_enrich)), 1)
|
|
441
|
+
z_sorted = np.partition(np.abs(z), n_not_enriched - 1)[:n_not_enriched]
|
|
442
|
+
return np.mean(z_sorted)
|
|
443
|
+
|
|
444
|
+
objective = _objective_p if loss == 'p' else _objective_z
|
|
445
|
+
|
|
446
|
+
# Starting values
|
|
447
|
+
prop_enriched = 0.5
|
|
448
|
+
ratios = response / inp
|
|
449
|
+
sf_interval = (np.percentile(ratios, 10), np.percentile(ratios, 80))
|
|
450
|
+
|
|
451
|
+
if sf_interval[0] == sf_interval[1]:
|
|
452
|
+
scaling_factor = sf_interval[0]
|
|
453
|
+
p, d = _nb_p_and_d(response, scaling_factor * inp)
|
|
454
|
+
pmid = p - d / 2
|
|
455
|
+
_, adj_p, _, _ = multipletests(pmid, method='holm')
|
|
456
|
+
enriched = adj_p < 0.5
|
|
457
|
+
prop_enriched = np.sum(enriched) / n
|
|
458
|
+
if verbose:
|
|
459
|
+
print(f"prop.enriched: {prop_enriched} scaling.factor: {scaling_factor}")
|
|
460
|
+
else:
|
|
461
|
+
from scipy.optimize import minimize_scalar
|
|
462
|
+
for _ in range(niter):
|
|
463
|
+
res = minimize_scalar(
|
|
464
|
+
objective, bounds=sf_interval, method='bounded',
|
|
465
|
+
args=(inp, response, prop_enriched),
|
|
466
|
+
)
|
|
467
|
+
scaling_factor = res.x
|
|
468
|
+
p, d = _nb_p_and_d(response, scaling_factor * inp)
|
|
469
|
+
pmid = p - d / 2
|
|
470
|
+
_, adj_p, _, _ = multipletests(pmid, method='holm')
|
|
471
|
+
enriched = adj_p < 0.5
|
|
472
|
+
prop_enriched = np.sum(enriched) / n
|
|
473
|
+
if verbose:
|
|
474
|
+
print(f"prop.enriched: {prop_enriched} scaling.factor: {scaling_factor}")
|
|
475
|
+
|
|
476
|
+
return {
|
|
477
|
+
'p_value': p,
|
|
478
|
+
'pmid_value': pmid,
|
|
479
|
+
'scaling_factor': float(scaling_factor),
|
|
480
|
+
'prop_enriched': float(prop_enriched),
|
|
481
|
+
}
|
|
482
|
+
|
|
483
|
+
|
|
484
|
+
def calc_norm_offsets_for_chip(input_counts, response, dispersion=0.01,
|
|
485
|
+
niter=6, loss='p', verbose=False):
|
|
486
|
+
"""Compute normalization offsets for ChIP-Seq relative to input.
|
|
487
|
+
|
|
488
|
+
Port of edgeR's calcNormOffsetsforChIP. Calls
|
|
489
|
+
:func:`normalize_chip_to_input` for each sample and returns a matrix
|
|
490
|
+
of offsets (log-scale) suitable for edgePython's GLM framework.
|
|
491
|
+
|
|
492
|
+
Parameters
|
|
493
|
+
----------
|
|
494
|
+
input_counts : array-like
|
|
495
|
+
Input control count matrix (genes x samples), or a single
|
|
496
|
+
column that is shared across all samples.
|
|
497
|
+
response : array-like or DGEList
|
|
498
|
+
ChIP-Seq mark count matrix (genes x samples), or a DGEList.
|
|
499
|
+
dispersion : float
|
|
500
|
+
Negative binomial dispersion (must be positive).
|
|
501
|
+
niter : int
|
|
502
|
+
Number of iterations.
|
|
503
|
+
loss : str
|
|
504
|
+
Loss function (``'p'`` or ``'z'``).
|
|
505
|
+
verbose : bool
|
|
506
|
+
If True, print working estimates.
|
|
507
|
+
|
|
508
|
+
Returns
|
|
509
|
+
-------
|
|
510
|
+
If *response* is a DGEList, returns the DGEList with the ``offset``
|
|
511
|
+
field set. Otherwise returns a numeric matrix of offsets
|
|
512
|
+
(genes x samples).
|
|
513
|
+
"""
|
|
514
|
+
is_dgelist = isinstance(response, dict) and 'counts' in response
|
|
515
|
+
|
|
516
|
+
if is_dgelist:
|
|
517
|
+
resp_mat = np.asarray(response['counts'], dtype=np.float64)
|
|
518
|
+
else:
|
|
519
|
+
resp_mat = np.asarray(response, dtype=np.float64)
|
|
520
|
+
|
|
521
|
+
inp_mat = np.asarray(input_counts, dtype=np.float64)
|
|
522
|
+
if inp_mat.ndim == 1:
|
|
523
|
+
inp_mat = inp_mat[:, np.newaxis]
|
|
524
|
+
if resp_mat.ndim == 1:
|
|
525
|
+
resp_mat = resp_mat[:, np.newaxis]
|
|
526
|
+
|
|
527
|
+
if inp_mat.shape[0] != resp_mat.shape[0]:
|
|
528
|
+
raise ValueError("nrows of input and response disagree")
|
|
529
|
+
if inp_mat.shape[1] == 1 and resp_mat.shape[1] > 1:
|
|
530
|
+
inp_mat = np.broadcast_to(inp_mat, resp_mat.shape).copy()
|
|
531
|
+
if inp_mat.shape[1] != resp_mat.shape[1]:
|
|
532
|
+
raise ValueError("ncols of input and response disagree")
|
|
533
|
+
|
|
534
|
+
offset = np.empty_like(resp_mat, dtype=np.float64)
|
|
535
|
+
for j in range(resp_mat.shape[1]):
|
|
536
|
+
out = normalize_chip_to_input(
|
|
537
|
+
inp_mat[:, j], resp_mat[:, j],
|
|
538
|
+
dispersion=dispersion, niter=niter, loss=loss, verbose=verbose,
|
|
539
|
+
)
|
|
540
|
+
offset[:, j] = np.log(out['scaling_factor'] * inp_mat[:, j])
|
|
541
|
+
|
|
542
|
+
if is_dgelist:
|
|
543
|
+
response = dict(response)
|
|
544
|
+
response['offset'] = offset
|
|
545
|
+
return response
|
|
546
|
+
return offset
|