edgepython 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- edgepython/__init__.py +114 -0
- edgepython/classes.py +517 -0
- edgepython/compressed_matrix.py +388 -0
- edgepython/dgelist.py +314 -0
- edgepython/dispersion.py +920 -0
- edgepython/dispersion_lowlevel.py +1066 -0
- edgepython/exact_test.py +525 -0
- edgepython/expression.py +323 -0
- edgepython/filtering.py +96 -0
- edgepython/gene_sets.py +1215 -0
- edgepython/glm_fit.py +653 -0
- edgepython/glm_levenberg.py +359 -0
- edgepython/glm_test.py +375 -0
- edgepython/io.py +1887 -0
- edgepython/limma_port.py +987 -0
- edgepython/normalization.py +546 -0
- edgepython/ql_weights.py +765 -0
- edgepython/results.py +236 -0
- edgepython/sc_fit.py +1511 -0
- edgepython/smoothing.py +474 -0
- edgepython/splicing.py +537 -0
- edgepython/utils.py +1050 -0
- edgepython/visualization.py +409 -0
- edgepython/weighted_lowess.py +323 -0
- edgepython-0.2.0.dist-info/METADATA +201 -0
- edgepython-0.2.0.dist-info/RECORD +29 -0
- edgepython-0.2.0.dist-info/WHEEL +5 -0
- edgepython-0.2.0.dist-info/licenses/LICENSE +674 -0
- edgepython-0.2.0.dist-info/top_level.txt +1 -0
edgepython/utils.py
ADDED
|
@@ -0,0 +1,1050 @@
|
|
|
1
|
+
# This code was written by Claude (Anthropic). The project was directed by Lior Pachter.
|
|
2
|
+
"""
|
|
3
|
+
Utility functions for edgePython.
|
|
4
|
+
|
|
5
|
+
Port of edgeR utility functions: expandAsMatrix, addPriorCount, movingAverageByCol,
|
|
6
|
+
predFC, goodTuring, thinCounts, gini, cutWithMinN, sumTechReps, systematicSubset,
|
|
7
|
+
nearestReftoX, getPriorN, zscoreNBinom, binomTest, dropEmptyLevels, etc.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import numpy as np
|
|
11
|
+
import pandas as pd
|
|
12
|
+
from scipy import stats, special
|
|
13
|
+
from .compressed_matrix import CompressedMatrix, compress_offsets, compress_prior
|
|
14
|
+
import warnings
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def expand_as_matrix(x, dim=None, byrow=True):
|
|
18
|
+
"""Convert scalar, row/column vector, or matrix to a full matrix.
|
|
19
|
+
|
|
20
|
+
Port of edgeR's expandAsMatrix.
|
|
21
|
+
"""
|
|
22
|
+
if dim is None:
|
|
23
|
+
return np.atleast_2d(np.asarray(x, dtype=np.float64))
|
|
24
|
+
dim = (int(dim[0]), int(dim[1]))
|
|
25
|
+
|
|
26
|
+
if isinstance(x, CompressedMatrix):
|
|
27
|
+
return expand_as_matrix(x.as_matrix(), dim=dim, byrow=byrow)
|
|
28
|
+
|
|
29
|
+
x = np.asarray(x, dtype=np.float64)
|
|
30
|
+
if x.ndim == 0 or x.size == 1:
|
|
31
|
+
return np.full(dim, x.ravel()[0])
|
|
32
|
+
if x.ndim <= 1:
|
|
33
|
+
lx = len(x)
|
|
34
|
+
if lx == dim[0] and lx == dim[1]:
|
|
35
|
+
return np.tile(x.reshape(-1, 1) if not byrow else x.reshape(1, -1),
|
|
36
|
+
(1, dim[1]) if not byrow else (dim[0], 1))
|
|
37
|
+
if lx == dim[1]:
|
|
38
|
+
return np.tile(x.reshape(1, -1), (dim[0], 1))
|
|
39
|
+
if lx == dim[0]:
|
|
40
|
+
return np.tile(x.reshape(-1, 1), (1, dim[1]))
|
|
41
|
+
raise ValueError("x of unexpected length")
|
|
42
|
+
if x.ndim == 2:
|
|
43
|
+
if x.shape == tuple(dim):
|
|
44
|
+
return x.copy()
|
|
45
|
+
raise ValueError("x is matrix of wrong size")
|
|
46
|
+
raise ValueError("x has wrong dimensions")
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def add_prior_count(y, lib_size=None, offset=None, prior_count=1):
|
|
50
|
+
"""Add library-size-adjusted prior counts.
|
|
51
|
+
|
|
52
|
+
Port of edgeR's addPriorCount.
|
|
53
|
+
|
|
54
|
+
Returns
|
|
55
|
+
-------
|
|
56
|
+
dict with 'y' (adjusted counts) and 'offset' (adjusted offsets).
|
|
57
|
+
"""
|
|
58
|
+
y = np.asarray(y, dtype=np.float64)
|
|
59
|
+
if y.ndim == 1:
|
|
60
|
+
y = y.reshape(1, -1)
|
|
61
|
+
|
|
62
|
+
if offset is None:
|
|
63
|
+
if lib_size is None:
|
|
64
|
+
lib_size = y.sum(axis=0)
|
|
65
|
+
offset = np.log(lib_size)
|
|
66
|
+
offset = np.atleast_1d(np.asarray(offset, dtype=np.float64))
|
|
67
|
+
|
|
68
|
+
prior_count = np.atleast_1d(np.asarray(prior_count, dtype=np.float64))
|
|
69
|
+
|
|
70
|
+
# Expand offset and prior_count to matrix form
|
|
71
|
+
if offset.ndim == 1:
|
|
72
|
+
offset_mat = np.tile(offset, (y.shape[0], 1))
|
|
73
|
+
else:
|
|
74
|
+
offset_mat = offset.copy()
|
|
75
|
+
|
|
76
|
+
if prior_count.ndim == 0 or prior_count.size == 1:
|
|
77
|
+
prior_mat = np.full(y.shape, prior_count.ravel()[0])
|
|
78
|
+
elif prior_count.ndim == 1 and len(prior_count) == y.shape[0]:
|
|
79
|
+
prior_mat = np.tile(prior_count.reshape(-1, 1), (1, y.shape[1]))
|
|
80
|
+
else:
|
|
81
|
+
prior_mat = expand_as_matrix(prior_count, dim=y.shape, byrow=False)
|
|
82
|
+
|
|
83
|
+
# Effective library sizes from offset
|
|
84
|
+
lib_size_eff = np.exp(offset_mat)
|
|
85
|
+
|
|
86
|
+
# Scale prior counts to be proportional to library size
|
|
87
|
+
avg_lib = np.mean(lib_size_eff, axis=0) if lib_size_eff.ndim == 2 else np.mean(lib_size_eff)
|
|
88
|
+
if lib_size_eff.ndim == 2:
|
|
89
|
+
# prior is scaled by lib_size / mean_lib_size
|
|
90
|
+
mean_lib = np.mean(lib_size_eff)
|
|
91
|
+
scaled_prior = prior_mat * lib_size_eff / mean_lib
|
|
92
|
+
else:
|
|
93
|
+
scaled_prior = prior_mat
|
|
94
|
+
|
|
95
|
+
y_aug = y + scaled_prior
|
|
96
|
+
offset_aug = np.log(lib_size_eff + 2 * np.mean(scaled_prior, axis=0, keepdims=True) * np.mean(lib_size_eff) / lib_size_eff)
|
|
97
|
+
|
|
98
|
+
# Simplified: match edgeR C code behavior
|
|
99
|
+
# offset_aug = log(lib_size + 2*prior_count_scaled)
|
|
100
|
+
if offset.ndim == 1:
|
|
101
|
+
lib = np.exp(offset)
|
|
102
|
+
pc = prior_count.ravel()[0] if prior_count.size == 1 else np.mean(prior_count)
|
|
103
|
+
offset_aug = np.log(lib + 2.0 * pc * lib / np.mean(lib))
|
|
104
|
+
scaled_prior_simple = prior_count.ravel()[0] * lib / np.mean(lib) if prior_count.size == 1 else prior_count.reshape(-1, 1) * lib / np.mean(lib)
|
|
105
|
+
y_aug = y + (scaled_prior_simple if np.ndim(scaled_prior_simple) == 2 else np.tile(scaled_prior_simple, (y.shape[0], 1)))
|
|
106
|
+
offset_aug_mat = np.tile(offset_aug, (y.shape[0], 1)) if offset_aug.ndim == 1 else offset_aug
|
|
107
|
+
|
|
108
|
+
return {'y': y_aug, 'offset': offset_aug}
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def moving_average_by_col(x, width=5, full_length=True):
|
|
112
|
+
"""Moving average smoother for columns of a matrix.
|
|
113
|
+
|
|
114
|
+
Port of edgeR's movingAverageByCol.
|
|
115
|
+
"""
|
|
116
|
+
x = np.asarray(x, dtype=np.float64)
|
|
117
|
+
if x.ndim == 1:
|
|
118
|
+
x = x.reshape(-1, 1)
|
|
119
|
+
width = int(width)
|
|
120
|
+
if width <= 1:
|
|
121
|
+
return x
|
|
122
|
+
n, m = x.shape
|
|
123
|
+
if width > n:
|
|
124
|
+
width = n
|
|
125
|
+
|
|
126
|
+
if full_length:
|
|
127
|
+
half1 = (width + 1) // 2
|
|
128
|
+
half2 = width // 2
|
|
129
|
+
x_pad = np.vstack([np.zeros((half1, m)), x, np.zeros((half2, m))])
|
|
130
|
+
else:
|
|
131
|
+
if width == n:
|
|
132
|
+
return np.tile(x.mean(axis=0), (1, 1))
|
|
133
|
+
x_pad = np.vstack([np.zeros((1, m)), x])
|
|
134
|
+
|
|
135
|
+
cs = np.cumsum(x_pad, axis=0)
|
|
136
|
+
n2 = cs.shape[0]
|
|
137
|
+
result = cs[width:n2] - cs[:n2 - width]
|
|
138
|
+
n3 = result.shape[0]
|
|
139
|
+
|
|
140
|
+
w = np.full(n3, width, dtype=np.float64)
|
|
141
|
+
if full_length:
|
|
142
|
+
if half1 > 1:
|
|
143
|
+
w[:half1 - 1] = width - np.arange(half1 - 1, 0, -1)
|
|
144
|
+
w[n3 - half2:] = width - np.arange(1, half2 + 1)
|
|
145
|
+
|
|
146
|
+
return result / w.reshape(-1, 1)
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def pred_fc(y, design, prior_count=0.125, offset=None, dispersion=0, weights=None):
|
|
150
|
+
"""Predicted fold changes with shrinkage.
|
|
151
|
+
|
|
152
|
+
Port of edgeR's predFC.
|
|
153
|
+
"""
|
|
154
|
+
from .glm_fit import glm_fit
|
|
155
|
+
|
|
156
|
+
out = add_prior_count(y, offset=offset, prior_count=prior_count)
|
|
157
|
+
design = np.asarray(design, dtype=np.float64)
|
|
158
|
+
g = glm_fit(out['y'], design, offset=out['offset'], dispersion=dispersion,
|
|
159
|
+
prior_count=0, weights=weights)
|
|
160
|
+
return g['coefficients'] / np.log(2)
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def good_turing(x, conf=1.96):
|
|
164
|
+
"""Simple Good-Turing frequency estimation.
|
|
165
|
+
|
|
166
|
+
Faithful port of edgeR's goodTuring (R wrapper + C code in good_turing.c).
|
|
167
|
+
"""
|
|
168
|
+
x = np.asarray(x, dtype=int)
|
|
169
|
+
|
|
170
|
+
# Tabulate frequencies — matches R's goodTuring R wrapper
|
|
171
|
+
max_x = x.max()
|
|
172
|
+
if max_x < len(x):
|
|
173
|
+
# np.bincount(x): index i = count of value i in x
|
|
174
|
+
bc = np.bincount(x)
|
|
175
|
+
n0 = bc[0] if len(bc) > 0 else 0
|
|
176
|
+
n = bc[1:] # counts for values 1, 2, ..., max_x
|
|
177
|
+
r = np.arange(1, len(n) + 1)
|
|
178
|
+
mask = n > 0
|
|
179
|
+
r = r[mask]
|
|
180
|
+
n = n[mask]
|
|
181
|
+
else:
|
|
182
|
+
r_unique, counts = np.unique(x, return_counts=True)
|
|
183
|
+
sort_idx = np.argsort(r_unique)
|
|
184
|
+
r_unique = r_unique[sort_idx]
|
|
185
|
+
counts = counts[sort_idx]
|
|
186
|
+
if r_unique[0] == 0:
|
|
187
|
+
n0 = counts[0]
|
|
188
|
+
r = r_unique[1:]
|
|
189
|
+
n = counts[1:]
|
|
190
|
+
else:
|
|
191
|
+
n0 = 0
|
|
192
|
+
r = r_unique
|
|
193
|
+
n = counts
|
|
194
|
+
|
|
195
|
+
if len(r) == 0:
|
|
196
|
+
return {'count': r, 'n': n, 'n0': n0, 'proportion': np.array([]),
|
|
197
|
+
'P0': 0.0}
|
|
198
|
+
|
|
199
|
+
r = r.astype(np.int64)
|
|
200
|
+
n = n.astype(np.int64)
|
|
201
|
+
nr = len(r)
|
|
202
|
+
last = nr - 1
|
|
203
|
+
|
|
204
|
+
# --- Port of good_turing.c ---
|
|
205
|
+
# Compute bigN, Z values, and linear regression in one pass
|
|
206
|
+
bigN = 0.0
|
|
207
|
+
log_obs = np.log(r.astype(float))
|
|
208
|
+
meanX = 0.0
|
|
209
|
+
meanY = 0.0
|
|
210
|
+
XYs = 0.0
|
|
211
|
+
Xsquares = 0.0
|
|
212
|
+
|
|
213
|
+
for i in range(nr):
|
|
214
|
+
bigN += float(r[i]) * float(n[i])
|
|
215
|
+
|
|
216
|
+
prev_obs = 0 if i == 0 else r[i - 1]
|
|
217
|
+
logO = log_obs[i]
|
|
218
|
+
|
|
219
|
+
xx = (2 * (r[i] - prev_obs)) if i == last else (r[i + 1] - prev_obs)
|
|
220
|
+
logZ = np.log(2.0 * n[i]) - np.log(float(xx))
|
|
221
|
+
|
|
222
|
+
meanX += logO
|
|
223
|
+
meanY += logZ
|
|
224
|
+
XYs += logO * logZ
|
|
225
|
+
Xsquares += logO * logO
|
|
226
|
+
|
|
227
|
+
meanX /= nr
|
|
228
|
+
meanY /= nr
|
|
229
|
+
XYs -= meanX * meanY * nr
|
|
230
|
+
Xsquares -= meanX * meanX * nr
|
|
231
|
+
|
|
232
|
+
slope = XYs / Xsquares if Xsquares != 0 else 0.0
|
|
233
|
+
|
|
234
|
+
# P0: only nonzero if first observed count is 1
|
|
235
|
+
P0 = 0.0 if (nr == 0 or r[0] != 1) else float(n[0]) / bigN
|
|
236
|
+
|
|
237
|
+
# Compute r* values with indiffValsSeen logic
|
|
238
|
+
out = np.zeros(nr)
|
|
239
|
+
bigNprime = 0.0
|
|
240
|
+
indiff_vals_seen = False
|
|
241
|
+
|
|
242
|
+
for i in range(nr):
|
|
243
|
+
next_obs = r[i] + 1
|
|
244
|
+
# Turing estimate (intercept cancels out)
|
|
245
|
+
y = float(next_obs) * np.exp(slope * (np.log(float(next_obs)) - log_obs[i]))
|
|
246
|
+
|
|
247
|
+
if i == last or r[i + 1] != next_obs:
|
|
248
|
+
indiff_vals_seen = True
|
|
249
|
+
|
|
250
|
+
if not indiff_vals_seen:
|
|
251
|
+
# Direct estimate
|
|
252
|
+
x_direct = float(next_obs) * float(n[i + 1]) / float(n[i])
|
|
253
|
+
if abs(x_direct - y) <= conf * x_direct * np.sqrt(
|
|
254
|
+
1.0 / float(n[i + 1]) + 1.0 / float(n[i])):
|
|
255
|
+
indiff_vals_seen = True
|
|
256
|
+
else:
|
|
257
|
+
out[i] = x_direct
|
|
258
|
+
|
|
259
|
+
if indiff_vals_seen:
|
|
260
|
+
out[i] = y
|
|
261
|
+
|
|
262
|
+
bigNprime += out[i] * float(n[i])
|
|
263
|
+
|
|
264
|
+
# Normalize to proportions
|
|
265
|
+
factor = (1.0 - P0) / bigNprime if bigNprime > 0 else 0.0
|
|
266
|
+
proportion = out * factor
|
|
267
|
+
|
|
268
|
+
return {
|
|
269
|
+
'count': r,
|
|
270
|
+
'n': n,
|
|
271
|
+
'n0': n0,
|
|
272
|
+
'proportion': proportion,
|
|
273
|
+
'P0': P0
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
def good_turing_proportions(counts):
|
|
278
|
+
"""Transform counts using Good-Turing proportions.
|
|
279
|
+
|
|
280
|
+
Port of edgeR's goodTuringProportions.
|
|
281
|
+
"""
|
|
282
|
+
counts = np.asarray(counts, dtype=int)
|
|
283
|
+
z = counts.astype(float).copy()
|
|
284
|
+
if z.ndim == 1:
|
|
285
|
+
z = z.reshape(-1, 1)
|
|
286
|
+
nlibs = z.shape[1]
|
|
287
|
+
for i in range(nlibs):
|
|
288
|
+
g = good_turing(counts[:, i] if counts.ndim == 2 else counts)
|
|
289
|
+
p0 = g['P0'] / g['n0'] if g['n0'] > 0 else 0
|
|
290
|
+
zero = z[:, i] == 0
|
|
291
|
+
z[zero, i] = p0
|
|
292
|
+
nonzero = ~zero
|
|
293
|
+
if np.any(nonzero):
|
|
294
|
+
m = np.searchsorted(g['count'], z[nonzero, i].astype(int))
|
|
295
|
+
m = np.clip(m, 0, len(g['proportion']) - 1)
|
|
296
|
+
z[nonzero, i] = g['proportion'][m]
|
|
297
|
+
return z
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
def thin_counts(x, prob=None, target_size=None):
|
|
301
|
+
"""Binomial or multinomial thinning of counts.
|
|
302
|
+
|
|
303
|
+
Port of edgeR's thinCounts.
|
|
304
|
+
"""
|
|
305
|
+
x = np.asarray(x, dtype=int).copy()
|
|
306
|
+
if prob is not None:
|
|
307
|
+
x = np.random.binomial(x, prob)
|
|
308
|
+
else:
|
|
309
|
+
if x.ndim == 1:
|
|
310
|
+
x = x.reshape(-1, 1)
|
|
311
|
+
if target_size is None:
|
|
312
|
+
target_size = x.sum(axis=0).min()
|
|
313
|
+
target_size = np.atleast_1d(np.asarray(target_size, dtype=int))
|
|
314
|
+
if len(target_size) == 1:
|
|
315
|
+
target_size = np.full(x.shape[1], target_size[0])
|
|
316
|
+
actual_size = x.sum(axis=0)
|
|
317
|
+
if np.any(target_size > actual_size):
|
|
318
|
+
raise ValueError("target_size bigger than actual size")
|
|
319
|
+
for j in range(x.shape[1]):
|
|
320
|
+
diff = actual_size[j] - target_size[j]
|
|
321
|
+
if diff > 0:
|
|
322
|
+
probs = x[:, j].astype(float)
|
|
323
|
+
probs /= probs.sum()
|
|
324
|
+
remove = np.random.multinomial(diff, probs)
|
|
325
|
+
x[:, j] -= remove
|
|
326
|
+
x = np.maximum(x, 0)
|
|
327
|
+
return x
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
def gini(x):
|
|
331
|
+
"""Gini diversity index for columns of a matrix.
|
|
332
|
+
|
|
333
|
+
Port of edgeR's gini.
|
|
334
|
+
"""
|
|
335
|
+
x = np.asarray(x, dtype=np.float64)
|
|
336
|
+
if x.ndim == 1:
|
|
337
|
+
x = x.reshape(-1, 1)
|
|
338
|
+
n = x.shape[0]
|
|
339
|
+
result = np.zeros(x.shape[1])
|
|
340
|
+
for j in range(x.shape[1]):
|
|
341
|
+
xs = np.sort(x[:, j])
|
|
342
|
+
i = np.arange(1, n + 1)
|
|
343
|
+
m = 0.75 * n
|
|
344
|
+
s1 = np.sum((i - m) * xs)
|
|
345
|
+
s2 = np.sum(xs)
|
|
346
|
+
if s2 > 0:
|
|
347
|
+
result[j] = (2 * (s1 / s2 + m) - n - 1) / n
|
|
348
|
+
return result
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
def cut_with_min_n(x, intervals=2, min_n=1):
|
|
352
|
+
"""Cut numeric x into intervals with minimum count per bin.
|
|
353
|
+
|
|
354
|
+
Port of edgeR's cutWithMinN.
|
|
355
|
+
"""
|
|
356
|
+
x = np.asarray(x, dtype=np.float64)
|
|
357
|
+
isna = np.isnan(x)
|
|
358
|
+
if np.any(isna):
|
|
359
|
+
group = np.full(len(x), np.nan)
|
|
360
|
+
out = cut_with_min_n(x[~isna], intervals=intervals, min_n=min_n)
|
|
361
|
+
group[~isna] = out['group']
|
|
362
|
+
return {'group': group, 'breaks': out['breaks']}
|
|
363
|
+
|
|
364
|
+
intervals = int(intervals)
|
|
365
|
+
min_n = int(min_n)
|
|
366
|
+
nx = len(x)
|
|
367
|
+
|
|
368
|
+
if nx < intervals * min_n:
|
|
369
|
+
raise ValueError("too few observations: length(x) < intervals*min_n")
|
|
370
|
+
|
|
371
|
+
if intervals == 1:
|
|
372
|
+
return {'group': np.ones(nx, dtype=int), 'breaks': None}
|
|
373
|
+
|
|
374
|
+
# Add jitter
|
|
375
|
+
x_jit = x + 1e-10 * (np.random.uniform(size=nx) - 0.5)
|
|
376
|
+
|
|
377
|
+
# Try equally spaced
|
|
378
|
+
breaks = np.linspace(x_jit.min() - 1, x_jit.max() + 1, intervals + 1)
|
|
379
|
+
z = np.digitize(x_jit, breaks[1:-1])
|
|
380
|
+
n = np.bincount(z, minlength=intervals)
|
|
381
|
+
if np.all(n >= min_n):
|
|
382
|
+
return {'group': z + 1, 'breaks': breaks}
|
|
383
|
+
|
|
384
|
+
# Try quantile-based
|
|
385
|
+
quantiles = np.quantile(x_jit, np.linspace(0, 1, intervals + 1))
|
|
386
|
+
quantiles[0] -= 1
|
|
387
|
+
quantiles[-1] += 1
|
|
388
|
+
|
|
389
|
+
for w in np.linspace(0.1, 1.0, 10):
|
|
390
|
+
brk = w * quantiles + (1 - w) * breaks
|
|
391
|
+
z = np.digitize(x_jit, brk[1:-1])
|
|
392
|
+
n = np.bincount(z, minlength=intervals)
|
|
393
|
+
if np.all(n >= min_n):
|
|
394
|
+
return {'group': z + 1, 'breaks': brk}
|
|
395
|
+
|
|
396
|
+
# Fallback: order by x
|
|
397
|
+
o = np.argsort(x_jit)
|
|
398
|
+
n_per = nx // intervals
|
|
399
|
+
nresid = nx - intervals * n_per
|
|
400
|
+
sizes = np.full(intervals, n_per)
|
|
401
|
+
if nresid > 0:
|
|
402
|
+
sizes[:nresid] += 1
|
|
403
|
+
z = np.zeros(nx, dtype=int)
|
|
404
|
+
z[o] = np.repeat(np.arange(1, intervals + 1), sizes)
|
|
405
|
+
return {'group': z, 'breaks': quantiles}
|
|
406
|
+
|
|
407
|
+
|
|
408
|
+
def sum_tech_reps(x, ID=None):
|
|
409
|
+
"""Sum over technical replicate columns.
|
|
410
|
+
|
|
411
|
+
Port of edgeR's sumTechReps.
|
|
412
|
+
"""
|
|
413
|
+
if isinstance(x, dict) and 'counts' in x:
|
|
414
|
+
# DGEList-like
|
|
415
|
+
if ID is None:
|
|
416
|
+
raise ValueError("No sample IDs")
|
|
417
|
+
ID = np.asarray(ID)
|
|
418
|
+
unique_ids, inverse = np.unique(ID, return_inverse=True)
|
|
419
|
+
if len(unique_ids) == len(ID):
|
|
420
|
+
return x
|
|
421
|
+
|
|
422
|
+
from copy import deepcopy
|
|
423
|
+
y = deepcopy(x)
|
|
424
|
+
# Sum counts
|
|
425
|
+
new_counts = np.zeros((x['counts'].shape[0], len(unique_ids)))
|
|
426
|
+
for i, uid in enumerate(unique_ids):
|
|
427
|
+
mask = ID == uid
|
|
428
|
+
new_counts[:, i] = x['counts'][:, mask].sum(axis=1)
|
|
429
|
+
y['counts'] = new_counts
|
|
430
|
+
|
|
431
|
+
# Average lib.size and norm.factors
|
|
432
|
+
if 'samples' in y:
|
|
433
|
+
new_samples = pd.DataFrame(index=unique_ids)
|
|
434
|
+
for col in y['samples'].columns:
|
|
435
|
+
vals = y['samples'][col].values
|
|
436
|
+
if np.issubdtype(type(vals[0]), np.number) if not isinstance(vals[0], str) else False:
|
|
437
|
+
new_vals = np.array([vals[ID == uid].sum() for uid in unique_ids])
|
|
438
|
+
if col == 'norm.factors':
|
|
439
|
+
counts_per_id = np.array([np.sum(ID == uid) for uid in unique_ids])
|
|
440
|
+
new_vals = new_vals / counts_per_id
|
|
441
|
+
new_samples[col] = new_vals
|
|
442
|
+
else:
|
|
443
|
+
new_samples[col] = [vals[ID == uid][0] for uid in unique_ids]
|
|
444
|
+
y['samples'] = new_samples
|
|
445
|
+
return y
|
|
446
|
+
else:
|
|
447
|
+
# Matrix
|
|
448
|
+
x = np.asarray(x, dtype=np.float64)
|
|
449
|
+
if ID is None:
|
|
450
|
+
raise ValueError("No sample IDs")
|
|
451
|
+
ID = np.asarray(ID)
|
|
452
|
+
unique_ids = np.unique(ID)
|
|
453
|
+
result = np.zeros((x.shape[0], len(unique_ids)))
|
|
454
|
+
for i, uid in enumerate(unique_ids):
|
|
455
|
+
mask = ID == uid
|
|
456
|
+
result[:, i] = x[:, mask].sum(axis=1)
|
|
457
|
+
return result
|
|
458
|
+
|
|
459
|
+
|
|
460
|
+
def systematic_subset(n, order_by):
|
|
461
|
+
"""Take a systematic subset of indices stratified by a ranking variable.
|
|
462
|
+
|
|
463
|
+
Port of edgeR's systematicSubset.
|
|
464
|
+
"""
|
|
465
|
+
order_by = np.asarray(order_by)
|
|
466
|
+
ntotal = len(order_by)
|
|
467
|
+
sampling_ratio = ntotal // n
|
|
468
|
+
if sampling_ratio <= 1:
|
|
469
|
+
return np.arange(ntotal)
|
|
470
|
+
i1 = sampling_ratio // 2
|
|
471
|
+
indices = np.arange(i1, ntotal, sampling_ratio)
|
|
472
|
+
o = np.argsort(order_by)
|
|
473
|
+
return o[indices]
|
|
474
|
+
|
|
475
|
+
|
|
476
|
+
def nearest_ref_to_x(x, reference):
|
|
477
|
+
"""Find nearest element of reference for each element of x.
|
|
478
|
+
|
|
479
|
+
Port of edgeR's nearestReftoX.
|
|
480
|
+
"""
|
|
481
|
+
reference = np.sort(reference)
|
|
482
|
+
midpt = (reference[:-1] + reference[1:]) / 2
|
|
483
|
+
return np.searchsorted(midpt, x)
|
|
484
|
+
|
|
485
|
+
|
|
486
|
+
def get_prior_n(y, design=None, prior_df=20):
|
|
487
|
+
"""Determine prior.n to keep prior degrees of freedom fixed.
|
|
488
|
+
|
|
489
|
+
Port of edgeR's getPriorN.
|
|
490
|
+
"""
|
|
491
|
+
if isinstance(y, dict):
|
|
492
|
+
nlibs = y['counts'].shape[1] if 'counts' in y else 0
|
|
493
|
+
if design is None:
|
|
494
|
+
npar = len(y['samples']['group'].unique()) if 'samples' in y else 1
|
|
495
|
+
else:
|
|
496
|
+
npar = design.shape[1]
|
|
497
|
+
else:
|
|
498
|
+
if design is None:
|
|
499
|
+
raise ValueError("design must be provided for matrix input")
|
|
500
|
+
nlibs = np.asarray(y).shape[1]
|
|
501
|
+
npar = design.shape[1]
|
|
502
|
+
|
|
503
|
+
residual_df = nlibs - npar
|
|
504
|
+
if residual_df <= 0:
|
|
505
|
+
return prior_df
|
|
506
|
+
return prior_df / residual_df
|
|
507
|
+
|
|
508
|
+
|
|
509
|
+
def zscore_nbinom(q, size, mu, method='midp'):
|
|
510
|
+
"""Z-score equivalents for negative binomial deviates.
|
|
511
|
+
|
|
512
|
+
Port of edgeR's zscoreNBinom.
|
|
513
|
+
"""
|
|
514
|
+
q = np.asarray(q, dtype=np.float64)
|
|
515
|
+
size = np.atleast_1d(np.asarray(size, dtype=np.float64))
|
|
516
|
+
mu = np.atleast_1d(np.asarray(mu, dtype=np.float64))
|
|
517
|
+
n = len(q)
|
|
518
|
+
size = np.broadcast_to(size, n).copy()
|
|
519
|
+
mu = np.broadcast_to(mu, n).copy()
|
|
520
|
+
|
|
521
|
+
z = np.zeros(n)
|
|
522
|
+
qr = np.round(q).astype(int)
|
|
523
|
+
|
|
524
|
+
for i in range(n):
|
|
525
|
+
if mu[i] <= 0 or size[i] <= 0:
|
|
526
|
+
z[i] = 0
|
|
527
|
+
continue
|
|
528
|
+
logd = stats.nbinom.logpmf(qr[i], size[i], size[i] / (size[i] + mu[i]))
|
|
529
|
+
if qr[i] == 0:
|
|
530
|
+
w = (q[i] - qr[i]) + 0.5
|
|
531
|
+
logp = logd + np.log(max(w, 1e-300))
|
|
532
|
+
z[i] = stats.norm.ppf(np.exp(logp)) if np.exp(logp) < 1 else 0
|
|
533
|
+
elif q[i] >= mu[i]:
|
|
534
|
+
logp_tail = stats.nbinom.logsf(qr[i], size[i], size[i] / (size[i] + mu[i]))
|
|
535
|
+
w = 0.5 - (q[i] - qr[i])
|
|
536
|
+
from .limma_port import logsumexp
|
|
537
|
+
logp = logsumexp(logp_tail, logd + np.log(max(w, 1e-300)))
|
|
538
|
+
z[i] = -stats.norm.ppf(np.exp(logp)) if np.exp(logp) < 1 else 0
|
|
539
|
+
else:
|
|
540
|
+
logp_tail = stats.nbinom.logcdf(max(qr[i] - 1, 0), size[i], size[i] / (size[i] + mu[i]))
|
|
541
|
+
w = (q[i] - qr[i]) + 0.5
|
|
542
|
+
from .limma_port import logsumexp
|
|
543
|
+
logp = logsumexp(logp_tail, logd + np.log(max(w, 1e-300)))
|
|
544
|
+
z[i] = stats.norm.ppf(np.exp(logp)) if np.exp(logp) < 1 else 0
|
|
545
|
+
|
|
546
|
+
return z
|
|
547
|
+
|
|
548
|
+
|
|
549
|
+
def binom_test(y1, y2, n1=None, n2=None, p=None):
|
|
550
|
+
"""Multiple exact binomial tests.
|
|
551
|
+
|
|
552
|
+
Port of edgeR's binomTest.
|
|
553
|
+
"""
|
|
554
|
+
y1 = np.asarray(y1, dtype=int)
|
|
555
|
+
y2 = np.asarray(y2, dtype=int)
|
|
556
|
+
if len(y1) != len(y2):
|
|
557
|
+
raise ValueError("y1 and y2 must have same length")
|
|
558
|
+
|
|
559
|
+
if n1 is None:
|
|
560
|
+
n1 = np.sum(y1)
|
|
561
|
+
if n2 is None:
|
|
562
|
+
n2 = np.sum(y2)
|
|
563
|
+
if p is None:
|
|
564
|
+
p = n1 / (n1 + n2)
|
|
565
|
+
|
|
566
|
+
size = y1 + y2
|
|
567
|
+
pvalue = np.ones(len(y1))
|
|
568
|
+
|
|
569
|
+
if p == 0.5:
|
|
570
|
+
for i in range(len(y1)):
|
|
571
|
+
if size[i] > 0:
|
|
572
|
+
k = min(y1[i], y2[i])
|
|
573
|
+
pvalue[i] = min(2 * stats.binom.cdf(k, size[i], 0.5), 1.0)
|
|
574
|
+
return pvalue
|
|
575
|
+
|
|
576
|
+
for i in range(len(y1)):
|
|
577
|
+
if size[i] == 0:
|
|
578
|
+
pvalue[i] = 1.0
|
|
579
|
+
continue
|
|
580
|
+
if size[i] > 10000:
|
|
581
|
+
table = np.array([[y1[i], y2[i]], [n1 - y1[i], n2 - y2[i]]])
|
|
582
|
+
_, pv, _, _ = stats.chi2_contingency(table, correction=False)
|
|
583
|
+
pvalue[i] = pv
|
|
584
|
+
else:
|
|
585
|
+
# Method of small probabilities
|
|
586
|
+
d = stats.binom.pmf(np.arange(size[i] + 1), size[i], p)
|
|
587
|
+
d_obs = stats.binom.pmf(y1[i], size[i], p)
|
|
588
|
+
pvalue[i] = np.sum(d[d <= d_obs + 1e-15])
|
|
589
|
+
|
|
590
|
+
return np.minimum(pvalue, 1.0)
|
|
591
|
+
|
|
592
|
+
|
|
593
|
+
def drop_empty_levels(x):
|
|
594
|
+
"""Drop unused factor levels.
|
|
595
|
+
|
|
596
|
+
Port of edgeR's dropEmptyLevels.
|
|
597
|
+
"""
|
|
598
|
+
if isinstance(x, pd.Categorical):
|
|
599
|
+
return x.remove_unused_categories()
|
|
600
|
+
return pd.Categorical(x)
|
|
601
|
+
|
|
602
|
+
|
|
603
|
+
def design_as_factor(design):
|
|
604
|
+
"""Construct a factor from the unique rows of a design matrix.
|
|
605
|
+
|
|
606
|
+
Port of edgeR's designAsFactor.
|
|
607
|
+
"""
|
|
608
|
+
design = np.asarray(design, dtype=np.float64)
|
|
609
|
+
z = (np.e + np.pi) / 5
|
|
610
|
+
powers = z ** np.arange(design.shape[1])
|
|
611
|
+
row_vals = design @ powers
|
|
612
|
+
_, inverse = np.unique(row_vals, return_inverse=True)
|
|
613
|
+
return inverse
|
|
614
|
+
|
|
615
|
+
|
|
616
|
+
def residual_df(zero_fit, design):
|
|
617
|
+
"""Calculate effective residual DF adjusted for exact zeros.
|
|
618
|
+
|
|
619
|
+
Port of edgeR's .residDF.
|
|
620
|
+
"""
|
|
621
|
+
zero_fit = np.asarray(zero_fit, dtype=bool)
|
|
622
|
+
nlibs = zero_fit.shape[1] if zero_fit.ndim == 2 else len(zero_fit)
|
|
623
|
+
ncoefs = design.shape[1]
|
|
624
|
+
base_df = nlibs - ncoefs
|
|
625
|
+
|
|
626
|
+
if zero_fit.ndim == 1:
|
|
627
|
+
n_zeros = np.sum(zero_fit)
|
|
628
|
+
return max(base_df - n_zeros, 0)
|
|
629
|
+
|
|
630
|
+
# Group rows with same zero pattern
|
|
631
|
+
ngenes = zero_fit.shape[0]
|
|
632
|
+
df = np.full(ngenes, base_df, dtype=np.float64)
|
|
633
|
+
|
|
634
|
+
for i in range(ngenes):
|
|
635
|
+
zf = zero_fit[i]
|
|
636
|
+
n_zeros = np.sum(zf)
|
|
637
|
+
if n_zeros == 0:
|
|
638
|
+
continue
|
|
639
|
+
if n_zeros >= nlibs - 1:
|
|
640
|
+
df[i] = 0
|
|
641
|
+
continue
|
|
642
|
+
# Reduce design matrix
|
|
643
|
+
keep = ~zf
|
|
644
|
+
design_sub = design[keep]
|
|
645
|
+
rank_sub = np.linalg.matrix_rank(design_sub)
|
|
646
|
+
df[i] = np.sum(keep) - rank_sub
|
|
647
|
+
|
|
648
|
+
return df
|
|
649
|
+
|
|
650
|
+
|
|
651
|
+
def scale_offset(y, offset):
|
|
652
|
+
"""Scale offsets to be consistent with library sizes.
|
|
653
|
+
|
|
654
|
+
Port of edgeR's scaleOffset.
|
|
655
|
+
"""
|
|
656
|
+
if isinstance(y, dict) and 'counts' in y:
|
|
657
|
+
lib_size = y['samples']['lib.size'].values * y['samples']['norm.factors'].values
|
|
658
|
+
y['offset'] = scale_offset(lib_size, offset)
|
|
659
|
+
return y
|
|
660
|
+
|
|
661
|
+
if isinstance(y, np.ndarray) and y.ndim == 2:
|
|
662
|
+
lib_size = y.sum(axis=0)
|
|
663
|
+
else:
|
|
664
|
+
lib_size = np.asarray(y, dtype=np.float64)
|
|
665
|
+
|
|
666
|
+
offset = np.asarray(offset, dtype=np.float64)
|
|
667
|
+
|
|
668
|
+
if offset.ndim == 2:
|
|
669
|
+
adj = offset.mean(axis=1, keepdims=True)
|
|
670
|
+
else:
|
|
671
|
+
adj = np.mean(offset)
|
|
672
|
+
|
|
673
|
+
return np.mean(np.log(lib_size)) + offset - adj
|
|
674
|
+
|
|
675
|
+
|
|
676
|
+
def _model_matrix_group(group):
|
|
677
|
+
"""Create a model matrix from a group factor (model.matrix(~group) equivalent).
|
|
678
|
+
|
|
679
|
+
Returns an intercept + dummy-coded design matrix.
|
|
680
|
+
"""
|
|
681
|
+
group = np.asarray(group)
|
|
682
|
+
unique_groups = np.unique(group)
|
|
683
|
+
n = len(group)
|
|
684
|
+
ngroups = len(unique_groups)
|
|
685
|
+
|
|
686
|
+
if ngroups <= 1:
|
|
687
|
+
return np.ones((n, 1))
|
|
688
|
+
|
|
689
|
+
# Intercept + (ngroups - 1) dummy columns
|
|
690
|
+
design = np.zeros((n, ngroups))
|
|
691
|
+
design[:, 0] = 1.0 # intercept
|
|
692
|
+
for i in range(1, ngroups):
|
|
693
|
+
design[group == unique_groups[i], i] = 1.0
|
|
694
|
+
|
|
695
|
+
return design
|
|
696
|
+
|
|
697
|
+
|
|
698
|
+
def model_matrix(formula, data=None):
|
|
699
|
+
"""Create a design matrix from an R-style formula.
|
|
700
|
+
|
|
701
|
+
Uses patsy to parse the formula and build the design matrix,
|
|
702
|
+
matching R's ``model.matrix(formula, data)`` behaviour.
|
|
703
|
+
|
|
704
|
+
Parameters
|
|
705
|
+
----------
|
|
706
|
+
formula : str
|
|
707
|
+
R-style formula, e.g. ``'~ group'``, ``'~ batch + condition'``,
|
|
708
|
+
``'~ 0 + group'`` (no intercept).
|
|
709
|
+
data : DataFrame, dict, ndarray, scipy.sparse, or Series
|
|
710
|
+
Sample-level data. Column names are used as variables in
|
|
711
|
+
the formula.
|
|
712
|
+
|
|
713
|
+
- **DataFrame**: used directly.
|
|
714
|
+
- **dict**: converted to DataFrame (keys → column names).
|
|
715
|
+
- **ndarray**: columns named ``x0, x1, …`` automatically.
|
|
716
|
+
- **scipy.sparse**: densified, then treated as ndarray.
|
|
717
|
+
- **Series**: wrapped in a single-column DataFrame whose
|
|
718
|
+
column name is the Series ``.name`` (or ``x0``).
|
|
719
|
+
|
|
720
|
+
Returns
|
|
721
|
+
-------
|
|
722
|
+
ndarray
|
|
723
|
+
Design matrix (samples x coefficients), dtype float64.
|
|
724
|
+
|
|
725
|
+
Examples
|
|
726
|
+
--------
|
|
727
|
+
>>> import pandas as pd
|
|
728
|
+
>>> df = pd.DataFrame({'group': ['A','A','B','B'], 'batch': [1,2,1,2]})
|
|
729
|
+
>>> model_matrix('~ group', df)
|
|
730
|
+
array([[1., 0.],
|
|
731
|
+
[1., 0.],
|
|
732
|
+
[1., 1.],
|
|
733
|
+
[1., 1.]])
|
|
734
|
+
>>> model_matrix('~ 0 + group', df) # no intercept
|
|
735
|
+
array([[1., 0.],
|
|
736
|
+
[1., 0.],
|
|
737
|
+
[0., 1.],
|
|
738
|
+
[0., 1.]])
|
|
739
|
+
"""
|
|
740
|
+
try:
|
|
741
|
+
import patsy
|
|
742
|
+
except ImportError:
|
|
743
|
+
raise ImportError(
|
|
744
|
+
"patsy package required for formula interface. "
|
|
745
|
+
"Install with: pip install patsy"
|
|
746
|
+
)
|
|
747
|
+
|
|
748
|
+
if data is None:
|
|
749
|
+
raise ValueError("data must be provided for formula-based design")
|
|
750
|
+
|
|
751
|
+
# Convert various types to DataFrame
|
|
752
|
+
if isinstance(data, dict):
|
|
753
|
+
data = pd.DataFrame(data)
|
|
754
|
+
elif isinstance(data, pd.Series):
|
|
755
|
+
name = data.name if data.name is not None else 'x0'
|
|
756
|
+
data = pd.DataFrame({name: data.values})
|
|
757
|
+
elif isinstance(data, np.ndarray):
|
|
758
|
+
if data.ndim == 1:
|
|
759
|
+
data = pd.DataFrame({'x0': data})
|
|
760
|
+
else:
|
|
761
|
+
cols = {f'x{i}': data[:, i] for i in range(data.shape[1])}
|
|
762
|
+
data = pd.DataFrame(cols)
|
|
763
|
+
elif not isinstance(data, pd.DataFrame):
|
|
764
|
+
# scipy.sparse or other array-like
|
|
765
|
+
if hasattr(data, 'toarray'):
|
|
766
|
+
data = data.toarray()
|
|
767
|
+
data = np.asarray(data)
|
|
768
|
+
if data.ndim == 1:
|
|
769
|
+
data = pd.DataFrame({'x0': data})
|
|
770
|
+
else:
|
|
771
|
+
cols = {f'x{i}': data[:, i] for i in range(data.shape[1])}
|
|
772
|
+
data = pd.DataFrame(cols)
|
|
773
|
+
|
|
774
|
+
design_info = patsy.dmatrix(formula, data=data, return_type='dataframe')
|
|
775
|
+
return np.asarray(design_info, dtype=np.float64)
|
|
776
|
+
|
|
777
|
+
|
|
778
|
+
def _resolve_design(design, y):
|
|
779
|
+
"""Resolve design argument: formula string → numpy array.
|
|
780
|
+
|
|
781
|
+
If *design* is a string it is treated as an R-style formula and
|
|
782
|
+
evaluated against the sample metadata in *y* (which must be a
|
|
783
|
+
DGEList with a 'samples' key). Otherwise *design* is returned
|
|
784
|
+
as-is.
|
|
785
|
+
"""
|
|
786
|
+
if not isinstance(design, str):
|
|
787
|
+
return design
|
|
788
|
+
|
|
789
|
+
if not (isinstance(y, dict) and 'samples' in y):
|
|
790
|
+
raise ValueError(
|
|
791
|
+
"Formula design requires a DGEList with sample metadata. "
|
|
792
|
+
"Pass a DGEList or use model_matrix() explicitly."
|
|
793
|
+
)
|
|
794
|
+
return model_matrix(design, y['samples'])
|
|
795
|
+
|
|
796
|
+
|
|
797
|
+
def model_matrix_meth(object, design=None):
|
|
798
|
+
"""Create expanded design matrix for BS-seq methylation analysis.
|
|
799
|
+
|
|
800
|
+
Port of edgeR's ``modelMatrixMeth``.
|
|
801
|
+
|
|
802
|
+
Takes a sample-level design matrix (``nsamples x p``) and expands it
|
|
803
|
+
for a DGEList produced by :func:`read_bismark2dge`, which has
|
|
804
|
+
``2 * nsamples`` columns arranged as
|
|
805
|
+
``S1-Me, S1-Un, S2-Me, S2-Un, ...``.
|
|
806
|
+
|
|
807
|
+
The returned design matrix has ``2 * nsamples`` rows and
|
|
808
|
+
``nsamples + p`` columns:
|
|
809
|
+
|
|
810
|
+
* **Left block** (``nsamples`` columns): sample indicator matrix.
|
|
811
|
+
Each sample gets a 1 in both its Me and Un rows.
|
|
812
|
+
* **Right block** (``p`` columns): treatment design for Me rows
|
|
813
|
+
(odd rows), zeros for Un rows (even rows).
|
|
814
|
+
|
|
815
|
+
Parameters
|
|
816
|
+
----------
|
|
817
|
+
object : DGEList or ndarray
|
|
818
|
+
Either a DGEList (in which case the sample-level design is taken
|
|
819
|
+
from ``design`` or built from the group factor) or a numpy array
|
|
820
|
+
to use directly as the sample-level treatment design matrix.
|
|
821
|
+
design : ndarray, optional
|
|
822
|
+
Sample-level design matrix (``nsamples x p``). Used when
|
|
823
|
+
*object* is a DGEList. If None and *object* is a DGEList,
|
|
824
|
+
a ``~group`` design is created from the sample metadata.
|
|
825
|
+
|
|
826
|
+
Returns
|
|
827
|
+
-------
|
|
828
|
+
ndarray
|
|
829
|
+
Expanded design matrix, shape ``(2 * nsamples, nsamples + p)``.
|
|
830
|
+
"""
|
|
831
|
+
if isinstance(object, np.ndarray):
|
|
832
|
+
design_treatments = object.copy()
|
|
833
|
+
elif isinstance(object, dict):
|
|
834
|
+
# DGEList
|
|
835
|
+
if design is not None:
|
|
836
|
+
design_treatments = np.asarray(design, dtype=np.float64)
|
|
837
|
+
else:
|
|
838
|
+
# Build ~group design from samples
|
|
839
|
+
if 'samples' in object and 'group' in object['samples'].columns:
|
|
840
|
+
group = object['samples']['group'].values
|
|
841
|
+
# Only use first half (Me samples)
|
|
842
|
+
ncols = object['counts'].shape[1]
|
|
843
|
+
nsamples = ncols // 2
|
|
844
|
+
group_half = group[:nsamples] if len(group) > nsamples else group
|
|
845
|
+
design_treatments = _model_matrix_group(group_half)
|
|
846
|
+
else:
|
|
847
|
+
raise ValueError(
|
|
848
|
+
"No design provided and DGEList has no group factor"
|
|
849
|
+
)
|
|
850
|
+
else:
|
|
851
|
+
raise TypeError("object must be a DGEList or a numpy array")
|
|
852
|
+
|
|
853
|
+
nsamples = design_treatments.shape[0]
|
|
854
|
+
nparam = design_treatments.shape[1]
|
|
855
|
+
|
|
856
|
+
# Sample indicator: gl(nsamples, 2) → [0,0,1,1,2,2,...]
|
|
857
|
+
# model.matrix(~0+Sample) → identity matrix indexed by sample
|
|
858
|
+
design_samples = np.zeros((2 * nsamples, nsamples), dtype=np.float64)
|
|
859
|
+
for i in range(nsamples):
|
|
860
|
+
design_samples[2 * i, i] = 1.0
|
|
861
|
+
design_samples[2 * i + 1, i] = 1.0
|
|
862
|
+
|
|
863
|
+
# Expand treatment design: duplicate each row for Me and Un
|
|
864
|
+
design_expanded = np.zeros((2 * nsamples, nparam), dtype=np.float64)
|
|
865
|
+
for i in range(nsamples):
|
|
866
|
+
design_expanded[2 * i, :] = design_treatments[i, :]
|
|
867
|
+
design_expanded[2 * i + 1, :] = design_treatments[i, :]
|
|
868
|
+
|
|
869
|
+
# Methylation indicator: 1 for Me (even rows), 0 for Un (odd rows)
|
|
870
|
+
meth_indicator = np.zeros(2 * nsamples, dtype=np.float64)
|
|
871
|
+
for i in range(nsamples):
|
|
872
|
+
meth_indicator[2 * i] = 1.0
|
|
873
|
+
|
|
874
|
+
# Right block: treatment design * methylation indicator
|
|
875
|
+
design_right = design_expanded * meth_indicator[:, np.newaxis]
|
|
876
|
+
|
|
877
|
+
return np.hstack([design_samples, design_right])
|
|
878
|
+
|
|
879
|
+
|
|
880
|
+
def nearest_tss(chr, locus, tss_data=None, species="Hs"):
|
|
881
|
+
"""Find nearest transcription start site for genomic coordinates.
|
|
882
|
+
|
|
883
|
+
Port of edgeR's ``nearestTSS``.
|
|
884
|
+
|
|
885
|
+
For each query position ``(chr[i], locus[i])``, finds the nearest
|
|
886
|
+
TSS on the same chromosome and returns information about the
|
|
887
|
+
corresponding gene.
|
|
888
|
+
|
|
889
|
+
Parameters
|
|
890
|
+
----------
|
|
891
|
+
chr : array-like of str
|
|
892
|
+
Chromosome names for query positions.
|
|
893
|
+
locus : array-like of int
|
|
894
|
+
Genomic positions for query positions.
|
|
895
|
+
tss_data : DataFrame, optional
|
|
896
|
+
TSS annotation with columns: ``chr``, ``tss``, ``gene_id``,
|
|
897
|
+
``gene_name``, ``strand``. If None, attempts to fetch from
|
|
898
|
+
Ensembl BioMart using ``pybiomart`` (requires internet).
|
|
899
|
+
species : str
|
|
900
|
+
Species code for BioMart query (default ``"Hs"`` for human).
|
|
901
|
+
Only used when ``tss_data`` is None.
|
|
902
|
+
|
|
903
|
+
Returns
|
|
904
|
+
-------
|
|
905
|
+
DataFrame
|
|
906
|
+
With columns: ``gene_id``, ``gene_name``, ``strand``, ``tss``,
|
|
907
|
+
``width``, ``distance``. ``distance`` is positive when the
|
|
908
|
+
query locus is downstream of the TSS on the gene's strand.
|
|
909
|
+
"""
|
|
910
|
+
chr_arr = np.asarray(chr, dtype=str)
|
|
911
|
+
locus_arr = np.asarray(locus, dtype=np.int64)
|
|
912
|
+
n = len(chr_arr)
|
|
913
|
+
|
|
914
|
+
if len(locus_arr) == 1:
|
|
915
|
+
locus_arr = np.full(n, locus_arr[0], dtype=np.int64)
|
|
916
|
+
elif len(locus_arr) != n:
|
|
917
|
+
raise ValueError("Length of locus doesn't agree with length of chr")
|
|
918
|
+
|
|
919
|
+
# Handle NAs
|
|
920
|
+
na_mask = np.array([(c == '' or c == 'nan' or c == 'None')
|
|
921
|
+
for c in chr_arr])
|
|
922
|
+
|
|
923
|
+
if tss_data is None:
|
|
924
|
+
tss_data = _fetch_tss_biomart(species)
|
|
925
|
+
|
|
926
|
+
# Ensure tss_data has required columns
|
|
927
|
+
required = {'chr', 'tss', 'gene_id', 'gene_name', 'strand'}
|
|
928
|
+
missing = required - set(tss_data.columns)
|
|
929
|
+
if missing:
|
|
930
|
+
raise ValueError(f"tss_data missing columns: {missing}")
|
|
931
|
+
|
|
932
|
+
# Sort tss_data by chromosome and TSS position
|
|
933
|
+
tss_data = tss_data.sort_values(['chr', 'tss']).reset_index(drop=True)
|
|
934
|
+
|
|
935
|
+
# Group by chromosome
|
|
936
|
+
tss_by_chr = {}
|
|
937
|
+
for chrom, grp in tss_data.groupby('chr'):
|
|
938
|
+
tss_by_chr[chrom] = grp
|
|
939
|
+
|
|
940
|
+
# Prepare output
|
|
941
|
+
out_gene_id = np.full(n, np.nan, dtype=object)
|
|
942
|
+
out_gene_name = np.full(n, np.nan, dtype=object)
|
|
943
|
+
out_strand = np.full(n, np.nan, dtype=object)
|
|
944
|
+
out_tss = np.full(n, np.nan, dtype=np.float64)
|
|
945
|
+
out_width = np.full(n, np.nan, dtype=np.float64)
|
|
946
|
+
out_distance = np.full(n, np.nan, dtype=np.float64)
|
|
947
|
+
|
|
948
|
+
# Check if query chr values start with "chr" but tss_data doesn't (or vice versa)
|
|
949
|
+
query_has_chr = any(c.startswith('chr') for c in chr_arr if c)
|
|
950
|
+
tss_has_chr = any(str(c).startswith('chr') for c in tss_data['chr'].values[:10])
|
|
951
|
+
|
|
952
|
+
for chrom_name in tss_by_chr:
|
|
953
|
+
grp = tss_by_chr[chrom_name]
|
|
954
|
+
tss_positions = grp['tss'].values.astype(np.float64)
|
|
955
|
+
|
|
956
|
+
# Match query chromosomes to this reference chromosome
|
|
957
|
+
if query_has_chr and not tss_has_chr:
|
|
958
|
+
query_chrom = 'chr' + str(chrom_name)
|
|
959
|
+
elif not query_has_chr and tss_has_chr:
|
|
960
|
+
query_chrom = str(chrom_name).replace('chr', '')
|
|
961
|
+
else:
|
|
962
|
+
query_chrom = str(chrom_name)
|
|
963
|
+
|
|
964
|
+
iinc = np.where((chr_arr == query_chrom) & ~na_mask)[0]
|
|
965
|
+
if len(iinc) == 0:
|
|
966
|
+
continue
|
|
967
|
+
|
|
968
|
+
which = nearest_ref_to_x(locus_arr[iinc].astype(np.float64),
|
|
969
|
+
tss_positions)
|
|
970
|
+
|
|
971
|
+
for j, qi in enumerate(iinc):
|
|
972
|
+
ref_idx = which[j]
|
|
973
|
+
row = grp.iloc[ref_idx]
|
|
974
|
+
out_gene_id[qi] = row['gene_id']
|
|
975
|
+
out_gene_name[qi] = row['gene_name']
|
|
976
|
+
out_strand[qi] = row['strand']
|
|
977
|
+
out_tss[qi] = row['tss']
|
|
978
|
+
if 'width' in grp.columns:
|
|
979
|
+
out_width[qi] = row['width']
|
|
980
|
+
# distance: signed distance, positive = downstream of TSS
|
|
981
|
+
dist = locus_arr[qi] - int(row['tss'])
|
|
982
|
+
if row['strand'] == '-':
|
|
983
|
+
dist = -dist
|
|
984
|
+
out_distance[qi] = dist
|
|
985
|
+
|
|
986
|
+
result = pd.DataFrame({
|
|
987
|
+
'gene_id': out_gene_id,
|
|
988
|
+
'gene_name': out_gene_name,
|
|
989
|
+
'strand': out_strand,
|
|
990
|
+
'tss': pd.array(out_tss, dtype=pd.Int64Dtype()),
|
|
991
|
+
'width': pd.array(out_width, dtype=pd.Int64Dtype()),
|
|
992
|
+
'distance': pd.array(out_distance, dtype=pd.Int64Dtype()),
|
|
993
|
+
})
|
|
994
|
+
return result
|
|
995
|
+
|
|
996
|
+
|
|
997
|
+
def _fetch_tss_biomart(species="Hs"):
|
|
998
|
+
"""Fetch TSS data from Ensembl BioMart.
|
|
999
|
+
|
|
1000
|
+
Requires the ``pybiomart`` package.
|
|
1001
|
+
"""
|
|
1002
|
+
try:
|
|
1003
|
+
from pybiomart import Server
|
|
1004
|
+
except ImportError:
|
|
1005
|
+
raise ImportError(
|
|
1006
|
+
"pybiomart package required to fetch TSS data from Ensembl. "
|
|
1007
|
+
"Install with: pip install pybiomart\n"
|
|
1008
|
+
"Alternatively, pass tss_data as a DataFrame with columns: "
|
|
1009
|
+
"chr, tss, gene_id, gene_name, strand"
|
|
1010
|
+
)
|
|
1011
|
+
|
|
1012
|
+
species_map = {
|
|
1013
|
+
'Hs': 'hsapiens_gene_ensembl',
|
|
1014
|
+
'Mm': 'mmusculus_gene_ensembl',
|
|
1015
|
+
'Rn': 'rnorvegicus_gene_ensembl',
|
|
1016
|
+
'Dm': 'dmelanogaster_gene_ensembl',
|
|
1017
|
+
'Dr': 'drerio_gene_ensembl',
|
|
1018
|
+
}
|
|
1019
|
+
|
|
1020
|
+
dataset_name = species_map.get(species)
|
|
1021
|
+
if dataset_name is None:
|
|
1022
|
+
raise ValueError(
|
|
1023
|
+
f"Unknown species code '{species}'. Known: {list(species_map.keys())}"
|
|
1024
|
+
)
|
|
1025
|
+
|
|
1026
|
+
server = Server(host='http://www.ensembl.org')
|
|
1027
|
+
dataset = server.marts['ENSEMBL_MART_ENSEMBL'].datasets[dataset_name]
|
|
1028
|
+
|
|
1029
|
+
result = dataset.query(
|
|
1030
|
+
attributes=[
|
|
1031
|
+
'chromosome_name',
|
|
1032
|
+
'transcription_start_site',
|
|
1033
|
+
'ensembl_gene_id',
|
|
1034
|
+
'external_gene_name',
|
|
1035
|
+
'strand',
|
|
1036
|
+
'transcript_length',
|
|
1037
|
+
]
|
|
1038
|
+
)
|
|
1039
|
+
|
|
1040
|
+
result.columns = ['chr', 'tss', 'gene_id', 'gene_name', 'strand_int',
|
|
1041
|
+
'width']
|
|
1042
|
+
result['strand'] = np.where(result['strand_int'] > 0, '+', '-')
|
|
1043
|
+
result = result.drop(columns=['strand_int'])
|
|
1044
|
+
|
|
1045
|
+
# Keep one TSS per gene (the one with smallest TSS per chromosome)
|
|
1046
|
+
result = result.sort_values(['chr', 'tss']).drop_duplicates(
|
|
1047
|
+
subset=['chr', 'gene_id'], keep='first'
|
|
1048
|
+
).reset_index(drop=True)
|
|
1049
|
+
|
|
1050
|
+
return result
|