edgepython 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- edgepython/__init__.py +114 -0
- edgepython/classes.py +517 -0
- edgepython/compressed_matrix.py +388 -0
- edgepython/dgelist.py +314 -0
- edgepython/dispersion.py +920 -0
- edgepython/dispersion_lowlevel.py +1066 -0
- edgepython/exact_test.py +525 -0
- edgepython/expression.py +323 -0
- edgepython/filtering.py +96 -0
- edgepython/gene_sets.py +1215 -0
- edgepython/glm_fit.py +653 -0
- edgepython/glm_levenberg.py +359 -0
- edgepython/glm_test.py +375 -0
- edgepython/io.py +1887 -0
- edgepython/limma_port.py +987 -0
- edgepython/normalization.py +546 -0
- edgepython/ql_weights.py +765 -0
- edgepython/results.py +236 -0
- edgepython/sc_fit.py +1511 -0
- edgepython/smoothing.py +474 -0
- edgepython/splicing.py +537 -0
- edgepython/utils.py +1050 -0
- edgepython/visualization.py +409 -0
- edgepython/weighted_lowess.py +323 -0
- edgepython-0.2.0.dist-info/METADATA +201 -0
- edgepython-0.2.0.dist-info/RECORD +29 -0
- edgepython-0.2.0.dist-info/WHEEL +5 -0
- edgepython-0.2.0.dist-info/licenses/LICENSE +674 -0
- edgepython-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,388 @@
|
|
|
1
|
+
# This code was written by Claude (Anthropic). The project was directed by Lior Pachter.
|
|
2
|
+
"""
|
|
3
|
+
CompressedMatrix class for memory-efficient matrix storage.
|
|
4
|
+
|
|
5
|
+
Port of edgeR's CompressedMatrix (makeCompressedMatrix.R).
|
|
6
|
+
Stores scalars, row vectors, or column vectors with flags indicating
|
|
7
|
+
which dimensions should be repeated when expanding to a full matrix.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import numpy as np
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class CompressedMatrix:
|
|
14
|
+
"""Memory-efficient matrix that stores repeated rows/columns compactly.
|
|
15
|
+
|
|
16
|
+
A CompressedMatrix stores a scalar, row vector, column vector, or full
|
|
17
|
+
matrix along with flags indicating which dimensions are repeated.
|
|
18
|
+
This avoids materializing large matrices when the same values are
|
|
19
|
+
repeated across rows or columns.
|
|
20
|
+
|
|
21
|
+
Parameters
|
|
22
|
+
----------
|
|
23
|
+
x : scalar, 1-D array, or 2-D array
|
|
24
|
+
The data to store.
|
|
25
|
+
dims : tuple of (int, int), optional
|
|
26
|
+
The logical dimensions (nrow, ncol) of the full matrix.
|
|
27
|
+
byrow : bool
|
|
28
|
+
If True (default), a 1-D vector is treated as a row to be repeated
|
|
29
|
+
down rows. If False, treated as a column to be repeated across columns.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def __init__(self, x, dims=None, byrow=True):
|
|
33
|
+
x = np.asarray(x, dtype=np.float64)
|
|
34
|
+
self.repeat_row = False
|
|
35
|
+
self.repeat_col = False
|
|
36
|
+
|
|
37
|
+
if x.ndim == 2:
|
|
38
|
+
if dims is not None:
|
|
39
|
+
xr, xc = x.shape
|
|
40
|
+
if xr == 1 and xc == 1:
|
|
41
|
+
self.repeat_row = True
|
|
42
|
+
self.repeat_col = True
|
|
43
|
+
self._data = x.reshape(1, 1)
|
|
44
|
+
elif xr == 1 and xc >= 2:
|
|
45
|
+
if xc != dims[1]:
|
|
46
|
+
raise ValueError("dims[1] should equal length of row vector x")
|
|
47
|
+
if byrow:
|
|
48
|
+
self.repeat_row = True
|
|
49
|
+
self._data = x.reshape(1, xc)
|
|
50
|
+
else:
|
|
51
|
+
self._data = x.reshape(1, xc)
|
|
52
|
+
dims = (xr, xc)
|
|
53
|
+
elif xr >= 2 and xc == 1:
|
|
54
|
+
if xr != dims[0]:
|
|
55
|
+
raise ValueError("dims[0] should equal length of column vector x")
|
|
56
|
+
if not byrow:
|
|
57
|
+
self.repeat_col = True
|
|
58
|
+
self._data = x.reshape(xr, 1)
|
|
59
|
+
else:
|
|
60
|
+
self._data = x.reshape(xr, 1)
|
|
61
|
+
dims = (xr, xc)
|
|
62
|
+
else:
|
|
63
|
+
self._data = x
|
|
64
|
+
dims = x.shape
|
|
65
|
+
else:
|
|
66
|
+
self._data = x
|
|
67
|
+
dims = x.shape
|
|
68
|
+
elif x.ndim <= 1:
|
|
69
|
+
x = x.ravel()
|
|
70
|
+
if x.size == 1:
|
|
71
|
+
self.repeat_row = True
|
|
72
|
+
self.repeat_col = True
|
|
73
|
+
self._data = x.reshape(1, 1)
|
|
74
|
+
if dims is None:
|
|
75
|
+
dims = (1, 1)
|
|
76
|
+
else:
|
|
77
|
+
if dims is None:
|
|
78
|
+
raise ValueError("dims must be provided for vector input")
|
|
79
|
+
if not byrow:
|
|
80
|
+
if dims[0] != x.size:
|
|
81
|
+
raise ValueError("dims[0] should equal length of x")
|
|
82
|
+
self._data = x.reshape(-1, 1)
|
|
83
|
+
self.repeat_col = True
|
|
84
|
+
else:
|
|
85
|
+
if dims[1] != x.size:
|
|
86
|
+
raise ValueError("dims[1] should equal length of x")
|
|
87
|
+
self._data = x.reshape(1, -1)
|
|
88
|
+
self.repeat_row = True
|
|
89
|
+
else:
|
|
90
|
+
raise ValueError("x must be scalar, 1-D, or 2-D")
|
|
91
|
+
|
|
92
|
+
self._dims = (int(dims[0]), int(dims[1]))
|
|
93
|
+
|
|
94
|
+
@property
|
|
95
|
+
def shape(self):
|
|
96
|
+
"""Logical dimensions of the full matrix."""
|
|
97
|
+
return self._dims
|
|
98
|
+
|
|
99
|
+
@property
|
|
100
|
+
def nrow(self):
|
|
101
|
+
return self._dims[0]
|
|
102
|
+
|
|
103
|
+
@property
|
|
104
|
+
def ncol(self):
|
|
105
|
+
return self._dims[1]
|
|
106
|
+
|
|
107
|
+
def __len__(self):
|
|
108
|
+
return self._dims[0] * self._dims[1]
|
|
109
|
+
|
|
110
|
+
def as_matrix(self):
|
|
111
|
+
"""Expand to a full numpy matrix."""
|
|
112
|
+
nr, nc = self._dims
|
|
113
|
+
if self.repeat_row and self.repeat_col:
|
|
114
|
+
return np.tile(self._data, (nr, nc))[:nr, :nc]
|
|
115
|
+
elif self.repeat_row:
|
|
116
|
+
return np.tile(self._data, (nr, 1))[:nr, :nc]
|
|
117
|
+
elif self.repeat_col:
|
|
118
|
+
return np.tile(self._data, (1, nc))[:nr, :nc]
|
|
119
|
+
else:
|
|
120
|
+
return self._data.copy()
|
|
121
|
+
|
|
122
|
+
def __array__(self, dtype=None):
|
|
123
|
+
result = self.as_matrix()
|
|
124
|
+
if dtype is not None:
|
|
125
|
+
result = result.astype(dtype)
|
|
126
|
+
return result
|
|
127
|
+
|
|
128
|
+
def __getitem__(self, key):
|
|
129
|
+
if isinstance(key, tuple):
|
|
130
|
+
if len(key) == 2:
|
|
131
|
+
i, j = key
|
|
132
|
+
raw = self._data.copy()
|
|
133
|
+
|
|
134
|
+
if not self.repeat_row and i is not None:
|
|
135
|
+
if isinstance(i, slice):
|
|
136
|
+
raw = raw[i, :]
|
|
137
|
+
else:
|
|
138
|
+
i_idx = np.arange(self._dims[0])[i]
|
|
139
|
+
i_idx = np.atleast_1d(i_idx)
|
|
140
|
+
raw = raw[i_idx, :]
|
|
141
|
+
if not self.repeat_col and j is not None:
|
|
142
|
+
if isinstance(j, slice):
|
|
143
|
+
raw = raw[:, j]
|
|
144
|
+
else:
|
|
145
|
+
j_idx = np.arange(self._dims[1])[j]
|
|
146
|
+
j_idx = np.atleast_1d(j_idx)
|
|
147
|
+
raw = raw[:, j_idx]
|
|
148
|
+
|
|
149
|
+
# Compute new dims
|
|
150
|
+
nr = self._dims[0]
|
|
151
|
+
if i is not None:
|
|
152
|
+
ref = np.arange(nr)
|
|
153
|
+
nr = len(ref[i]) if not np.isscalar(ref[i]) else 1
|
|
154
|
+
nc = self._dims[1]
|
|
155
|
+
if j is not None:
|
|
156
|
+
ref = np.arange(nc)
|
|
157
|
+
nc = len(ref[j]) if not np.isscalar(ref[j]) else 1
|
|
158
|
+
|
|
159
|
+
result = CompressedMatrix.__new__(CompressedMatrix)
|
|
160
|
+
result._data = raw
|
|
161
|
+
result._dims = (nr, nc)
|
|
162
|
+
result.repeat_row = self.repeat_row
|
|
163
|
+
result.repeat_col = self.repeat_col
|
|
164
|
+
|
|
165
|
+
# Drop to vector if single row or column
|
|
166
|
+
i_scalar = (isinstance(i, (int, np.integer)) or
|
|
167
|
+
(hasattr(i, '__len__') and len(i) == 1))
|
|
168
|
+
j_scalar = (isinstance(j, (int, np.integer)) or
|
|
169
|
+
(hasattr(j, '__len__') and len(j) == 1))
|
|
170
|
+
if i_scalar or j_scalar:
|
|
171
|
+
return result.as_matrix().ravel()
|
|
172
|
+
return result
|
|
173
|
+
else:
|
|
174
|
+
return self.as_matrix()[key]
|
|
175
|
+
else:
|
|
176
|
+
return self.as_matrix().ravel()[key]
|
|
177
|
+
|
|
178
|
+
def __setitem__(self, key, value):
|
|
179
|
+
full = self.as_matrix()
|
|
180
|
+
if isinstance(value, CompressedMatrix):
|
|
181
|
+
value = value.as_matrix()
|
|
182
|
+
if isinstance(key, tuple):
|
|
183
|
+
full[key] = value
|
|
184
|
+
else:
|
|
185
|
+
full.ravel()[key] = value
|
|
186
|
+
new = CompressedMatrix(full)
|
|
187
|
+
self._data = new._data
|
|
188
|
+
self._dims = new._dims
|
|
189
|
+
self.repeat_row = new.repeat_row
|
|
190
|
+
self.repeat_col = new.repeat_col
|
|
191
|
+
|
|
192
|
+
def _binary_op(self, other, op):
|
|
193
|
+
if isinstance(other, CompressedMatrix):
|
|
194
|
+
if self._dims != other._dims:
|
|
195
|
+
raise ValueError("CompressedMatrix dimensions should be equal for binary operations")
|
|
196
|
+
row_rep = self.repeat_row and other.repeat_row
|
|
197
|
+
col_rep = self.repeat_col and other.repeat_col
|
|
198
|
+
if row_rep or col_rep:
|
|
199
|
+
e1 = self._data.ravel()
|
|
200
|
+
e2 = other._data.ravel()
|
|
201
|
+
outcome = op(e1, e2)
|
|
202
|
+
return CompressedMatrix(outcome, self._dims, byrow=row_rep)
|
|
203
|
+
else:
|
|
204
|
+
return CompressedMatrix(op(self.as_matrix(), other.as_matrix()))
|
|
205
|
+
else:
|
|
206
|
+
other_arr = np.asarray(other, dtype=np.float64)
|
|
207
|
+
if other_arr.ndim <= 1 and other_arr.size == 1:
|
|
208
|
+
other_cm = CompressedMatrix(other_arr, self._dims, byrow=False)
|
|
209
|
+
elif other_arr.ndim == 1:
|
|
210
|
+
other_cm = CompressedMatrix(other_arr, self._dims, byrow=False)
|
|
211
|
+
else:
|
|
212
|
+
other_cm = CompressedMatrix(other_arr, self._dims, byrow=False)
|
|
213
|
+
return self._binary_op(other_cm, op)
|
|
214
|
+
|
|
215
|
+
def __add__(self, other):
|
|
216
|
+
return self._binary_op(other, np.add)
|
|
217
|
+
|
|
218
|
+
def __radd__(self, other):
|
|
219
|
+
return self._binary_op(other, lambda a, b: np.add(b, a))
|
|
220
|
+
|
|
221
|
+
def __sub__(self, other):
|
|
222
|
+
return self._binary_op(other, np.subtract)
|
|
223
|
+
|
|
224
|
+
def __rsub__(self, other):
|
|
225
|
+
return self._binary_op(other, lambda a, b: np.subtract(b, a))
|
|
226
|
+
|
|
227
|
+
def __mul__(self, other):
|
|
228
|
+
return self._binary_op(other, np.multiply)
|
|
229
|
+
|
|
230
|
+
def __rmul__(self, other):
|
|
231
|
+
return self._binary_op(other, lambda a, b: np.multiply(b, a))
|
|
232
|
+
|
|
233
|
+
def __truediv__(self, other):
|
|
234
|
+
return self._binary_op(other, np.true_divide)
|
|
235
|
+
|
|
236
|
+
def __rtruediv__(self, other):
|
|
237
|
+
return self._binary_op(other, lambda a, b: np.true_divide(b, a))
|
|
238
|
+
|
|
239
|
+
def __pow__(self, other):
|
|
240
|
+
return self._binary_op(other, np.power)
|
|
241
|
+
|
|
242
|
+
def __neg__(self):
|
|
243
|
+
result = CompressedMatrix.__new__(CompressedMatrix)
|
|
244
|
+
result._data = -self._data
|
|
245
|
+
result._dims = self._dims
|
|
246
|
+
result.repeat_row = self.repeat_row
|
|
247
|
+
result.repeat_col = self.repeat_col
|
|
248
|
+
return result
|
|
249
|
+
|
|
250
|
+
def __repr__(self):
|
|
251
|
+
return (f"CompressedMatrix(shape={self._dims}, "
|
|
252
|
+
f"repeat_row={self.repeat_row}, repeat_col={self.repeat_col}, "
|
|
253
|
+
f"stored_shape={self._data.shape})")
|
|
254
|
+
|
|
255
|
+
@staticmethod
|
|
256
|
+
def rbind(*matrices):
|
|
257
|
+
"""Row-bind CompressedMatrix objects."""
|
|
258
|
+
if len(matrices) == 1:
|
|
259
|
+
return matrices[0]
|
|
260
|
+
all_nr = sum(m.nrow for m in matrices)
|
|
261
|
+
col_rep = [m.repeat_col for m in matrices]
|
|
262
|
+
row_rep = [m.repeat_row for m in matrices]
|
|
263
|
+
|
|
264
|
+
if all(col_rep):
|
|
265
|
+
all_nc = matrices[0].ncol
|
|
266
|
+
collected = []
|
|
267
|
+
for m in matrices:
|
|
268
|
+
if m.ncol != all_nc:
|
|
269
|
+
raise ValueError("cannot combine CompressedMatrix objects with different number of columns")
|
|
270
|
+
collected.append(np.tile(m._data.ravel(), max(1, m.nrow // max(1, m._data.shape[0])))[:m.nrow])
|
|
271
|
+
return CompressedMatrix(np.concatenate(collected), dims=(all_nr, all_nc), byrow=False)
|
|
272
|
+
|
|
273
|
+
if all(row_rep):
|
|
274
|
+
ref = matrices[0]._data
|
|
275
|
+
ok = all(np.allclose(m._data, ref) for m in matrices[1:])
|
|
276
|
+
if ok:
|
|
277
|
+
result = CompressedMatrix.__new__(CompressedMatrix)
|
|
278
|
+
result._data = matrices[0]._data.copy()
|
|
279
|
+
result._dims = (all_nr, matrices[0].ncol)
|
|
280
|
+
result.repeat_row = True
|
|
281
|
+
result.repeat_col = matrices[0].repeat_col
|
|
282
|
+
return result
|
|
283
|
+
|
|
284
|
+
expanded = [m.as_matrix() for m in matrices]
|
|
285
|
+
return CompressedMatrix(np.vstack(expanded))
|
|
286
|
+
|
|
287
|
+
@staticmethod
|
|
288
|
+
def cbind(*matrices):
|
|
289
|
+
"""Column-bind CompressedMatrix objects."""
|
|
290
|
+
if len(matrices) == 1:
|
|
291
|
+
return matrices[0]
|
|
292
|
+
all_nc = sum(m.ncol for m in matrices)
|
|
293
|
+
col_rep = [m.repeat_col for m in matrices]
|
|
294
|
+
row_rep = [m.repeat_row for m in matrices]
|
|
295
|
+
|
|
296
|
+
if all(row_rep):
|
|
297
|
+
all_nr = matrices[0].nrow
|
|
298
|
+
collected = []
|
|
299
|
+
for m in matrices:
|
|
300
|
+
if m.nrow != all_nr:
|
|
301
|
+
raise ValueError("cannot combine CompressedMatrix objects with different number of rows")
|
|
302
|
+
collected.append(np.tile(m._data.ravel(), max(1, m.ncol // max(1, m._data.shape[1])))[:m.ncol])
|
|
303
|
+
return CompressedMatrix(np.concatenate(collected), dims=(all_nr, all_nc), byrow=True)
|
|
304
|
+
|
|
305
|
+
if all(col_rep):
|
|
306
|
+
ref = matrices[0]._data
|
|
307
|
+
ok = all(np.allclose(m._data, ref) for m in matrices[1:])
|
|
308
|
+
if ok:
|
|
309
|
+
result = CompressedMatrix.__new__(CompressedMatrix)
|
|
310
|
+
result._data = matrices[0]._data.copy()
|
|
311
|
+
result._dims = (matrices[0].nrow, all_nc)
|
|
312
|
+
result.repeat_row = matrices[0].repeat_row
|
|
313
|
+
result.repeat_col = True
|
|
314
|
+
return result
|
|
315
|
+
|
|
316
|
+
expanded = [m.as_matrix() for m in matrices]
|
|
317
|
+
return CompressedMatrix(np.hstack(expanded))
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
def compress_offsets(y, offset=None, lib_size=None):
|
|
321
|
+
"""Compress offsets into a CompressedMatrix.
|
|
322
|
+
|
|
323
|
+
Port of edgeR's .compressOffsets.
|
|
324
|
+
"""
|
|
325
|
+
if isinstance(offset, CompressedMatrix):
|
|
326
|
+
return offset
|
|
327
|
+
dims = y.shape if hasattr(y, 'shape') else (y._dims if isinstance(y, CompressedMatrix) else None)
|
|
328
|
+
if offset is None:
|
|
329
|
+
if lib_size is None:
|
|
330
|
+
if isinstance(y, np.ndarray):
|
|
331
|
+
lib_size = y.sum(axis=0)
|
|
332
|
+
else:
|
|
333
|
+
lib_size = np.asarray(y).sum(axis=0)
|
|
334
|
+
offset = np.log(lib_size)
|
|
335
|
+
offset = np.asarray(offset, dtype=np.float64)
|
|
336
|
+
if not np.all(np.isfinite(offset)):
|
|
337
|
+
raise ValueError("offsets must be finite values")
|
|
338
|
+
return CompressedMatrix(offset, dims, byrow=True)
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
def compress_weights(y, weights=None):
|
|
342
|
+
"""Compress weights into a CompressedMatrix.
|
|
343
|
+
|
|
344
|
+
Port of edgeR's .compressWeights.
|
|
345
|
+
"""
|
|
346
|
+
if isinstance(weights, CompressedMatrix):
|
|
347
|
+
return weights
|
|
348
|
+
dims = y.shape if hasattr(y, 'shape') else (y._dims if isinstance(y, CompressedMatrix) else None)
|
|
349
|
+
if weights is None:
|
|
350
|
+
weights = 1.0
|
|
351
|
+
weights = np.asarray(weights, dtype=np.float64)
|
|
352
|
+
if np.any(np.isnan(weights)):
|
|
353
|
+
raise ValueError("NA weights not allowed")
|
|
354
|
+
if np.any(weights <= 0):
|
|
355
|
+
raise ValueError("Weights must be positive")
|
|
356
|
+
return CompressedMatrix(weights, dims, byrow=True)
|
|
357
|
+
|
|
358
|
+
|
|
359
|
+
def compress_prior(y, prior_count):
|
|
360
|
+
"""Compress prior counts into a CompressedMatrix.
|
|
361
|
+
|
|
362
|
+
Port of edgeR's .compressPrior.
|
|
363
|
+
"""
|
|
364
|
+
if isinstance(prior_count, CompressedMatrix):
|
|
365
|
+
return prior_count
|
|
366
|
+
dims = y.shape if hasattr(y, 'shape') else (y._dims if isinstance(y, CompressedMatrix) else None)
|
|
367
|
+
prior_count = np.asarray(prior_count, dtype=np.float64)
|
|
368
|
+
if np.any(np.isnan(prior_count)):
|
|
369
|
+
raise ValueError("NA prior counts not allowed")
|
|
370
|
+
if np.any(prior_count < 0):
|
|
371
|
+
raise ValueError("Negative prior counts not allowed")
|
|
372
|
+
return CompressedMatrix(prior_count, dims, byrow=False)
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
def compress_dispersions(y, dispersion):
|
|
376
|
+
"""Compress dispersions into a CompressedMatrix.
|
|
377
|
+
|
|
378
|
+
Port of edgeR's .compressDispersions.
|
|
379
|
+
"""
|
|
380
|
+
if isinstance(dispersion, CompressedMatrix):
|
|
381
|
+
return dispersion
|
|
382
|
+
dims = y.shape if hasattr(y, 'shape') else (y._dims if isinstance(y, CompressedMatrix) else None)
|
|
383
|
+
dispersion = np.asarray(dispersion, dtype=np.float64)
|
|
384
|
+
if np.any(np.isnan(dispersion)):
|
|
385
|
+
raise ValueError("NA dispersions not allowed")
|
|
386
|
+
if np.any(dispersion < 0):
|
|
387
|
+
raise ValueError("Negative dispersions not allowed")
|
|
388
|
+
return CompressedMatrix(dispersion, dims, byrow=False)
|
edgepython/dgelist.py
ADDED
|
@@ -0,0 +1,314 @@
|
|
|
1
|
+
# This code was written by Claude (Anthropic). The project was directed by Lior Pachter.
|
|
2
|
+
"""
|
|
3
|
+
DGEList construction, validation, and accessors.
|
|
4
|
+
|
|
5
|
+
Port of edgeR's DGEList.R, validDGEList.R, getCounts.R, getDispersion.R,
|
|
6
|
+
getOffset.R, effectiveLibSizes.R.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
import pandas as pd
|
|
11
|
+
import warnings
|
|
12
|
+
from .classes import DGEList
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _drop_empty_levels(x):
|
|
16
|
+
"""Drop unused levels from a categorical/factor variable."""
|
|
17
|
+
if hasattr(x, 'cat'):
|
|
18
|
+
return x.cat.remove_unused_categories()
|
|
19
|
+
return pd.Categorical(x)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def make_dgelist(counts, lib_size=None, norm_factors=None, samples=None,
|
|
23
|
+
group=None, genes=None, remove_zeros=False,
|
|
24
|
+
annotation_columns=None):
|
|
25
|
+
"""Construct a DGEList object from components.
|
|
26
|
+
|
|
27
|
+
Port of edgeR's DGEList().
|
|
28
|
+
|
|
29
|
+
Parameters
|
|
30
|
+
----------
|
|
31
|
+
counts : array-like or DataFrame
|
|
32
|
+
Matrix of counts (genes x samples).
|
|
33
|
+
lib_size : array-like, optional
|
|
34
|
+
Library sizes. Defaults to column sums.
|
|
35
|
+
norm_factors : array-like, optional
|
|
36
|
+
Normalization factors. Defaults to all ones.
|
|
37
|
+
samples : DataFrame, optional
|
|
38
|
+
Sample-level information.
|
|
39
|
+
group : array-like, optional
|
|
40
|
+
Group memberships.
|
|
41
|
+
genes : DataFrame, optional
|
|
42
|
+
Gene-level annotation.
|
|
43
|
+
remove_zeros : bool
|
|
44
|
+
Whether to remove rows with all zero counts.
|
|
45
|
+
annotation_columns : list, optional
|
|
46
|
+
For DataFrame counts, which columns are annotation (not counts).
|
|
47
|
+
|
|
48
|
+
Returns
|
|
49
|
+
-------
|
|
50
|
+
DGEList
|
|
51
|
+
"""
|
|
52
|
+
# Handle DataFrame input
|
|
53
|
+
if isinstance(counts, pd.DataFrame):
|
|
54
|
+
if annotation_columns is not None:
|
|
55
|
+
if isinstance(annotation_columns, (list, np.ndarray)):
|
|
56
|
+
ann_cols = annotation_columns
|
|
57
|
+
elif isinstance(annotation_columns, str):
|
|
58
|
+
ann_cols = [annotation_columns]
|
|
59
|
+
else:
|
|
60
|
+
ann_cols = list(annotation_columns)
|
|
61
|
+
|
|
62
|
+
if genes is None:
|
|
63
|
+
genes = counts[ann_cols].copy()
|
|
64
|
+
else:
|
|
65
|
+
genes = pd.concat([counts[ann_cols], genes], axis=1)
|
|
66
|
+
counts = counts.drop(columns=ann_cols)
|
|
67
|
+
else:
|
|
68
|
+
# Auto-detect non-numeric columns
|
|
69
|
+
numeric_mask = counts.dtypes.apply(lambda dt: np.issubdtype(dt, np.number))
|
|
70
|
+
if not numeric_mask.all():
|
|
71
|
+
non_numeric = counts.columns[~numeric_mask]
|
|
72
|
+
last_non_numeric = non_numeric[-1]
|
|
73
|
+
last_idx = counts.columns.get_loc(last_non_numeric)
|
|
74
|
+
ann_cols = counts.columns[:last_idx + 1].tolist()
|
|
75
|
+
if genes is None:
|
|
76
|
+
genes = counts[ann_cols].copy()
|
|
77
|
+
else:
|
|
78
|
+
genes = pd.concat([counts[ann_cols], genes], axis=1)
|
|
79
|
+
counts = counts.iloc[:, last_idx + 1:]
|
|
80
|
+
|
|
81
|
+
# Handle scipy sparse matrices
|
|
82
|
+
if hasattr(counts, 'toarray') and hasattr(counts, 'nnz'):
|
|
83
|
+
shape = counts.shape
|
|
84
|
+
nnz = counts.nnz
|
|
85
|
+
density = nnz / (shape[0] * shape[1]) if shape[0] * shape[1] > 0 else 0
|
|
86
|
+
warnings.warn(
|
|
87
|
+
f"Densifying sparse matrix ({shape[0]} x {shape[1]}, "
|
|
88
|
+
f"{100*density:.1f}% non-zero, "
|
|
89
|
+
f"{shape[0] * shape[1] * 8 / 1e6:.0f} MB dense). "
|
|
90
|
+
f"edgePython stores counts as dense arrays.",
|
|
91
|
+
stacklevel=2,
|
|
92
|
+
)
|
|
93
|
+
counts = np.asarray(counts.toarray(), dtype=np.float64)
|
|
94
|
+
else:
|
|
95
|
+
counts = np.asarray(counts, dtype=np.float64)
|
|
96
|
+
if counts.ndim == 1:
|
|
97
|
+
counts = counts.reshape(-1, 1)
|
|
98
|
+
|
|
99
|
+
# Validate counts
|
|
100
|
+
if counts.size == 0:
|
|
101
|
+
raise ValueError("'counts' must contain at least one value")
|
|
102
|
+
m = np.nanmin(counts)
|
|
103
|
+
if np.isnan(m):
|
|
104
|
+
raise ValueError("NA counts not allowed")
|
|
105
|
+
if m < 0:
|
|
106
|
+
raise ValueError("Negative counts not allowed")
|
|
107
|
+
if not np.isfinite(np.max(counts)):
|
|
108
|
+
raise ValueError("Infinite counts not allowed")
|
|
109
|
+
|
|
110
|
+
nlib = counts.shape[1]
|
|
111
|
+
ntags = counts.shape[0]
|
|
112
|
+
|
|
113
|
+
# Column names
|
|
114
|
+
col_names = [f"Sample{i+1}" for i in range(nlib)]
|
|
115
|
+
row_names = [str(i+1) for i in range(ntags)]
|
|
116
|
+
|
|
117
|
+
# Library sizes
|
|
118
|
+
if lib_size is None:
|
|
119
|
+
lib_size = counts.sum(axis=0)
|
|
120
|
+
if np.min(lib_size) <= 0:
|
|
121
|
+
warnings.warn("At least one library size is zero")
|
|
122
|
+
else:
|
|
123
|
+
lib_size = np.asarray(lib_size, dtype=np.float64)
|
|
124
|
+
if not np.issubdtype(lib_size.dtype, np.number):
|
|
125
|
+
raise ValueError("'lib_size' must be numeric")
|
|
126
|
+
if len(lib_size) != nlib:
|
|
127
|
+
raise ValueError("length of 'lib_size' must equal number of samples")
|
|
128
|
+
if np.any(np.isnan(lib_size)):
|
|
129
|
+
raise ValueError("NA library sizes not allowed")
|
|
130
|
+
if np.any(lib_size < 0):
|
|
131
|
+
raise ValueError("negative library sizes not allowed")
|
|
132
|
+
if np.any((lib_size == 0) & (counts.sum(axis=0) > 0)):
|
|
133
|
+
raise ValueError("library size set to zero but counts for that sample are nonzero")
|
|
134
|
+
|
|
135
|
+
# Normalization factors
|
|
136
|
+
if norm_factors is None:
|
|
137
|
+
norm_factors = np.ones(nlib)
|
|
138
|
+
else:
|
|
139
|
+
norm_factors = np.asarray(norm_factors, dtype=np.float64)
|
|
140
|
+
if len(norm_factors) != nlib:
|
|
141
|
+
raise ValueError("Length of 'norm_factors' must equal number of columns in 'counts'")
|
|
142
|
+
if np.any(np.isnan(norm_factors)):
|
|
143
|
+
raise ValueError("NA norm factors not allowed")
|
|
144
|
+
if np.any(norm_factors <= 0):
|
|
145
|
+
raise ValueError("norm factors must be positive")
|
|
146
|
+
if abs(np.sum(np.log(norm_factors))) > 1e-6:
|
|
147
|
+
warnings.warn("norm factors don't multiply to 1")
|
|
148
|
+
|
|
149
|
+
# Samples DataFrame
|
|
150
|
+
if samples is not None:
|
|
151
|
+
samples = pd.DataFrame(samples)
|
|
152
|
+
if nlib != len(samples):
|
|
153
|
+
raise ValueError("Number of rows in 'samples' must equal number of columns in 'counts'")
|
|
154
|
+
|
|
155
|
+
# Group
|
|
156
|
+
if group is None and samples is not None and 'group' in samples.columns:
|
|
157
|
+
group = samples['group'].values
|
|
158
|
+
samples = samples.drop(columns=['group'])
|
|
159
|
+
|
|
160
|
+
if group is None:
|
|
161
|
+
group = pd.Categorical([1] * nlib)
|
|
162
|
+
else:
|
|
163
|
+
if len(group) != nlib:
|
|
164
|
+
raise ValueError("Length of 'group' must equal number of columns in 'counts'")
|
|
165
|
+
group = _drop_empty_levels(pd.Categorical(group))
|
|
166
|
+
|
|
167
|
+
# Build samples DataFrame
|
|
168
|
+
sam = pd.DataFrame({
|
|
169
|
+
'group': group,
|
|
170
|
+
'lib.size': lib_size,
|
|
171
|
+
'norm.factors': norm_factors
|
|
172
|
+
})
|
|
173
|
+
if samples is not None:
|
|
174
|
+
for col in samples.columns:
|
|
175
|
+
sam[col] = samples[col].values
|
|
176
|
+
sam.index = col_names
|
|
177
|
+
|
|
178
|
+
# Build DGEList
|
|
179
|
+
x = DGEList()
|
|
180
|
+
x['counts'] = counts
|
|
181
|
+
x['samples'] = sam
|
|
182
|
+
|
|
183
|
+
# Gene annotation
|
|
184
|
+
if genes is not None:
|
|
185
|
+
genes = pd.DataFrame(genes)
|
|
186
|
+
if len(genes) != ntags:
|
|
187
|
+
raise ValueError("Counts and genes have different numbers of rows")
|
|
188
|
+
genes.index = row_names
|
|
189
|
+
x['genes'] = genes
|
|
190
|
+
|
|
191
|
+
# Remove all-zero rows
|
|
192
|
+
if remove_zeros:
|
|
193
|
+
all_zeros = np.sum(counts > 0, axis=1) == 0
|
|
194
|
+
if np.any(all_zeros):
|
|
195
|
+
keep = ~all_zeros
|
|
196
|
+
x['counts'] = counts[keep]
|
|
197
|
+
if 'genes' in x and x['genes'] is not None:
|
|
198
|
+
x['genes'] = x['genes'].iloc[keep]
|
|
199
|
+
print(f"Removing {np.sum(all_zeros)} rows with all zero counts")
|
|
200
|
+
|
|
201
|
+
return x
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def valid_dgelist(y):
|
|
205
|
+
"""Check and fill standard components of a DGEList.
|
|
206
|
+
|
|
207
|
+
Port of edgeR's validDGEList.
|
|
208
|
+
"""
|
|
209
|
+
if 'counts' not in y or y['counts'] is None:
|
|
210
|
+
raise ValueError("No count matrix")
|
|
211
|
+
y['counts'] = np.asarray(y['counts'], dtype=np.float64)
|
|
212
|
+
nlib = y['counts'].shape[1]
|
|
213
|
+
if 'samples' not in y:
|
|
214
|
+
y['samples'] = pd.DataFrame()
|
|
215
|
+
if 'group' not in y['samples'].columns:
|
|
216
|
+
y['samples']['group'] = pd.Categorical([1] * nlib)
|
|
217
|
+
if 'lib.size' not in y['samples'].columns:
|
|
218
|
+
y['samples']['lib.size'] = y['counts'].sum(axis=0)
|
|
219
|
+
if 'norm.factors' not in y['samples'].columns:
|
|
220
|
+
y['samples']['norm.factors'] = np.ones(nlib)
|
|
221
|
+
return y
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def get_counts(y):
|
|
225
|
+
"""Extract count matrix from DGEList.
|
|
226
|
+
|
|
227
|
+
Port of edgeR's getCounts.
|
|
228
|
+
"""
|
|
229
|
+
return np.asarray(y['counts'])
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def get_dispersion(y):
|
|
233
|
+
"""Get the most complex dispersion values from a DGEList.
|
|
234
|
+
|
|
235
|
+
Port of edgeR's getDispersion.
|
|
236
|
+
Returns tagwise > trended > common > None, with a 'type' attribute.
|
|
237
|
+
"""
|
|
238
|
+
if y.get('tagwise.dispersion') is not None:
|
|
239
|
+
disp = np.asarray(y['tagwise.dispersion'])
|
|
240
|
+
disp_type = 'tagwise'
|
|
241
|
+
elif y.get('trended.dispersion') is not None:
|
|
242
|
+
disp = np.asarray(y['trended.dispersion'])
|
|
243
|
+
disp_type = 'trended'
|
|
244
|
+
elif y.get('common.dispersion') is not None:
|
|
245
|
+
disp = np.float64(y['common.dispersion'])
|
|
246
|
+
disp_type = 'common'
|
|
247
|
+
else:
|
|
248
|
+
return None
|
|
249
|
+
|
|
250
|
+
# Store type as attribute (Python doesn't have R's attr, use a wrapper)
|
|
251
|
+
result = disp
|
|
252
|
+
# We'll just return the value; callers can check type if needed
|
|
253
|
+
return result
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
def get_dispersion_type(y):
|
|
257
|
+
"""Get the type of the most complex dispersion in a DGEList."""
|
|
258
|
+
if y.get('tagwise.dispersion') is not None:
|
|
259
|
+
return 'tagwise'
|
|
260
|
+
elif y.get('trended.dispersion') is not None:
|
|
261
|
+
return 'trended'
|
|
262
|
+
elif y.get('common.dispersion') is not None:
|
|
263
|
+
return 'common'
|
|
264
|
+
return None
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def get_offset(y):
|
|
268
|
+
"""Extract offset vector or matrix from a DGEList.
|
|
269
|
+
|
|
270
|
+
Port of edgeR's getOffset. Returns log(lib.size * norm.factors) by default.
|
|
271
|
+
"""
|
|
272
|
+
if y.get('offset') is not None:
|
|
273
|
+
return y['offset']
|
|
274
|
+
|
|
275
|
+
lib_size = y['samples']['lib.size'].values
|
|
276
|
+
if lib_size is None:
|
|
277
|
+
raise ValueError("y is not a valid DGEList object")
|
|
278
|
+
|
|
279
|
+
norm_factors = y['samples'].get('norm.factors')
|
|
280
|
+
if norm_factors is not None:
|
|
281
|
+
lib_size = lib_size * norm_factors.values
|
|
282
|
+
|
|
283
|
+
return np.log(lib_size)
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
def get_norm_lib_sizes(y, log=False):
|
|
287
|
+
"""Get effective (normalized) library sizes.
|
|
288
|
+
|
|
289
|
+
Port of edgeR's getNormLibSizes / effectiveLibSizes.
|
|
290
|
+
"""
|
|
291
|
+
if isinstance(y, dict):
|
|
292
|
+
if y.get('offset') is not None:
|
|
293
|
+
# For DGEGLM/DGELRT objects, offset is a matrix
|
|
294
|
+
offset = y['offset']
|
|
295
|
+
if hasattr(offset, 'as_matrix'):
|
|
296
|
+
offset = offset.as_matrix()
|
|
297
|
+
if isinstance(offset, np.ndarray) and offset.ndim == 2:
|
|
298
|
+
els = offset[0, :]
|
|
299
|
+
else:
|
|
300
|
+
els = offset
|
|
301
|
+
if not log:
|
|
302
|
+
els = np.exp(els)
|
|
303
|
+
return els
|
|
304
|
+
elif 'samples' in y:
|
|
305
|
+
els = y['samples']['lib.size'].values * y['samples']['norm.factors'].values
|
|
306
|
+
if log:
|
|
307
|
+
els = np.log(els)
|
|
308
|
+
return els
|
|
309
|
+
# Default for matrices
|
|
310
|
+
y = np.asarray(y)
|
|
311
|
+
els = y.sum(axis=0)
|
|
312
|
+
if log:
|
|
313
|
+
els = np.log(els)
|
|
314
|
+
return els
|