edgepython 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,388 @@
1
+ # This code was written by Claude (Anthropic). The project was directed by Lior Pachter.
2
+ """
3
+ CompressedMatrix class for memory-efficient matrix storage.
4
+
5
+ Port of edgeR's CompressedMatrix (makeCompressedMatrix.R).
6
+ Stores scalars, row vectors, or column vectors with flags indicating
7
+ which dimensions should be repeated when expanding to a full matrix.
8
+ """
9
+
10
+ import numpy as np
11
+
12
+
13
+ class CompressedMatrix:
14
+ """Memory-efficient matrix that stores repeated rows/columns compactly.
15
+
16
+ A CompressedMatrix stores a scalar, row vector, column vector, or full
17
+ matrix along with flags indicating which dimensions are repeated.
18
+ This avoids materializing large matrices when the same values are
19
+ repeated across rows or columns.
20
+
21
+ Parameters
22
+ ----------
23
+ x : scalar, 1-D array, or 2-D array
24
+ The data to store.
25
+ dims : tuple of (int, int), optional
26
+ The logical dimensions (nrow, ncol) of the full matrix.
27
+ byrow : bool
28
+ If True (default), a 1-D vector is treated as a row to be repeated
29
+ down rows. If False, treated as a column to be repeated across columns.
30
+ """
31
+
32
+ def __init__(self, x, dims=None, byrow=True):
33
+ x = np.asarray(x, dtype=np.float64)
34
+ self.repeat_row = False
35
+ self.repeat_col = False
36
+
37
+ if x.ndim == 2:
38
+ if dims is not None:
39
+ xr, xc = x.shape
40
+ if xr == 1 and xc == 1:
41
+ self.repeat_row = True
42
+ self.repeat_col = True
43
+ self._data = x.reshape(1, 1)
44
+ elif xr == 1 and xc >= 2:
45
+ if xc != dims[1]:
46
+ raise ValueError("dims[1] should equal length of row vector x")
47
+ if byrow:
48
+ self.repeat_row = True
49
+ self._data = x.reshape(1, xc)
50
+ else:
51
+ self._data = x.reshape(1, xc)
52
+ dims = (xr, xc)
53
+ elif xr >= 2 and xc == 1:
54
+ if xr != dims[0]:
55
+ raise ValueError("dims[0] should equal length of column vector x")
56
+ if not byrow:
57
+ self.repeat_col = True
58
+ self._data = x.reshape(xr, 1)
59
+ else:
60
+ self._data = x.reshape(xr, 1)
61
+ dims = (xr, xc)
62
+ else:
63
+ self._data = x
64
+ dims = x.shape
65
+ else:
66
+ self._data = x
67
+ dims = x.shape
68
+ elif x.ndim <= 1:
69
+ x = x.ravel()
70
+ if x.size == 1:
71
+ self.repeat_row = True
72
+ self.repeat_col = True
73
+ self._data = x.reshape(1, 1)
74
+ if dims is None:
75
+ dims = (1, 1)
76
+ else:
77
+ if dims is None:
78
+ raise ValueError("dims must be provided for vector input")
79
+ if not byrow:
80
+ if dims[0] != x.size:
81
+ raise ValueError("dims[0] should equal length of x")
82
+ self._data = x.reshape(-1, 1)
83
+ self.repeat_col = True
84
+ else:
85
+ if dims[1] != x.size:
86
+ raise ValueError("dims[1] should equal length of x")
87
+ self._data = x.reshape(1, -1)
88
+ self.repeat_row = True
89
+ else:
90
+ raise ValueError("x must be scalar, 1-D, or 2-D")
91
+
92
+ self._dims = (int(dims[0]), int(dims[1]))
93
+
94
+ @property
95
+ def shape(self):
96
+ """Logical dimensions of the full matrix."""
97
+ return self._dims
98
+
99
+ @property
100
+ def nrow(self):
101
+ return self._dims[0]
102
+
103
+ @property
104
+ def ncol(self):
105
+ return self._dims[1]
106
+
107
+ def __len__(self):
108
+ return self._dims[0] * self._dims[1]
109
+
110
+ def as_matrix(self):
111
+ """Expand to a full numpy matrix."""
112
+ nr, nc = self._dims
113
+ if self.repeat_row and self.repeat_col:
114
+ return np.tile(self._data, (nr, nc))[:nr, :nc]
115
+ elif self.repeat_row:
116
+ return np.tile(self._data, (nr, 1))[:nr, :nc]
117
+ elif self.repeat_col:
118
+ return np.tile(self._data, (1, nc))[:nr, :nc]
119
+ else:
120
+ return self._data.copy()
121
+
122
+ def __array__(self, dtype=None):
123
+ result = self.as_matrix()
124
+ if dtype is not None:
125
+ result = result.astype(dtype)
126
+ return result
127
+
128
+ def __getitem__(self, key):
129
+ if isinstance(key, tuple):
130
+ if len(key) == 2:
131
+ i, j = key
132
+ raw = self._data.copy()
133
+
134
+ if not self.repeat_row and i is not None:
135
+ if isinstance(i, slice):
136
+ raw = raw[i, :]
137
+ else:
138
+ i_idx = np.arange(self._dims[0])[i]
139
+ i_idx = np.atleast_1d(i_idx)
140
+ raw = raw[i_idx, :]
141
+ if not self.repeat_col and j is not None:
142
+ if isinstance(j, slice):
143
+ raw = raw[:, j]
144
+ else:
145
+ j_idx = np.arange(self._dims[1])[j]
146
+ j_idx = np.atleast_1d(j_idx)
147
+ raw = raw[:, j_idx]
148
+
149
+ # Compute new dims
150
+ nr = self._dims[0]
151
+ if i is not None:
152
+ ref = np.arange(nr)
153
+ nr = len(ref[i]) if not np.isscalar(ref[i]) else 1
154
+ nc = self._dims[1]
155
+ if j is not None:
156
+ ref = np.arange(nc)
157
+ nc = len(ref[j]) if not np.isscalar(ref[j]) else 1
158
+
159
+ result = CompressedMatrix.__new__(CompressedMatrix)
160
+ result._data = raw
161
+ result._dims = (nr, nc)
162
+ result.repeat_row = self.repeat_row
163
+ result.repeat_col = self.repeat_col
164
+
165
+ # Drop to vector if single row or column
166
+ i_scalar = (isinstance(i, (int, np.integer)) or
167
+ (hasattr(i, '__len__') and len(i) == 1))
168
+ j_scalar = (isinstance(j, (int, np.integer)) or
169
+ (hasattr(j, '__len__') and len(j) == 1))
170
+ if i_scalar or j_scalar:
171
+ return result.as_matrix().ravel()
172
+ return result
173
+ else:
174
+ return self.as_matrix()[key]
175
+ else:
176
+ return self.as_matrix().ravel()[key]
177
+
178
+ def __setitem__(self, key, value):
179
+ full = self.as_matrix()
180
+ if isinstance(value, CompressedMatrix):
181
+ value = value.as_matrix()
182
+ if isinstance(key, tuple):
183
+ full[key] = value
184
+ else:
185
+ full.ravel()[key] = value
186
+ new = CompressedMatrix(full)
187
+ self._data = new._data
188
+ self._dims = new._dims
189
+ self.repeat_row = new.repeat_row
190
+ self.repeat_col = new.repeat_col
191
+
192
+ def _binary_op(self, other, op):
193
+ if isinstance(other, CompressedMatrix):
194
+ if self._dims != other._dims:
195
+ raise ValueError("CompressedMatrix dimensions should be equal for binary operations")
196
+ row_rep = self.repeat_row and other.repeat_row
197
+ col_rep = self.repeat_col and other.repeat_col
198
+ if row_rep or col_rep:
199
+ e1 = self._data.ravel()
200
+ e2 = other._data.ravel()
201
+ outcome = op(e1, e2)
202
+ return CompressedMatrix(outcome, self._dims, byrow=row_rep)
203
+ else:
204
+ return CompressedMatrix(op(self.as_matrix(), other.as_matrix()))
205
+ else:
206
+ other_arr = np.asarray(other, dtype=np.float64)
207
+ if other_arr.ndim <= 1 and other_arr.size == 1:
208
+ other_cm = CompressedMatrix(other_arr, self._dims, byrow=False)
209
+ elif other_arr.ndim == 1:
210
+ other_cm = CompressedMatrix(other_arr, self._dims, byrow=False)
211
+ else:
212
+ other_cm = CompressedMatrix(other_arr, self._dims, byrow=False)
213
+ return self._binary_op(other_cm, op)
214
+
215
+ def __add__(self, other):
216
+ return self._binary_op(other, np.add)
217
+
218
+ def __radd__(self, other):
219
+ return self._binary_op(other, lambda a, b: np.add(b, a))
220
+
221
+ def __sub__(self, other):
222
+ return self._binary_op(other, np.subtract)
223
+
224
+ def __rsub__(self, other):
225
+ return self._binary_op(other, lambda a, b: np.subtract(b, a))
226
+
227
+ def __mul__(self, other):
228
+ return self._binary_op(other, np.multiply)
229
+
230
+ def __rmul__(self, other):
231
+ return self._binary_op(other, lambda a, b: np.multiply(b, a))
232
+
233
+ def __truediv__(self, other):
234
+ return self._binary_op(other, np.true_divide)
235
+
236
+ def __rtruediv__(self, other):
237
+ return self._binary_op(other, lambda a, b: np.true_divide(b, a))
238
+
239
+ def __pow__(self, other):
240
+ return self._binary_op(other, np.power)
241
+
242
+ def __neg__(self):
243
+ result = CompressedMatrix.__new__(CompressedMatrix)
244
+ result._data = -self._data
245
+ result._dims = self._dims
246
+ result.repeat_row = self.repeat_row
247
+ result.repeat_col = self.repeat_col
248
+ return result
249
+
250
+ def __repr__(self):
251
+ return (f"CompressedMatrix(shape={self._dims}, "
252
+ f"repeat_row={self.repeat_row}, repeat_col={self.repeat_col}, "
253
+ f"stored_shape={self._data.shape})")
254
+
255
+ @staticmethod
256
+ def rbind(*matrices):
257
+ """Row-bind CompressedMatrix objects."""
258
+ if len(matrices) == 1:
259
+ return matrices[0]
260
+ all_nr = sum(m.nrow for m in matrices)
261
+ col_rep = [m.repeat_col for m in matrices]
262
+ row_rep = [m.repeat_row for m in matrices]
263
+
264
+ if all(col_rep):
265
+ all_nc = matrices[0].ncol
266
+ collected = []
267
+ for m in matrices:
268
+ if m.ncol != all_nc:
269
+ raise ValueError("cannot combine CompressedMatrix objects with different number of columns")
270
+ collected.append(np.tile(m._data.ravel(), max(1, m.nrow // max(1, m._data.shape[0])))[:m.nrow])
271
+ return CompressedMatrix(np.concatenate(collected), dims=(all_nr, all_nc), byrow=False)
272
+
273
+ if all(row_rep):
274
+ ref = matrices[0]._data
275
+ ok = all(np.allclose(m._data, ref) for m in matrices[1:])
276
+ if ok:
277
+ result = CompressedMatrix.__new__(CompressedMatrix)
278
+ result._data = matrices[0]._data.copy()
279
+ result._dims = (all_nr, matrices[0].ncol)
280
+ result.repeat_row = True
281
+ result.repeat_col = matrices[0].repeat_col
282
+ return result
283
+
284
+ expanded = [m.as_matrix() for m in matrices]
285
+ return CompressedMatrix(np.vstack(expanded))
286
+
287
+ @staticmethod
288
+ def cbind(*matrices):
289
+ """Column-bind CompressedMatrix objects."""
290
+ if len(matrices) == 1:
291
+ return matrices[0]
292
+ all_nc = sum(m.ncol for m in matrices)
293
+ col_rep = [m.repeat_col for m in matrices]
294
+ row_rep = [m.repeat_row for m in matrices]
295
+
296
+ if all(row_rep):
297
+ all_nr = matrices[0].nrow
298
+ collected = []
299
+ for m in matrices:
300
+ if m.nrow != all_nr:
301
+ raise ValueError("cannot combine CompressedMatrix objects with different number of rows")
302
+ collected.append(np.tile(m._data.ravel(), max(1, m.ncol // max(1, m._data.shape[1])))[:m.ncol])
303
+ return CompressedMatrix(np.concatenate(collected), dims=(all_nr, all_nc), byrow=True)
304
+
305
+ if all(col_rep):
306
+ ref = matrices[0]._data
307
+ ok = all(np.allclose(m._data, ref) for m in matrices[1:])
308
+ if ok:
309
+ result = CompressedMatrix.__new__(CompressedMatrix)
310
+ result._data = matrices[0]._data.copy()
311
+ result._dims = (matrices[0].nrow, all_nc)
312
+ result.repeat_row = matrices[0].repeat_row
313
+ result.repeat_col = True
314
+ return result
315
+
316
+ expanded = [m.as_matrix() for m in matrices]
317
+ return CompressedMatrix(np.hstack(expanded))
318
+
319
+
320
+ def compress_offsets(y, offset=None, lib_size=None):
321
+ """Compress offsets into a CompressedMatrix.
322
+
323
+ Port of edgeR's .compressOffsets.
324
+ """
325
+ if isinstance(offset, CompressedMatrix):
326
+ return offset
327
+ dims = y.shape if hasattr(y, 'shape') else (y._dims if isinstance(y, CompressedMatrix) else None)
328
+ if offset is None:
329
+ if lib_size is None:
330
+ if isinstance(y, np.ndarray):
331
+ lib_size = y.sum(axis=0)
332
+ else:
333
+ lib_size = np.asarray(y).sum(axis=0)
334
+ offset = np.log(lib_size)
335
+ offset = np.asarray(offset, dtype=np.float64)
336
+ if not np.all(np.isfinite(offset)):
337
+ raise ValueError("offsets must be finite values")
338
+ return CompressedMatrix(offset, dims, byrow=True)
339
+
340
+
341
+ def compress_weights(y, weights=None):
342
+ """Compress weights into a CompressedMatrix.
343
+
344
+ Port of edgeR's .compressWeights.
345
+ """
346
+ if isinstance(weights, CompressedMatrix):
347
+ return weights
348
+ dims = y.shape if hasattr(y, 'shape') else (y._dims if isinstance(y, CompressedMatrix) else None)
349
+ if weights is None:
350
+ weights = 1.0
351
+ weights = np.asarray(weights, dtype=np.float64)
352
+ if np.any(np.isnan(weights)):
353
+ raise ValueError("NA weights not allowed")
354
+ if np.any(weights <= 0):
355
+ raise ValueError("Weights must be positive")
356
+ return CompressedMatrix(weights, dims, byrow=True)
357
+
358
+
359
+ def compress_prior(y, prior_count):
360
+ """Compress prior counts into a CompressedMatrix.
361
+
362
+ Port of edgeR's .compressPrior.
363
+ """
364
+ if isinstance(prior_count, CompressedMatrix):
365
+ return prior_count
366
+ dims = y.shape if hasattr(y, 'shape') else (y._dims if isinstance(y, CompressedMatrix) else None)
367
+ prior_count = np.asarray(prior_count, dtype=np.float64)
368
+ if np.any(np.isnan(prior_count)):
369
+ raise ValueError("NA prior counts not allowed")
370
+ if np.any(prior_count < 0):
371
+ raise ValueError("Negative prior counts not allowed")
372
+ return CompressedMatrix(prior_count, dims, byrow=False)
373
+
374
+
375
+ def compress_dispersions(y, dispersion):
376
+ """Compress dispersions into a CompressedMatrix.
377
+
378
+ Port of edgeR's .compressDispersions.
379
+ """
380
+ if isinstance(dispersion, CompressedMatrix):
381
+ return dispersion
382
+ dims = y.shape if hasattr(y, 'shape') else (y._dims if isinstance(y, CompressedMatrix) else None)
383
+ dispersion = np.asarray(dispersion, dtype=np.float64)
384
+ if np.any(np.isnan(dispersion)):
385
+ raise ValueError("NA dispersions not allowed")
386
+ if np.any(dispersion < 0):
387
+ raise ValueError("Negative dispersions not allowed")
388
+ return CompressedMatrix(dispersion, dims, byrow=False)
edgepython/dgelist.py ADDED
@@ -0,0 +1,314 @@
1
+ # This code was written by Claude (Anthropic). The project was directed by Lior Pachter.
2
+ """
3
+ DGEList construction, validation, and accessors.
4
+
5
+ Port of edgeR's DGEList.R, validDGEList.R, getCounts.R, getDispersion.R,
6
+ getOffset.R, effectiveLibSizes.R.
7
+ """
8
+
9
+ import numpy as np
10
+ import pandas as pd
11
+ import warnings
12
+ from .classes import DGEList
13
+
14
+
15
+ def _drop_empty_levels(x):
16
+ """Drop unused levels from a categorical/factor variable."""
17
+ if hasattr(x, 'cat'):
18
+ return x.cat.remove_unused_categories()
19
+ return pd.Categorical(x)
20
+
21
+
22
+ def make_dgelist(counts, lib_size=None, norm_factors=None, samples=None,
23
+ group=None, genes=None, remove_zeros=False,
24
+ annotation_columns=None):
25
+ """Construct a DGEList object from components.
26
+
27
+ Port of edgeR's DGEList().
28
+
29
+ Parameters
30
+ ----------
31
+ counts : array-like or DataFrame
32
+ Matrix of counts (genes x samples).
33
+ lib_size : array-like, optional
34
+ Library sizes. Defaults to column sums.
35
+ norm_factors : array-like, optional
36
+ Normalization factors. Defaults to all ones.
37
+ samples : DataFrame, optional
38
+ Sample-level information.
39
+ group : array-like, optional
40
+ Group memberships.
41
+ genes : DataFrame, optional
42
+ Gene-level annotation.
43
+ remove_zeros : bool
44
+ Whether to remove rows with all zero counts.
45
+ annotation_columns : list, optional
46
+ For DataFrame counts, which columns are annotation (not counts).
47
+
48
+ Returns
49
+ -------
50
+ DGEList
51
+ """
52
+ # Handle DataFrame input
53
+ if isinstance(counts, pd.DataFrame):
54
+ if annotation_columns is not None:
55
+ if isinstance(annotation_columns, (list, np.ndarray)):
56
+ ann_cols = annotation_columns
57
+ elif isinstance(annotation_columns, str):
58
+ ann_cols = [annotation_columns]
59
+ else:
60
+ ann_cols = list(annotation_columns)
61
+
62
+ if genes is None:
63
+ genes = counts[ann_cols].copy()
64
+ else:
65
+ genes = pd.concat([counts[ann_cols], genes], axis=1)
66
+ counts = counts.drop(columns=ann_cols)
67
+ else:
68
+ # Auto-detect non-numeric columns
69
+ numeric_mask = counts.dtypes.apply(lambda dt: np.issubdtype(dt, np.number))
70
+ if not numeric_mask.all():
71
+ non_numeric = counts.columns[~numeric_mask]
72
+ last_non_numeric = non_numeric[-1]
73
+ last_idx = counts.columns.get_loc(last_non_numeric)
74
+ ann_cols = counts.columns[:last_idx + 1].tolist()
75
+ if genes is None:
76
+ genes = counts[ann_cols].copy()
77
+ else:
78
+ genes = pd.concat([counts[ann_cols], genes], axis=1)
79
+ counts = counts.iloc[:, last_idx + 1:]
80
+
81
+ # Handle scipy sparse matrices
82
+ if hasattr(counts, 'toarray') and hasattr(counts, 'nnz'):
83
+ shape = counts.shape
84
+ nnz = counts.nnz
85
+ density = nnz / (shape[0] * shape[1]) if shape[0] * shape[1] > 0 else 0
86
+ warnings.warn(
87
+ f"Densifying sparse matrix ({shape[0]} x {shape[1]}, "
88
+ f"{100*density:.1f}% non-zero, "
89
+ f"{shape[0] * shape[1] * 8 / 1e6:.0f} MB dense). "
90
+ f"edgePython stores counts as dense arrays.",
91
+ stacklevel=2,
92
+ )
93
+ counts = np.asarray(counts.toarray(), dtype=np.float64)
94
+ else:
95
+ counts = np.asarray(counts, dtype=np.float64)
96
+ if counts.ndim == 1:
97
+ counts = counts.reshape(-1, 1)
98
+
99
+ # Validate counts
100
+ if counts.size == 0:
101
+ raise ValueError("'counts' must contain at least one value")
102
+ m = np.nanmin(counts)
103
+ if np.isnan(m):
104
+ raise ValueError("NA counts not allowed")
105
+ if m < 0:
106
+ raise ValueError("Negative counts not allowed")
107
+ if not np.isfinite(np.max(counts)):
108
+ raise ValueError("Infinite counts not allowed")
109
+
110
+ nlib = counts.shape[1]
111
+ ntags = counts.shape[0]
112
+
113
+ # Column names
114
+ col_names = [f"Sample{i+1}" for i in range(nlib)]
115
+ row_names = [str(i+1) for i in range(ntags)]
116
+
117
+ # Library sizes
118
+ if lib_size is None:
119
+ lib_size = counts.sum(axis=0)
120
+ if np.min(lib_size) <= 0:
121
+ warnings.warn("At least one library size is zero")
122
+ else:
123
+ lib_size = np.asarray(lib_size, dtype=np.float64)
124
+ if not np.issubdtype(lib_size.dtype, np.number):
125
+ raise ValueError("'lib_size' must be numeric")
126
+ if len(lib_size) != nlib:
127
+ raise ValueError("length of 'lib_size' must equal number of samples")
128
+ if np.any(np.isnan(lib_size)):
129
+ raise ValueError("NA library sizes not allowed")
130
+ if np.any(lib_size < 0):
131
+ raise ValueError("negative library sizes not allowed")
132
+ if np.any((lib_size == 0) & (counts.sum(axis=0) > 0)):
133
+ raise ValueError("library size set to zero but counts for that sample are nonzero")
134
+
135
+ # Normalization factors
136
+ if norm_factors is None:
137
+ norm_factors = np.ones(nlib)
138
+ else:
139
+ norm_factors = np.asarray(norm_factors, dtype=np.float64)
140
+ if len(norm_factors) != nlib:
141
+ raise ValueError("Length of 'norm_factors' must equal number of columns in 'counts'")
142
+ if np.any(np.isnan(norm_factors)):
143
+ raise ValueError("NA norm factors not allowed")
144
+ if np.any(norm_factors <= 0):
145
+ raise ValueError("norm factors must be positive")
146
+ if abs(np.sum(np.log(norm_factors))) > 1e-6:
147
+ warnings.warn("norm factors don't multiply to 1")
148
+
149
+ # Samples DataFrame
150
+ if samples is not None:
151
+ samples = pd.DataFrame(samples)
152
+ if nlib != len(samples):
153
+ raise ValueError("Number of rows in 'samples' must equal number of columns in 'counts'")
154
+
155
+ # Group
156
+ if group is None and samples is not None and 'group' in samples.columns:
157
+ group = samples['group'].values
158
+ samples = samples.drop(columns=['group'])
159
+
160
+ if group is None:
161
+ group = pd.Categorical([1] * nlib)
162
+ else:
163
+ if len(group) != nlib:
164
+ raise ValueError("Length of 'group' must equal number of columns in 'counts'")
165
+ group = _drop_empty_levels(pd.Categorical(group))
166
+
167
+ # Build samples DataFrame
168
+ sam = pd.DataFrame({
169
+ 'group': group,
170
+ 'lib.size': lib_size,
171
+ 'norm.factors': norm_factors
172
+ })
173
+ if samples is not None:
174
+ for col in samples.columns:
175
+ sam[col] = samples[col].values
176
+ sam.index = col_names
177
+
178
+ # Build DGEList
179
+ x = DGEList()
180
+ x['counts'] = counts
181
+ x['samples'] = sam
182
+
183
+ # Gene annotation
184
+ if genes is not None:
185
+ genes = pd.DataFrame(genes)
186
+ if len(genes) != ntags:
187
+ raise ValueError("Counts and genes have different numbers of rows")
188
+ genes.index = row_names
189
+ x['genes'] = genes
190
+
191
+ # Remove all-zero rows
192
+ if remove_zeros:
193
+ all_zeros = np.sum(counts > 0, axis=1) == 0
194
+ if np.any(all_zeros):
195
+ keep = ~all_zeros
196
+ x['counts'] = counts[keep]
197
+ if 'genes' in x and x['genes'] is not None:
198
+ x['genes'] = x['genes'].iloc[keep]
199
+ print(f"Removing {np.sum(all_zeros)} rows with all zero counts")
200
+
201
+ return x
202
+
203
+
204
+ def valid_dgelist(y):
205
+ """Check and fill standard components of a DGEList.
206
+
207
+ Port of edgeR's validDGEList.
208
+ """
209
+ if 'counts' not in y or y['counts'] is None:
210
+ raise ValueError("No count matrix")
211
+ y['counts'] = np.asarray(y['counts'], dtype=np.float64)
212
+ nlib = y['counts'].shape[1]
213
+ if 'samples' not in y:
214
+ y['samples'] = pd.DataFrame()
215
+ if 'group' not in y['samples'].columns:
216
+ y['samples']['group'] = pd.Categorical([1] * nlib)
217
+ if 'lib.size' not in y['samples'].columns:
218
+ y['samples']['lib.size'] = y['counts'].sum(axis=0)
219
+ if 'norm.factors' not in y['samples'].columns:
220
+ y['samples']['norm.factors'] = np.ones(nlib)
221
+ return y
222
+
223
+
224
+ def get_counts(y):
225
+ """Extract count matrix from DGEList.
226
+
227
+ Port of edgeR's getCounts.
228
+ """
229
+ return np.asarray(y['counts'])
230
+
231
+
232
+ def get_dispersion(y):
233
+ """Get the most complex dispersion values from a DGEList.
234
+
235
+ Port of edgeR's getDispersion.
236
+ Returns tagwise > trended > common > None, with a 'type' attribute.
237
+ """
238
+ if y.get('tagwise.dispersion') is not None:
239
+ disp = np.asarray(y['tagwise.dispersion'])
240
+ disp_type = 'tagwise'
241
+ elif y.get('trended.dispersion') is not None:
242
+ disp = np.asarray(y['trended.dispersion'])
243
+ disp_type = 'trended'
244
+ elif y.get('common.dispersion') is not None:
245
+ disp = np.float64(y['common.dispersion'])
246
+ disp_type = 'common'
247
+ else:
248
+ return None
249
+
250
+ # Store type as attribute (Python doesn't have R's attr, use a wrapper)
251
+ result = disp
252
+ # We'll just return the value; callers can check type if needed
253
+ return result
254
+
255
+
256
+ def get_dispersion_type(y):
257
+ """Get the type of the most complex dispersion in a DGEList."""
258
+ if y.get('tagwise.dispersion') is not None:
259
+ return 'tagwise'
260
+ elif y.get('trended.dispersion') is not None:
261
+ return 'trended'
262
+ elif y.get('common.dispersion') is not None:
263
+ return 'common'
264
+ return None
265
+
266
+
267
+ def get_offset(y):
268
+ """Extract offset vector or matrix from a DGEList.
269
+
270
+ Port of edgeR's getOffset. Returns log(lib.size * norm.factors) by default.
271
+ """
272
+ if y.get('offset') is not None:
273
+ return y['offset']
274
+
275
+ lib_size = y['samples']['lib.size'].values
276
+ if lib_size is None:
277
+ raise ValueError("y is not a valid DGEList object")
278
+
279
+ norm_factors = y['samples'].get('norm.factors')
280
+ if norm_factors is not None:
281
+ lib_size = lib_size * norm_factors.values
282
+
283
+ return np.log(lib_size)
284
+
285
+
286
+ def get_norm_lib_sizes(y, log=False):
287
+ """Get effective (normalized) library sizes.
288
+
289
+ Port of edgeR's getNormLibSizes / effectiveLibSizes.
290
+ """
291
+ if isinstance(y, dict):
292
+ if y.get('offset') is not None:
293
+ # For DGEGLM/DGELRT objects, offset is a matrix
294
+ offset = y['offset']
295
+ if hasattr(offset, 'as_matrix'):
296
+ offset = offset.as_matrix()
297
+ if isinstance(offset, np.ndarray) and offset.ndim == 2:
298
+ els = offset[0, :]
299
+ else:
300
+ els = offset
301
+ if not log:
302
+ els = np.exp(els)
303
+ return els
304
+ elif 'samples' in y:
305
+ els = y['samples']['lib.size'].values * y['samples']['norm.factors'].values
306
+ if log:
307
+ els = np.log(els)
308
+ return els
309
+ # Default for matrices
310
+ y = np.asarray(y)
311
+ els = y.sum(axis=0)
312
+ if log:
313
+ els = np.log(els)
314
+ return els