edgepython 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,323 @@
1
+ # This code was written by Claude (Anthropic). The project was directed by Lior Pachter.
2
+ """
3
+ Expression value computation for edgePython.
4
+
5
+ Port of edgeR's cpm, rpkm, tpm, aveLogCPM, cpmByGroup, rpkmByGroup.
6
+ """
7
+
8
+ import numpy as np
9
+ import warnings
10
+ from .utils import expand_as_matrix, add_prior_count
11
+
12
+
13
+ def cpm(y, lib_size=None, offset=None, log=False, prior_count=2,
14
+ normalized_lib_sizes=True):
15
+ """Counts per million.
16
+
17
+ Port of edgeR's cpm().
18
+
19
+ Parameters
20
+ ----------
21
+ y : array-like or DGEList
22
+ Count matrix or DGEList.
23
+ lib_size : array-like, optional
24
+ Library sizes.
25
+ offset : array-like, optional
26
+ Log-scale offsets.
27
+ log : bool
28
+ Return log2-CPM?
29
+ prior_count : float
30
+ Prior count for log transformation.
31
+ normalized_lib_sizes : bool
32
+ Use normalized library sizes (for DGEList input).
33
+
34
+ Returns
35
+ -------
36
+ ndarray of CPM values.
37
+ """
38
+ # DGEList input
39
+ if isinstance(y, dict) and 'counts' in y:
40
+ dge = y
41
+ ls = dge['samples']['lib.size'].values
42
+
43
+ if dge.get('offset') is not None:
44
+ ls = None
45
+ offset = dge['offset']
46
+ elif normalized_lib_sizes:
47
+ ls = ls * dge['samples']['norm.factors'].values
48
+
49
+ return _cpm_default(dge['counts'], lib_size=ls, offset=offset,
50
+ log=log, prior_count=prior_count)
51
+
52
+ return _cpm_default(y, lib_size=lib_size, offset=offset,
53
+ log=log, prior_count=prior_count)
54
+
55
+
56
+ def _cpm_default(y, lib_size=None, offset=None, log=False, prior_count=2):
57
+ """Core CPM calculation."""
58
+ y = np.asarray(y, dtype=np.float64)
59
+ ymin = np.nanmin(y)
60
+ if np.isnan(ymin):
61
+ raise ValueError("NA counts not allowed")
62
+ if ymin < 0:
63
+ raise ValueError("Negative counts not allowed")
64
+
65
+ if y.ndim == 1:
66
+ y = y.reshape(-1, 1)
67
+ if y.size == 0:
68
+ return y.copy()
69
+
70
+ if offset is not None:
71
+ offset = np.asarray(offset, dtype=np.float64)
72
+ if offset.ndim == 2:
73
+ if offset.shape != y.shape:
74
+ raise ValueError("dimensions not consistent between counts and offset")
75
+ else:
76
+ if len(offset) != y.shape[1]:
77
+ raise ValueError("Length of offset differs from number of libraries")
78
+ lib_size = np.exp(offset) if offset.ndim == 1 else None
79
+ if offset.ndim == 2:
80
+ lib_size = np.exp(offset)
81
+ else:
82
+ if lib_size is None:
83
+ lib_size = y.sum(axis=0)
84
+
85
+ lib_size = np.asarray(lib_size, dtype=np.float64)
86
+ if lib_size.ndim == 1:
87
+ if np.any(lib_size <= 0):
88
+ raise ValueError("library sizes should be greater than zero")
89
+
90
+ if log:
91
+ out = add_prior_count(y, lib_size=lib_size if lib_size.ndim == 1 else None,
92
+ offset=np.log(lib_size) if lib_size.ndim == 1 else offset,
93
+ prior_count=prior_count)
94
+ y_aug = out['y']
95
+ offset_aug = out['offset']
96
+
97
+ if isinstance(offset_aug, np.ndarray) and offset_aug.ndim == 1:
98
+ lib_size_aug = np.exp(offset_aug)
99
+ else:
100
+ lib_size_aug = np.exp(offset_aug)
101
+
102
+ if lib_size_aug.ndim == 1:
103
+ result = np.log2(y_aug / lib_size_aug[np.newaxis, :] * 1e6)
104
+ else:
105
+ result = np.log2(y_aug / lib_size_aug * 1e6)
106
+ return result
107
+ else:
108
+ if lib_size.ndim == 1:
109
+ return y / lib_size[np.newaxis, :] * 1e6
110
+ else:
111
+ return y / lib_size * 1e6
112
+
113
+
114
+ def rpkm(y, gene_length, lib_size=None, offset=None, log=False, prior_count=2,
115
+ normalized_lib_sizes=True):
116
+ """Reads per kilobase per million.
117
+
118
+ Port of edgeR's rpkm().
119
+
120
+ Parameters
121
+ ----------
122
+ y : array-like or DGEList
123
+ Count matrix or DGEList.
124
+ gene_length : array-like or str
125
+ Gene lengths in bp.
126
+ lib_size, offset, log, prior_count, normalized_lib_sizes :
127
+ As for cpm().
128
+ """
129
+ # Extract gene_length from DGEList if string
130
+ if isinstance(y, dict) and 'counts' in y:
131
+ if isinstance(gene_length, str):
132
+ gene_length = y['genes'][gene_length].values
133
+ elif gene_length is None:
134
+ for col in ['Length', 'length']:
135
+ if col in y.get('genes', {}).columns if y.get('genes') is not None else False:
136
+ gene_length = y['genes'][col].values
137
+ break
138
+ if gene_length is None:
139
+ raise ValueError("Gene lengths not found")
140
+
141
+ gene_length = np.asarray(gene_length, dtype=np.float64)
142
+ gene_length_kb = gene_length / 1000
143
+
144
+ result = cpm(y, lib_size=lib_size, offset=offset, log=log,
145
+ prior_count=prior_count, normalized_lib_sizes=normalized_lib_sizes)
146
+
147
+ if log:
148
+ return result - np.log2(gene_length_kb[:, np.newaxis])
149
+ else:
150
+ return result / gene_length_kb[:, np.newaxis]
151
+
152
+
153
+ def tpm(y, effective_tx_length, rta_overdispersion=None, shrunk=False):
154
+ """Transcripts per million from a fitted model.
155
+
156
+ Port of edgeR's tpm().
157
+ """
158
+ t = cpm(y, log=False)
159
+ A = np.asarray(effective_tx_length, dtype=np.float64)
160
+ if rta_overdispersion is not None:
161
+ A = A / np.asarray(rta_overdispersion)
162
+ if A.ndim == 1:
163
+ t = t / A[:, np.newaxis]
164
+ else:
165
+ t = t / A
166
+ col_sums = t.sum(axis=0)
167
+ avg_col_sum = np.exp(np.mean(np.log(col_sums)))
168
+ return t / avg_col_sum * 1e6
169
+
170
+
171
+ def ave_log_cpm(y, lib_size=None, offset=None, prior_count=2, dispersion=None,
172
+ weights=None, normalized_lib_sizes=True):
173
+ """Average log2-CPM for each gene.
174
+
175
+ Port of edgeR's aveLogCPM().
176
+ """
177
+ # DGEList input
178
+ if isinstance(y, dict) and 'counts' in y:
179
+ dge = y
180
+ ls = dge['samples']['lib.size'].values
181
+ if ls is None:
182
+ ls = dge['counts'].sum(axis=0)
183
+ if normalized_lib_sizes:
184
+ nf = dge['samples'].get('norm.factors')
185
+ if nf is not None:
186
+ ls = ls * nf.values
187
+ if dispersion is None:
188
+ dispersion = dge.get('common.dispersion')
189
+ w = dge.get('weights')
190
+ return _ave_log_cpm_default(dge['counts'], lib_size=ls, offset=offset,
191
+ prior_count=prior_count, dispersion=dispersion,
192
+ weights=w)
193
+
194
+ return _ave_log_cpm_default(y, lib_size=lib_size, offset=offset,
195
+ prior_count=prior_count, dispersion=dispersion,
196
+ weights=weights)
197
+
198
+
199
+ def _ave_log_cpm_default(y, lib_size=None, offset=None, prior_count=2,
200
+ dispersion=None, weights=None):
201
+ """Core aveLogCPM calculation.
202
+
203
+ Uses mglmOneGroup to fit intercept-only NB GLM model, then converts
204
+ the fitted coefficient to log2-CPM. This matches R edgeR's aveLogCPM.default
205
+ which calls C++ code internally doing the same thing.
206
+ """
207
+ from .glm_fit import mglm_one_group
208
+
209
+ y = np.asarray(y, dtype=np.float64)
210
+ if y.ndim == 1:
211
+ y = y.reshape(-1, 1)
212
+ if y.shape[0] == 0:
213
+ return np.array([], dtype=np.float64)
214
+
215
+ if dispersion is None:
216
+ dispersion = 0.05
217
+ dispersion = np.atleast_1d(np.asarray(dispersion, dtype=np.float64))
218
+ if np.all(np.isnan(dispersion)):
219
+ dispersion = np.array([0.05])
220
+ dispersion = np.where(np.isnan(dispersion), np.nanmean(dispersion), dispersion)
221
+
222
+ if offset is None:
223
+ if lib_size is None:
224
+ lib_size = y.sum(axis=0)
225
+ offset = np.log(lib_size)
226
+ offset = np.atleast_1d(np.asarray(offset, dtype=np.float64))
227
+
228
+ # Add prior counts and adjust offset
229
+ out = add_prior_count(y, offset=offset, prior_count=prior_count)
230
+ y_aug = out['y']
231
+ offset_aug = out['offset']
232
+
233
+ # Fit intercept-only NB GLM using mglmOneGroup
234
+ ab = mglm_one_group(y_aug, dispersion=dispersion, offset=offset_aug,
235
+ weights=weights)
236
+
237
+ # Convert fitted coefficient to log2-CPM: (ab + log(1e6)) / log(2)
238
+ result = (ab + np.log(1e6)) / np.log(2)
239
+
240
+ return result
241
+
242
+
243
+ def cpm_by_group(y, group=None, dispersion=0.05, offset=None, weights=None,
244
+ log=False, prior_count=2):
245
+ """Counts per million averaged by group.
246
+
247
+ Port of edgeR's cpmByGroup().
248
+ """
249
+ # DGEList input
250
+ if isinstance(y, dict) and 'counts' in y:
251
+ dge = y
252
+ if group is None:
253
+ group = dge['samples']['group'].values
254
+ if offset is None:
255
+ from .dgelist import get_offset
256
+ offset = get_offset(dge)
257
+ w = dge.get('weights')
258
+ return _cpm_by_group_default(dge['counts'], group=group, dispersion=dispersion,
259
+ offset=offset, weights=w, log=log,
260
+ prior_count=prior_count)
261
+
262
+ return _cpm_by_group_default(y, group=group, dispersion=dispersion,
263
+ offset=offset, weights=weights, log=log,
264
+ prior_count=prior_count)
265
+
266
+
267
+ def _cpm_by_group_default(y, group=None, dispersion=0.05, offset=None,
268
+ weights=None, log=False, prior_count=2):
269
+ """Core cpmByGroup calculation.
270
+
271
+ Uses mglmOneWay to fit NB GLM per group, matching R's cpmByGroup.default.
272
+ """
273
+ from .glm_fit import mglm_one_way
274
+
275
+ y = np.asarray(y, dtype=np.float64)
276
+ if y.ndim == 1:
277
+ y = y.reshape(-1, 1)
278
+
279
+ if group is None:
280
+ group = np.ones(y.shape[1], dtype=int)
281
+ group = np.asarray(group)
282
+
283
+ if offset is None:
284
+ offset = np.log(y.sum(axis=0))
285
+ offset = np.atleast_1d(np.asarray(offset, dtype=np.float64))
286
+
287
+ if log:
288
+ out = add_prior_count(y, offset=offset, prior_count=prior_count)
289
+ fit = mglm_one_way(out['y'], group=group, dispersion=dispersion,
290
+ offset=out['offset'], weights=weights)
291
+ return fit['coefficients'] / np.log(2) + np.log2(1e6)
292
+ else:
293
+ fit = mglm_one_way(y, group=group, dispersion=dispersion,
294
+ offset=offset, weights=weights)
295
+ return np.exp(fit['coefficients']) * 1e6
296
+
297
+
298
+ def rpkm_by_group(y, group=None, gene_length=None, dispersion=0.05,
299
+ offset=None, weights=None, log=False, prior_count=2):
300
+ """RPKM averaged by group.
301
+
302
+ Port of edgeR's rpkmByGroup().
303
+ """
304
+ if isinstance(y, dict) and 'counts' in y:
305
+ dge = y
306
+ if gene_length is None:
307
+ for col in ['Length', 'length']:
308
+ if dge.get('genes') is not None and col in dge['genes'].columns:
309
+ gene_length = dge['genes'][col].values
310
+ break
311
+ elif isinstance(gene_length, str):
312
+ gene_length = dge['genes'][gene_length].values
313
+ if gene_length is None:
314
+ raise ValueError("Gene lengths not found")
315
+
316
+ gene_length = np.asarray(gene_length, dtype=np.float64)
317
+ z = cpm_by_group(y, group=group, dispersion=dispersion, offset=offset,
318
+ weights=weights, log=log, prior_count=prior_count)
319
+
320
+ if log:
321
+ return z - np.log2(gene_length[:, np.newaxis] / 1e3)
322
+ else:
323
+ return z / (gene_length[:, np.newaxis] / 1e3)
@@ -0,0 +1,96 @@
1
+ # This code was written by Claude (Anthropic). The project was directed by Lior Pachter.
2
+ """
3
+ Gene filtering for edgePython.
4
+
5
+ Port of edgeR's filterByExpr.
6
+ """
7
+
8
+ import numpy as np
9
+ import pandas as pd
10
+ from .expression import cpm
11
+
12
+
13
+ def filter_by_expr(y, design=None, group=None, lib_size=None,
14
+ min_count=10, min_total_count=15, large_n=10, min_prop=0.7):
15
+ """Filter low-expressed genes.
16
+
17
+ Port of edgeR's filterByExpr().
18
+
19
+ Parameters
20
+ ----------
21
+ y : array-like or DGEList
22
+ Count matrix or DGEList.
23
+ design : array-like, optional
24
+ Design matrix.
25
+ group : array-like, optional
26
+ Group factor.
27
+ lib_size : array-like, optional
28
+ Library sizes.
29
+ min_count : float
30
+ Minimum count threshold.
31
+ min_total_count : float
32
+ Minimum total count across all samples.
33
+ large_n : int
34
+ Large sample size threshold.
35
+ min_prop : float
36
+ Minimum proportion for large groups.
37
+
38
+ Returns
39
+ -------
40
+ ndarray of bool, True for genes to keep.
41
+ """
42
+ # DGEList input
43
+ if isinstance(y, dict) and 'counts' in y:
44
+ dge = y
45
+ if design is None and group is None:
46
+ design = dge.get('design')
47
+ if design is None:
48
+ group = dge['samples']['group'].values
49
+ if lib_size is None:
50
+ lib_size = dge['samples']['lib.size'].values * dge['samples']['norm.factors'].values
51
+ counts = dge['counts']
52
+ else:
53
+ counts = np.asarray(y, dtype=np.float64)
54
+
55
+ if counts.ndim == 1:
56
+ counts = counts.reshape(-1, 1)
57
+
58
+ if lib_size is None:
59
+ lib_size = counts.sum(axis=0)
60
+ lib_size = np.asarray(lib_size, dtype=np.float64)
61
+
62
+ # Minimum effective sample size
63
+ if group is None:
64
+ if design is None:
65
+ min_sample_size = counts.shape[1]
66
+ else:
67
+ design = np.asarray(design, dtype=np.float64)
68
+ h = _hat_values(design)
69
+ min_sample_size = 1.0 / np.max(h)
70
+ else:
71
+ group = np.asarray(group)
72
+ _, counts_per_group = np.unique(group, return_counts=True)
73
+ nonzero_counts = counts_per_group[counts_per_group > 0]
74
+ min_sample_size = np.min(nonzero_counts)
75
+
76
+ if min_sample_size > large_n:
77
+ min_sample_size = large_n + (min_sample_size - large_n) * min_prop
78
+
79
+ # CPM cutoff
80
+ median_lib_size = np.median(lib_size)
81
+ cpm_cutoff = min_count / median_lib_size * 1e6
82
+ cpm_vals = cpm(counts, lib_size=lib_size)
83
+
84
+ tol = 1e-14
85
+ keep_cpm = np.sum(cpm_vals >= cpm_cutoff, axis=1) >= (min_sample_size - tol)
86
+
87
+ # Total count cutoff
88
+ keep_total = np.sum(counts, axis=1) >= (min_total_count - tol)
89
+
90
+ return keep_cpm & keep_total
91
+
92
+
93
+ def _hat_values(design):
94
+ """Compute hat/leverage values for a design matrix."""
95
+ Q, R = np.linalg.qr(design)
96
+ return np.sum(Q ** 2, axis=1)