edgepython 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- edgepython/__init__.py +114 -0
- edgepython/classes.py +517 -0
- edgepython/compressed_matrix.py +388 -0
- edgepython/dgelist.py +314 -0
- edgepython/dispersion.py +920 -0
- edgepython/dispersion_lowlevel.py +1066 -0
- edgepython/exact_test.py +525 -0
- edgepython/expression.py +323 -0
- edgepython/filtering.py +96 -0
- edgepython/gene_sets.py +1215 -0
- edgepython/glm_fit.py +653 -0
- edgepython/glm_levenberg.py +359 -0
- edgepython/glm_test.py +375 -0
- edgepython/io.py +1887 -0
- edgepython/limma_port.py +987 -0
- edgepython/normalization.py +546 -0
- edgepython/ql_weights.py +765 -0
- edgepython/results.py +236 -0
- edgepython/sc_fit.py +1511 -0
- edgepython/smoothing.py +474 -0
- edgepython/splicing.py +537 -0
- edgepython/utils.py +1050 -0
- edgepython/visualization.py +409 -0
- edgepython/weighted_lowess.py +323 -0
- edgepython-0.2.0.dist-info/METADATA +201 -0
- edgepython-0.2.0.dist-info/RECORD +29 -0
- edgepython-0.2.0.dist-info/WHEEL +5 -0
- edgepython-0.2.0.dist-info/licenses/LICENSE +674 -0
- edgepython-0.2.0.dist-info/top_level.txt +1 -0
edgepython/expression.py
ADDED
|
@@ -0,0 +1,323 @@
|
|
|
1
|
+
# This code was written by Claude (Anthropic). The project was directed by Lior Pachter.
|
|
2
|
+
"""
|
|
3
|
+
Expression value computation for edgePython.
|
|
4
|
+
|
|
5
|
+
Port of edgeR's cpm, rpkm, tpm, aveLogCPM, cpmByGroup, rpkmByGroup.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
import warnings
|
|
10
|
+
from .utils import expand_as_matrix, add_prior_count
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def cpm(y, lib_size=None, offset=None, log=False, prior_count=2,
|
|
14
|
+
normalized_lib_sizes=True):
|
|
15
|
+
"""Counts per million.
|
|
16
|
+
|
|
17
|
+
Port of edgeR's cpm().
|
|
18
|
+
|
|
19
|
+
Parameters
|
|
20
|
+
----------
|
|
21
|
+
y : array-like or DGEList
|
|
22
|
+
Count matrix or DGEList.
|
|
23
|
+
lib_size : array-like, optional
|
|
24
|
+
Library sizes.
|
|
25
|
+
offset : array-like, optional
|
|
26
|
+
Log-scale offsets.
|
|
27
|
+
log : bool
|
|
28
|
+
Return log2-CPM?
|
|
29
|
+
prior_count : float
|
|
30
|
+
Prior count for log transformation.
|
|
31
|
+
normalized_lib_sizes : bool
|
|
32
|
+
Use normalized library sizes (for DGEList input).
|
|
33
|
+
|
|
34
|
+
Returns
|
|
35
|
+
-------
|
|
36
|
+
ndarray of CPM values.
|
|
37
|
+
"""
|
|
38
|
+
# DGEList input
|
|
39
|
+
if isinstance(y, dict) and 'counts' in y:
|
|
40
|
+
dge = y
|
|
41
|
+
ls = dge['samples']['lib.size'].values
|
|
42
|
+
|
|
43
|
+
if dge.get('offset') is not None:
|
|
44
|
+
ls = None
|
|
45
|
+
offset = dge['offset']
|
|
46
|
+
elif normalized_lib_sizes:
|
|
47
|
+
ls = ls * dge['samples']['norm.factors'].values
|
|
48
|
+
|
|
49
|
+
return _cpm_default(dge['counts'], lib_size=ls, offset=offset,
|
|
50
|
+
log=log, prior_count=prior_count)
|
|
51
|
+
|
|
52
|
+
return _cpm_default(y, lib_size=lib_size, offset=offset,
|
|
53
|
+
log=log, prior_count=prior_count)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _cpm_default(y, lib_size=None, offset=None, log=False, prior_count=2):
|
|
57
|
+
"""Core CPM calculation."""
|
|
58
|
+
y = np.asarray(y, dtype=np.float64)
|
|
59
|
+
ymin = np.nanmin(y)
|
|
60
|
+
if np.isnan(ymin):
|
|
61
|
+
raise ValueError("NA counts not allowed")
|
|
62
|
+
if ymin < 0:
|
|
63
|
+
raise ValueError("Negative counts not allowed")
|
|
64
|
+
|
|
65
|
+
if y.ndim == 1:
|
|
66
|
+
y = y.reshape(-1, 1)
|
|
67
|
+
if y.size == 0:
|
|
68
|
+
return y.copy()
|
|
69
|
+
|
|
70
|
+
if offset is not None:
|
|
71
|
+
offset = np.asarray(offset, dtype=np.float64)
|
|
72
|
+
if offset.ndim == 2:
|
|
73
|
+
if offset.shape != y.shape:
|
|
74
|
+
raise ValueError("dimensions not consistent between counts and offset")
|
|
75
|
+
else:
|
|
76
|
+
if len(offset) != y.shape[1]:
|
|
77
|
+
raise ValueError("Length of offset differs from number of libraries")
|
|
78
|
+
lib_size = np.exp(offset) if offset.ndim == 1 else None
|
|
79
|
+
if offset.ndim == 2:
|
|
80
|
+
lib_size = np.exp(offset)
|
|
81
|
+
else:
|
|
82
|
+
if lib_size is None:
|
|
83
|
+
lib_size = y.sum(axis=0)
|
|
84
|
+
|
|
85
|
+
lib_size = np.asarray(lib_size, dtype=np.float64)
|
|
86
|
+
if lib_size.ndim == 1:
|
|
87
|
+
if np.any(lib_size <= 0):
|
|
88
|
+
raise ValueError("library sizes should be greater than zero")
|
|
89
|
+
|
|
90
|
+
if log:
|
|
91
|
+
out = add_prior_count(y, lib_size=lib_size if lib_size.ndim == 1 else None,
|
|
92
|
+
offset=np.log(lib_size) if lib_size.ndim == 1 else offset,
|
|
93
|
+
prior_count=prior_count)
|
|
94
|
+
y_aug = out['y']
|
|
95
|
+
offset_aug = out['offset']
|
|
96
|
+
|
|
97
|
+
if isinstance(offset_aug, np.ndarray) and offset_aug.ndim == 1:
|
|
98
|
+
lib_size_aug = np.exp(offset_aug)
|
|
99
|
+
else:
|
|
100
|
+
lib_size_aug = np.exp(offset_aug)
|
|
101
|
+
|
|
102
|
+
if lib_size_aug.ndim == 1:
|
|
103
|
+
result = np.log2(y_aug / lib_size_aug[np.newaxis, :] * 1e6)
|
|
104
|
+
else:
|
|
105
|
+
result = np.log2(y_aug / lib_size_aug * 1e6)
|
|
106
|
+
return result
|
|
107
|
+
else:
|
|
108
|
+
if lib_size.ndim == 1:
|
|
109
|
+
return y / lib_size[np.newaxis, :] * 1e6
|
|
110
|
+
else:
|
|
111
|
+
return y / lib_size * 1e6
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def rpkm(y, gene_length, lib_size=None, offset=None, log=False, prior_count=2,
|
|
115
|
+
normalized_lib_sizes=True):
|
|
116
|
+
"""Reads per kilobase per million.
|
|
117
|
+
|
|
118
|
+
Port of edgeR's rpkm().
|
|
119
|
+
|
|
120
|
+
Parameters
|
|
121
|
+
----------
|
|
122
|
+
y : array-like or DGEList
|
|
123
|
+
Count matrix or DGEList.
|
|
124
|
+
gene_length : array-like or str
|
|
125
|
+
Gene lengths in bp.
|
|
126
|
+
lib_size, offset, log, prior_count, normalized_lib_sizes :
|
|
127
|
+
As for cpm().
|
|
128
|
+
"""
|
|
129
|
+
# Extract gene_length from DGEList if string
|
|
130
|
+
if isinstance(y, dict) and 'counts' in y:
|
|
131
|
+
if isinstance(gene_length, str):
|
|
132
|
+
gene_length = y['genes'][gene_length].values
|
|
133
|
+
elif gene_length is None:
|
|
134
|
+
for col in ['Length', 'length']:
|
|
135
|
+
if col in y.get('genes', {}).columns if y.get('genes') is not None else False:
|
|
136
|
+
gene_length = y['genes'][col].values
|
|
137
|
+
break
|
|
138
|
+
if gene_length is None:
|
|
139
|
+
raise ValueError("Gene lengths not found")
|
|
140
|
+
|
|
141
|
+
gene_length = np.asarray(gene_length, dtype=np.float64)
|
|
142
|
+
gene_length_kb = gene_length / 1000
|
|
143
|
+
|
|
144
|
+
result = cpm(y, lib_size=lib_size, offset=offset, log=log,
|
|
145
|
+
prior_count=prior_count, normalized_lib_sizes=normalized_lib_sizes)
|
|
146
|
+
|
|
147
|
+
if log:
|
|
148
|
+
return result - np.log2(gene_length_kb[:, np.newaxis])
|
|
149
|
+
else:
|
|
150
|
+
return result / gene_length_kb[:, np.newaxis]
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def tpm(y, effective_tx_length, rta_overdispersion=None, shrunk=False):
|
|
154
|
+
"""Transcripts per million from a fitted model.
|
|
155
|
+
|
|
156
|
+
Port of edgeR's tpm().
|
|
157
|
+
"""
|
|
158
|
+
t = cpm(y, log=False)
|
|
159
|
+
A = np.asarray(effective_tx_length, dtype=np.float64)
|
|
160
|
+
if rta_overdispersion is not None:
|
|
161
|
+
A = A / np.asarray(rta_overdispersion)
|
|
162
|
+
if A.ndim == 1:
|
|
163
|
+
t = t / A[:, np.newaxis]
|
|
164
|
+
else:
|
|
165
|
+
t = t / A
|
|
166
|
+
col_sums = t.sum(axis=0)
|
|
167
|
+
avg_col_sum = np.exp(np.mean(np.log(col_sums)))
|
|
168
|
+
return t / avg_col_sum * 1e6
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def ave_log_cpm(y, lib_size=None, offset=None, prior_count=2, dispersion=None,
|
|
172
|
+
weights=None, normalized_lib_sizes=True):
|
|
173
|
+
"""Average log2-CPM for each gene.
|
|
174
|
+
|
|
175
|
+
Port of edgeR's aveLogCPM().
|
|
176
|
+
"""
|
|
177
|
+
# DGEList input
|
|
178
|
+
if isinstance(y, dict) and 'counts' in y:
|
|
179
|
+
dge = y
|
|
180
|
+
ls = dge['samples']['lib.size'].values
|
|
181
|
+
if ls is None:
|
|
182
|
+
ls = dge['counts'].sum(axis=0)
|
|
183
|
+
if normalized_lib_sizes:
|
|
184
|
+
nf = dge['samples'].get('norm.factors')
|
|
185
|
+
if nf is not None:
|
|
186
|
+
ls = ls * nf.values
|
|
187
|
+
if dispersion is None:
|
|
188
|
+
dispersion = dge.get('common.dispersion')
|
|
189
|
+
w = dge.get('weights')
|
|
190
|
+
return _ave_log_cpm_default(dge['counts'], lib_size=ls, offset=offset,
|
|
191
|
+
prior_count=prior_count, dispersion=dispersion,
|
|
192
|
+
weights=w)
|
|
193
|
+
|
|
194
|
+
return _ave_log_cpm_default(y, lib_size=lib_size, offset=offset,
|
|
195
|
+
prior_count=prior_count, dispersion=dispersion,
|
|
196
|
+
weights=weights)
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def _ave_log_cpm_default(y, lib_size=None, offset=None, prior_count=2,
|
|
200
|
+
dispersion=None, weights=None):
|
|
201
|
+
"""Core aveLogCPM calculation.
|
|
202
|
+
|
|
203
|
+
Uses mglmOneGroup to fit intercept-only NB GLM model, then converts
|
|
204
|
+
the fitted coefficient to log2-CPM. This matches R edgeR's aveLogCPM.default
|
|
205
|
+
which calls C++ code internally doing the same thing.
|
|
206
|
+
"""
|
|
207
|
+
from .glm_fit import mglm_one_group
|
|
208
|
+
|
|
209
|
+
y = np.asarray(y, dtype=np.float64)
|
|
210
|
+
if y.ndim == 1:
|
|
211
|
+
y = y.reshape(-1, 1)
|
|
212
|
+
if y.shape[0] == 0:
|
|
213
|
+
return np.array([], dtype=np.float64)
|
|
214
|
+
|
|
215
|
+
if dispersion is None:
|
|
216
|
+
dispersion = 0.05
|
|
217
|
+
dispersion = np.atleast_1d(np.asarray(dispersion, dtype=np.float64))
|
|
218
|
+
if np.all(np.isnan(dispersion)):
|
|
219
|
+
dispersion = np.array([0.05])
|
|
220
|
+
dispersion = np.where(np.isnan(dispersion), np.nanmean(dispersion), dispersion)
|
|
221
|
+
|
|
222
|
+
if offset is None:
|
|
223
|
+
if lib_size is None:
|
|
224
|
+
lib_size = y.sum(axis=0)
|
|
225
|
+
offset = np.log(lib_size)
|
|
226
|
+
offset = np.atleast_1d(np.asarray(offset, dtype=np.float64))
|
|
227
|
+
|
|
228
|
+
# Add prior counts and adjust offset
|
|
229
|
+
out = add_prior_count(y, offset=offset, prior_count=prior_count)
|
|
230
|
+
y_aug = out['y']
|
|
231
|
+
offset_aug = out['offset']
|
|
232
|
+
|
|
233
|
+
# Fit intercept-only NB GLM using mglmOneGroup
|
|
234
|
+
ab = mglm_one_group(y_aug, dispersion=dispersion, offset=offset_aug,
|
|
235
|
+
weights=weights)
|
|
236
|
+
|
|
237
|
+
# Convert fitted coefficient to log2-CPM: (ab + log(1e6)) / log(2)
|
|
238
|
+
result = (ab + np.log(1e6)) / np.log(2)
|
|
239
|
+
|
|
240
|
+
return result
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def cpm_by_group(y, group=None, dispersion=0.05, offset=None, weights=None,
|
|
244
|
+
log=False, prior_count=2):
|
|
245
|
+
"""Counts per million averaged by group.
|
|
246
|
+
|
|
247
|
+
Port of edgeR's cpmByGroup().
|
|
248
|
+
"""
|
|
249
|
+
# DGEList input
|
|
250
|
+
if isinstance(y, dict) and 'counts' in y:
|
|
251
|
+
dge = y
|
|
252
|
+
if group is None:
|
|
253
|
+
group = dge['samples']['group'].values
|
|
254
|
+
if offset is None:
|
|
255
|
+
from .dgelist import get_offset
|
|
256
|
+
offset = get_offset(dge)
|
|
257
|
+
w = dge.get('weights')
|
|
258
|
+
return _cpm_by_group_default(dge['counts'], group=group, dispersion=dispersion,
|
|
259
|
+
offset=offset, weights=w, log=log,
|
|
260
|
+
prior_count=prior_count)
|
|
261
|
+
|
|
262
|
+
return _cpm_by_group_default(y, group=group, dispersion=dispersion,
|
|
263
|
+
offset=offset, weights=weights, log=log,
|
|
264
|
+
prior_count=prior_count)
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def _cpm_by_group_default(y, group=None, dispersion=0.05, offset=None,
|
|
268
|
+
weights=None, log=False, prior_count=2):
|
|
269
|
+
"""Core cpmByGroup calculation.
|
|
270
|
+
|
|
271
|
+
Uses mglmOneWay to fit NB GLM per group, matching R's cpmByGroup.default.
|
|
272
|
+
"""
|
|
273
|
+
from .glm_fit import mglm_one_way
|
|
274
|
+
|
|
275
|
+
y = np.asarray(y, dtype=np.float64)
|
|
276
|
+
if y.ndim == 1:
|
|
277
|
+
y = y.reshape(-1, 1)
|
|
278
|
+
|
|
279
|
+
if group is None:
|
|
280
|
+
group = np.ones(y.shape[1], dtype=int)
|
|
281
|
+
group = np.asarray(group)
|
|
282
|
+
|
|
283
|
+
if offset is None:
|
|
284
|
+
offset = np.log(y.sum(axis=0))
|
|
285
|
+
offset = np.atleast_1d(np.asarray(offset, dtype=np.float64))
|
|
286
|
+
|
|
287
|
+
if log:
|
|
288
|
+
out = add_prior_count(y, offset=offset, prior_count=prior_count)
|
|
289
|
+
fit = mglm_one_way(out['y'], group=group, dispersion=dispersion,
|
|
290
|
+
offset=out['offset'], weights=weights)
|
|
291
|
+
return fit['coefficients'] / np.log(2) + np.log2(1e6)
|
|
292
|
+
else:
|
|
293
|
+
fit = mglm_one_way(y, group=group, dispersion=dispersion,
|
|
294
|
+
offset=offset, weights=weights)
|
|
295
|
+
return np.exp(fit['coefficients']) * 1e6
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
def rpkm_by_group(y, group=None, gene_length=None, dispersion=0.05,
|
|
299
|
+
offset=None, weights=None, log=False, prior_count=2):
|
|
300
|
+
"""RPKM averaged by group.
|
|
301
|
+
|
|
302
|
+
Port of edgeR's rpkmByGroup().
|
|
303
|
+
"""
|
|
304
|
+
if isinstance(y, dict) and 'counts' in y:
|
|
305
|
+
dge = y
|
|
306
|
+
if gene_length is None:
|
|
307
|
+
for col in ['Length', 'length']:
|
|
308
|
+
if dge.get('genes') is not None and col in dge['genes'].columns:
|
|
309
|
+
gene_length = dge['genes'][col].values
|
|
310
|
+
break
|
|
311
|
+
elif isinstance(gene_length, str):
|
|
312
|
+
gene_length = dge['genes'][gene_length].values
|
|
313
|
+
if gene_length is None:
|
|
314
|
+
raise ValueError("Gene lengths not found")
|
|
315
|
+
|
|
316
|
+
gene_length = np.asarray(gene_length, dtype=np.float64)
|
|
317
|
+
z = cpm_by_group(y, group=group, dispersion=dispersion, offset=offset,
|
|
318
|
+
weights=weights, log=log, prior_count=prior_count)
|
|
319
|
+
|
|
320
|
+
if log:
|
|
321
|
+
return z - np.log2(gene_length[:, np.newaxis] / 1e3)
|
|
322
|
+
else:
|
|
323
|
+
return z / (gene_length[:, np.newaxis] / 1e3)
|
edgepython/filtering.py
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
# This code was written by Claude (Anthropic). The project was directed by Lior Pachter.
|
|
2
|
+
"""
|
|
3
|
+
Gene filtering for edgePython.
|
|
4
|
+
|
|
5
|
+
Port of edgeR's filterByExpr.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
import pandas as pd
|
|
10
|
+
from .expression import cpm
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def filter_by_expr(y, design=None, group=None, lib_size=None,
|
|
14
|
+
min_count=10, min_total_count=15, large_n=10, min_prop=0.7):
|
|
15
|
+
"""Filter low-expressed genes.
|
|
16
|
+
|
|
17
|
+
Port of edgeR's filterByExpr().
|
|
18
|
+
|
|
19
|
+
Parameters
|
|
20
|
+
----------
|
|
21
|
+
y : array-like or DGEList
|
|
22
|
+
Count matrix or DGEList.
|
|
23
|
+
design : array-like, optional
|
|
24
|
+
Design matrix.
|
|
25
|
+
group : array-like, optional
|
|
26
|
+
Group factor.
|
|
27
|
+
lib_size : array-like, optional
|
|
28
|
+
Library sizes.
|
|
29
|
+
min_count : float
|
|
30
|
+
Minimum count threshold.
|
|
31
|
+
min_total_count : float
|
|
32
|
+
Minimum total count across all samples.
|
|
33
|
+
large_n : int
|
|
34
|
+
Large sample size threshold.
|
|
35
|
+
min_prop : float
|
|
36
|
+
Minimum proportion for large groups.
|
|
37
|
+
|
|
38
|
+
Returns
|
|
39
|
+
-------
|
|
40
|
+
ndarray of bool, True for genes to keep.
|
|
41
|
+
"""
|
|
42
|
+
# DGEList input
|
|
43
|
+
if isinstance(y, dict) and 'counts' in y:
|
|
44
|
+
dge = y
|
|
45
|
+
if design is None and group is None:
|
|
46
|
+
design = dge.get('design')
|
|
47
|
+
if design is None:
|
|
48
|
+
group = dge['samples']['group'].values
|
|
49
|
+
if lib_size is None:
|
|
50
|
+
lib_size = dge['samples']['lib.size'].values * dge['samples']['norm.factors'].values
|
|
51
|
+
counts = dge['counts']
|
|
52
|
+
else:
|
|
53
|
+
counts = np.asarray(y, dtype=np.float64)
|
|
54
|
+
|
|
55
|
+
if counts.ndim == 1:
|
|
56
|
+
counts = counts.reshape(-1, 1)
|
|
57
|
+
|
|
58
|
+
if lib_size is None:
|
|
59
|
+
lib_size = counts.sum(axis=0)
|
|
60
|
+
lib_size = np.asarray(lib_size, dtype=np.float64)
|
|
61
|
+
|
|
62
|
+
# Minimum effective sample size
|
|
63
|
+
if group is None:
|
|
64
|
+
if design is None:
|
|
65
|
+
min_sample_size = counts.shape[1]
|
|
66
|
+
else:
|
|
67
|
+
design = np.asarray(design, dtype=np.float64)
|
|
68
|
+
h = _hat_values(design)
|
|
69
|
+
min_sample_size = 1.0 / np.max(h)
|
|
70
|
+
else:
|
|
71
|
+
group = np.asarray(group)
|
|
72
|
+
_, counts_per_group = np.unique(group, return_counts=True)
|
|
73
|
+
nonzero_counts = counts_per_group[counts_per_group > 0]
|
|
74
|
+
min_sample_size = np.min(nonzero_counts)
|
|
75
|
+
|
|
76
|
+
if min_sample_size > large_n:
|
|
77
|
+
min_sample_size = large_n + (min_sample_size - large_n) * min_prop
|
|
78
|
+
|
|
79
|
+
# CPM cutoff
|
|
80
|
+
median_lib_size = np.median(lib_size)
|
|
81
|
+
cpm_cutoff = min_count / median_lib_size * 1e6
|
|
82
|
+
cpm_vals = cpm(counts, lib_size=lib_size)
|
|
83
|
+
|
|
84
|
+
tol = 1e-14
|
|
85
|
+
keep_cpm = np.sum(cpm_vals >= cpm_cutoff, axis=1) >= (min_sample_size - tol)
|
|
86
|
+
|
|
87
|
+
# Total count cutoff
|
|
88
|
+
keep_total = np.sum(counts, axis=1) >= (min_total_count - tol)
|
|
89
|
+
|
|
90
|
+
return keep_cpm & keep_total
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _hat_values(design):
|
|
94
|
+
"""Compute hat/leverage values for a design matrix."""
|
|
95
|
+
Q, R = np.linalg.qr(design)
|
|
96
|
+
return np.sum(Q ** 2, axis=1)
|