edgepython 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- edgepython/__init__.py +114 -0
- edgepython/classes.py +517 -0
- edgepython/compressed_matrix.py +388 -0
- edgepython/dgelist.py +314 -0
- edgepython/dispersion.py +920 -0
- edgepython/dispersion_lowlevel.py +1066 -0
- edgepython/exact_test.py +525 -0
- edgepython/expression.py +323 -0
- edgepython/filtering.py +96 -0
- edgepython/gene_sets.py +1215 -0
- edgepython/glm_fit.py +653 -0
- edgepython/glm_levenberg.py +359 -0
- edgepython/glm_test.py +375 -0
- edgepython/io.py +1887 -0
- edgepython/limma_port.py +987 -0
- edgepython/normalization.py +546 -0
- edgepython/ql_weights.py +765 -0
- edgepython/results.py +236 -0
- edgepython/sc_fit.py +1511 -0
- edgepython/smoothing.py +474 -0
- edgepython/splicing.py +537 -0
- edgepython/utils.py +1050 -0
- edgepython/visualization.py +409 -0
- edgepython/weighted_lowess.py +323 -0
- edgepython-0.2.0.dist-info/METADATA +201 -0
- edgepython-0.2.0.dist-info/RECORD +29 -0
- edgepython-0.2.0.dist-info/WHEEL +5 -0
- edgepython-0.2.0.dist-info/licenses/LICENSE +674 -0
- edgepython-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,323 @@
|
|
|
1
|
+
# This code was written by Claude (Anthropic). The project was directed by Lior Pachter.
|
|
2
|
+
"""Port of limma's weightedLowess C code.
|
|
3
|
+
|
|
4
|
+
Weighted local regression with delta-based binning to approximately npts
|
|
5
|
+
seed points. Matches the behavior of limma's weighted_lowess() C function
|
|
6
|
+
in src/weighted_lowess.c.
|
|
7
|
+
|
|
8
|
+
The algorithm:
|
|
9
|
+
1. Sort data by x
|
|
10
|
+
2. Select ~npts seed points spaced at least delta apart
|
|
11
|
+
3. For each seed, find the span window where cumulative weight >= span * total_weight
|
|
12
|
+
4. Fit local weighted linear regression (tricube kernel) at each seed
|
|
13
|
+
5. Linearly interpolate between seeds
|
|
14
|
+
6. Optionally iterate with bisquare robustness weights
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import numpy as np
|
|
18
|
+
from numba import njit
|
|
19
|
+
|
|
20
|
+
_THRESHOLD = 1e-7
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def weighted_lowess(x, y, weights=None, span=0.3, iterations=4, npts=200, delta=None):
|
|
24
|
+
"""Weighted lowess smoothing matching limma's C implementation.
|
|
25
|
+
|
|
26
|
+
Parameters
|
|
27
|
+
----------
|
|
28
|
+
x : array-like
|
|
29
|
+
Covariate values.
|
|
30
|
+
y : array-like
|
|
31
|
+
Response values.
|
|
32
|
+
weights : array-like, optional
|
|
33
|
+
Prior weights (default: all ones).
|
|
34
|
+
span : float
|
|
35
|
+
Proportion of total weight to use in each local regression window.
|
|
36
|
+
iterations : int
|
|
37
|
+
Total number of fitting passes (1 = no robustness iterations).
|
|
38
|
+
npts : int
|
|
39
|
+
Approximate number of seed points for binning.
|
|
40
|
+
delta : float, optional
|
|
41
|
+
Minimum distance between seed points. Computed from npts if None.
|
|
42
|
+
|
|
43
|
+
Returns
|
|
44
|
+
-------
|
|
45
|
+
dict with keys 'fitted', 'residuals', 'weights' (robustness weights), 'delta'.
|
|
46
|
+
"""
|
|
47
|
+
x = np.asarray(x, dtype=np.float64)
|
|
48
|
+
y = np.asarray(y, dtype=np.float64)
|
|
49
|
+
n = len(x)
|
|
50
|
+
|
|
51
|
+
if weights is None:
|
|
52
|
+
weights = np.ones(n, dtype=np.float64)
|
|
53
|
+
else:
|
|
54
|
+
weights = np.asarray(weights, dtype=np.float64).copy()
|
|
55
|
+
|
|
56
|
+
if n < 2:
|
|
57
|
+
raise ValueError("Need at least two points")
|
|
58
|
+
|
|
59
|
+
# Sort by x (mergesort for stable ordering matching R's order())
|
|
60
|
+
o = np.argsort(x, kind='mergesort')
|
|
61
|
+
xs = x[o].copy()
|
|
62
|
+
ys = y[o].copy()
|
|
63
|
+
ws = weights[o].copy()
|
|
64
|
+
|
|
65
|
+
# Compute delta if not provided (matching R wrapper logic)
|
|
66
|
+
if delta is None:
|
|
67
|
+
npts = int(npts + 0.5)
|
|
68
|
+
if npts >= n:
|
|
69
|
+
delta = 0.0
|
|
70
|
+
else:
|
|
71
|
+
dx = np.sort(np.diff(xs))
|
|
72
|
+
cumrange = np.cumsum(dx)
|
|
73
|
+
numclusters = np.arange(npts)
|
|
74
|
+
# R 1-based to Python 0-based index conversion
|
|
75
|
+
indices = len(dx) - 1 - numclusters
|
|
76
|
+
delta = float(np.min(cumrange[indices] / (npts - numclusters)))
|
|
77
|
+
|
|
78
|
+
delta = float(delta)
|
|
79
|
+
|
|
80
|
+
# Compute total weight and span weight
|
|
81
|
+
total_weight = np.sum(ws)
|
|
82
|
+
span_weight = total_weight * span
|
|
83
|
+
subrange = (xs[-1] - xs[0]) / n
|
|
84
|
+
|
|
85
|
+
# Find seed points (binned to ~npts)
|
|
86
|
+
seed_idx, nseeds = _find_seeds(xs, n, delta)
|
|
87
|
+
|
|
88
|
+
# Find span limits for each seed
|
|
89
|
+
frame_start, frame_end, max_dist = _find_limits(
|
|
90
|
+
seed_idx, nseeds, xs, ws, n, span_weight)
|
|
91
|
+
|
|
92
|
+
# Initialize fitted values and robustness weights
|
|
93
|
+
fitted = np.zeros(n, dtype=np.float64)
|
|
94
|
+
rob_w = np.ones(n, dtype=np.float64)
|
|
95
|
+
|
|
96
|
+
# Run iterations in compiled code
|
|
97
|
+
_lowess_iterations(xs, ys, ws, fitted, rob_w, seed_idx, nseeds,
|
|
98
|
+
frame_start, frame_end, max_dist, total_weight,
|
|
99
|
+
subrange, iterations)
|
|
100
|
+
|
|
101
|
+
# Map back to original (unsorted) order
|
|
102
|
+
fitted_orig = np.empty(n, dtype=np.float64)
|
|
103
|
+
fitted_orig[o] = fitted
|
|
104
|
+
rob_orig = np.empty(n, dtype=np.float64)
|
|
105
|
+
rob_orig[o] = rob_w
|
|
106
|
+
|
|
107
|
+
return {
|
|
108
|
+
'fitted': fitted_orig,
|
|
109
|
+
'residuals': y - fitted_orig,
|
|
110
|
+
'weights': rob_orig,
|
|
111
|
+
'delta': delta
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def _find_seeds(xs, n, delta):
|
|
116
|
+
"""Find seed point indices for delta-based binning.
|
|
117
|
+
|
|
118
|
+
Always includes first and last points. Interior points are included
|
|
119
|
+
if they are more than delta away from the last included point.
|
|
120
|
+
"""
|
|
121
|
+
if delta <= 0 or n <= 2:
|
|
122
|
+
return np.arange(n, dtype=np.intp), n
|
|
123
|
+
|
|
124
|
+
seeds = [0]
|
|
125
|
+
last_pt = 0
|
|
126
|
+
for pt in range(1, n - 1):
|
|
127
|
+
if xs[pt] - xs[last_pt] > delta:
|
|
128
|
+
seeds.append(pt)
|
|
129
|
+
last_pt = pt
|
|
130
|
+
seeds.append(n - 1)
|
|
131
|
+
|
|
132
|
+
return np.array(seeds, dtype=np.intp), len(seeds)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
@njit(cache=True)
|
|
136
|
+
def _find_limits(seed_idx, nseeds, xs, ws, n, span_weight):
|
|
137
|
+
"""Find span window [start, end] and max distance for each seed.
|
|
138
|
+
|
|
139
|
+
For each seed point, extends the window left and right (choosing
|
|
140
|
+
the closer direction each step) until the cumulative weight in the
|
|
141
|
+
window reaches span_weight. Then extends to include ties.
|
|
142
|
+
"""
|
|
143
|
+
frame_start = np.empty(nseeds, dtype=np.intp)
|
|
144
|
+
frame_end = np.empty(nseeds, dtype=np.intp)
|
|
145
|
+
max_dist = np.empty(nseeds, dtype=np.float64)
|
|
146
|
+
|
|
147
|
+
for s in range(nseeds):
|
|
148
|
+
curpt = seed_idx[s]
|
|
149
|
+
left = curpt
|
|
150
|
+
right = curpt
|
|
151
|
+
cur_w = ws[curpt]
|
|
152
|
+
at_start = (left == 0)
|
|
153
|
+
at_end = (right == n - 1)
|
|
154
|
+
mdist = 0.0
|
|
155
|
+
|
|
156
|
+
while cur_w < span_weight and (not at_end or not at_start):
|
|
157
|
+
if at_end:
|
|
158
|
+
# Can only extend left
|
|
159
|
+
left -= 1
|
|
160
|
+
cur_w += ws[left]
|
|
161
|
+
if left == 0:
|
|
162
|
+
at_start = True
|
|
163
|
+
ldist = xs[curpt] - xs[left]
|
|
164
|
+
if mdist < ldist:
|
|
165
|
+
mdist = ldist
|
|
166
|
+
elif at_start:
|
|
167
|
+
# Can only extend right
|
|
168
|
+
right += 1
|
|
169
|
+
cur_w += ws[right]
|
|
170
|
+
if right == n - 1:
|
|
171
|
+
at_end = True
|
|
172
|
+
rdist = xs[right] - xs[curpt]
|
|
173
|
+
if mdist < rdist:
|
|
174
|
+
mdist = rdist
|
|
175
|
+
else:
|
|
176
|
+
# Extend in direction of closer point
|
|
177
|
+
ldist = xs[curpt] - xs[left - 1]
|
|
178
|
+
rdist = xs[right + 1] - xs[curpt]
|
|
179
|
+
if ldist < rdist:
|
|
180
|
+
left -= 1
|
|
181
|
+
cur_w += ws[left]
|
|
182
|
+
if left == 0:
|
|
183
|
+
at_start = True
|
|
184
|
+
if mdist < ldist:
|
|
185
|
+
mdist = ldist
|
|
186
|
+
else:
|
|
187
|
+
right += 1
|
|
188
|
+
cur_w += ws[right]
|
|
189
|
+
if right == n - 1:
|
|
190
|
+
at_end = True
|
|
191
|
+
if mdist < rdist:
|
|
192
|
+
mdist = rdist
|
|
193
|
+
|
|
194
|
+
# Extend to ties
|
|
195
|
+
while left > 0 and xs[left] == xs[left - 1]:
|
|
196
|
+
left -= 1
|
|
197
|
+
while right < n - 1 and xs[right] == xs[right + 1]:
|
|
198
|
+
right += 1
|
|
199
|
+
|
|
200
|
+
frame_start[s] = left
|
|
201
|
+
frame_end[s] = right
|
|
202
|
+
max_dist[s] = mdist
|
|
203
|
+
|
|
204
|
+
return frame_start, frame_end, max_dist
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
@njit(cache=True)
|
|
208
|
+
def _lowess_fit(xs, ys, ws, rw, curpt, left, right, dist):
|
|
209
|
+
"""Local weighted linear regression at a single point."""
|
|
210
|
+
threshold = 1e-7
|
|
211
|
+
allweight = 0.0
|
|
212
|
+
xmean = 0.0
|
|
213
|
+
ymean = 0.0
|
|
214
|
+
|
|
215
|
+
if dist < threshold:
|
|
216
|
+
for i in range(left, right + 1):
|
|
217
|
+
w = ws[i] * rw[i]
|
|
218
|
+
allweight += w
|
|
219
|
+
if allweight == 0.0:
|
|
220
|
+
return 0.0
|
|
221
|
+
val = 0.0
|
|
222
|
+
for i in range(left, right + 1):
|
|
223
|
+
val += ys[i] * ws[i] * rw[i]
|
|
224
|
+
return val / allweight
|
|
225
|
+
|
|
226
|
+
for i in range(left, right + 1):
|
|
227
|
+
u = abs(xs[curpt] - xs[i]) / dist
|
|
228
|
+
tricube = (1.0 - u * u * u)
|
|
229
|
+
tricube = tricube * tricube * tricube
|
|
230
|
+
w = tricube * ws[i] * rw[i]
|
|
231
|
+
allweight += w
|
|
232
|
+
xmean += w * xs[i]
|
|
233
|
+
ymean += w * ys[i]
|
|
234
|
+
|
|
235
|
+
if allweight == 0.0:
|
|
236
|
+
return 0.0
|
|
237
|
+
|
|
238
|
+
xmean /= allweight
|
|
239
|
+
ymean /= allweight
|
|
240
|
+
|
|
241
|
+
var = 0.0
|
|
242
|
+
covar = 0.0
|
|
243
|
+
for i in range(left, right + 1):
|
|
244
|
+
u = abs(xs[curpt] - xs[i]) / dist
|
|
245
|
+
tricube = (1.0 - u * u * u)
|
|
246
|
+
tricube = tricube * tricube * tricube
|
|
247
|
+
w = tricube * ws[i] * rw[i]
|
|
248
|
+
temp = xs[i] - xmean
|
|
249
|
+
var += temp * temp * w
|
|
250
|
+
covar += temp * (ys[i] - ymean) * w
|
|
251
|
+
|
|
252
|
+
if var < threshold:
|
|
253
|
+
return ymean
|
|
254
|
+
|
|
255
|
+
slope = covar / var
|
|
256
|
+
return slope * xs[curpt] + ymean - slope * xmean
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
@njit(cache=True)
|
|
260
|
+
def _lowess_iterations(xs, ys, ws, fitted, rob_w, seed_idx, nseeds,
|
|
261
|
+
frame_start, frame_end, max_dist, total_weight,
|
|
262
|
+
subrange, iterations):
|
|
263
|
+
"""Run the full lowess iteration loop in compiled code."""
|
|
264
|
+
n = len(xs)
|
|
265
|
+
threshold = 1e-7
|
|
266
|
+
|
|
267
|
+
for _it in range(iterations):
|
|
268
|
+
fitted[0] = _lowess_fit(xs, ys, ws, rob_w,
|
|
269
|
+
0, frame_start[0], frame_end[0], max_dist[0])
|
|
270
|
+
last_pt = 0
|
|
271
|
+
for s in range(1, nseeds):
|
|
272
|
+
pt = seed_idx[s]
|
|
273
|
+
fitted[pt] = _lowess_fit(xs, ys, ws, rob_w,
|
|
274
|
+
pt, frame_start[s], frame_end[s],
|
|
275
|
+
max_dist[s])
|
|
276
|
+
if pt - last_pt > 1:
|
|
277
|
+
dx_interp = xs[pt] - xs[last_pt]
|
|
278
|
+
if dx_interp > threshold * subrange:
|
|
279
|
+
slope = (fitted[pt] - fitted[last_pt]) / dx_interp
|
|
280
|
+
intercept = fitted[pt] - slope * xs[pt]
|
|
281
|
+
for j in range(last_pt + 1, pt):
|
|
282
|
+
fitted[j] = slope * xs[j] + intercept
|
|
283
|
+
else:
|
|
284
|
+
avg = 0.5 * (fitted[pt] + fitted[last_pt])
|
|
285
|
+
for j in range(last_pt + 1, pt):
|
|
286
|
+
fitted[j] = avg
|
|
287
|
+
last_pt = pt
|
|
288
|
+
|
|
289
|
+
# Compute absolute residuals
|
|
290
|
+
abs_resid = np.empty(n)
|
|
291
|
+
resid_sum = 0.0
|
|
292
|
+
for i in range(n):
|
|
293
|
+
abs_resid[i] = abs(ys[i] - fitted[i])
|
|
294
|
+
resid_sum += abs_resid[i]
|
|
295
|
+
resid_scale = resid_sum / n
|
|
296
|
+
|
|
297
|
+
# Sort residuals
|
|
298
|
+
ror = np.argsort(abs_resid)
|
|
299
|
+
sorted_resid = abs_resid[ror]
|
|
300
|
+
|
|
301
|
+
cumw = 0.0
|
|
302
|
+
half_weight = total_weight / 2.0
|
|
303
|
+
cmad = 0.0
|
|
304
|
+
for i in range(n):
|
|
305
|
+
cumw += ws[ror[i]]
|
|
306
|
+
if cumw == half_weight and i < n - 1:
|
|
307
|
+
cmad = 3.0 * (sorted_resid[i] + sorted_resid[i + 1])
|
|
308
|
+
break
|
|
309
|
+
elif cumw > half_weight:
|
|
310
|
+
cmad = 6.0 * sorted_resid[i]
|
|
311
|
+
break
|
|
312
|
+
|
|
313
|
+
if cmad <= threshold * resid_scale:
|
|
314
|
+
break
|
|
315
|
+
|
|
316
|
+
for i in range(n):
|
|
317
|
+
rob_w[i] = 0.0
|
|
318
|
+
for i in range(n):
|
|
319
|
+
if sorted_resid[i] < cmad:
|
|
320
|
+
u = sorted_resid[i] / cmad
|
|
321
|
+
rob_w[ror[i]] = (1.0 - u * u) * (1.0 - u * u)
|
|
322
|
+
else:
|
|
323
|
+
break
|
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: edgepython
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Python port of the edgeR Bioconductor package for differential expression analysis of digital gene expression data.
|
|
5
|
+
Author: Lior Pachter
|
|
6
|
+
License-Expression: GPL-3.0-or-later
|
|
7
|
+
Requires-Python: >=3.9
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Requires-Dist: numpy>=1.21
|
|
11
|
+
Requires-Dist: scipy>=1.7
|
|
12
|
+
Requires-Dist: pandas>=1.3
|
|
13
|
+
Requires-Dist: matplotlib>=3.4
|
|
14
|
+
Requires-Dist: statsmodels>=0.13
|
|
15
|
+
Requires-Dist: numba>=0.57
|
|
16
|
+
Provides-Extra: h5
|
|
17
|
+
Requires-Dist: h5py>=3.0; extra == "h5"
|
|
18
|
+
Provides-Extra: anndata
|
|
19
|
+
Requires-Dist: anndata>=0.7; extra == "anndata"
|
|
20
|
+
Provides-Extra: parquet
|
|
21
|
+
Requires-Dist: pyarrow>=8.0; extra == "parquet"
|
|
22
|
+
Provides-Extra: formula
|
|
23
|
+
Requires-Dist: patsy>=0.5; extra == "formula"
|
|
24
|
+
Provides-Extra: all
|
|
25
|
+
Requires-Dist: h5py>=3.0; extra == "all"
|
|
26
|
+
Requires-Dist: anndata>=0.7; extra == "all"
|
|
27
|
+
Requires-Dist: pyarrow>=8.0; extra == "all"
|
|
28
|
+
Requires-Dist: patsy>=0.5; extra == "all"
|
|
29
|
+
Provides-Extra: dev
|
|
30
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
31
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
|
32
|
+
Dynamic: license-file
|
|
33
|
+
|
|
34
|
+
# edgePython
|
|
35
|
+
|
|
36
|
+
`edgePython` is a Python implementation of the Bioconductor `edgeR` package for differential analysis of genomics count data. It also includes a new single-cell differential expression method that extends the NEBULA-LN negative binomial mixed model with edgeR's TMM normalization and empirical Bayes dispersion shrinkage.
|
|
37
|
+
|
|
38
|
+
## Installation
|
|
39
|
+
|
|
40
|
+
From source:
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
pip install .
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
With optional extras:
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
pip install .[all]
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## Quick Start
|
|
53
|
+
|
|
54
|
+
```python
|
|
55
|
+
import numpy as np
|
|
56
|
+
import edgepython as ep
|
|
57
|
+
|
|
58
|
+
# genes x samples count matrix
|
|
59
|
+
counts = np.random.poisson(lam=10, size=(1000, 6))
|
|
60
|
+
group = np.array(["A", "A", "A", "B", "B", "B"])
|
|
61
|
+
|
|
62
|
+
y = ep.make_dgelist(counts=counts, group=group)
|
|
63
|
+
y = ep.calc_norm_factors(y)
|
|
64
|
+
y = ep.estimate_disp(y)
|
|
65
|
+
|
|
66
|
+
design = np.column_stack([np.ones(6), (group == "B").astype(float)])
|
|
67
|
+
fit = ep.glm_ql_fit(y, design)
|
|
68
|
+
res = ep.glm_ql_ftest(fit, coef=1)
|
|
69
|
+
top = ep.top_tags(res, n=10)
|
|
70
|
+
print(top["table"].head())
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
## Features
|
|
74
|
+
|
|
75
|
+
### Data Structures
|
|
76
|
+
|
|
77
|
+
`DGEList`-style data structures (`make_dgelist`, `cbind_dgelist`, `rbind_dgelist`, `valid_dgelist`) with accessor functions (`get_counts`, `get_dispersion`, `get_norm_lib_sizes`, `get_offset`).
|
|
78
|
+
|
|
79
|
+
### Normalization
|
|
80
|
+
|
|
81
|
+
TMM, TMMwsp, RLE, and upper-quartile normalization via `calc_norm_factors`. Normalized expression values via `cpm`, `rpkm`, `tpm`, `ave_log_cpm`, `cpm_by_group`, and `rpkm_by_group`.
|
|
82
|
+
|
|
83
|
+
### Filtering
|
|
84
|
+
|
|
85
|
+
Gene filtering by expression level via `filter_by_expr`.
|
|
86
|
+
|
|
87
|
+
### Dispersion Estimation
|
|
88
|
+
|
|
89
|
+
Common, trended, and tagwise dispersion estimation (`estimate_disp`, `estimate_common_disp`, `estimate_trended_disp`, `estimate_tagwise_disp`) with GLM variants (`estimate_glm_common_disp`, `estimate_glm_trended_disp`, `estimate_glm_tagwise_disp`). Weighted likelihood empirical Bayes shrinkage via `WLEB`.
|
|
90
|
+
|
|
91
|
+
### Differential Expression Testing
|
|
92
|
+
|
|
93
|
+
- **Exact test**: `exact_test` for two-group comparisons with exact negative binomial tests, plus helpers (`exact_test_double_tail`, `equalize_lib_sizes`, `q2q_nbinom`, `split_into_groups`).
|
|
94
|
+
- **GLM fitting**: `glm_fit`, `glm_ql_fit` for generalized linear model fitting.
|
|
95
|
+
- **GLM testing**: likelihood ratio tests (`glm_lrt`), quasi-likelihood F-tests (`glm_ql_ftest`), and fold-change threshold testing (`glm_treat`).
|
|
96
|
+
- **Results**: `top_tags` for extracting top DE genes with p-value adjustment, `decide_tests` for classifying genes as up/down/unchanged.
|
|
97
|
+
|
|
98
|
+
### Gene Set Testing
|
|
99
|
+
|
|
100
|
+
Competitive and self-contained gene set tests: `camera`, `fry`, `roast`, `mroast`, `romer`. Gene ontology and KEGG pathway enrichment via `goana` and `kegga`.
|
|
101
|
+
|
|
102
|
+
### Differential Splicing
|
|
103
|
+
|
|
104
|
+
Differential exon and transcript usage testing via `diff_splice` (GLM-based with LRT or QL tests), `diff_splice_dge` (exact test for two-group comparisons), and `splice_variants` (chi-squared tests for homogeneity of proportions across exons).
|
|
105
|
+
|
|
106
|
+
### Quantification Uncertainty
|
|
107
|
+
|
|
108
|
+
Reading quantification output with bootstrap or Gibbs sampling uncertainty from Salmon (`catch_salmon`), kallisto (`catch_kallisto`), and RSEM (`catch_rsem`). Overdispersion estimates from quantification uncertainty are used for differential transcript expression following the approach of Baldoni et al. (2024).
|
|
109
|
+
|
|
110
|
+
### I/O
|
|
111
|
+
|
|
112
|
+
- **Universal reader**: `read_data` with auto-detection for kallisto (H5/TSV), Salmon, oarfish, RSEM, 10X CellRanger, CSV/TSV count tables, AnnData (`.h5ad`), and RDS files.
|
|
113
|
+
- **Specialized readers**: `read_dge` (collates per-sample count files), `read_10x` (10X Genomics output), `feature_counts_to_dgelist` (featureCounts output), `read_bismark2dge` (Bismark methylation coverage).
|
|
114
|
+
- **Single-cell aggregation**: `seurat_to_pb` for pseudo-bulk aggregation.
|
|
115
|
+
- **Export**: `to_anndata` for converting DGEList and results to AnnData format.
|
|
116
|
+
|
|
117
|
+
### Visualization
|
|
118
|
+
|
|
119
|
+
`plot_md` (mean-difference plots), `plot_bcv` (biological coefficient of variation), `plot_mds` (multidimensional scaling), `plot_ql_disp` (quasi-likelihood dispersion), `plot_smear` (smear plots), `ma_plot` (MA plots), and `gof` (goodness of fit).
|
|
120
|
+
|
|
121
|
+
### Single-Cell Mixed Model
|
|
122
|
+
|
|
123
|
+
NEBULA-LN-style negative binomial gamma mixed model for multi-subject single-cell data: `glm_sc_fit`, `shrink_sc_disp`, `glm_sc_test`.
|
|
124
|
+
|
|
125
|
+
### ChIP-Seq
|
|
126
|
+
|
|
127
|
+
ChIP-seq normalization to matched input controls via `normalize_chip_to_input` and `calc_norm_offsets_for_chip`.
|
|
128
|
+
|
|
129
|
+
### Methylation/RRBS
|
|
130
|
+
|
|
131
|
+
Bismark coverage file reader (`read_bismark2dge`) and methylation-specific design matrix construction (`model_matrix_meth`).
|
|
132
|
+
|
|
133
|
+
### Utilities
|
|
134
|
+
|
|
135
|
+
Design matrix construction (`model_matrix`), prior count addition (`add_prior_count`), predicted fold changes (`pred_fc`), Good-Turing smoothing (`good_turing`), count thinning/downsampling (`thin_counts`), Gini coefficient (`gini`), sum technical replicates (`sum_tech_reps`), negative binomial z-scores (`zscore_nbinom`), nearest TSS annotation (`nearest_tss`), and variance shrinkage (`squeeze_var`).
|
|
136
|
+
|
|
137
|
+
## Examples
|
|
138
|
+
|
|
139
|
+
The [examples/mammary](examples/mammary) directory contains two notebooks for the GSE60450 mouse mammary dataset ([Fu et al. 2015](https://www.nature.com/articles/ncb3117)):
|
|
140
|
+
|
|
141
|
+
- [mouse_mammary_tutorial.ipynb](examples/mammary/mouse_mammary_tutorial.ipynb) — edgePython-only tutorial (Colab-ready)
|
|
142
|
+
- [mouse_mammary_R_vs_Python.ipynb](examples/mammary/mouse_mammary_R_vs_Python.ipynb) — side-by-side edgeR vs edgePython comparison
|
|
143
|
+
|
|
144
|
+
The [examples/hoxa1](examples/hoxa1) directory contains two notebooks for the GSE37704 HOXA1 knockdown dataset ([Trapnell et al. 2013](https://doi.org/10.1038/nbt.2594)), with transcript-level quantification by kallisto:
|
|
145
|
+
|
|
146
|
+
- [hoxa1_tutorial.ipynb](examples/hoxa1/hoxa1_tutorial.ipynb) — edgePython-only tutorial with scaled analysis using bootstrap overdispersion (Colab-ready)
|
|
147
|
+
- [hoxa1_R_vs_Python.ipynb](examples/hoxa1/hoxa1_R_vs_Python.ipynb) — side-by-side edgeR vs edgePython comparison reproducing Figure 1 panels
|
|
148
|
+
|
|
149
|
+
The [examples/clytia](examples/clytia) directory contains a notebook for the *Clytia hemisphaerica* single-cell RNA-seq dataset ([Chari et al. 2021](https://doi.org/10.1016/j.celrep.2021.109751)), demonstrating the NEBULA-LN mixed model with empirical Bayes dispersion shrinkage:
|
|
150
|
+
|
|
151
|
+
- [clytia_tutorial.ipynb](examples/clytia/clytia_tutorial.ipynb) — single-cell differential expression of fed vs starved gastrodigestive cells across 10 organisms, reproducing Figure 2 panels (Colab-ready)
|
|
152
|
+
|
|
153
|
+
## Development
|
|
154
|
+
|
|
155
|
+
Run tests:
|
|
156
|
+
|
|
157
|
+
```bash
|
|
158
|
+
pytest -q
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
## Authorship
|
|
162
|
+
|
|
163
|
+
This code was written by Claude (Anthropic). The project was directed by Lior Pachter.
|
|
164
|
+
|
|
165
|
+
## edgeR
|
|
166
|
+
|
|
167
|
+
`edgePython` is based on the [edgeR](https://bioconductor.org/packages/edgeR/) Bioconductor package. The edgeR publications are:
|
|
168
|
+
|
|
169
|
+
- Robinson MD, Smyth GK (2007). Moderated statistical tests for assessing differences in tag abundance. *Bioinformatics*, 23(21), 2881-2887. [doi:10.1093/bioinformatics/btm453](https://doi.org/10.1093/bioinformatics/btm453)
|
|
170
|
+
|
|
171
|
+
- Robinson MD, Smyth GK (2007). Small-sample estimation of negative binomial dispersion, with applications to SAGE data. *Biostatistics*, 9(2), 321-332. [doi:10.1093/biostatistics/kxm030](https://doi.org/10.1093/biostatistics/kxm030)
|
|
172
|
+
|
|
173
|
+
- Robinson MD, McCarthy DJ, Smyth GK (2010). edgeR: a Bioconductor package for differential expression analysis of digital gene expression data. *Bioinformatics*, 26(1), 139-140. [doi:10.1093/bioinformatics/btp616](https://doi.org/10.1093/bioinformatics/btp616)
|
|
174
|
+
|
|
175
|
+
- Robinson MD, Oshlack A (2010). A scaling normalization method for differential expression analysis of RNA-seq data. *Genome Biology*, 11(3), R25. [doi:10.1186/gb-2010-11-3-r25](https://doi.org/10.1186/gb-2010-11-3-r25)
|
|
176
|
+
|
|
177
|
+
- McCarthy DJ, Chen Y, Smyth GK (2012). Differential expression analysis of multifactor RNA-Seq experiments with respect to biological variation. *Nucleic Acids Research*, 40(10), 4288-4297. [doi:10.1093/nar/gks042](https://doi.org/10.1093/nar/gks042)
|
|
178
|
+
|
|
179
|
+
- Chen Y, Lun ATL, Smyth GK (2014). Differential expression analysis of complex RNA-seq experiments using edgeR. In *Statistical Analysis of Next Generation Sequencing Data*, Springer, 51-74. [doi:10.1007/978-3-319-07212-8_3](https://doi.org/10.1007/978-3-319-07212-8_3)
|
|
180
|
+
|
|
181
|
+
- Zhou X, Lindsay H, Robinson MD (2014). Robustly detecting differential expression in RNA sequencing data using observation weights. *Nucleic Acids Research*, 42(11), e91. [doi:10.1093/nar/gku310](https://doi.org/10.1093/nar/gku310)
|
|
182
|
+
|
|
183
|
+
- Dai Z, Sheridan JM, Gearing LJ, Moore DL, Su S, Wormald S, Wilcox S, O'Connor L, Dickins RA, Blewitt ME, Ritchie ME (2014). edgeR: a versatile tool for the analysis of shRNA-seq and CRISPR-Cas9 genetic screens. *F1000Research*, 3, 95. [doi:10.12688/f1000research.3928.2](https://doi.org/10.12688/f1000research.3928.2)
|
|
184
|
+
|
|
185
|
+
- Lun ATL, Chen Y, Smyth GK (2016). It's DE-licious: A recipe for differential expression analyses of RNA-seq experiments using quasi-likelihood methods in edgeR. In *Statistical Genomics*, Springer, 391-416. [doi:10.1007/978-1-4939-3578-9_19](https://doi.org/10.1007/978-1-4939-3578-9_19)
|
|
186
|
+
|
|
187
|
+
- Chen Y, Lun ATL, Smyth GK (2016). From reads to genes to pathways: differential expression analysis of RNA-Seq experiments using Rsubread and the edgeR quasi-likelihood pipeline. *F1000Research*, 5, 1438. [doi:10.12688/f1000research.8987.2](https://doi.org/10.12688/f1000research.8987.2)
|
|
188
|
+
|
|
189
|
+
- Chen Y, Pal B, Visvader JE, Smyth GK (2018). Differential methylation analysis of reduced representation bisulfite sequencing experiments using edgeR. *F1000Research*, 6, 2055. [doi:10.12688/f1000research.13196.2](https://doi.org/10.12688/f1000research.13196.2)
|
|
190
|
+
|
|
191
|
+
- Baldoni PL, Chen Y, Hediyeh-zadeh S, Liao Y, Dong X, Ritchie ME, Shi W, Smyth GK (2024). Dividing out quantification uncertainty allows efficient assessment of differential transcript expression with edgeR. *Nucleic Acids Research*, 52(3), e13. [doi:10.1093/nar/gkad1167](https://doi.org/10.1093/nar/gkad1167)
|
|
192
|
+
|
|
193
|
+
- Chen Y, Chen L, Lun ATL, Baldoni PL, Smyth GK (2025). edgeR v4: powerful differential analysis of sequencing data with expanded functionality and improved support for small counts and larger datasets. *Nucleic Acids Research*, 53(2), gkaf018. [doi:10.1093/nar/gkaf018](https://doi.org/10.1093/nar/gkaf018)
|
|
194
|
+
|
|
195
|
+
The single-cell mixed model in `edgePython` is based on NEBULA:
|
|
196
|
+
|
|
197
|
+
- He L, Davila-Velderrain J, Sumida TS, Hafler DA, Kellis M, Kulminski AM (2021). NEBULA is a fast negative binomial mixed model for differential or co-expression analysis of large-scale multi-subject single-cell data. *Communications Biology*, 4, 629. [doi:10.1038/s42003-021-02146-6](https://doi.org/10.1038/s42003-021-02146-6)
|
|
198
|
+
|
|
199
|
+
## License
|
|
200
|
+
|
|
201
|
+
This project is licensed under the GNU General Public License v3.0.
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
edgepython/__init__.py,sha256=SA-OSJITfGIkrgRKLewTCB3eMTN44t3HkqQjIuYt5h8,2489
|
|
2
|
+
edgepython/classes.py,sha256=tUgpQJFeFdR-48aEYOst0aAMo24eL-LLlXDUbWTgmjQ,16399
|
|
3
|
+
edgepython/compressed_matrix.py,sha256=XugtBswZX_Y8KxRBKe9eDM6FevxVStItw9RNYHYBQ5Q,14768
|
|
4
|
+
edgepython/dgelist.py,sha256=YAi8Sdf1eXdh7e5VUOWXvye9gAr1dF8_kQz0JbGacQU,10682
|
|
5
|
+
edgepython/dispersion.py,sha256=ybA-AQlParEb2GvGJv79me1RXnlc1csgl2uDad7EOBw,31664
|
|
6
|
+
edgepython/dispersion_lowlevel.py,sha256=I-21X-3vdOFo2gWXKLcfNJdonUWzMDxNLXNjpvgUd7g,35368
|
|
7
|
+
edgepython/exact_test.py,sha256=GtWbIpgWOHGuEh6dyPrYL1ASjh56cLCqU6vewcUkEwQ,16461
|
|
8
|
+
edgepython/expression.py,sha256=7QXdle37JSoqCi5TNDLQQo7fT2Av3kSzNjaR_eav8SM,11123
|
|
9
|
+
edgepython/filtering.py,sha256=f5f1imjrNzX3a2m-oI605V3J_D1JIEOlHYr2_DRwXRc,2843
|
|
10
|
+
edgepython/gene_sets.py,sha256=EIi64MDQU7uUuoextCMsuVR8YGWhEGrNDH91JKvKtC0,42208
|
|
11
|
+
edgepython/glm_fit.py,sha256=V5mLHUDSGYyKcQTZRnJMwWzYEfi4gpURFym7Y-kMleg,23250
|
|
12
|
+
edgepython/glm_levenberg.py,sha256=jQkmRtnnYZKj6ZiW_iVxFmgBpvbkdYKzgtP9b8aqAdA,12093
|
|
13
|
+
edgepython/glm_test.py,sha256=QVJHjZrcGqIdULdWVwcO4Qj3fycyX7RLHcoJciYh6mo,12134
|
|
14
|
+
edgepython/io.py,sha256=FjePd6ziup23MCpiCU2ApSsVW0hfpqAHGk3rTKxdLgs,64373
|
|
15
|
+
edgepython/limma_port.py,sha256=sMvphlGWCp-pOrGDDWlPjiS0O3w6BjfaSkHI-4JJ0wM,32453
|
|
16
|
+
edgepython/normalization.py,sha256=5ZIBV1t-XfD7qy-3HNYJ2-wdHYEu59YI2nRkLc_2a6w,18852
|
|
17
|
+
edgepython/ql_weights.py,sha256=XpUEwAS_UYvbFNF-g935jQhzcdfe-UXNkAAuDrx-m-A,68132
|
|
18
|
+
edgepython/results.py,sha256=2M_nNq8efUNp512OGeLZE60XB6_atygSJVwFpmS7Lfc,7197
|
|
19
|
+
edgepython/sc_fit.py,sha256=dN_UcHyFF4wCSwx5pHhorst4qxz96TDDxBD-96aWQd8,48333
|
|
20
|
+
edgepython/smoothing.py,sha256=5rEP0dUKrDr8YaEzekRtJ7tX5Ip3rgBc2lxMywrA9ZU,13751
|
|
21
|
+
edgepython/splicing.py,sha256=Do_JOGxOPawLxj0qQObPgytYBTi9N39tcHlPgYDICDc,19047
|
|
22
|
+
edgepython/utils.py,sha256=mP9wcffw_7cRjPypkKvQFw7kiNj4OCABX5Mrigh4CkE,34647
|
|
23
|
+
edgepython/visualization.py,sha256=VeDeWtSCpPWQ5Pn6oDCWg2gDZMwf4F1yWcEMihMPcR8,12227
|
|
24
|
+
edgepython/weighted_lowess.py,sha256=ldUHZJcKFFY9NdAFeq0pWkJS6e-HgCARAJ5NN7kklc0,10251
|
|
25
|
+
edgepython-0.2.0.dist-info/licenses/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
26
|
+
edgepython-0.2.0.dist-info/METADATA,sha256=pQuv9fwnUvhdwd88yhsHsMymLJLkqQ8XeKxzRlChafo,11674
|
|
27
|
+
edgepython-0.2.0.dist-info/WHEEL,sha256=YCfwYGOYMi5Jhw2fU4yNgwErybb2IX5PEwBKV4ZbdBo,91
|
|
28
|
+
edgepython-0.2.0.dist-info/top_level.txt,sha256=XDJV3dwIsV8JKQQDZX0qoQT7Z0iXfwZnjA0PLr_srBU,11
|
|
29
|
+
edgepython-0.2.0.dist-info/RECORD,,
|