edgepython 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- edgepython/__init__.py +114 -0
- edgepython/classes.py +517 -0
- edgepython/compressed_matrix.py +388 -0
- edgepython/dgelist.py +314 -0
- edgepython/dispersion.py +920 -0
- edgepython/dispersion_lowlevel.py +1066 -0
- edgepython/exact_test.py +525 -0
- edgepython/expression.py +323 -0
- edgepython/filtering.py +96 -0
- edgepython/gene_sets.py +1215 -0
- edgepython/glm_fit.py +653 -0
- edgepython/glm_levenberg.py +359 -0
- edgepython/glm_test.py +375 -0
- edgepython/io.py +1887 -0
- edgepython/limma_port.py +987 -0
- edgepython/normalization.py +546 -0
- edgepython/ql_weights.py +765 -0
- edgepython/results.py +236 -0
- edgepython/sc_fit.py +1511 -0
- edgepython/smoothing.py +474 -0
- edgepython/splicing.py +537 -0
- edgepython/utils.py +1050 -0
- edgepython/visualization.py +409 -0
- edgepython/weighted_lowess.py +323 -0
- edgepython-0.2.0.dist-info/METADATA +201 -0
- edgepython-0.2.0.dist-info/RECORD +29 -0
- edgepython-0.2.0.dist-info/WHEEL +5 -0
- edgepython-0.2.0.dist-info/licenses/LICENSE +674 -0
- edgepython-0.2.0.dist-info/top_level.txt +1 -0
edgepython/glm_test.py
ADDED
|
@@ -0,0 +1,375 @@
|
|
|
1
|
+
# This code was written by Claude (Anthropic). The project was directed by Lior Pachter.
|
|
2
|
+
"""
|
|
3
|
+
GLM-based tests for differential expression in edgePython.
|
|
4
|
+
|
|
5
|
+
Port of edgeR's glmLRT, glmQLFTest, glmTreat.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
import pandas as pd
|
|
10
|
+
from scipy.stats import chi2, f as f_dist
|
|
11
|
+
from scipy.special import gammaln
|
|
12
|
+
|
|
13
|
+
from .expression import ave_log_cpm
|
|
14
|
+
from .limma_port import contrast_as_coef
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def glm_lrt(glmfit, coef=None, contrast=None):
|
|
18
|
+
"""Likelihood ratio test for GLM coefficients.
|
|
19
|
+
|
|
20
|
+
Port of edgeR's glmLRT.
|
|
21
|
+
|
|
22
|
+
Parameters
|
|
23
|
+
----------
|
|
24
|
+
glmfit : dict (DGEGLM-like)
|
|
25
|
+
Fitted GLM object from glm_fit().
|
|
26
|
+
coef : int, list of int, or str, optional
|
|
27
|
+
Coefficient(s) to test. Default is last column.
|
|
28
|
+
contrast : ndarray, optional
|
|
29
|
+
Contrast vector or matrix.
|
|
30
|
+
|
|
31
|
+
Returns
|
|
32
|
+
-------
|
|
33
|
+
dict (DGELRT-like) with 'table', 'comparison', 'df.test'.
|
|
34
|
+
"""
|
|
35
|
+
if glmfit.get('AveLogCPM') is None:
|
|
36
|
+
glmfit['AveLogCPM'] = ave_log_cpm(glmfit)
|
|
37
|
+
|
|
38
|
+
design = np.asarray(glmfit['design'], dtype=np.float64)
|
|
39
|
+
nbeta = design.shape[1]
|
|
40
|
+
nlibs = design.shape[0]
|
|
41
|
+
|
|
42
|
+
if nbeta < 2:
|
|
43
|
+
raise ValueError("Need at least two columns for design")
|
|
44
|
+
|
|
45
|
+
coef_names = [f'coef{i}' for i in range(nbeta)]
|
|
46
|
+
|
|
47
|
+
# Determine coefficients to test
|
|
48
|
+
if contrast is None:
|
|
49
|
+
if coef is None:
|
|
50
|
+
coef = nbeta - 1 # last column (0-indexed)
|
|
51
|
+
|
|
52
|
+
if isinstance(coef, (int, np.integer)):
|
|
53
|
+
coef = [coef]
|
|
54
|
+
coef = list(set(coef))
|
|
55
|
+
coef_name = [coef_names[c] for c in coef]
|
|
56
|
+
|
|
57
|
+
logFC = glmfit['coefficients'][:, coef] / np.log(2)
|
|
58
|
+
else:
|
|
59
|
+
contrast = np.asarray(contrast, dtype=np.float64)
|
|
60
|
+
if contrast.ndim == 1:
|
|
61
|
+
contrast = contrast.reshape(-1, 1)
|
|
62
|
+
|
|
63
|
+
ncontrasts = np.linalg.matrix_rank(contrast)
|
|
64
|
+
if ncontrasts == 0:
|
|
65
|
+
raise ValueError("contrasts are all zero")
|
|
66
|
+
|
|
67
|
+
coef = list(range(ncontrasts))
|
|
68
|
+
logFC = (glmfit['coefficients'] @ contrast) / np.log(2)
|
|
69
|
+
|
|
70
|
+
# Reform design
|
|
71
|
+
Q, R = np.linalg.qr(contrast, mode='complete')
|
|
72
|
+
design = design @ Q
|
|
73
|
+
|
|
74
|
+
if ncontrasts > 1:
|
|
75
|
+
coef_name = f"LR test on {ncontrasts} degrees of freedom"
|
|
76
|
+
else:
|
|
77
|
+
coef_name = "contrast"
|
|
78
|
+
|
|
79
|
+
if len(coef) == 1 and logFC.ndim == 2:
|
|
80
|
+
logFC = logFC.ravel()
|
|
81
|
+
|
|
82
|
+
# Null design matrix
|
|
83
|
+
keep_cols = [i for i in range(design.shape[1]) if i not in coef]
|
|
84
|
+
design0 = design[:, keep_cols]
|
|
85
|
+
|
|
86
|
+
# Null fit
|
|
87
|
+
from .glm_fit import glm_fit
|
|
88
|
+
dispersion = glmfit.get('dispersion')
|
|
89
|
+
if glmfit.get('average.ql.dispersion') is not None:
|
|
90
|
+
dispersion = np.asarray(dispersion, dtype=np.float64) / glmfit['average.ql.dispersion']
|
|
91
|
+
|
|
92
|
+
fit_null = glm_fit(glmfit['counts'], design=design0,
|
|
93
|
+
offset=glmfit.get('offset'),
|
|
94
|
+
weights=glmfit.get('weights'),
|
|
95
|
+
dispersion=dispersion, prior_count=0)
|
|
96
|
+
|
|
97
|
+
# Likelihood ratio statistic
|
|
98
|
+
LR = fit_null['deviance'] - glmfit['deviance']
|
|
99
|
+
df_test = np.asarray(fit_null['df.residual']) - np.asarray(glmfit['df.residual'])
|
|
100
|
+
df_test_val = df_test[0] if np.all(df_test == df_test[0]) else df_test
|
|
101
|
+
|
|
102
|
+
LRT_pvalue = chi2.sf(np.maximum(LR, 0), df=df_test_val)
|
|
103
|
+
|
|
104
|
+
# Build output table
|
|
105
|
+
table = pd.DataFrame({
|
|
106
|
+
'logFC': logFC if logFC.ndim == 1 else logFC[:, 0],
|
|
107
|
+
'logCPM': glmfit['AveLogCPM'],
|
|
108
|
+
'LR': LR,
|
|
109
|
+
'PValue': LRT_pvalue
|
|
110
|
+
})
|
|
111
|
+
|
|
112
|
+
result = dict(glmfit)
|
|
113
|
+
result.pop('counts', None)
|
|
114
|
+
result['table'] = table
|
|
115
|
+
result['comparison'] = coef_name
|
|
116
|
+
result['df.test'] = df_test_val
|
|
117
|
+
|
|
118
|
+
return result
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def glm_ql_ftest(glmfit, coef=None, contrast=None, poisson_bound=True):
|
|
122
|
+
"""Quasi-likelihood F-test for GLM coefficients.
|
|
123
|
+
|
|
124
|
+
Port of edgeR's glmQLFTest.
|
|
125
|
+
|
|
126
|
+
Parameters
|
|
127
|
+
----------
|
|
128
|
+
glmfit : dict (DGEGLM-like)
|
|
129
|
+
Fitted QL GLM from glm_ql_fit().
|
|
130
|
+
coef : int or list, optional
|
|
131
|
+
Coefficient(s) to test.
|
|
132
|
+
contrast : ndarray, optional
|
|
133
|
+
Contrast vector.
|
|
134
|
+
poisson_bound : bool
|
|
135
|
+
Apply Poisson bound.
|
|
136
|
+
|
|
137
|
+
Returns
|
|
138
|
+
-------
|
|
139
|
+
dict (DGELRT-like) with F-statistics and p-values.
|
|
140
|
+
"""
|
|
141
|
+
if glmfit.get('s2.post') is None:
|
|
142
|
+
raise ValueError("need to run glm_ql_fit before glm_ql_ftest")
|
|
143
|
+
|
|
144
|
+
# Run glmLRT to get the LR statistics
|
|
145
|
+
out = glm_lrt(glmfit, coef=coef, contrast=contrast)
|
|
146
|
+
|
|
147
|
+
# Get adjusted df
|
|
148
|
+
if glmfit.get('df.residual.zeros') is None:
|
|
149
|
+
df_residual = glmfit.get('df.residual.adj', glmfit['df.residual'])
|
|
150
|
+
poisson_bound = False
|
|
151
|
+
else:
|
|
152
|
+
df_residual = glmfit['df.residual.zeros']
|
|
153
|
+
|
|
154
|
+
df_residual = np.asarray(df_residual, dtype=np.float64)
|
|
155
|
+
|
|
156
|
+
# Compute F-statistic
|
|
157
|
+
df_test = out['df.test']
|
|
158
|
+
if np.isscalar(df_test):
|
|
159
|
+
df_test_val = float(df_test)
|
|
160
|
+
else:
|
|
161
|
+
df_test_val = np.asarray(df_test, dtype=np.float64)
|
|
162
|
+
|
|
163
|
+
F_stat = out['table']['LR'].values / df_test_val / glmfit['s2.post']
|
|
164
|
+
|
|
165
|
+
df_prior = np.atleast_1d(np.asarray(glmfit['df.prior'], dtype=np.float64))
|
|
166
|
+
df_total = df_prior + df_residual
|
|
167
|
+
|
|
168
|
+
# Cap df.total
|
|
169
|
+
df_residual_total = np.sum(glmfit['df.residual'])
|
|
170
|
+
df_total = np.minimum(df_total, df_residual_total)
|
|
171
|
+
|
|
172
|
+
# P-values from F-distribution
|
|
173
|
+
F_pvalue = f_dist.sf(np.maximum(F_stat, 0), dfn=df_test_val, dfd=df_total)
|
|
174
|
+
|
|
175
|
+
# Update output
|
|
176
|
+
out['table'].drop(columns=['LR'], inplace=True, errors='ignore')
|
|
177
|
+
out['table']['F'] = F_stat
|
|
178
|
+
out['table']['PValue'] = F_pvalue
|
|
179
|
+
out['df.total'] = df_total
|
|
180
|
+
|
|
181
|
+
return out
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def glm_treat(glmfit, coef=None, contrast=None, lfc=np.log2(1.2),
|
|
185
|
+
null='interval'):
|
|
186
|
+
"""Likelihood ratio or quasi-likelihood test with a log-FC threshold.
|
|
187
|
+
|
|
188
|
+
Port of edgeR's glmTreat.
|
|
189
|
+
|
|
190
|
+
Parameters
|
|
191
|
+
----------
|
|
192
|
+
glmfit : dict (DGEGLM-like)
|
|
193
|
+
Fitted GLM from glm_fit() or glm_ql_fit().
|
|
194
|
+
coef : int, optional
|
|
195
|
+
Coefficient to test.
|
|
196
|
+
contrast : ndarray, optional
|
|
197
|
+
Contrast vector.
|
|
198
|
+
lfc : float
|
|
199
|
+
Log2-fold-change threshold.
|
|
200
|
+
null : str
|
|
201
|
+
'interval' or 'worst.case'.
|
|
202
|
+
|
|
203
|
+
Returns
|
|
204
|
+
-------
|
|
205
|
+
dict (DGELRT-like) with table including p-values.
|
|
206
|
+
"""
|
|
207
|
+
from scipy.stats import norm as norm_dist, t as t_dist
|
|
208
|
+
from .glm_fit import glm_fit
|
|
209
|
+
from .compressed_matrix import CompressedMatrix
|
|
210
|
+
|
|
211
|
+
if lfc < 0:
|
|
212
|
+
raise ValueError("lfc has to be non-negative")
|
|
213
|
+
|
|
214
|
+
is_lrt = glmfit.get('df.prior') is None
|
|
215
|
+
|
|
216
|
+
# If lfc is zero, fall back to standard test
|
|
217
|
+
if lfc == 0:
|
|
218
|
+
if is_lrt:
|
|
219
|
+
return glm_lrt(glmfit, coef=coef, contrast=contrast)
|
|
220
|
+
else:
|
|
221
|
+
return glm_ql_ftest(glmfit, coef=coef, contrast=contrast)
|
|
222
|
+
|
|
223
|
+
if glmfit.get('AveLogCPM') is None:
|
|
224
|
+
glmfit['AveLogCPM'] = ave_log_cpm(glmfit)
|
|
225
|
+
ngenes = glmfit['counts'].shape[0]
|
|
226
|
+
|
|
227
|
+
design = np.asarray(glmfit['design'], dtype=np.float64)
|
|
228
|
+
nbeta = design.shape[1]
|
|
229
|
+
|
|
230
|
+
if nbeta < 2:
|
|
231
|
+
raise ValueError("Need at least two columns for design")
|
|
232
|
+
|
|
233
|
+
# Determine coefficient to test
|
|
234
|
+
if coef is None:
|
|
235
|
+
coef = nbeta - 1
|
|
236
|
+
|
|
237
|
+
shrunk = glmfit.get('prior.count', 0) != 0
|
|
238
|
+
|
|
239
|
+
if contrast is None:
|
|
240
|
+
if isinstance(coef, (int, np.integer)):
|
|
241
|
+
coef_idx = coef
|
|
242
|
+
else:
|
|
243
|
+
coef_idx = coef[0]
|
|
244
|
+
# R: logFC uses shrunk coefficients for display, unshrunk for test
|
|
245
|
+
logFC = glmfit['coefficients'][:, coef_idx] / np.log(2)
|
|
246
|
+
unshrunk_logFC = logFC.copy()
|
|
247
|
+
if shrunk and glmfit.get('unshrunk.coefficients') is not None:
|
|
248
|
+
unshrunk_logFC = glmfit['unshrunk.coefficients'][:, coef_idx] / np.log(2)
|
|
249
|
+
else:
|
|
250
|
+
contrast = np.asarray(contrast, dtype=np.float64).ravel()
|
|
251
|
+
reform = contrast_as_coef(design, contrast, first=True)
|
|
252
|
+
coef_idx = 0
|
|
253
|
+
logFC = (glmfit['coefficients'] @ contrast) / np.log(2)
|
|
254
|
+
unshrunk_logFC = logFC.copy()
|
|
255
|
+
if shrunk and glmfit.get('unshrunk.coefficients') is not None:
|
|
256
|
+
unshrunk_logFC = (glmfit['unshrunk.coefficients'] @ contrast) / np.log(2)
|
|
257
|
+
design = reform['design']
|
|
258
|
+
|
|
259
|
+
# Null design matrix
|
|
260
|
+
keep_cols = [i for i in range(design.shape[1]) if i != coef_idx]
|
|
261
|
+
design0 = design[:, keep_cols]
|
|
262
|
+
|
|
263
|
+
# Get dispersion
|
|
264
|
+
dispersion = glmfit.get('dispersion')
|
|
265
|
+
if glmfit.get('average.ql.dispersion') is not None:
|
|
266
|
+
dispersion = np.asarray(dispersion, dtype=np.float64) / glmfit['average.ql.dispersion']
|
|
267
|
+
|
|
268
|
+
# Offset adjustment
|
|
269
|
+
offset = np.asarray(glmfit.get('offset', np.zeros((ngenes, design.shape[0]))),
|
|
270
|
+
dtype=np.float64)
|
|
271
|
+
if offset.ndim == 1:
|
|
272
|
+
offset = np.tile(offset, (ngenes, 1))
|
|
273
|
+
|
|
274
|
+
offset_adj = lfc * np.log(2) * design[:, coef_idx]
|
|
275
|
+
offset_adj_mat = np.tile(offset_adj, (ngenes, 1))
|
|
276
|
+
|
|
277
|
+
# Test at beta_0 = +tau
|
|
278
|
+
offset_new = offset + offset_adj_mat
|
|
279
|
+
fit0 = glm_fit(glmfit['counts'], design=design0, offset=offset_new,
|
|
280
|
+
weights=glmfit.get('weights'), dispersion=dispersion,
|
|
281
|
+
prior_count=0)
|
|
282
|
+
fit1 = glm_fit(glmfit['counts'], design=design, offset=offset_new,
|
|
283
|
+
weights=glmfit.get('weights'), dispersion=dispersion,
|
|
284
|
+
prior_count=0)
|
|
285
|
+
z_left = np.sqrt(np.maximum(0, fit0['deviance'] - fit1['deviance']))
|
|
286
|
+
|
|
287
|
+
# Test at beta_0 = -tau
|
|
288
|
+
offset_new = offset - offset_adj_mat
|
|
289
|
+
fit0 = glm_fit(glmfit['counts'], design=design0, offset=offset_new,
|
|
290
|
+
weights=glmfit.get('weights'), dispersion=dispersion,
|
|
291
|
+
prior_count=0)
|
|
292
|
+
fit1 = glm_fit(glmfit['counts'], design=design, offset=offset_new,
|
|
293
|
+
weights=glmfit.get('weights'), dispersion=dispersion,
|
|
294
|
+
prior_count=0)
|
|
295
|
+
z_right = np.sqrt(np.maximum(0, fit0['deviance'] - fit1['deviance']))
|
|
296
|
+
|
|
297
|
+
# Make sure z_left <= z_right
|
|
298
|
+
swap = z_left > z_right
|
|
299
|
+
z_left_tmp = z_left.copy()
|
|
300
|
+
z_left[swap] = z_right[swap]
|
|
301
|
+
z_right[swap] = z_left_tmp[swap]
|
|
302
|
+
|
|
303
|
+
# Convert t to z under QL pipeline
|
|
304
|
+
if not is_lrt:
|
|
305
|
+
if glmfit.get('df.residual.zeros') is None:
|
|
306
|
+
df_residual = glmfit.get('df.residual.adj', glmfit['df.residual'])
|
|
307
|
+
else:
|
|
308
|
+
df_residual = glmfit['df.residual.zeros']
|
|
309
|
+
|
|
310
|
+
df_total = np.asarray(glmfit['df.prior']) + np.asarray(df_residual)
|
|
311
|
+
s2_post = np.asarray(glmfit['s2.post'])
|
|
312
|
+
z_left = _zscore_t(z_left / np.sqrt(s2_post), df_total)
|
|
313
|
+
z_right = _zscore_t(z_right / np.sqrt(s2_post), df_total)
|
|
314
|
+
|
|
315
|
+
# Apply sign based on whether |logFC| <= lfc
|
|
316
|
+
within = np.abs(unshrunk_logFC) <= lfc
|
|
317
|
+
sgn = 2 * within.astype(float) - 1
|
|
318
|
+
z_left = z_left * sgn
|
|
319
|
+
|
|
320
|
+
# Compute p-values
|
|
321
|
+
if null == 'interval':
|
|
322
|
+
c = 1.470402
|
|
323
|
+
j = (z_right + z_left) > c
|
|
324
|
+
p_value = np.ones(ngenes)
|
|
325
|
+
p_value[j] = (_integrate_pnorm(-z_right[j], -z_right[j] + c) +
|
|
326
|
+
_integrate_pnorm(z_left[j] - c, z_left[j]))
|
|
327
|
+
p_value[~j] = 2 * _integrate_pnorm(-z_right[~j], z_left[~j])
|
|
328
|
+
else:
|
|
329
|
+
p_value = norm_dist.cdf(-z_right) + norm_dist.cdf(z_left)
|
|
330
|
+
|
|
331
|
+
# Build table — use shrunk logFC for display (matching R)
|
|
332
|
+
table = pd.DataFrame({
|
|
333
|
+
'logFC': logFC,
|
|
334
|
+
'logCPM': glmfit['AveLogCPM'],
|
|
335
|
+
'PValue': p_value
|
|
336
|
+
})
|
|
337
|
+
|
|
338
|
+
result = dict(glmfit)
|
|
339
|
+
result.pop('counts', None)
|
|
340
|
+
result['lfc'] = lfc
|
|
341
|
+
result['table'] = table
|
|
342
|
+
result['comparison'] = f'coef{coef_idx}'
|
|
343
|
+
|
|
344
|
+
return result
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
def _zscore_t(x, df):
|
|
348
|
+
"""Convert t-statistics to z-scores."""
|
|
349
|
+
from scipy.stats import t as t_dist, norm as norm_dist
|
|
350
|
+
df = np.asarray(df, dtype=np.float64)
|
|
351
|
+
x = np.asarray(x, dtype=np.float64)
|
|
352
|
+
# Use log p-value for precision
|
|
353
|
+
log_p = t_dist.logsf(np.abs(x), df)
|
|
354
|
+
z = norm_dist.isf(np.exp(log_p))
|
|
355
|
+
z = np.where(x < 0, -z, z)
|
|
356
|
+
return z
|
|
357
|
+
|
|
358
|
+
|
|
359
|
+
def _integrate_pnorm(a, b):
|
|
360
|
+
"""Integrate the standard normal CDF from a to b.
|
|
361
|
+
|
|
362
|
+
Port of edgeR's .integratepnorm.
|
|
363
|
+
"""
|
|
364
|
+
from scipy.stats import norm as norm_dist
|
|
365
|
+
a = np.asarray(a, dtype=np.float64)
|
|
366
|
+
b = np.asarray(b, dtype=np.float64)
|
|
367
|
+
|
|
368
|
+
equal = np.abs(a - b) < 1e-15
|
|
369
|
+
result = np.where(
|
|
370
|
+
equal,
|
|
371
|
+
norm_dist.cdf(a),
|
|
372
|
+
(b * norm_dist.cdf(b) + norm_dist.pdf(b) -
|
|
373
|
+
(a * norm_dist.cdf(a) + norm_dist.pdf(a))) / np.maximum(b - a, 1e-15)
|
|
374
|
+
)
|
|
375
|
+
return result
|