edgepython 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
edgepython/glm_test.py ADDED
@@ -0,0 +1,375 @@
1
+ # This code was written by Claude (Anthropic). The project was directed by Lior Pachter.
2
+ """
3
+ GLM-based tests for differential expression in edgePython.
4
+
5
+ Port of edgeR's glmLRT, glmQLFTest, glmTreat.
6
+ """
7
+
8
+ import numpy as np
9
+ import pandas as pd
10
+ from scipy.stats import chi2, f as f_dist
11
+ from scipy.special import gammaln
12
+
13
+ from .expression import ave_log_cpm
14
+ from .limma_port import contrast_as_coef
15
+
16
+
17
+ def glm_lrt(glmfit, coef=None, contrast=None):
18
+ """Likelihood ratio test for GLM coefficients.
19
+
20
+ Port of edgeR's glmLRT.
21
+
22
+ Parameters
23
+ ----------
24
+ glmfit : dict (DGEGLM-like)
25
+ Fitted GLM object from glm_fit().
26
+ coef : int, list of int, or str, optional
27
+ Coefficient(s) to test. Default is last column.
28
+ contrast : ndarray, optional
29
+ Contrast vector or matrix.
30
+
31
+ Returns
32
+ -------
33
+ dict (DGELRT-like) with 'table', 'comparison', 'df.test'.
34
+ """
35
+ if glmfit.get('AveLogCPM') is None:
36
+ glmfit['AveLogCPM'] = ave_log_cpm(glmfit)
37
+
38
+ design = np.asarray(glmfit['design'], dtype=np.float64)
39
+ nbeta = design.shape[1]
40
+ nlibs = design.shape[0]
41
+
42
+ if nbeta < 2:
43
+ raise ValueError("Need at least two columns for design")
44
+
45
+ coef_names = [f'coef{i}' for i in range(nbeta)]
46
+
47
+ # Determine coefficients to test
48
+ if contrast is None:
49
+ if coef is None:
50
+ coef = nbeta - 1 # last column (0-indexed)
51
+
52
+ if isinstance(coef, (int, np.integer)):
53
+ coef = [coef]
54
+ coef = list(set(coef))
55
+ coef_name = [coef_names[c] for c in coef]
56
+
57
+ logFC = glmfit['coefficients'][:, coef] / np.log(2)
58
+ else:
59
+ contrast = np.asarray(contrast, dtype=np.float64)
60
+ if contrast.ndim == 1:
61
+ contrast = contrast.reshape(-1, 1)
62
+
63
+ ncontrasts = np.linalg.matrix_rank(contrast)
64
+ if ncontrasts == 0:
65
+ raise ValueError("contrasts are all zero")
66
+
67
+ coef = list(range(ncontrasts))
68
+ logFC = (glmfit['coefficients'] @ contrast) / np.log(2)
69
+
70
+ # Reform design
71
+ Q, R = np.linalg.qr(contrast, mode='complete')
72
+ design = design @ Q
73
+
74
+ if ncontrasts > 1:
75
+ coef_name = f"LR test on {ncontrasts} degrees of freedom"
76
+ else:
77
+ coef_name = "contrast"
78
+
79
+ if len(coef) == 1 and logFC.ndim == 2:
80
+ logFC = logFC.ravel()
81
+
82
+ # Null design matrix
83
+ keep_cols = [i for i in range(design.shape[1]) if i not in coef]
84
+ design0 = design[:, keep_cols]
85
+
86
+ # Null fit
87
+ from .glm_fit import glm_fit
88
+ dispersion = glmfit.get('dispersion')
89
+ if glmfit.get('average.ql.dispersion') is not None:
90
+ dispersion = np.asarray(dispersion, dtype=np.float64) / glmfit['average.ql.dispersion']
91
+
92
+ fit_null = glm_fit(glmfit['counts'], design=design0,
93
+ offset=glmfit.get('offset'),
94
+ weights=glmfit.get('weights'),
95
+ dispersion=dispersion, prior_count=0)
96
+
97
+ # Likelihood ratio statistic
98
+ LR = fit_null['deviance'] - glmfit['deviance']
99
+ df_test = np.asarray(fit_null['df.residual']) - np.asarray(glmfit['df.residual'])
100
+ df_test_val = df_test[0] if np.all(df_test == df_test[0]) else df_test
101
+
102
+ LRT_pvalue = chi2.sf(np.maximum(LR, 0), df=df_test_val)
103
+
104
+ # Build output table
105
+ table = pd.DataFrame({
106
+ 'logFC': logFC if logFC.ndim == 1 else logFC[:, 0],
107
+ 'logCPM': glmfit['AveLogCPM'],
108
+ 'LR': LR,
109
+ 'PValue': LRT_pvalue
110
+ })
111
+
112
+ result = dict(glmfit)
113
+ result.pop('counts', None)
114
+ result['table'] = table
115
+ result['comparison'] = coef_name
116
+ result['df.test'] = df_test_val
117
+
118
+ return result
119
+
120
+
121
+ def glm_ql_ftest(glmfit, coef=None, contrast=None, poisson_bound=True):
122
+ """Quasi-likelihood F-test for GLM coefficients.
123
+
124
+ Port of edgeR's glmQLFTest.
125
+
126
+ Parameters
127
+ ----------
128
+ glmfit : dict (DGEGLM-like)
129
+ Fitted QL GLM from glm_ql_fit().
130
+ coef : int or list, optional
131
+ Coefficient(s) to test.
132
+ contrast : ndarray, optional
133
+ Contrast vector.
134
+ poisson_bound : bool
135
+ Apply Poisson bound.
136
+
137
+ Returns
138
+ -------
139
+ dict (DGELRT-like) with F-statistics and p-values.
140
+ """
141
+ if glmfit.get('s2.post') is None:
142
+ raise ValueError("need to run glm_ql_fit before glm_ql_ftest")
143
+
144
+ # Run glmLRT to get the LR statistics
145
+ out = glm_lrt(glmfit, coef=coef, contrast=contrast)
146
+
147
+ # Get adjusted df
148
+ if glmfit.get('df.residual.zeros') is None:
149
+ df_residual = glmfit.get('df.residual.adj', glmfit['df.residual'])
150
+ poisson_bound = False
151
+ else:
152
+ df_residual = glmfit['df.residual.zeros']
153
+
154
+ df_residual = np.asarray(df_residual, dtype=np.float64)
155
+
156
+ # Compute F-statistic
157
+ df_test = out['df.test']
158
+ if np.isscalar(df_test):
159
+ df_test_val = float(df_test)
160
+ else:
161
+ df_test_val = np.asarray(df_test, dtype=np.float64)
162
+
163
+ F_stat = out['table']['LR'].values / df_test_val / glmfit['s2.post']
164
+
165
+ df_prior = np.atleast_1d(np.asarray(glmfit['df.prior'], dtype=np.float64))
166
+ df_total = df_prior + df_residual
167
+
168
+ # Cap df.total
169
+ df_residual_total = np.sum(glmfit['df.residual'])
170
+ df_total = np.minimum(df_total, df_residual_total)
171
+
172
+ # P-values from F-distribution
173
+ F_pvalue = f_dist.sf(np.maximum(F_stat, 0), dfn=df_test_val, dfd=df_total)
174
+
175
+ # Update output
176
+ out['table'].drop(columns=['LR'], inplace=True, errors='ignore')
177
+ out['table']['F'] = F_stat
178
+ out['table']['PValue'] = F_pvalue
179
+ out['df.total'] = df_total
180
+
181
+ return out
182
+
183
+
184
+ def glm_treat(glmfit, coef=None, contrast=None, lfc=np.log2(1.2),
185
+ null='interval'):
186
+ """Likelihood ratio or quasi-likelihood test with a log-FC threshold.
187
+
188
+ Port of edgeR's glmTreat.
189
+
190
+ Parameters
191
+ ----------
192
+ glmfit : dict (DGEGLM-like)
193
+ Fitted GLM from glm_fit() or glm_ql_fit().
194
+ coef : int, optional
195
+ Coefficient to test.
196
+ contrast : ndarray, optional
197
+ Contrast vector.
198
+ lfc : float
199
+ Log2-fold-change threshold.
200
+ null : str
201
+ 'interval' or 'worst.case'.
202
+
203
+ Returns
204
+ -------
205
+ dict (DGELRT-like) with table including p-values.
206
+ """
207
+ from scipy.stats import norm as norm_dist, t as t_dist
208
+ from .glm_fit import glm_fit
209
+ from .compressed_matrix import CompressedMatrix
210
+
211
+ if lfc < 0:
212
+ raise ValueError("lfc has to be non-negative")
213
+
214
+ is_lrt = glmfit.get('df.prior') is None
215
+
216
+ # If lfc is zero, fall back to standard test
217
+ if lfc == 0:
218
+ if is_lrt:
219
+ return glm_lrt(glmfit, coef=coef, contrast=contrast)
220
+ else:
221
+ return glm_ql_ftest(glmfit, coef=coef, contrast=contrast)
222
+
223
+ if glmfit.get('AveLogCPM') is None:
224
+ glmfit['AveLogCPM'] = ave_log_cpm(glmfit)
225
+ ngenes = glmfit['counts'].shape[0]
226
+
227
+ design = np.asarray(glmfit['design'], dtype=np.float64)
228
+ nbeta = design.shape[1]
229
+
230
+ if nbeta < 2:
231
+ raise ValueError("Need at least two columns for design")
232
+
233
+ # Determine coefficient to test
234
+ if coef is None:
235
+ coef = nbeta - 1
236
+
237
+ shrunk = glmfit.get('prior.count', 0) != 0
238
+
239
+ if contrast is None:
240
+ if isinstance(coef, (int, np.integer)):
241
+ coef_idx = coef
242
+ else:
243
+ coef_idx = coef[0]
244
+ # R: logFC uses shrunk coefficients for display, unshrunk for test
245
+ logFC = glmfit['coefficients'][:, coef_idx] / np.log(2)
246
+ unshrunk_logFC = logFC.copy()
247
+ if shrunk and glmfit.get('unshrunk.coefficients') is not None:
248
+ unshrunk_logFC = glmfit['unshrunk.coefficients'][:, coef_idx] / np.log(2)
249
+ else:
250
+ contrast = np.asarray(contrast, dtype=np.float64).ravel()
251
+ reform = contrast_as_coef(design, contrast, first=True)
252
+ coef_idx = 0
253
+ logFC = (glmfit['coefficients'] @ contrast) / np.log(2)
254
+ unshrunk_logFC = logFC.copy()
255
+ if shrunk and glmfit.get('unshrunk.coefficients') is not None:
256
+ unshrunk_logFC = (glmfit['unshrunk.coefficients'] @ contrast) / np.log(2)
257
+ design = reform['design']
258
+
259
+ # Null design matrix
260
+ keep_cols = [i for i in range(design.shape[1]) if i != coef_idx]
261
+ design0 = design[:, keep_cols]
262
+
263
+ # Get dispersion
264
+ dispersion = glmfit.get('dispersion')
265
+ if glmfit.get('average.ql.dispersion') is not None:
266
+ dispersion = np.asarray(dispersion, dtype=np.float64) / glmfit['average.ql.dispersion']
267
+
268
+ # Offset adjustment
269
+ offset = np.asarray(glmfit.get('offset', np.zeros((ngenes, design.shape[0]))),
270
+ dtype=np.float64)
271
+ if offset.ndim == 1:
272
+ offset = np.tile(offset, (ngenes, 1))
273
+
274
+ offset_adj = lfc * np.log(2) * design[:, coef_idx]
275
+ offset_adj_mat = np.tile(offset_adj, (ngenes, 1))
276
+
277
+ # Test at beta_0 = +tau
278
+ offset_new = offset + offset_adj_mat
279
+ fit0 = glm_fit(glmfit['counts'], design=design0, offset=offset_new,
280
+ weights=glmfit.get('weights'), dispersion=dispersion,
281
+ prior_count=0)
282
+ fit1 = glm_fit(glmfit['counts'], design=design, offset=offset_new,
283
+ weights=glmfit.get('weights'), dispersion=dispersion,
284
+ prior_count=0)
285
+ z_left = np.sqrt(np.maximum(0, fit0['deviance'] - fit1['deviance']))
286
+
287
+ # Test at beta_0 = -tau
288
+ offset_new = offset - offset_adj_mat
289
+ fit0 = glm_fit(glmfit['counts'], design=design0, offset=offset_new,
290
+ weights=glmfit.get('weights'), dispersion=dispersion,
291
+ prior_count=0)
292
+ fit1 = glm_fit(glmfit['counts'], design=design, offset=offset_new,
293
+ weights=glmfit.get('weights'), dispersion=dispersion,
294
+ prior_count=0)
295
+ z_right = np.sqrt(np.maximum(0, fit0['deviance'] - fit1['deviance']))
296
+
297
+ # Make sure z_left <= z_right
298
+ swap = z_left > z_right
299
+ z_left_tmp = z_left.copy()
300
+ z_left[swap] = z_right[swap]
301
+ z_right[swap] = z_left_tmp[swap]
302
+
303
+ # Convert t to z under QL pipeline
304
+ if not is_lrt:
305
+ if glmfit.get('df.residual.zeros') is None:
306
+ df_residual = glmfit.get('df.residual.adj', glmfit['df.residual'])
307
+ else:
308
+ df_residual = glmfit['df.residual.zeros']
309
+
310
+ df_total = np.asarray(glmfit['df.prior']) + np.asarray(df_residual)
311
+ s2_post = np.asarray(glmfit['s2.post'])
312
+ z_left = _zscore_t(z_left / np.sqrt(s2_post), df_total)
313
+ z_right = _zscore_t(z_right / np.sqrt(s2_post), df_total)
314
+
315
+ # Apply sign based on whether |logFC| <= lfc
316
+ within = np.abs(unshrunk_logFC) <= lfc
317
+ sgn = 2 * within.astype(float) - 1
318
+ z_left = z_left * sgn
319
+
320
+ # Compute p-values
321
+ if null == 'interval':
322
+ c = 1.470402
323
+ j = (z_right + z_left) > c
324
+ p_value = np.ones(ngenes)
325
+ p_value[j] = (_integrate_pnorm(-z_right[j], -z_right[j] + c) +
326
+ _integrate_pnorm(z_left[j] - c, z_left[j]))
327
+ p_value[~j] = 2 * _integrate_pnorm(-z_right[~j], z_left[~j])
328
+ else:
329
+ p_value = norm_dist.cdf(-z_right) + norm_dist.cdf(z_left)
330
+
331
+ # Build table — use shrunk logFC for display (matching R)
332
+ table = pd.DataFrame({
333
+ 'logFC': logFC,
334
+ 'logCPM': glmfit['AveLogCPM'],
335
+ 'PValue': p_value
336
+ })
337
+
338
+ result = dict(glmfit)
339
+ result.pop('counts', None)
340
+ result['lfc'] = lfc
341
+ result['table'] = table
342
+ result['comparison'] = f'coef{coef_idx}'
343
+
344
+ return result
345
+
346
+
347
+ def _zscore_t(x, df):
348
+ """Convert t-statistics to z-scores."""
349
+ from scipy.stats import t as t_dist, norm as norm_dist
350
+ df = np.asarray(df, dtype=np.float64)
351
+ x = np.asarray(x, dtype=np.float64)
352
+ # Use log p-value for precision
353
+ log_p = t_dist.logsf(np.abs(x), df)
354
+ z = norm_dist.isf(np.exp(log_p))
355
+ z = np.where(x < 0, -z, z)
356
+ return z
357
+
358
+
359
+ def _integrate_pnorm(a, b):
360
+ """Integrate the standard normal CDF from a to b.
361
+
362
+ Port of edgeR's .integratepnorm.
363
+ """
364
+ from scipy.stats import norm as norm_dist
365
+ a = np.asarray(a, dtype=np.float64)
366
+ b = np.asarray(b, dtype=np.float64)
367
+
368
+ equal = np.abs(a - b) < 1e-15
369
+ result = np.where(
370
+ equal,
371
+ norm_dist.cdf(a),
372
+ (b * norm_dist.cdf(b) + norm_dist.pdf(b) -
373
+ (a * norm_dist.cdf(a) + norm_dist.pdf(a))) / np.maximum(b - a, 1e-15)
374
+ )
375
+ return result