edgepython 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- edgepython/__init__.py +114 -0
- edgepython/classes.py +517 -0
- edgepython/compressed_matrix.py +388 -0
- edgepython/dgelist.py +314 -0
- edgepython/dispersion.py +920 -0
- edgepython/dispersion_lowlevel.py +1066 -0
- edgepython/exact_test.py +525 -0
- edgepython/expression.py +323 -0
- edgepython/filtering.py +96 -0
- edgepython/gene_sets.py +1215 -0
- edgepython/glm_fit.py +653 -0
- edgepython/glm_levenberg.py +359 -0
- edgepython/glm_test.py +375 -0
- edgepython/io.py +1887 -0
- edgepython/limma_port.py +987 -0
- edgepython/normalization.py +546 -0
- edgepython/ql_weights.py +765 -0
- edgepython/results.py +236 -0
- edgepython/sc_fit.py +1511 -0
- edgepython/smoothing.py +474 -0
- edgepython/splicing.py +537 -0
- edgepython/utils.py +1050 -0
- edgepython/visualization.py +409 -0
- edgepython/weighted_lowess.py +323 -0
- edgepython-0.2.0.dist-info/METADATA +201 -0
- edgepython-0.2.0.dist-info/RECORD +29 -0
- edgepython-0.2.0.dist-info/WHEEL +5 -0
- edgepython-0.2.0.dist-info/licenses/LICENSE +674 -0
- edgepython-0.2.0.dist-info/top_level.txt +1 -0
edgepython/glm_fit.py
ADDED
|
@@ -0,0 +1,653 @@
|
|
|
1
|
+
# This code was written by Claude (Anthropic). The project was directed by Lior Pachter.
|
|
2
|
+
"""
|
|
3
|
+
GLM fitting for edgePython.
|
|
4
|
+
|
|
5
|
+
Port of edgeR's glmFit, glmQLFit, mglmOneGroup, mglmOneWay.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
import warnings
|
|
10
|
+
from .compressed_matrix import (CompressedMatrix, compress_offsets,
|
|
11
|
+
compress_weights, compress_dispersions)
|
|
12
|
+
from .glm_levenberg import mglm_levenberg, nbinom_deviance
|
|
13
|
+
from .utils import (expand_as_matrix, design_as_factor, pred_fc,
|
|
14
|
+
add_prior_count, residual_df)
|
|
15
|
+
from .limma_port import squeeze_var, non_estimable, is_fullrank, choose_lowess_span
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def mglm_one_group(y, dispersion=0, offset=0, weights=None,
|
|
19
|
+
coef_start=None, maxit=50, tol=1e-10):
|
|
20
|
+
"""Fit single-group negative-binomial GLM.
|
|
21
|
+
|
|
22
|
+
Port of edgeR's mglmOneGroup (C code reimplemented in Python).
|
|
23
|
+
|
|
24
|
+
Parameters
|
|
25
|
+
----------
|
|
26
|
+
y : ndarray
|
|
27
|
+
Count matrix (genes x samples).
|
|
28
|
+
dispersion : float, ndarray, or CompressedMatrix
|
|
29
|
+
NB dispersions.
|
|
30
|
+
offset : float, ndarray, or CompressedMatrix
|
|
31
|
+
Log-scale offsets.
|
|
32
|
+
weights : ndarray or CompressedMatrix, optional
|
|
33
|
+
Observation weights.
|
|
34
|
+
coef_start : ndarray, optional
|
|
35
|
+
Starting coefficient values (one per gene).
|
|
36
|
+
maxit : int
|
|
37
|
+
Maximum iterations.
|
|
38
|
+
tol : float
|
|
39
|
+
Convergence tolerance.
|
|
40
|
+
|
|
41
|
+
Returns
|
|
42
|
+
-------
|
|
43
|
+
ndarray of coefficients (one per gene).
|
|
44
|
+
"""
|
|
45
|
+
y = np.asarray(y, dtype=np.float64)
|
|
46
|
+
if y.ndim == 1:
|
|
47
|
+
y = y.reshape(1, -1)
|
|
48
|
+
ngenes, nlibs = y.shape
|
|
49
|
+
|
|
50
|
+
# Expand offset, dispersion, weights
|
|
51
|
+
offset_mat = _expand_to_matrix(offset, y.shape)
|
|
52
|
+
disp_mat = _expand_to_matrix(dispersion, y.shape)
|
|
53
|
+
if weights is not None:
|
|
54
|
+
w_mat = _expand_to_matrix(weights, y.shape)
|
|
55
|
+
else:
|
|
56
|
+
w_mat = np.ones_like(y)
|
|
57
|
+
|
|
58
|
+
# Ensure 2D for all
|
|
59
|
+
if disp_mat.ndim == 1:
|
|
60
|
+
disp_mat = np.broadcast_to(disp_mat[:, None] if len(disp_mat) == ngenes
|
|
61
|
+
else disp_mat[None, :], y.shape).copy()
|
|
62
|
+
elif disp_mat.ndim == 0:
|
|
63
|
+
disp_mat = np.full_like(y, float(disp_mat))
|
|
64
|
+
|
|
65
|
+
# Starting values (vectorized)
|
|
66
|
+
if coef_start is not None:
|
|
67
|
+
b = np.asarray(coef_start, dtype=np.float64).ravel()
|
|
68
|
+
if len(b) == 1:
|
|
69
|
+
b = np.full(ngenes, b[0])
|
|
70
|
+
need_init = np.isnan(b)
|
|
71
|
+
else:
|
|
72
|
+
b = np.full(ngenes, np.nan)
|
|
73
|
+
need_init = np.ones(ngenes, dtype=bool)
|
|
74
|
+
|
|
75
|
+
if np.any(need_init):
|
|
76
|
+
lib = np.exp(offset_mat[need_init])
|
|
77
|
+
total_y = np.sum(w_mat[need_init] * y[need_init], axis=1)
|
|
78
|
+
total_lib = np.sum(w_mat[need_init] * lib, axis=1)
|
|
79
|
+
valid = (total_y > 0) & (total_lib > 0)
|
|
80
|
+
b_init = np.full(np.sum(need_init), -20.0)
|
|
81
|
+
b_init[valid] = np.log(total_y[valid] / total_lib[valid])
|
|
82
|
+
b[need_init] = b_init
|
|
83
|
+
|
|
84
|
+
# Vectorized Fisher scoring iteration (all genes at once)
|
|
85
|
+
active = np.ones(ngenes, dtype=bool) # genes still iterating
|
|
86
|
+
for _it in range(maxit):
|
|
87
|
+
if not np.any(active):
|
|
88
|
+
break
|
|
89
|
+
|
|
90
|
+
# Compute mu for active genes
|
|
91
|
+
eta = b[active, None] + offset_mat[active] # (n_active, nlibs)
|
|
92
|
+
mu = np.exp(np.clip(eta, -500, 500))
|
|
93
|
+
mu = np.maximum(mu, 1e-300)
|
|
94
|
+
|
|
95
|
+
# Working weights
|
|
96
|
+
denom = 1.0 + disp_mat[active] * mu # (n_active, nlibs)
|
|
97
|
+
|
|
98
|
+
# Score and information
|
|
99
|
+
dl = np.sum(w_mat[active] * (y[active] - mu) / denom, axis=1) # (n_active,)
|
|
100
|
+
info = np.sum(w_mat[active] * mu / denom, axis=1) # (n_active,)
|
|
101
|
+
|
|
102
|
+
# Guard against zero information
|
|
103
|
+
safe = info > 1e-300
|
|
104
|
+
step = np.zeros_like(dl)
|
|
105
|
+
step[safe] = dl[safe] / info[safe]
|
|
106
|
+
|
|
107
|
+
b_new = b[active] + step
|
|
108
|
+
|
|
109
|
+
# Check convergence
|
|
110
|
+
converged = np.abs(step) < tol * (np.abs(b[active]) + 0.1)
|
|
111
|
+
converged |= ~safe
|
|
112
|
+
|
|
113
|
+
b[active] = b_new
|
|
114
|
+
|
|
115
|
+
# Mark converged genes as inactive
|
|
116
|
+
active_indices = np.where(active)[0]
|
|
117
|
+
active[active_indices[converged]] = False
|
|
118
|
+
|
|
119
|
+
return b
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def mglm_one_way(y, design=None, group=None, dispersion=0, offset=0,
|
|
123
|
+
weights=None, coef_start=None, maxit=50, tol=1e-10):
|
|
124
|
+
"""Fit multiple NB GLMs with a one-way layout.
|
|
125
|
+
|
|
126
|
+
Port of edgeR's mglmOneWay.
|
|
127
|
+
|
|
128
|
+
Parameters
|
|
129
|
+
----------
|
|
130
|
+
y : ndarray
|
|
131
|
+
Count matrix (genes x samples).
|
|
132
|
+
design : ndarray, optional
|
|
133
|
+
Design matrix.
|
|
134
|
+
group : ndarray, optional
|
|
135
|
+
Group factor.
|
|
136
|
+
dispersion : float or ndarray
|
|
137
|
+
NB dispersions.
|
|
138
|
+
offset : float or ndarray
|
|
139
|
+
Offsets.
|
|
140
|
+
weights : ndarray, optional
|
|
141
|
+
Observation weights.
|
|
142
|
+
coef_start : ndarray, optional
|
|
143
|
+
Starting coefficients.
|
|
144
|
+
maxit : int
|
|
145
|
+
Maximum iterations.
|
|
146
|
+
tol : float
|
|
147
|
+
Convergence tolerance.
|
|
148
|
+
|
|
149
|
+
Returns
|
|
150
|
+
-------
|
|
151
|
+
dict with 'coefficients' and 'fitted.values'.
|
|
152
|
+
"""
|
|
153
|
+
y = np.asarray(y, dtype=np.float64)
|
|
154
|
+
if y.ndim == 1:
|
|
155
|
+
y = y.reshape(1, -1)
|
|
156
|
+
ngenes, nlibs = y.shape
|
|
157
|
+
|
|
158
|
+
offset_mat = _expand_to_matrix(offset, y.shape)
|
|
159
|
+
disp_mat = _expand_to_matrix(dispersion, y.shape)
|
|
160
|
+
if weights is not None:
|
|
161
|
+
w_mat = _expand_to_matrix(weights, y.shape)
|
|
162
|
+
else:
|
|
163
|
+
w_mat = np.ones_like(y)
|
|
164
|
+
|
|
165
|
+
# Get group factor
|
|
166
|
+
if group is None:
|
|
167
|
+
if design is None:
|
|
168
|
+
group = np.zeros(nlibs, dtype=int)
|
|
169
|
+
else:
|
|
170
|
+
design = np.asarray(design, dtype=np.float64)
|
|
171
|
+
if design.ndim == 1:
|
|
172
|
+
design = design.reshape(-1, 1)
|
|
173
|
+
group = design_as_factor(design)
|
|
174
|
+
else:
|
|
175
|
+
group = np.asarray(group)
|
|
176
|
+
|
|
177
|
+
unique_groups = np.unique(group)
|
|
178
|
+
ngroups = len(unique_groups)
|
|
179
|
+
|
|
180
|
+
# Check if design reduces to indicator matrix
|
|
181
|
+
design_unique = None
|
|
182
|
+
if design is not None:
|
|
183
|
+
design = np.asarray(design, dtype=np.float64)
|
|
184
|
+
if design.ndim == 1:
|
|
185
|
+
design = design.reshape(-1, 1)
|
|
186
|
+
if design.shape[1] != ngroups:
|
|
187
|
+
raise ValueError("design matrix is not equivalent to a oneway layout")
|
|
188
|
+
# Get representative design rows
|
|
189
|
+
first_of_group = np.array([np.where(group == g)[0][0] for g in unique_groups])
|
|
190
|
+
design_unique = design[first_of_group]
|
|
191
|
+
# Check if it's a simple group indicator
|
|
192
|
+
is_indicator = (np.sum(design_unique == 1) == ngroups and
|
|
193
|
+
np.sum(design_unique == 0) == (ngroups - 1) * ngroups)
|
|
194
|
+
if is_indicator:
|
|
195
|
+
design_unique = None
|
|
196
|
+
|
|
197
|
+
# Convert starting values if needed
|
|
198
|
+
cs = None
|
|
199
|
+
if coef_start is not None:
|
|
200
|
+
coef_start = np.asarray(coef_start, dtype=np.float64)
|
|
201
|
+
if coef_start.ndim == 1:
|
|
202
|
+
coef_start = coef_start.reshape(1, -1)
|
|
203
|
+
if design_unique is not None:
|
|
204
|
+
cs = coef_start @ design_unique.T
|
|
205
|
+
else:
|
|
206
|
+
cs = coef_start
|
|
207
|
+
|
|
208
|
+
# Fit each group
|
|
209
|
+
beta = np.zeros((ngenes, ngroups))
|
|
210
|
+
for g_idx, grp in enumerate(unique_groups):
|
|
211
|
+
j = np.where(group == grp)[0]
|
|
212
|
+
cs_g = cs[:, g_idx] if cs is not None else None
|
|
213
|
+
beta[:, g_idx] = mglm_one_group(
|
|
214
|
+
y[:, j], dispersion=disp_mat[:, j] if disp_mat.ndim == 2 else disp_mat,
|
|
215
|
+
offset=offset_mat[:, j] if offset_mat.ndim == 2 else offset_mat,
|
|
216
|
+
weights=w_mat[:, j] if w_mat.ndim == 2 else w_mat,
|
|
217
|
+
coef_start=cs_g, maxit=maxit, tol=tol)
|
|
218
|
+
|
|
219
|
+
# Clamp -Inf to large negative
|
|
220
|
+
beta = np.maximum(beta, -1e8)
|
|
221
|
+
|
|
222
|
+
# Fitted values from group-wise betas
|
|
223
|
+
mu = np.zeros_like(y)
|
|
224
|
+
for g_idx, grp in enumerate(unique_groups):
|
|
225
|
+
j = np.where(group == grp)[0]
|
|
226
|
+
for jj in j:
|
|
227
|
+
mu[:, jj] = np.exp(np.clip(beta[:, g_idx] + offset_mat[:, jj], -500, 500))
|
|
228
|
+
|
|
229
|
+
# If design is not indicator, convert back
|
|
230
|
+
if design_unique is not None:
|
|
231
|
+
beta = np.linalg.solve(design_unique, beta.T).T
|
|
232
|
+
|
|
233
|
+
return {
|
|
234
|
+
'coefficients': beta,
|
|
235
|
+
'fitted.values': mu
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def glm_fit(y, design=None, dispersion=None, offset=None, lib_size=None,
|
|
240
|
+
weights=None, prior_count=0.125, start=None):
|
|
241
|
+
"""Fit negative binomial GLMs for each gene.
|
|
242
|
+
|
|
243
|
+
Port of edgeR's glmFit.default.
|
|
244
|
+
|
|
245
|
+
Parameters
|
|
246
|
+
----------
|
|
247
|
+
y : ndarray or DGEList
|
|
248
|
+
Count matrix (genes x samples), or DGEList.
|
|
249
|
+
design : ndarray or str, optional
|
|
250
|
+
Design matrix, or an R-style formula string (e.g.
|
|
251
|
+
``'~ group'``, ``'~ batch + condition'``) which is
|
|
252
|
+
evaluated against the DGEList sample metadata via patsy.
|
|
253
|
+
dispersion : float or ndarray
|
|
254
|
+
NB dispersions.
|
|
255
|
+
offset : ndarray, optional
|
|
256
|
+
Log-scale offsets.
|
|
257
|
+
lib_size : ndarray, optional
|
|
258
|
+
Library sizes.
|
|
259
|
+
weights : ndarray, optional
|
|
260
|
+
Observation weights.
|
|
261
|
+
prior_count : float
|
|
262
|
+
Prior count for shrinking log-fold-changes.
|
|
263
|
+
start : ndarray, optional
|
|
264
|
+
Starting coefficient values.
|
|
265
|
+
|
|
266
|
+
Returns
|
|
267
|
+
-------
|
|
268
|
+
dict (DGEGLM-like) with coefficients, fitted.values, deviance,
|
|
269
|
+
df.residual, design, offset, dispersion, weights, etc.
|
|
270
|
+
"""
|
|
271
|
+
# Resolve formula string to design matrix
|
|
272
|
+
from .utils import _resolve_design
|
|
273
|
+
design = _resolve_design(design, y)
|
|
274
|
+
|
|
275
|
+
# DGEList input
|
|
276
|
+
if isinstance(y, dict) and 'counts' in y:
|
|
277
|
+
dge = y
|
|
278
|
+
if design is None:
|
|
279
|
+
design = dge.get('design')
|
|
280
|
+
if design is None:
|
|
281
|
+
group = dge['samples']['group'].values
|
|
282
|
+
from .utils import drop_empty_levels
|
|
283
|
+
group = drop_empty_levels(group)
|
|
284
|
+
unique_groups = np.unique(group)
|
|
285
|
+
if len(unique_groups) > 1:
|
|
286
|
+
# model.matrix(~group)
|
|
287
|
+
from .utils import _model_matrix_group
|
|
288
|
+
design = _model_matrix_group(group)
|
|
289
|
+
if dispersion is None:
|
|
290
|
+
from .dgelist import get_dispersion
|
|
291
|
+
dispersion = get_dispersion(dge)
|
|
292
|
+
if dispersion is None:
|
|
293
|
+
raise ValueError("No dispersion values found in DGEList object.")
|
|
294
|
+
from .dgelist import get_offset
|
|
295
|
+
offset = get_offset(dge)
|
|
296
|
+
from .expression import ave_log_cpm
|
|
297
|
+
if dge.get('AveLogCPM') is None:
|
|
298
|
+
dge['AveLogCPM'] = ave_log_cpm(dge)
|
|
299
|
+
|
|
300
|
+
fit = glm_fit(dge['counts'], design=design, dispersion=dispersion,
|
|
301
|
+
offset=offset, lib_size=None, weights=dge.get('weights'),
|
|
302
|
+
prior_count=prior_count, start=start)
|
|
303
|
+
fit['samples'] = dge['samples']
|
|
304
|
+
fit['genes'] = dge.get('genes')
|
|
305
|
+
fit['prior.df'] = dge.get('prior.df')
|
|
306
|
+
fit['AveLogCPM'] = dge.get('AveLogCPM')
|
|
307
|
+
return fit
|
|
308
|
+
|
|
309
|
+
# Default method
|
|
310
|
+
y = np.asarray(y, dtype=np.float64)
|
|
311
|
+
if y.ndim == 1:
|
|
312
|
+
y = y.reshape(1, -1)
|
|
313
|
+
ntag, nlib = y.shape
|
|
314
|
+
|
|
315
|
+
# Check design
|
|
316
|
+
if design is None:
|
|
317
|
+
design = np.ones((nlib, 1))
|
|
318
|
+
else:
|
|
319
|
+
design = np.asarray(design, dtype=np.float64)
|
|
320
|
+
if design.ndim == 1:
|
|
321
|
+
design = design.reshape(-1, 1)
|
|
322
|
+
if design.shape[0] != nlib:
|
|
323
|
+
raise ValueError("nrow(design) disagrees with ncol(y)")
|
|
324
|
+
ne = non_estimable(design)
|
|
325
|
+
if ne is not None:
|
|
326
|
+
raise ValueError(f"Design matrix not of full rank. Non-estimable: {ne}")
|
|
327
|
+
|
|
328
|
+
# Check dispersion
|
|
329
|
+
if dispersion is None:
|
|
330
|
+
raise ValueError("No dispersion values provided.")
|
|
331
|
+
dispersion = np.atleast_1d(np.asarray(dispersion, dtype=np.float64))
|
|
332
|
+
if np.any(np.isnan(dispersion)):
|
|
333
|
+
raise ValueError("NA dispersions not allowed")
|
|
334
|
+
|
|
335
|
+
# Build offset from lib_size and offset
|
|
336
|
+
if offset is not None:
|
|
337
|
+
offset = np.asarray(offset, dtype=np.float64)
|
|
338
|
+
elif lib_size is not None:
|
|
339
|
+
lib_size = np.asarray(lib_size, dtype=np.float64)
|
|
340
|
+
offset = np.log(lib_size)
|
|
341
|
+
else:
|
|
342
|
+
offset = np.log(y.sum(axis=0))
|
|
343
|
+
|
|
344
|
+
offset_mat = expand_as_matrix(offset, (ntag, nlib))
|
|
345
|
+
disp_mat = expand_as_matrix(dispersion, (ntag, nlib))
|
|
346
|
+
|
|
347
|
+
if weights is not None:
|
|
348
|
+
w_mat = expand_as_matrix(np.asarray(weights, dtype=np.float64), (ntag, nlib))
|
|
349
|
+
else:
|
|
350
|
+
w_mat = None
|
|
351
|
+
|
|
352
|
+
# Fit: use one-way shortcut if design is equivalent to one-way layout
|
|
353
|
+
group = design_as_factor(design)
|
|
354
|
+
unique_groups = np.unique(group)
|
|
355
|
+
|
|
356
|
+
if len(unique_groups) == design.shape[1]:
|
|
357
|
+
fit = mglm_one_way(y, design=design, group=group,
|
|
358
|
+
dispersion=disp_mat, offset=offset_mat,
|
|
359
|
+
weights=w_mat, coef_start=start)
|
|
360
|
+
fit['deviance'] = nbinom_deviance(y, fit['fitted.values'], dispersion, w_mat)
|
|
361
|
+
fit['method'] = 'oneway'
|
|
362
|
+
else:
|
|
363
|
+
fit = mglm_levenberg(y, design=design, dispersion=disp_mat,
|
|
364
|
+
offset=offset_mat, weights=w_mat,
|
|
365
|
+
coef_start=start, maxit=250)
|
|
366
|
+
fit['method'] = 'levenberg'
|
|
367
|
+
|
|
368
|
+
# Prepare output
|
|
369
|
+
fit['counts'] = y
|
|
370
|
+
if prior_count > 0:
|
|
371
|
+
fit['unshrunk.coefficients'] = fit['coefficients'].copy()
|
|
372
|
+
fit['coefficients'] = pred_fc(y, design, offset=offset_mat,
|
|
373
|
+
dispersion=disp_mat,
|
|
374
|
+
prior_count=prior_count,
|
|
375
|
+
weights=w_mat) * np.log(2)
|
|
376
|
+
|
|
377
|
+
fit['df.residual'] = np.full(ntag, nlib - design.shape[1])
|
|
378
|
+
fit['design'] = design
|
|
379
|
+
fit['offset'] = offset_mat
|
|
380
|
+
fit['dispersion'] = dispersion
|
|
381
|
+
fit['weights'] = weights
|
|
382
|
+
fit['prior.count'] = prior_count
|
|
383
|
+
|
|
384
|
+
return fit
|
|
385
|
+
|
|
386
|
+
|
|
387
|
+
def glm_ql_fit(y, design=None, dispersion=None, offset=None, lib_size=None,
|
|
388
|
+
weights=None, abundance_trend=True, ave_log_cpm=None,
|
|
389
|
+
covariate_trend=None, robust=False, winsor_tail_p=(0.05, 0.1),
|
|
390
|
+
legacy=False, top_proportion=None, keep_unit_mat=False):
|
|
391
|
+
"""Fit quasi-likelihood negative binomial GLMs.
|
|
392
|
+
|
|
393
|
+
Port of edgeR's glmQLFit.default.
|
|
394
|
+
|
|
395
|
+
Parameters
|
|
396
|
+
----------
|
|
397
|
+
y : ndarray or DGEList
|
|
398
|
+
Count matrix or DGEList.
|
|
399
|
+
design : ndarray or str, optional
|
|
400
|
+
Design matrix, or an R-style formula string (e.g.
|
|
401
|
+
``'~ group'``, ``'~ batch + condition'``) evaluated
|
|
402
|
+
against DGEList sample metadata via patsy.
|
|
403
|
+
dispersion : float or ndarray, optional
|
|
404
|
+
NB dispersions.
|
|
405
|
+
offset : ndarray, optional
|
|
406
|
+
Offsets.
|
|
407
|
+
lib_size : ndarray, optional
|
|
408
|
+
Library sizes.
|
|
409
|
+
weights : ndarray, optional
|
|
410
|
+
Observation weights.
|
|
411
|
+
abundance_trend : bool
|
|
412
|
+
Use abundance trend for QL prior.
|
|
413
|
+
ave_log_cpm : ndarray, optional
|
|
414
|
+
Average log-CPM values.
|
|
415
|
+
covariate_trend : ndarray, optional
|
|
416
|
+
Covariate for trended prior.
|
|
417
|
+
robust : bool
|
|
418
|
+
Robust empirical Bayes.
|
|
419
|
+
winsor_tail_p : tuple
|
|
420
|
+
Winsorization tail proportions.
|
|
421
|
+
legacy : bool
|
|
422
|
+
Use legacy (old-style) QL method.
|
|
423
|
+
top_proportion : float, optional
|
|
424
|
+
Proportion of top-abundance genes for dispersion estimation.
|
|
425
|
+
keep_unit_mat : bool
|
|
426
|
+
Keep unit deviance matrix.
|
|
427
|
+
|
|
428
|
+
Returns
|
|
429
|
+
-------
|
|
430
|
+
dict (DGEGLM-like) with added s2.post, df.prior, s2.prior fields.
|
|
431
|
+
"""
|
|
432
|
+
from .expression import ave_log_cpm as _ave_log_cpm
|
|
433
|
+
from .utils import _resolve_design
|
|
434
|
+
design = _resolve_design(design, y)
|
|
435
|
+
|
|
436
|
+
# DGEList input
|
|
437
|
+
if isinstance(y, dict) and 'counts' in y:
|
|
438
|
+
dge = y
|
|
439
|
+
if design is None:
|
|
440
|
+
design = dge.get('design')
|
|
441
|
+
if design is None:
|
|
442
|
+
group = dge['samples']['group'].values
|
|
443
|
+
from .utils import drop_empty_levels
|
|
444
|
+
group = drop_empty_levels(group)
|
|
445
|
+
unique_g = np.unique(group)
|
|
446
|
+
if len(unique_g) > 1:
|
|
447
|
+
from .utils import _model_matrix_group
|
|
448
|
+
design = _model_matrix_group(group)
|
|
449
|
+
|
|
450
|
+
if dge.get('AveLogCPM') is None:
|
|
451
|
+
dge['AveLogCPM'] = _ave_log_cpm(dge)
|
|
452
|
+
|
|
453
|
+
if dispersion is None:
|
|
454
|
+
if legacy:
|
|
455
|
+
dispersion = dge.get('trended.dispersion')
|
|
456
|
+
if dispersion is None:
|
|
457
|
+
dispersion = dge.get('common.dispersion')
|
|
458
|
+
if dispersion is None:
|
|
459
|
+
raise ValueError("No dispersion values found in DGEList object.")
|
|
460
|
+
else:
|
|
461
|
+
if dge.get('trended.dispersion') is not None:
|
|
462
|
+
ntop = int(np.ceil(0.1 * dge['counts'].shape[0]))
|
|
463
|
+
i = np.argsort(dge['AveLogCPM'])[::-1][:ntop]
|
|
464
|
+
dispersion = np.mean(dge['trended.dispersion'][i])
|
|
465
|
+
|
|
466
|
+
from .dgelist import get_offset
|
|
467
|
+
offset = get_offset(dge)
|
|
468
|
+
|
|
469
|
+
fit = glm_ql_fit(dge['counts'], design=design, dispersion=dispersion,
|
|
470
|
+
offset=offset, lib_size=None,
|
|
471
|
+
abundance_trend=abundance_trend,
|
|
472
|
+
ave_log_cpm=dge['AveLogCPM'],
|
|
473
|
+
robust=robust, winsor_tail_p=winsor_tail_p,
|
|
474
|
+
weights=dge.get('weights'),
|
|
475
|
+
legacy=legacy, top_proportion=top_proportion,
|
|
476
|
+
keep_unit_mat=keep_unit_mat)
|
|
477
|
+
fit['samples'] = dge['samples']
|
|
478
|
+
fit['genes'] = dge.get('genes')
|
|
479
|
+
fit['AveLogCPM'] = dge['AveLogCPM']
|
|
480
|
+
return fit
|
|
481
|
+
|
|
482
|
+
# Default method
|
|
483
|
+
y_mat = np.asarray(y, dtype=np.float64)
|
|
484
|
+
if y_mat.ndim == 1:
|
|
485
|
+
y_mat = y_mat.reshape(1, -1)
|
|
486
|
+
ngenes = y_mat.shape[0]
|
|
487
|
+
nlibs = y_mat.shape[1]
|
|
488
|
+
|
|
489
|
+
# Check design
|
|
490
|
+
if design is None:
|
|
491
|
+
design = np.ones((nlibs, 1))
|
|
492
|
+
|
|
493
|
+
design = np.asarray(design, dtype=np.float64)
|
|
494
|
+
if design.ndim == 1:
|
|
495
|
+
design = design.reshape(-1, 1)
|
|
496
|
+
|
|
497
|
+
# Check AveLogCPM
|
|
498
|
+
if ave_log_cpm is None:
|
|
499
|
+
ave_log_cpm = _ave_log_cpm(y_mat, offset=offset, lib_size=lib_size,
|
|
500
|
+
weights=weights, dispersion=dispersion)
|
|
501
|
+
|
|
502
|
+
# Check dispersion
|
|
503
|
+
if dispersion is None:
|
|
504
|
+
if legacy:
|
|
505
|
+
raise ValueError("No dispersion values provided.")
|
|
506
|
+
else:
|
|
507
|
+
if top_proportion is None:
|
|
508
|
+
df_residual = nlibs - design.shape[1]
|
|
509
|
+
top_proportion = choose_lowess_span(
|
|
510
|
+
ngenes * np.sqrt(df_residual), small_n=20, min_span=0.02)
|
|
511
|
+
else:
|
|
512
|
+
if top_proportion < 0 or top_proportion > 1:
|
|
513
|
+
raise ValueError("top_proportion should be between 0 and 1.")
|
|
514
|
+
ntop = int(np.ceil(top_proportion * ngenes))
|
|
515
|
+
i = np.argsort(ave_log_cpm)[::-1][:ntop]
|
|
516
|
+
from .dispersion import estimate_glm_common_disp
|
|
517
|
+
if offset is not None:
|
|
518
|
+
off_sub = np.asarray(offset)
|
|
519
|
+
if off_sub.ndim == 2:
|
|
520
|
+
off_sub = off_sub[i]
|
|
521
|
+
else:
|
|
522
|
+
off_sub = None
|
|
523
|
+
w_sub = None
|
|
524
|
+
if weights is not None:
|
|
525
|
+
w_arr = np.asarray(weights)
|
|
526
|
+
if w_arr.ndim == 2:
|
|
527
|
+
w_sub = w_arr[i]
|
|
528
|
+
dispersion = estimate_glm_common_disp(
|
|
529
|
+
y_mat[i], design=design, offset=off_sub, weights=w_sub)
|
|
530
|
+
else:
|
|
531
|
+
# Cap dispersion at 4 for non-legacy
|
|
532
|
+
if not legacy:
|
|
533
|
+
dispersion = np.atleast_1d(np.asarray(dispersion, dtype=np.float64))
|
|
534
|
+
if np.max(dispersion) > 4:
|
|
535
|
+
dispersion = np.minimum(dispersion, 4.0)
|
|
536
|
+
|
|
537
|
+
# Fit GLM (prior_count=0.125 matches R's glmFit.default default for logFC shrinkage)
|
|
538
|
+
fit = glm_fit(y_mat, design=design, dispersion=dispersion, offset=offset,
|
|
539
|
+
lib_size=lib_size, weights=weights, prior_count=0.125)
|
|
540
|
+
|
|
541
|
+
# Store AveLogCPM for computation
|
|
542
|
+
ave_log_cpm2 = ave_log_cpm.copy()
|
|
543
|
+
|
|
544
|
+
# Covariate for trended prior
|
|
545
|
+
if covariate_trend is None:
|
|
546
|
+
if abundance_trend:
|
|
547
|
+
fit['AveLogCPM'] = ave_log_cpm
|
|
548
|
+
else:
|
|
549
|
+
ave_log_cpm = None
|
|
550
|
+
else:
|
|
551
|
+
ave_log_cpm = covariate_trend
|
|
552
|
+
|
|
553
|
+
# Setting residual deviances and df
|
|
554
|
+
if legacy:
|
|
555
|
+
# Old-style: adjust df for fitted values at zero
|
|
556
|
+
zerofit = (fit['fitted.values'] < 1e-4) & (fit['counts'] < 1e-4)
|
|
557
|
+
df_residual = residual_df(zerofit, fit['design'])
|
|
558
|
+
fit['df.residual.zeros'] = df_residual
|
|
559
|
+
s2 = fit['deviance'] / np.maximum(df_residual, 1e-8)
|
|
560
|
+
s2[df_residual == 0] = 0
|
|
561
|
+
else:
|
|
562
|
+
# New-style: adjusted deviance and df using QL weights (matching R's C code)
|
|
563
|
+
from .ql_weights import update_prior, compute_adjust_vec
|
|
564
|
+
|
|
565
|
+
# Expand dispersion to matrix form for ql_weights
|
|
566
|
+
disp_arr = np.atleast_1d(np.asarray(dispersion, dtype=np.float64))
|
|
567
|
+
|
|
568
|
+
# Compute average quasi-dispersion via iterative lowess + adjusted deviance
|
|
569
|
+
ave_ql_disp = update_prior(y_mat, fit['fitted.values'], design,
|
|
570
|
+
disp_arr, weights, ave_log_cpm2)
|
|
571
|
+
|
|
572
|
+
# Refit with dispersion scaled by average quasi-dispersion
|
|
573
|
+
fit = glm_fit(y_mat, design=design, dispersion=dispersion / ave_ql_disp,
|
|
574
|
+
offset=offset, lib_size=lib_size, weights=weights,
|
|
575
|
+
prior_count=0.125)
|
|
576
|
+
fit['dispersion'] = dispersion
|
|
577
|
+
|
|
578
|
+
# Compute adjusted deviance, df, and s2 using QL weights
|
|
579
|
+
out = compute_adjust_vec(y_mat, fit['fitted.values'], design,
|
|
580
|
+
disp_arr, ave_ql_disp, weights)
|
|
581
|
+
s2 = out['s2']
|
|
582
|
+
df_residual = out['df']
|
|
583
|
+
fit['df.residual.adj'] = df_residual
|
|
584
|
+
fit['deviance.adj'] = out['deviance']
|
|
585
|
+
fit['average.ql.dispersion'] = ave_ql_disp
|
|
586
|
+
|
|
587
|
+
# Empirical Bayes moderation
|
|
588
|
+
s2 = np.maximum(s2, 0)
|
|
589
|
+
s2_fit = squeeze_var(s2, df=df_residual, covariate=ave_log_cpm,
|
|
590
|
+
robust=robust, winsor_tail_p=winsor_tail_p)
|
|
591
|
+
|
|
592
|
+
fit['df.prior'] = s2_fit['df_prior']
|
|
593
|
+
fit['s2.post'] = s2_fit['var_post']
|
|
594
|
+
fit['s2.prior'] = s2_fit['var_prior']
|
|
595
|
+
if not legacy:
|
|
596
|
+
fit['top.proportion'] = top_proportion
|
|
597
|
+
|
|
598
|
+
return fit
|
|
599
|
+
|
|
600
|
+
|
|
601
|
+
def _compute_ave_ql_disp(s2, df, ave_log_cpm):
|
|
602
|
+
"""Compute average quasi-likelihood dispersion.
|
|
603
|
+
|
|
604
|
+
Matches R's update_prior in ql_glm.c: iteratively fits a lowess trend
|
|
605
|
+
of s2^(1/4) vs AveLogCPM, takes the 90th percentile of the trend,
|
|
606
|
+
and raises to 4th power.
|
|
607
|
+
"""
|
|
608
|
+
from scipy.interpolate import UnivariateSpline
|
|
609
|
+
from statsmodels.nonparametric.smoothers_lowess import lowess as sm_lowess
|
|
610
|
+
|
|
611
|
+
threshold = 1e-8
|
|
612
|
+
|
|
613
|
+
# Filter genes with sufficient df
|
|
614
|
+
mask = df > threshold
|
|
615
|
+
x = ave_log_cpm[mask]
|
|
616
|
+
y_vals = np.power(np.maximum(s2[mask], 0), 0.25) # s2^(1/4)
|
|
617
|
+
|
|
618
|
+
if len(x) < 10:
|
|
619
|
+
return 1.0
|
|
620
|
+
|
|
621
|
+
# Two iterations of lowess + 90th percentile (matches R's update_prior)
|
|
622
|
+
prior = 1.0
|
|
623
|
+
for _ in range(2):
|
|
624
|
+
# Fit lowess trend (f=0.5, iter=3 matches R defaults)
|
|
625
|
+
fitted = sm_lowess(y_vals, x, frac=0.5, it=3, return_sorted=False)
|
|
626
|
+
|
|
627
|
+
# 90th percentile of fitted values (R type=7 quantile)
|
|
628
|
+
p90 = np.percentile(fitted, 90, interpolation='linear')
|
|
629
|
+
|
|
630
|
+
# Cap at minimum of 1.0 (on the ^(1/4) scale)
|
|
631
|
+
if p90 < 1.0:
|
|
632
|
+
p90 = 1.0
|
|
633
|
+
|
|
634
|
+
prior = p90 ** 4
|
|
635
|
+
|
|
636
|
+
return max(prior, 1.0)
|
|
637
|
+
|
|
638
|
+
|
|
639
|
+
def _expand_to_matrix(x, shape):
|
|
640
|
+
"""Expand scalar, vector, or CompressedMatrix to full matrix."""
|
|
641
|
+
if isinstance(x, CompressedMatrix):
|
|
642
|
+
return x.as_matrix()
|
|
643
|
+
x = np.asarray(x, dtype=np.float64)
|
|
644
|
+
if x.ndim == 0 or x.size == 1:
|
|
645
|
+
return np.full(shape, x.ravel()[0])
|
|
646
|
+
if x.ndim == 1:
|
|
647
|
+
if len(x) == shape[1]:
|
|
648
|
+
return np.tile(x, (shape[0], 1))
|
|
649
|
+
elif len(x) == shape[0]:
|
|
650
|
+
return np.tile(x.reshape(-1, 1), (1, shape[1]))
|
|
651
|
+
if x.shape == shape:
|
|
652
|
+
return x
|
|
653
|
+
return np.broadcast_to(x, shape).copy()
|