edgepython 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- edgepython/__init__.py +114 -0
- edgepython/classes.py +517 -0
- edgepython/compressed_matrix.py +388 -0
- edgepython/dgelist.py +314 -0
- edgepython/dispersion.py +920 -0
- edgepython/dispersion_lowlevel.py +1066 -0
- edgepython/exact_test.py +525 -0
- edgepython/expression.py +323 -0
- edgepython/filtering.py +96 -0
- edgepython/gene_sets.py +1215 -0
- edgepython/glm_fit.py +653 -0
- edgepython/glm_levenberg.py +359 -0
- edgepython/glm_test.py +375 -0
- edgepython/io.py +1887 -0
- edgepython/limma_port.py +987 -0
- edgepython/normalization.py +546 -0
- edgepython/ql_weights.py +765 -0
- edgepython/results.py +236 -0
- edgepython/sc_fit.py +1511 -0
- edgepython/smoothing.py +474 -0
- edgepython/splicing.py +537 -0
- edgepython/utils.py +1050 -0
- edgepython/visualization.py +409 -0
- edgepython/weighted_lowess.py +323 -0
- edgepython-0.2.0.dist-info/METADATA +201 -0
- edgepython-0.2.0.dist-info/RECORD +29 -0
- edgepython-0.2.0.dist-info/WHEEL +5 -0
- edgepython-0.2.0.dist-info/licenses/LICENSE +674 -0
- edgepython-0.2.0.dist-info/top_level.txt +1 -0
edgepython/dispersion.py
ADDED
|
@@ -0,0 +1,920 @@
|
|
|
1
|
+
# This code was written by Claude (Anthropic). The project was directed by Lior Pachter.
|
|
2
|
+
"""
|
|
3
|
+
Dispersion estimation for edgePython.
|
|
4
|
+
|
|
5
|
+
Port of edgeR's estimateDisp, estimateCommonDisp, estimateTagwiseDisp,
|
|
6
|
+
estimateTrendedDisp, estimateGLMCommonDisp, estimateGLMTrendedDisp,
|
|
7
|
+
estimateGLMTagwiseDisp, and WLEB.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import numpy as np
|
|
11
|
+
import warnings
|
|
12
|
+
from scipy.optimize import minimize_scalar
|
|
13
|
+
|
|
14
|
+
from .expression import ave_log_cpm
|
|
15
|
+
from .utils import (expand_as_matrix, moving_average_by_col, cut_with_min_n,
|
|
16
|
+
drop_empty_levels, systematic_subset)
|
|
17
|
+
from .smoothing import locfit_by_col, loess_by_col
|
|
18
|
+
from .limma_port import squeeze_var, choose_lowess_span
|
|
19
|
+
from .dispersion_lowlevel import (
|
|
20
|
+
adjusted_profile_lik, adjusted_profile_lik_grid, maximize_interpolant,
|
|
21
|
+
cond_log_lik_der_delta, common_cond_log_lik_der_delta,
|
|
22
|
+
disp_cox_reid, disp_cox_reid_interpolate_tagwise,
|
|
23
|
+
disp_cox_reid_spline_trend, disp_cox_reid_power_trend,
|
|
24
|
+
disp_bin_trend, disp_pearson, disp_deviance
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def estimate_disp(y, design=None, group=None, lib_size=None, offset=None,
|
|
29
|
+
prior_df=None, trend_method='locfit', tagwise=True,
|
|
30
|
+
span=None, legacy_span=False, min_row_sum=5,
|
|
31
|
+
grid_length=21, grid_range=(-10, 10), robust=False,
|
|
32
|
+
winsor_tail_p=(0.05, 0.1), tol=1e-6, weights=None):
|
|
33
|
+
"""Estimate common, trended and tagwise dispersions.
|
|
34
|
+
|
|
35
|
+
Port of edgeR's estimateDisp.
|
|
36
|
+
|
|
37
|
+
Parameters
|
|
38
|
+
----------
|
|
39
|
+
y : ndarray or DGEList
|
|
40
|
+
Count matrix or DGEList.
|
|
41
|
+
design : ndarray or str, optional
|
|
42
|
+
Design matrix, or an R-style formula string (e.g.
|
|
43
|
+
``'~ group'``, ``'~ batch + condition'``) evaluated
|
|
44
|
+
against DGEList sample metadata via patsy.
|
|
45
|
+
If None, uses classic edgeR approach.
|
|
46
|
+
group : array-like, optional
|
|
47
|
+
Group factor.
|
|
48
|
+
lib_size : ndarray, optional
|
|
49
|
+
Library sizes.
|
|
50
|
+
offset : ndarray, optional
|
|
51
|
+
Log-scale offsets.
|
|
52
|
+
prior_df : float, optional
|
|
53
|
+
Prior degrees of freedom.
|
|
54
|
+
trend_method : str
|
|
55
|
+
'locfit', 'loess', 'movingave', or 'none'.
|
|
56
|
+
tagwise : bool
|
|
57
|
+
Estimate tagwise dispersions.
|
|
58
|
+
span : float, optional
|
|
59
|
+
Span for smoothing.
|
|
60
|
+
legacy_span : bool
|
|
61
|
+
Use legacy span selection.
|
|
62
|
+
min_row_sum : int
|
|
63
|
+
Minimum row sum for a gene.
|
|
64
|
+
grid_length : int
|
|
65
|
+
Number of grid points.
|
|
66
|
+
grid_range : tuple
|
|
67
|
+
Range for dispersion grid.
|
|
68
|
+
robust : bool
|
|
69
|
+
Robust estimation.
|
|
70
|
+
winsor_tail_p : tuple
|
|
71
|
+
Winsorization tail proportions.
|
|
72
|
+
tol : float
|
|
73
|
+
Tolerance.
|
|
74
|
+
weights : ndarray, optional
|
|
75
|
+
Observation weights.
|
|
76
|
+
|
|
77
|
+
Returns
|
|
78
|
+
-------
|
|
79
|
+
DGEList (if input is DGEList) or dict with common.dispersion,
|
|
80
|
+
trended.dispersion, tagwise.dispersion, span, prior.df, prior.n.
|
|
81
|
+
"""
|
|
82
|
+
# Resolve formula string to design matrix
|
|
83
|
+
from .utils import _resolve_design
|
|
84
|
+
design = _resolve_design(design, y)
|
|
85
|
+
|
|
86
|
+
# DGEList input
|
|
87
|
+
if isinstance(y, dict) and 'counts' in y:
|
|
88
|
+
dge = y
|
|
89
|
+
from .dgelist import valid_dgelist, get_offset
|
|
90
|
+
dge = valid_dgelist(dge)
|
|
91
|
+
group_val = dge['samples']['group'].values
|
|
92
|
+
ls = dge['samples']['lib.size'].values * dge['samples']['norm.factors'].values
|
|
93
|
+
|
|
94
|
+
if design is None:
|
|
95
|
+
design = dge.get('design')
|
|
96
|
+
else:
|
|
97
|
+
dge['design'] = design
|
|
98
|
+
|
|
99
|
+
d = estimate_disp(
|
|
100
|
+
dge['counts'], design=design, group=group_val, lib_size=ls,
|
|
101
|
+
offset=get_offset(dge), prior_df=prior_df,
|
|
102
|
+
trend_method=trend_method, tagwise=tagwise, span=span,
|
|
103
|
+
legacy_span=legacy_span, min_row_sum=min_row_sum,
|
|
104
|
+
grid_length=grid_length, grid_range=grid_range,
|
|
105
|
+
robust=robust, winsor_tail_p=winsor_tail_p, tol=tol,
|
|
106
|
+
weights=dge.get('weights'))
|
|
107
|
+
|
|
108
|
+
dge['common.dispersion'] = d['common.dispersion']
|
|
109
|
+
dge['trended.dispersion'] = d['trended.dispersion']
|
|
110
|
+
if tagwise:
|
|
111
|
+
dge['tagwise.dispersion'] = d.get('tagwise.dispersion')
|
|
112
|
+
dge['AveLogCPM'] = ave_log_cpm(dge)
|
|
113
|
+
dge['trend.method'] = trend_method
|
|
114
|
+
dge['prior.df'] = d.get('prior.df')
|
|
115
|
+
dge['prior.n'] = d.get('prior.n')
|
|
116
|
+
dge['span'] = d.get('span')
|
|
117
|
+
return dge
|
|
118
|
+
|
|
119
|
+
# Default method
|
|
120
|
+
y = np.asarray(y, dtype=np.float64)
|
|
121
|
+
if y.ndim == 1:
|
|
122
|
+
y = y.reshape(1, -1)
|
|
123
|
+
ntags, nlibs = y.shape
|
|
124
|
+
|
|
125
|
+
if ntags == 0:
|
|
126
|
+
return {'span': span, 'prior.df': prior_df, 'prior.n': None}
|
|
127
|
+
|
|
128
|
+
# Check trend_method
|
|
129
|
+
valid_methods = ('none', 'loess', 'locfit', 'movingave', 'locfit.mixed')
|
|
130
|
+
if trend_method not in valid_methods:
|
|
131
|
+
raise ValueError(f"trend_method must be one of {valid_methods}")
|
|
132
|
+
|
|
133
|
+
# Check group
|
|
134
|
+
if group is None:
|
|
135
|
+
group = np.ones(nlibs, dtype=int)
|
|
136
|
+
group = drop_empty_levels(np.asarray(group))
|
|
137
|
+
|
|
138
|
+
# Check lib_size
|
|
139
|
+
if lib_size is None:
|
|
140
|
+
lib_size = y.sum(axis=0)
|
|
141
|
+
lib_size = np.asarray(lib_size, dtype=np.float64)
|
|
142
|
+
|
|
143
|
+
# Build offset
|
|
144
|
+
if offset is None:
|
|
145
|
+
offset = np.log(lib_size)
|
|
146
|
+
offset = np.asarray(offset, dtype=np.float64)
|
|
147
|
+
offset_mat = expand_as_matrix(offset, y.shape)
|
|
148
|
+
|
|
149
|
+
if weights is not None:
|
|
150
|
+
w_mat = expand_as_matrix(np.asarray(weights, dtype=np.float64), y.shape)
|
|
151
|
+
else:
|
|
152
|
+
w_mat = np.ones_like(y)
|
|
153
|
+
|
|
154
|
+
# Filter genes with small counts
|
|
155
|
+
sel = y.sum(axis=1) >= min_row_sum
|
|
156
|
+
sely = y[sel]
|
|
157
|
+
seloffset = offset_mat[sel]
|
|
158
|
+
selweights = w_mat[sel]
|
|
159
|
+
|
|
160
|
+
# Spline points
|
|
161
|
+
spline_pts = np.linspace(grid_range[0], grid_range[1], grid_length)
|
|
162
|
+
spline_disp = 0.1 * 2 ** spline_pts
|
|
163
|
+
grid_vals = spline_disp / (1 + spline_disp)
|
|
164
|
+
l0 = np.zeros((np.sum(sel), grid_length))
|
|
165
|
+
|
|
166
|
+
if design is None:
|
|
167
|
+
# Classic edgeR approach
|
|
168
|
+
unique_groups = np.unique(group)
|
|
169
|
+
if np.all(np.bincount(group.astype(int) if np.issubdtype(group.dtype, np.integer) else
|
|
170
|
+
np.searchsorted(unique_groups, group)) <= 1):
|
|
171
|
+
warnings.warn("There is no replication, setting dispersion to NA.")
|
|
172
|
+
return {'common.dispersion': np.nan,
|
|
173
|
+
'trended.dispersion': np.nan,
|
|
174
|
+
'tagwise.dispersion': np.nan}
|
|
175
|
+
|
|
176
|
+
if len(unique_groups) == 1:
|
|
177
|
+
design_classic = np.ones((nlibs, 1))
|
|
178
|
+
else:
|
|
179
|
+
from .utils import _model_matrix_group
|
|
180
|
+
design_classic = _model_matrix_group(group)
|
|
181
|
+
|
|
182
|
+
# Equalize library sizes and estimate common dispersion
|
|
183
|
+
from .exact_test import equalize_lib_sizes, split_into_groups
|
|
184
|
+
eq = equalize_lib_sizes(y, group=group, dispersion=0.01, lib_size=lib_size)
|
|
185
|
+
y_pseudo = eq['pseudo.counts'][sel]
|
|
186
|
+
y_split = split_into_groups(y_pseudo, group=group)
|
|
187
|
+
|
|
188
|
+
# Optimize common dispersion
|
|
189
|
+
result = minimize_scalar(
|
|
190
|
+
lambda d: -common_cond_log_lik_der_delta(y_split, d, der=0),
|
|
191
|
+
bounds=(1e-4, 100 / 101), method='bounded')
|
|
192
|
+
delta = result.x
|
|
193
|
+
disp = delta / (1 - delta)
|
|
194
|
+
|
|
195
|
+
# Re-equalize
|
|
196
|
+
eq = equalize_lib_sizes(y, group=group, dispersion=disp, lib_size=lib_size)
|
|
197
|
+
y_pseudo = eq['pseudo.counts'][sel]
|
|
198
|
+
y_split = split_into_groups(y_pseudo, group=group)
|
|
199
|
+
|
|
200
|
+
# Compute log-likelihoods on grid
|
|
201
|
+
for j in range(grid_length):
|
|
202
|
+
for grp_data in y_split:
|
|
203
|
+
l0[:, j] += cond_log_lik_der_delta(grp_data[sel] if grp_data.shape[0] > np.sum(sel) else grp_data,
|
|
204
|
+
grid_vals[j], der=0)
|
|
205
|
+
else:
|
|
206
|
+
# GLM edgeR approach
|
|
207
|
+
design = np.asarray(design, dtype=np.float64)
|
|
208
|
+
if design.ndim == 1:
|
|
209
|
+
design = design.reshape(-1, 1)
|
|
210
|
+
|
|
211
|
+
if design.shape[1] >= nlibs:
|
|
212
|
+
warnings.warn("No residual df: setting dispersion to NA")
|
|
213
|
+
return {'common.dispersion': np.nan,
|
|
214
|
+
'trended.dispersion': np.nan,
|
|
215
|
+
'tagwise.dispersion': np.nan}
|
|
216
|
+
|
|
217
|
+
# Compute APL on grid for all genes (fast batch)
|
|
218
|
+
l0 = adjusted_profile_lik_grid(
|
|
219
|
+
spline_disp, sely, design, seloffset, weights=selweights)
|
|
220
|
+
|
|
221
|
+
# Calculate common dispersion
|
|
222
|
+
overall = maximize_interpolant(spline_pts, np.sum(l0, axis=0).reshape(1, -1))
|
|
223
|
+
common_dispersion = 0.1 * 2 ** overall[0]
|
|
224
|
+
|
|
225
|
+
# Allow dispersion trend
|
|
226
|
+
if trend_method != 'none':
|
|
227
|
+
ave_lcpm = ave_log_cpm(y, lib_size=lib_size, dispersion=common_dispersion,
|
|
228
|
+
weights=weights)
|
|
229
|
+
out_1 = WLEB(theta=spline_pts, loglik=l0, covariate=ave_lcpm[sel],
|
|
230
|
+
trend_method=trend_method, span=span, legacy_span=legacy_span,
|
|
231
|
+
overall=False, individual=False, m0_out=True)
|
|
232
|
+
span = out_1['span']
|
|
233
|
+
m0 = out_1['shared.loglik']
|
|
234
|
+
disp_trend = 0.1 * 2 ** out_1['trend']
|
|
235
|
+
trended_dispersion = np.full(ntags, disp_trend[np.argmin(ave_lcpm[sel])])
|
|
236
|
+
trended_dispersion[sel] = disp_trend
|
|
237
|
+
else:
|
|
238
|
+
ave_lcpm = None
|
|
239
|
+
m0 = np.tile(np.mean(l0, axis=0), (np.sum(sel), 1))
|
|
240
|
+
disp_trend = common_dispersion
|
|
241
|
+
trended_dispersion = None
|
|
242
|
+
|
|
243
|
+
# Are tagwise dispersions required?
|
|
244
|
+
if not tagwise:
|
|
245
|
+
return {'common.dispersion': common_dispersion,
|
|
246
|
+
'trended.dispersion': trended_dispersion}
|
|
247
|
+
|
|
248
|
+
# Calculate prior_df
|
|
249
|
+
if prior_df is None:
|
|
250
|
+
from .glm_fit import glm_fit
|
|
251
|
+
if design is None:
|
|
252
|
+
design_fit = np.ones((nlibs, 1))
|
|
253
|
+
else:
|
|
254
|
+
design_fit = design
|
|
255
|
+
glmfit = glm_fit(sely, offset=seloffset, weights=selweights,
|
|
256
|
+
design=design_fit, dispersion=disp_trend, prior_count=0)
|
|
257
|
+
|
|
258
|
+
df_residual = glmfit['df.residual'].astype(float)
|
|
259
|
+
|
|
260
|
+
# Adjust for zeros
|
|
261
|
+
from .utils import residual_df
|
|
262
|
+
zerofit = (glmfit['counts'] < 1e-4) & (glmfit['fitted.values'] < 1e-4)
|
|
263
|
+
df_residual = residual_df(zerofit, design_fit)
|
|
264
|
+
|
|
265
|
+
s2 = glmfit['deviance'] / np.maximum(df_residual, 1e-8)
|
|
266
|
+
s2[df_residual == 0] = 0
|
|
267
|
+
s2 = np.maximum(s2, 0)
|
|
268
|
+
covariate = ave_lcpm[sel] if ave_lcpm is not None else None
|
|
269
|
+
s2_fit = squeeze_var(s2, df=df_residual, covariate=covariate,
|
|
270
|
+
robust=robust, winsor_tail_p=winsor_tail_p)
|
|
271
|
+
prior_df = s2_fit.get('df.prior', s2_fit.get('df_prior'))
|
|
272
|
+
|
|
273
|
+
ncoefs = design.shape[1] if design is not None else 1
|
|
274
|
+
prior_n = prior_df / (nlibs - ncoefs)
|
|
275
|
+
|
|
276
|
+
# Initiate tagwise dispersions
|
|
277
|
+
if trend_method != 'none':
|
|
278
|
+
tagwise_dispersion = trended_dispersion.copy()
|
|
279
|
+
else:
|
|
280
|
+
tagwise_dispersion = np.full(ntags, common_dispersion)
|
|
281
|
+
|
|
282
|
+
# Estimate tagwise dispersions via WLEB
|
|
283
|
+
too_large = np.atleast_1d(prior_n > 1e6)
|
|
284
|
+
if not np.all(too_large):
|
|
285
|
+
temp_n = np.atleast_1d(prior_n).copy()
|
|
286
|
+
if np.any(too_large):
|
|
287
|
+
temp_n[too_large] = 1e6
|
|
288
|
+
|
|
289
|
+
out_2 = WLEB(theta=spline_pts, loglik=l0, prior_n=temp_n,
|
|
290
|
+
covariate=ave_lcpm[sel] if ave_lcpm is not None else None,
|
|
291
|
+
trend_method=trend_method, span=span, legacy_span=False,
|
|
292
|
+
overall=False, trend=False, m0=m0)
|
|
293
|
+
tagwise_dispersion[sel] = 0.1 * 2 ** out_2['individual']
|
|
294
|
+
|
|
295
|
+
if robust:
|
|
296
|
+
temp_df = prior_df
|
|
297
|
+
temp_n = prior_n
|
|
298
|
+
prior_df = np.full(ntags, np.inf)
|
|
299
|
+
prior_n = np.full(ntags, np.inf)
|
|
300
|
+
prior_df[sel] = temp_df
|
|
301
|
+
prior_n[sel] = temp_n
|
|
302
|
+
|
|
303
|
+
return {
|
|
304
|
+
'common.dispersion': common_dispersion,
|
|
305
|
+
'trended.dispersion': trended_dispersion,
|
|
306
|
+
'tagwise.dispersion': tagwise_dispersion,
|
|
307
|
+
'span': span,
|
|
308
|
+
'prior.df': prior_df,
|
|
309
|
+
'prior.n': prior_n
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
def WLEB(theta, loglik, prior_n=5, covariate=None, trend_method='locfit',
|
|
314
|
+
span=None, legacy_span=False, overall=True, trend=True,
|
|
315
|
+
individual=True, m0=None, m0_out=False):
|
|
316
|
+
"""Weighted likelihood empirical Bayes.
|
|
317
|
+
|
|
318
|
+
Port of edgeR's WLEB.
|
|
319
|
+
|
|
320
|
+
Parameters
|
|
321
|
+
----------
|
|
322
|
+
theta : ndarray
|
|
323
|
+
Grid of theta values.
|
|
324
|
+
loglik : ndarray
|
|
325
|
+
Log-likelihood matrix (genes x grid points).
|
|
326
|
+
prior_n : float or ndarray
|
|
327
|
+
Prior sample size.
|
|
328
|
+
covariate : ndarray, optional
|
|
329
|
+
Covariate for trend.
|
|
330
|
+
trend_method : str
|
|
331
|
+
Smoothing method.
|
|
332
|
+
span : float, optional
|
|
333
|
+
Smoothing span.
|
|
334
|
+
legacy_span : bool
|
|
335
|
+
Use legacy span selection.
|
|
336
|
+
overall : bool
|
|
337
|
+
Compute overall estimate.
|
|
338
|
+
trend : bool
|
|
339
|
+
Compute trended estimate.
|
|
340
|
+
individual : bool
|
|
341
|
+
Compute individual estimates.
|
|
342
|
+
m0 : ndarray, optional
|
|
343
|
+
Pre-computed shared loglik.
|
|
344
|
+
m0_out : bool
|
|
345
|
+
Return shared loglik.
|
|
346
|
+
|
|
347
|
+
Returns
|
|
348
|
+
-------
|
|
349
|
+
dict with 'overall', 'trend', 'individual', 'span', 'shared.loglik'.
|
|
350
|
+
"""
|
|
351
|
+
loglik = np.asarray(loglik, dtype=np.float64)
|
|
352
|
+
if loglik.ndim == 1:
|
|
353
|
+
loglik = loglik.reshape(1, -1)
|
|
354
|
+
ntags = loglik.shape[0]
|
|
355
|
+
theta = np.asarray(theta, dtype=np.float64)
|
|
356
|
+
|
|
357
|
+
# Check covariate and trend
|
|
358
|
+
if covariate is None:
|
|
359
|
+
trend_method = 'none'
|
|
360
|
+
|
|
361
|
+
# Set span matching R's WLEB formula exactly
|
|
362
|
+
if span is None:
|
|
363
|
+
if ntags <= 50:
|
|
364
|
+
span = 1.0
|
|
365
|
+
else:
|
|
366
|
+
span = 0.25 + 0.75 * (50 / ntags) ** 0.5
|
|
367
|
+
|
|
368
|
+
out = {'span': span}
|
|
369
|
+
|
|
370
|
+
# Overall prior
|
|
371
|
+
if overall:
|
|
372
|
+
out['overall'] = maximize_interpolant(
|
|
373
|
+
theta, np.sum(loglik, axis=0).reshape(1, -1))[0]
|
|
374
|
+
|
|
375
|
+
# Trended prior
|
|
376
|
+
if m0 is None:
|
|
377
|
+
if trend_method == 'movingave':
|
|
378
|
+
o = np.argsort(covariate)
|
|
379
|
+
oo = np.argsort(o)
|
|
380
|
+
width = int(np.floor(span * ntags))
|
|
381
|
+
width = max(width, 1)
|
|
382
|
+
m0 = moving_average_by_col(loglik[o], width=width)[oo]
|
|
383
|
+
elif trend_method == 'loess':
|
|
384
|
+
result = loess_by_col(loglik, x=covariate, span=span)
|
|
385
|
+
m0 = result['fitted_values']
|
|
386
|
+
elif trend_method == 'locfit':
|
|
387
|
+
m0 = locfit_by_col(loglik, x=covariate, span=span, degree=0)
|
|
388
|
+
elif trend_method == 'locfit.mixed':
|
|
389
|
+
deg0 = locfit_by_col(loglik, x=covariate, span=span, degree=0)
|
|
390
|
+
deg1 = locfit_by_col(loglik, x=covariate, span=span, degree=1)
|
|
391
|
+
from scipy.stats import beta as beta_dist
|
|
392
|
+
r = np.array([np.min(covariate), np.max(covariate)])
|
|
393
|
+
if r[1] - r[0] > 0:
|
|
394
|
+
w = beta_dist.cdf((covariate - r[0]) / (r[1] - r[0]), 2, 2)
|
|
395
|
+
else:
|
|
396
|
+
w = np.full(len(covariate), 0.5)
|
|
397
|
+
m0 = w[:, None] * deg0 + (1 - w[:, None]) * deg1
|
|
398
|
+
else:
|
|
399
|
+
# 'none'
|
|
400
|
+
m0 = np.tile(np.mean(loglik, axis=0), (ntags, 1))
|
|
401
|
+
|
|
402
|
+
if trend:
|
|
403
|
+
out['trend'] = maximize_interpolant(theta, m0)
|
|
404
|
+
|
|
405
|
+
# Weighted empirical Bayes posterior estimates
|
|
406
|
+
if individual:
|
|
407
|
+
prior_n = np.atleast_1d(np.asarray(prior_n, dtype=np.float64))
|
|
408
|
+
if len(prior_n) == 1:
|
|
409
|
+
l0a = loglik + prior_n[0] * m0
|
|
410
|
+
else:
|
|
411
|
+
l0a = loglik + prior_n[:, None] * m0
|
|
412
|
+
out['individual'] = maximize_interpolant(theta, l0a)
|
|
413
|
+
|
|
414
|
+
if m0_out:
|
|
415
|
+
out['shared.loglik'] = m0
|
|
416
|
+
|
|
417
|
+
return out
|
|
418
|
+
|
|
419
|
+
|
|
420
|
+
def estimate_common_disp(y, group=None, lib_size=None, tol=1e-6,
|
|
421
|
+
rowsum_filter=5, verbose=False):
|
|
422
|
+
"""Estimate common dispersion using exact conditional likelihood.
|
|
423
|
+
|
|
424
|
+
Port of edgeR's estimateCommonDisp.
|
|
425
|
+
|
|
426
|
+
Parameters
|
|
427
|
+
----------
|
|
428
|
+
y : ndarray or DGEList
|
|
429
|
+
Count matrix or DGEList.
|
|
430
|
+
group : array-like, optional
|
|
431
|
+
Group factor.
|
|
432
|
+
lib_size : ndarray, optional
|
|
433
|
+
Library sizes.
|
|
434
|
+
tol : float
|
|
435
|
+
Optimization tolerance.
|
|
436
|
+
rowsum_filter : int
|
|
437
|
+
Minimum row sum.
|
|
438
|
+
verbose : bool
|
|
439
|
+
Print progress.
|
|
440
|
+
|
|
441
|
+
Returns
|
|
442
|
+
-------
|
|
443
|
+
DGEList (if input is DGEList) or float.
|
|
444
|
+
"""
|
|
445
|
+
# DGEList input
|
|
446
|
+
if isinstance(y, dict) and 'counts' in y:
|
|
447
|
+
dge = y
|
|
448
|
+
from .dgelist import valid_dgelist
|
|
449
|
+
dge = valid_dgelist(dge)
|
|
450
|
+
group = dge['samples']['group'].values
|
|
451
|
+
ls = dge['samples']['lib.size'].values * dge['samples']['norm.factors'].values
|
|
452
|
+
|
|
453
|
+
d = estimate_common_disp(dge['counts'], group=group, lib_size=ls,
|
|
454
|
+
tol=tol, rowsum_filter=rowsum_filter, verbose=verbose)
|
|
455
|
+
dge['common.dispersion'] = d
|
|
456
|
+
dge['AveLogCPM'] = ave_log_cpm(dge, dispersion=d)
|
|
457
|
+
return dge
|
|
458
|
+
|
|
459
|
+
y = np.asarray(y, dtype=np.float64)
|
|
460
|
+
if y.ndim == 1:
|
|
461
|
+
y = y.reshape(1, -1)
|
|
462
|
+
ntags, nlibs = y.shape
|
|
463
|
+
|
|
464
|
+
if group is None:
|
|
465
|
+
group = np.ones(nlibs, dtype=int)
|
|
466
|
+
group = np.asarray(group)
|
|
467
|
+
|
|
468
|
+
if lib_size is None:
|
|
469
|
+
lib_size = y.sum(axis=0)
|
|
470
|
+
lib_size = np.asarray(lib_size, dtype=np.float64)
|
|
471
|
+
|
|
472
|
+
# Filter
|
|
473
|
+
keep = y.sum(axis=1) >= rowsum_filter
|
|
474
|
+
y_filt = y[keep]
|
|
475
|
+
|
|
476
|
+
if y_filt.shape[0] == 0:
|
|
477
|
+
warnings.warn("No genes pass rowsum filter")
|
|
478
|
+
return 0.1
|
|
479
|
+
|
|
480
|
+
# Equalize library sizes and split into groups
|
|
481
|
+
from .exact_test import equalize_lib_sizes, split_into_groups
|
|
482
|
+
|
|
483
|
+
# First pass with rough dispersion
|
|
484
|
+
eq = equalize_lib_sizes(y_filt, group=group, dispersion=0.01, lib_size=lib_size)
|
|
485
|
+
y_pseudo = eq['pseudo.counts']
|
|
486
|
+
y_split = split_into_groups(y_pseudo, group=group)
|
|
487
|
+
|
|
488
|
+
# Optimize
|
|
489
|
+
result = minimize_scalar(
|
|
490
|
+
lambda d: -common_cond_log_lik_der_delta(y_split, d, der=0),
|
|
491
|
+
bounds=(1e-4, 100 / 101), method='bounded',
|
|
492
|
+
options={'xatol': tol})
|
|
493
|
+
delta = result.x
|
|
494
|
+
disp = delta / (1 - delta)
|
|
495
|
+
|
|
496
|
+
if verbose:
|
|
497
|
+
print(f"Disp = {disp:.5f}, BCV = {np.sqrt(disp):.4f}")
|
|
498
|
+
|
|
499
|
+
return disp
|
|
500
|
+
|
|
501
|
+
|
|
502
|
+
def estimate_tagwise_disp(y, group=None, lib_size=None, dispersion=None,
|
|
503
|
+
prior_df=10, trend='movingave', span=None,
|
|
504
|
+
method='grid', grid_length=11, grid_range=(-6, 6),
|
|
505
|
+
tol=1e-6, verbose=False):
|
|
506
|
+
"""Estimate tagwise dispersions using exact conditional likelihood.
|
|
507
|
+
|
|
508
|
+
Port of edgeR's estimateTagwiseDisp.
|
|
509
|
+
|
|
510
|
+
Parameters
|
|
511
|
+
----------
|
|
512
|
+
y : ndarray or DGEList
|
|
513
|
+
Count matrix or DGEList.
|
|
514
|
+
group : array-like, optional
|
|
515
|
+
Group factor.
|
|
516
|
+
lib_size : ndarray, optional
|
|
517
|
+
Library sizes.
|
|
518
|
+
dispersion : float or ndarray, optional
|
|
519
|
+
Starting dispersion.
|
|
520
|
+
prior_df : float
|
|
521
|
+
Prior degrees of freedom.
|
|
522
|
+
trend : str
|
|
523
|
+
'movingave', 'loess', or 'none'.
|
|
524
|
+
span : float, optional
|
|
525
|
+
Smoothing span.
|
|
526
|
+
method : str
|
|
527
|
+
'grid' or 'optimize'.
|
|
528
|
+
grid_length : int
|
|
529
|
+
Number of grid points.
|
|
530
|
+
grid_range : tuple
|
|
531
|
+
Grid range.
|
|
532
|
+
tol : float
|
|
533
|
+
Tolerance.
|
|
534
|
+
|
|
535
|
+
Returns
|
|
536
|
+
-------
|
|
537
|
+
DGEList (if input is DGEList) or ndarray of tagwise dispersions.
|
|
538
|
+
"""
|
|
539
|
+
# DGEList input
|
|
540
|
+
if isinstance(y, dict) and 'counts' in y:
|
|
541
|
+
dge = y
|
|
542
|
+
from .dgelist import valid_dgelist
|
|
543
|
+
dge = valid_dgelist(dge)
|
|
544
|
+
group = dge['samples']['group'].values
|
|
545
|
+
ls = dge['samples']['lib.size'].values * dge['samples']['norm.factors'].values
|
|
546
|
+
|
|
547
|
+
if dispersion is None:
|
|
548
|
+
dispersion = dge.get('common.dispersion')
|
|
549
|
+
if dispersion is None:
|
|
550
|
+
raise ValueError("No common.dispersion found. Run estimate_common_disp first.")
|
|
551
|
+
|
|
552
|
+
if dge.get('AveLogCPM') is None:
|
|
553
|
+
dge['AveLogCPM'] = ave_log_cpm(dge)
|
|
554
|
+
|
|
555
|
+
td = estimate_tagwise_disp(
|
|
556
|
+
dge['counts'], group=group, lib_size=ls, dispersion=dispersion,
|
|
557
|
+
prior_df=prior_df, trend=trend, span=span, method=method,
|
|
558
|
+
grid_length=grid_length, grid_range=grid_range, tol=tol)
|
|
559
|
+
dge['tagwise.dispersion'] = td
|
|
560
|
+
dge['prior.df'] = prior_df
|
|
561
|
+
return dge
|
|
562
|
+
|
|
563
|
+
y = np.asarray(y, dtype=np.float64)
|
|
564
|
+
if y.ndim == 1:
|
|
565
|
+
y = y.reshape(1, -1)
|
|
566
|
+
ntags, nlibs = y.shape
|
|
567
|
+
|
|
568
|
+
if group is None:
|
|
569
|
+
group = np.ones(nlibs, dtype=int)
|
|
570
|
+
group = np.asarray(group)
|
|
571
|
+
|
|
572
|
+
if lib_size is None:
|
|
573
|
+
lib_size = y.sum(axis=0)
|
|
574
|
+
lib_size = np.asarray(lib_size, dtype=np.float64)
|
|
575
|
+
|
|
576
|
+
if dispersion is None:
|
|
577
|
+
dispersion = 0.1
|
|
578
|
+
|
|
579
|
+
if span is None:
|
|
580
|
+
span = (10 / ntags) ** 0.23 if ntags > 10 else 1.0
|
|
581
|
+
|
|
582
|
+
# Equalize library sizes
|
|
583
|
+
from .exact_test import equalize_lib_sizes, split_into_groups
|
|
584
|
+
eq = equalize_lib_sizes(y, group=group, dispersion=dispersion, lib_size=lib_size)
|
|
585
|
+
y_pseudo = eq['pseudo.counts']
|
|
586
|
+
y_split = split_into_groups(y_pseudo, group=group)
|
|
587
|
+
|
|
588
|
+
# Compute log-likelihoods on grid
|
|
589
|
+
spline_pts = np.linspace(grid_range[0], grid_range[1], grid_length)
|
|
590
|
+
|
|
591
|
+
if np.isscalar(dispersion):
|
|
592
|
+
disp_base = dispersion
|
|
593
|
+
else:
|
|
594
|
+
disp_base = np.median(dispersion)
|
|
595
|
+
|
|
596
|
+
grid_disp = disp_base * 2 ** spline_pts
|
|
597
|
+
grid_delta = grid_disp / (1 + grid_disp)
|
|
598
|
+
|
|
599
|
+
l0 = np.zeros((ntags, grid_length))
|
|
600
|
+
for j in range(grid_length):
|
|
601
|
+
for grp_data in y_split:
|
|
602
|
+
l0[:, j] += cond_log_lik_der_delta(grp_data, grid_delta[j], der=0)
|
|
603
|
+
|
|
604
|
+
# Compute AveLogCPM for smoothing
|
|
605
|
+
alc = ave_log_cpm(y, lib_size=lib_size)
|
|
606
|
+
|
|
607
|
+
# Use WLEB
|
|
608
|
+
prior_n = prior_df / (nlibs - len(np.unique(group)))
|
|
609
|
+
|
|
610
|
+
out = WLEB(theta=spline_pts, loglik=l0, prior_n=prior_n,
|
|
611
|
+
covariate=alc, trend_method='movingave' if trend == 'movingave' else
|
|
612
|
+
('loess' if trend == 'loess' else 'none'),
|
|
613
|
+
span=span)
|
|
614
|
+
|
|
615
|
+
tagwise_dispersion = disp_base * 2 ** out['individual']
|
|
616
|
+
return tagwise_dispersion
|
|
617
|
+
|
|
618
|
+
|
|
619
|
+
def estimate_trended_disp(y, group=None, lib_size=None, ave_log_cpm_vals=None,
|
|
620
|
+
method='bin.spline', df=5, span=2/3):
|
|
621
|
+
"""Estimate trended dispersions using exact conditional likelihood.
|
|
622
|
+
|
|
623
|
+
Port of edgeR's estimateTrendedDisp.
|
|
624
|
+
|
|
625
|
+
Returns
|
|
626
|
+
-------
|
|
627
|
+
DGEList (if input is DGEList) or ndarray of trended dispersions.
|
|
628
|
+
"""
|
|
629
|
+
# DGEList input
|
|
630
|
+
if isinstance(y, dict) and 'counts' in y:
|
|
631
|
+
dge = y
|
|
632
|
+
from .dgelist import valid_dgelist
|
|
633
|
+
dge = valid_dgelist(dge)
|
|
634
|
+
group_val = dge['samples']['group'].values
|
|
635
|
+
ls = dge['samples']['lib.size'].values * dge['samples']['norm.factors'].values
|
|
636
|
+
if dge.get('AveLogCPM') is None:
|
|
637
|
+
dge['AveLogCPM'] = ave_log_cpm(dge)
|
|
638
|
+
out = estimate_trended_disp(dge['counts'], group=group_val, lib_size=ls,
|
|
639
|
+
ave_log_cpm_vals=dge['AveLogCPM'],
|
|
640
|
+
method=method, df=df, span=span)
|
|
641
|
+
dge['trended.dispersion'] = out
|
|
642
|
+
return dge
|
|
643
|
+
|
|
644
|
+
y = np.asarray(y, dtype=np.float64)
|
|
645
|
+
if y.ndim == 1:
|
|
646
|
+
y = y.reshape(1, -1)
|
|
647
|
+
ntags, nlibs = y.shape
|
|
648
|
+
|
|
649
|
+
if group is None:
|
|
650
|
+
group = np.ones(nlibs, dtype=int)
|
|
651
|
+
group = drop_empty_levels(np.asarray(group))
|
|
652
|
+
|
|
653
|
+
if lib_size is None:
|
|
654
|
+
lib_size = y.sum(axis=0)
|
|
655
|
+
lib_size = np.asarray(lib_size, dtype=np.float64)
|
|
656
|
+
|
|
657
|
+
if ave_log_cpm_vals is None:
|
|
658
|
+
ave_log_cpm_vals = ave_log_cpm(y, lib_size=lib_size)
|
|
659
|
+
|
|
660
|
+
# Bin genes by abundance and estimate dispersion in each bin
|
|
661
|
+
nbins = 50
|
|
662
|
+
if nbins > ntags:
|
|
663
|
+
nbins = max(1, ntags // 2)
|
|
664
|
+
|
|
665
|
+
bins = cut_with_min_n(ave_log_cpm_vals, intervals=nbins,
|
|
666
|
+
min_n=max(1, ntags // nbins))
|
|
667
|
+
disp_bins = np.zeros(nbins)
|
|
668
|
+
ave_bins = np.zeros(nbins)
|
|
669
|
+
|
|
670
|
+
for i in range(1, nbins + 1):
|
|
671
|
+
mask = bins['group'] == i
|
|
672
|
+
if np.sum(mask) == 0:
|
|
673
|
+
continue
|
|
674
|
+
disp_bins[i - 1] = estimate_common_disp(y[mask], group=group,
|
|
675
|
+
lib_size=lib_size,
|
|
676
|
+
rowsum_filter=0)
|
|
677
|
+
ave_bins[i - 1] = np.mean(ave_log_cpm_vals[mask])
|
|
678
|
+
|
|
679
|
+
# Fit trend
|
|
680
|
+
if method == 'bin.spline':
|
|
681
|
+
from scipy.interpolate import UnivariateSpline
|
|
682
|
+
order = np.argsort(ave_bins)
|
|
683
|
+
try:
|
|
684
|
+
spl = UnivariateSpline(ave_bins[order],
|
|
685
|
+
np.sqrt(np.maximum(disp_bins[order], 0)),
|
|
686
|
+
k=min(3, len(ave_bins) - 1),
|
|
687
|
+
s=len(ave_bins) * 0.1)
|
|
688
|
+
trended = spl(ave_log_cpm_vals) ** 2
|
|
689
|
+
except Exception:
|
|
690
|
+
trended = np.full(ntags, np.mean(disp_bins))
|
|
691
|
+
else:
|
|
692
|
+
# bin.loess
|
|
693
|
+
from scipy.interpolate import interp1d
|
|
694
|
+
try:
|
|
695
|
+
f = interp1d(ave_bins, np.sqrt(np.maximum(disp_bins, 0)),
|
|
696
|
+
fill_value='extrapolate')
|
|
697
|
+
trended = f(ave_log_cpm_vals) ** 2
|
|
698
|
+
except Exception:
|
|
699
|
+
trended = np.full(ntags, np.mean(disp_bins))
|
|
700
|
+
|
|
701
|
+
return np.maximum(trended, 0)
|
|
702
|
+
|
|
703
|
+
|
|
704
|
+
def estimate_glm_common_disp(y, design=None, offset=None, method='CoxReid',
|
|
705
|
+
subset=10000, ave_log_cpm_vals=None, verbose=False,
|
|
706
|
+
weights=None):
|
|
707
|
+
"""Estimate common dispersion using GLM approach.
|
|
708
|
+
|
|
709
|
+
Port of edgeR's estimateGLMCommonDisp.
|
|
710
|
+
|
|
711
|
+
Returns
|
|
712
|
+
-------
|
|
713
|
+
DGEList (if input is DGEList) or float.
|
|
714
|
+
"""
|
|
715
|
+
# DGEList input
|
|
716
|
+
if isinstance(y, dict) and 'counts' in y:
|
|
717
|
+
dge = y
|
|
718
|
+
from .dgelist import valid_dgelist, get_offset
|
|
719
|
+
dge = valid_dgelist(dge)
|
|
720
|
+
alc = ave_log_cpm(dge, dispersion=0.05)
|
|
721
|
+
offset_val = get_offset(dge)
|
|
722
|
+
d = estimate_glm_common_disp(
|
|
723
|
+
dge['counts'], design=design, offset=offset_val,
|
|
724
|
+
method=method, subset=subset, ave_log_cpm_vals=alc,
|
|
725
|
+
verbose=verbose, weights=dge.get('weights'))
|
|
726
|
+
dge['common.dispersion'] = d
|
|
727
|
+
dge['AveLogCPM'] = ave_log_cpm(dge, dispersion=d)
|
|
728
|
+
return dge
|
|
729
|
+
|
|
730
|
+
y = np.asarray(y, dtype=np.float64)
|
|
731
|
+
if y.ndim == 1:
|
|
732
|
+
y = y.reshape(1, -1)
|
|
733
|
+
|
|
734
|
+
if design is None:
|
|
735
|
+
design = np.ones((y.shape[1], 1))
|
|
736
|
+
else:
|
|
737
|
+
design = np.asarray(design, dtype=np.float64)
|
|
738
|
+
if design.ndim == 1:
|
|
739
|
+
design = design.reshape(-1, 1)
|
|
740
|
+
|
|
741
|
+
if design.shape[1] >= y.shape[1]:
|
|
742
|
+
warnings.warn("No residual df: setting dispersion to NA")
|
|
743
|
+
return np.nan
|
|
744
|
+
|
|
745
|
+
if offset is None:
|
|
746
|
+
offset = np.log(y.sum(axis=0))
|
|
747
|
+
|
|
748
|
+
if ave_log_cpm_vals is None:
|
|
749
|
+
ave_log_cpm_vals = ave_log_cpm(y, offset=offset, weights=weights)
|
|
750
|
+
|
|
751
|
+
valid_methods = ('CoxReid', 'Pearson', 'deviance')
|
|
752
|
+
if method not in valid_methods:
|
|
753
|
+
raise ValueError(f"method must be one of {valid_methods}")
|
|
754
|
+
|
|
755
|
+
if method != 'CoxReid' and weights is not None:
|
|
756
|
+
warnings.warn("weights only supported by CoxReid method")
|
|
757
|
+
|
|
758
|
+
if method == 'CoxReid':
|
|
759
|
+
d = disp_cox_reid(y, design=design, offset=offset, subset=subset,
|
|
760
|
+
ave_log_cpm_vals=ave_log_cpm_vals, weights=weights)
|
|
761
|
+
elif method == 'Pearson':
|
|
762
|
+
d = disp_pearson(y, design=design, offset=offset, subset=subset,
|
|
763
|
+
ave_log_cpm_vals=ave_log_cpm_vals)
|
|
764
|
+
else:
|
|
765
|
+
d = disp_deviance(y, design=design, offset=offset, subset=subset,
|
|
766
|
+
ave_log_cpm_vals=ave_log_cpm_vals)
|
|
767
|
+
|
|
768
|
+
if verbose:
|
|
769
|
+
print(f"Disp = {d:.5f}, BCV = {np.sqrt(d):.4f}")
|
|
770
|
+
|
|
771
|
+
return d
|
|
772
|
+
|
|
773
|
+
|
|
774
|
+
def estimate_glm_trended_disp(y, design=None, offset=None,
|
|
775
|
+
ave_log_cpm_vals=None, method='auto',
|
|
776
|
+
weights=None):
|
|
777
|
+
"""Estimate trended dispersion using GLM approach.
|
|
778
|
+
|
|
779
|
+
Port of edgeR's estimateGLMTrendedDisp.
|
|
780
|
+
|
|
781
|
+
Returns
|
|
782
|
+
-------
|
|
783
|
+
DGEList (if input is DGEList) or ndarray.
|
|
784
|
+
"""
|
|
785
|
+
# DGEList input
|
|
786
|
+
if isinstance(y, dict) and 'counts' in y:
|
|
787
|
+
dge = y
|
|
788
|
+
if dge.get('AveLogCPM') is None:
|
|
789
|
+
dge['AveLogCPM'] = ave_log_cpm(dge)
|
|
790
|
+
from .dgelist import get_offset
|
|
791
|
+
d = estimate_glm_trended_disp(
|
|
792
|
+
dge['counts'], design=design, offset=get_offset(dge),
|
|
793
|
+
ave_log_cpm_vals=dge['AveLogCPM'], method=method,
|
|
794
|
+
weights=dge.get('weights'))
|
|
795
|
+
dge['trended.dispersion'] = d
|
|
796
|
+
return dge
|
|
797
|
+
|
|
798
|
+
y = np.asarray(y, dtype=np.float64)
|
|
799
|
+
if y.ndim == 1:
|
|
800
|
+
y = y.reshape(1, -1)
|
|
801
|
+
ntags = y.shape[0]
|
|
802
|
+
nlibs = y.shape[1]
|
|
803
|
+
|
|
804
|
+
if ntags == 0:
|
|
805
|
+
return np.array([], dtype=np.float64)
|
|
806
|
+
|
|
807
|
+
if design is None:
|
|
808
|
+
design = np.ones((nlibs, 1))
|
|
809
|
+
else:
|
|
810
|
+
design = np.asarray(design, dtype=np.float64)
|
|
811
|
+
if design.ndim == 1:
|
|
812
|
+
design = design.reshape(-1, 1)
|
|
813
|
+
|
|
814
|
+
if design.shape[1] >= nlibs:
|
|
815
|
+
warnings.warn("No residual df: cannot estimate dispersion")
|
|
816
|
+
return np.full(ntags, np.nan)
|
|
817
|
+
|
|
818
|
+
if offset is None:
|
|
819
|
+
offset = np.log(y.sum(axis=0))
|
|
820
|
+
|
|
821
|
+
if ave_log_cpm_vals is None:
|
|
822
|
+
ave_log_cpm_vals = ave_log_cpm(y, offset=offset, weights=weights)
|
|
823
|
+
|
|
824
|
+
if method == 'auto':
|
|
825
|
+
method = 'power' if ntags < 200 else 'bin.spline'
|
|
826
|
+
|
|
827
|
+
valid_methods = ('bin.spline', 'bin.loess', 'power', 'spline')
|
|
828
|
+
if method not in valid_methods:
|
|
829
|
+
raise ValueError(f"method must be one of {valid_methods}")
|
|
830
|
+
|
|
831
|
+
if method in ('bin.spline', 'bin.loess'):
|
|
832
|
+
mt = 'spline' if method == 'bin.spline' else 'loess'
|
|
833
|
+
result = disp_bin_trend(y, design, offset=offset, method_trend=mt,
|
|
834
|
+
ave_log_cpm_vals=ave_log_cpm_vals, weights=weights)
|
|
835
|
+
elif method == 'power':
|
|
836
|
+
result = disp_cox_reid_power_trend(y, design, offset=offset,
|
|
837
|
+
ave_log_cpm_vals=ave_log_cpm_vals)
|
|
838
|
+
else:
|
|
839
|
+
result = disp_cox_reid_spline_trend(y, design, offset=offset,
|
|
840
|
+
ave_log_cpm_vals=ave_log_cpm_vals)
|
|
841
|
+
|
|
842
|
+
return result['dispersion']
|
|
843
|
+
|
|
844
|
+
|
|
845
|
+
def estimate_glm_tagwise_disp(y, design=None, offset=None, dispersion=None,
|
|
846
|
+
prior_df=10, trend=True, span=None,
|
|
847
|
+
ave_log_cpm_vals=None, weights=None):
|
|
848
|
+
"""Estimate tagwise dispersions using GLM approach.
|
|
849
|
+
|
|
850
|
+
Port of edgeR's estimateGLMTagwiseDisp.
|
|
851
|
+
|
|
852
|
+
Returns
|
|
853
|
+
-------
|
|
854
|
+
DGEList (if input is DGEList) or ndarray.
|
|
855
|
+
"""
|
|
856
|
+
# DGEList input
|
|
857
|
+
if isinstance(y, dict) and 'counts' in y:
|
|
858
|
+
dge = y
|
|
859
|
+
if trend:
|
|
860
|
+
dispersion = dge.get('trended.dispersion')
|
|
861
|
+
if dispersion is None:
|
|
862
|
+
raise ValueError("No trended.dispersion found. Run estimate_glm_trended_disp first.")
|
|
863
|
+
else:
|
|
864
|
+
if dispersion is None:
|
|
865
|
+
dispersion = dge.get('common.dispersion')
|
|
866
|
+
if dispersion is None:
|
|
867
|
+
raise ValueError("No common.dispersion found. Run estimate_glm_common_disp first.")
|
|
868
|
+
|
|
869
|
+
if dge.get('AveLogCPM') is None:
|
|
870
|
+
dge['AveLogCPM'] = ave_log_cpm(dge)
|
|
871
|
+
|
|
872
|
+
ntags = dge['counts'].shape[0]
|
|
873
|
+
if span is None:
|
|
874
|
+
span = (10 / ntags) ** 0.23 if ntags > 10 else 1.0
|
|
875
|
+
dge['span'] = span
|
|
876
|
+
|
|
877
|
+
from .dgelist import get_offset
|
|
878
|
+
d = estimate_glm_tagwise_disp(
|
|
879
|
+
dge['counts'], design=design, offset=get_offset(dge),
|
|
880
|
+
dispersion=dispersion, prior_df=prior_df, trend=trend,
|
|
881
|
+
span=span, ave_log_cpm_vals=dge['AveLogCPM'],
|
|
882
|
+
weights=dge.get('weights'))
|
|
883
|
+
dge['prior.df'] = prior_df
|
|
884
|
+
dge['tagwise.dispersion'] = d
|
|
885
|
+
return dge
|
|
886
|
+
|
|
887
|
+
y = np.asarray(y, dtype=np.float64)
|
|
888
|
+
if y.ndim == 1:
|
|
889
|
+
y = y.reshape(1, -1)
|
|
890
|
+
ntags, nlibs = y.shape
|
|
891
|
+
|
|
892
|
+
if ntags == 0:
|
|
893
|
+
return np.array([], dtype=np.float64)
|
|
894
|
+
|
|
895
|
+
if design is None:
|
|
896
|
+
design = np.ones((nlibs, 1))
|
|
897
|
+
else:
|
|
898
|
+
design = np.asarray(design, dtype=np.float64)
|
|
899
|
+
if design.ndim == 1:
|
|
900
|
+
design = design.reshape(-1, 1)
|
|
901
|
+
|
|
902
|
+
if design.shape[1] >= nlibs:
|
|
903
|
+
warnings.warn("No residual df: setting dispersion to NA")
|
|
904
|
+
return np.full(ntags, np.nan)
|
|
905
|
+
|
|
906
|
+
if offset is None:
|
|
907
|
+
offset = np.log(y.sum(axis=0))
|
|
908
|
+
|
|
909
|
+
if span is None:
|
|
910
|
+
span = (10 / ntags) ** 0.23 if ntags > 10 else 1.0
|
|
911
|
+
|
|
912
|
+
if ave_log_cpm_vals is None:
|
|
913
|
+
ave_log_cpm_vals = ave_log_cpm(y, offset=offset, weights=weights)
|
|
914
|
+
|
|
915
|
+
tagwise = disp_cox_reid_interpolate_tagwise(
|
|
916
|
+
y, design, offset=offset, dispersion=dispersion,
|
|
917
|
+
trend=trend, prior_df=prior_df, span=span,
|
|
918
|
+
ave_log_cpm_vals=ave_log_cpm_vals, weights=weights)
|
|
919
|
+
|
|
920
|
+
return tagwise
|