edgepython 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
edgepython/glm_fit.py ADDED
@@ -0,0 +1,653 @@
1
+ # This code was written by Claude (Anthropic). The project was directed by Lior Pachter.
2
+ """
3
+ GLM fitting for edgePython.
4
+
5
+ Port of edgeR's glmFit, glmQLFit, mglmOneGroup, mglmOneWay.
6
+ """
7
+
8
+ import numpy as np
9
+ import warnings
10
+ from .compressed_matrix import (CompressedMatrix, compress_offsets,
11
+ compress_weights, compress_dispersions)
12
+ from .glm_levenberg import mglm_levenberg, nbinom_deviance
13
+ from .utils import (expand_as_matrix, design_as_factor, pred_fc,
14
+ add_prior_count, residual_df)
15
+ from .limma_port import squeeze_var, non_estimable, is_fullrank, choose_lowess_span
16
+
17
+
18
+ def mglm_one_group(y, dispersion=0, offset=0, weights=None,
19
+ coef_start=None, maxit=50, tol=1e-10):
20
+ """Fit single-group negative-binomial GLM.
21
+
22
+ Port of edgeR's mglmOneGroup (C code reimplemented in Python).
23
+
24
+ Parameters
25
+ ----------
26
+ y : ndarray
27
+ Count matrix (genes x samples).
28
+ dispersion : float, ndarray, or CompressedMatrix
29
+ NB dispersions.
30
+ offset : float, ndarray, or CompressedMatrix
31
+ Log-scale offsets.
32
+ weights : ndarray or CompressedMatrix, optional
33
+ Observation weights.
34
+ coef_start : ndarray, optional
35
+ Starting coefficient values (one per gene).
36
+ maxit : int
37
+ Maximum iterations.
38
+ tol : float
39
+ Convergence tolerance.
40
+
41
+ Returns
42
+ -------
43
+ ndarray of coefficients (one per gene).
44
+ """
45
+ y = np.asarray(y, dtype=np.float64)
46
+ if y.ndim == 1:
47
+ y = y.reshape(1, -1)
48
+ ngenes, nlibs = y.shape
49
+
50
+ # Expand offset, dispersion, weights
51
+ offset_mat = _expand_to_matrix(offset, y.shape)
52
+ disp_mat = _expand_to_matrix(dispersion, y.shape)
53
+ if weights is not None:
54
+ w_mat = _expand_to_matrix(weights, y.shape)
55
+ else:
56
+ w_mat = np.ones_like(y)
57
+
58
+ # Ensure 2D for all
59
+ if disp_mat.ndim == 1:
60
+ disp_mat = np.broadcast_to(disp_mat[:, None] if len(disp_mat) == ngenes
61
+ else disp_mat[None, :], y.shape).copy()
62
+ elif disp_mat.ndim == 0:
63
+ disp_mat = np.full_like(y, float(disp_mat))
64
+
65
+ # Starting values (vectorized)
66
+ if coef_start is not None:
67
+ b = np.asarray(coef_start, dtype=np.float64).ravel()
68
+ if len(b) == 1:
69
+ b = np.full(ngenes, b[0])
70
+ need_init = np.isnan(b)
71
+ else:
72
+ b = np.full(ngenes, np.nan)
73
+ need_init = np.ones(ngenes, dtype=bool)
74
+
75
+ if np.any(need_init):
76
+ lib = np.exp(offset_mat[need_init])
77
+ total_y = np.sum(w_mat[need_init] * y[need_init], axis=1)
78
+ total_lib = np.sum(w_mat[need_init] * lib, axis=1)
79
+ valid = (total_y > 0) & (total_lib > 0)
80
+ b_init = np.full(np.sum(need_init), -20.0)
81
+ b_init[valid] = np.log(total_y[valid] / total_lib[valid])
82
+ b[need_init] = b_init
83
+
84
+ # Vectorized Fisher scoring iteration (all genes at once)
85
+ active = np.ones(ngenes, dtype=bool) # genes still iterating
86
+ for _it in range(maxit):
87
+ if not np.any(active):
88
+ break
89
+
90
+ # Compute mu for active genes
91
+ eta = b[active, None] + offset_mat[active] # (n_active, nlibs)
92
+ mu = np.exp(np.clip(eta, -500, 500))
93
+ mu = np.maximum(mu, 1e-300)
94
+
95
+ # Working weights
96
+ denom = 1.0 + disp_mat[active] * mu # (n_active, nlibs)
97
+
98
+ # Score and information
99
+ dl = np.sum(w_mat[active] * (y[active] - mu) / denom, axis=1) # (n_active,)
100
+ info = np.sum(w_mat[active] * mu / denom, axis=1) # (n_active,)
101
+
102
+ # Guard against zero information
103
+ safe = info > 1e-300
104
+ step = np.zeros_like(dl)
105
+ step[safe] = dl[safe] / info[safe]
106
+
107
+ b_new = b[active] + step
108
+
109
+ # Check convergence
110
+ converged = np.abs(step) < tol * (np.abs(b[active]) + 0.1)
111
+ converged |= ~safe
112
+
113
+ b[active] = b_new
114
+
115
+ # Mark converged genes as inactive
116
+ active_indices = np.where(active)[0]
117
+ active[active_indices[converged]] = False
118
+
119
+ return b
120
+
121
+
122
+ def mglm_one_way(y, design=None, group=None, dispersion=0, offset=0,
123
+ weights=None, coef_start=None, maxit=50, tol=1e-10):
124
+ """Fit multiple NB GLMs with a one-way layout.
125
+
126
+ Port of edgeR's mglmOneWay.
127
+
128
+ Parameters
129
+ ----------
130
+ y : ndarray
131
+ Count matrix (genes x samples).
132
+ design : ndarray, optional
133
+ Design matrix.
134
+ group : ndarray, optional
135
+ Group factor.
136
+ dispersion : float or ndarray
137
+ NB dispersions.
138
+ offset : float or ndarray
139
+ Offsets.
140
+ weights : ndarray, optional
141
+ Observation weights.
142
+ coef_start : ndarray, optional
143
+ Starting coefficients.
144
+ maxit : int
145
+ Maximum iterations.
146
+ tol : float
147
+ Convergence tolerance.
148
+
149
+ Returns
150
+ -------
151
+ dict with 'coefficients' and 'fitted.values'.
152
+ """
153
+ y = np.asarray(y, dtype=np.float64)
154
+ if y.ndim == 1:
155
+ y = y.reshape(1, -1)
156
+ ngenes, nlibs = y.shape
157
+
158
+ offset_mat = _expand_to_matrix(offset, y.shape)
159
+ disp_mat = _expand_to_matrix(dispersion, y.shape)
160
+ if weights is not None:
161
+ w_mat = _expand_to_matrix(weights, y.shape)
162
+ else:
163
+ w_mat = np.ones_like(y)
164
+
165
+ # Get group factor
166
+ if group is None:
167
+ if design is None:
168
+ group = np.zeros(nlibs, dtype=int)
169
+ else:
170
+ design = np.asarray(design, dtype=np.float64)
171
+ if design.ndim == 1:
172
+ design = design.reshape(-1, 1)
173
+ group = design_as_factor(design)
174
+ else:
175
+ group = np.asarray(group)
176
+
177
+ unique_groups = np.unique(group)
178
+ ngroups = len(unique_groups)
179
+
180
+ # Check if design reduces to indicator matrix
181
+ design_unique = None
182
+ if design is not None:
183
+ design = np.asarray(design, dtype=np.float64)
184
+ if design.ndim == 1:
185
+ design = design.reshape(-1, 1)
186
+ if design.shape[1] != ngroups:
187
+ raise ValueError("design matrix is not equivalent to a oneway layout")
188
+ # Get representative design rows
189
+ first_of_group = np.array([np.where(group == g)[0][0] for g in unique_groups])
190
+ design_unique = design[first_of_group]
191
+ # Check if it's a simple group indicator
192
+ is_indicator = (np.sum(design_unique == 1) == ngroups and
193
+ np.sum(design_unique == 0) == (ngroups - 1) * ngroups)
194
+ if is_indicator:
195
+ design_unique = None
196
+
197
+ # Convert starting values if needed
198
+ cs = None
199
+ if coef_start is not None:
200
+ coef_start = np.asarray(coef_start, dtype=np.float64)
201
+ if coef_start.ndim == 1:
202
+ coef_start = coef_start.reshape(1, -1)
203
+ if design_unique is not None:
204
+ cs = coef_start @ design_unique.T
205
+ else:
206
+ cs = coef_start
207
+
208
+ # Fit each group
209
+ beta = np.zeros((ngenes, ngroups))
210
+ for g_idx, grp in enumerate(unique_groups):
211
+ j = np.where(group == grp)[0]
212
+ cs_g = cs[:, g_idx] if cs is not None else None
213
+ beta[:, g_idx] = mglm_one_group(
214
+ y[:, j], dispersion=disp_mat[:, j] if disp_mat.ndim == 2 else disp_mat,
215
+ offset=offset_mat[:, j] if offset_mat.ndim == 2 else offset_mat,
216
+ weights=w_mat[:, j] if w_mat.ndim == 2 else w_mat,
217
+ coef_start=cs_g, maxit=maxit, tol=tol)
218
+
219
+ # Clamp -Inf to large negative
220
+ beta = np.maximum(beta, -1e8)
221
+
222
+ # Fitted values from group-wise betas
223
+ mu = np.zeros_like(y)
224
+ for g_idx, grp in enumerate(unique_groups):
225
+ j = np.where(group == grp)[0]
226
+ for jj in j:
227
+ mu[:, jj] = np.exp(np.clip(beta[:, g_idx] + offset_mat[:, jj], -500, 500))
228
+
229
+ # If design is not indicator, convert back
230
+ if design_unique is not None:
231
+ beta = np.linalg.solve(design_unique, beta.T).T
232
+
233
+ return {
234
+ 'coefficients': beta,
235
+ 'fitted.values': mu
236
+ }
237
+
238
+
239
+ def glm_fit(y, design=None, dispersion=None, offset=None, lib_size=None,
240
+ weights=None, prior_count=0.125, start=None):
241
+ """Fit negative binomial GLMs for each gene.
242
+
243
+ Port of edgeR's glmFit.default.
244
+
245
+ Parameters
246
+ ----------
247
+ y : ndarray or DGEList
248
+ Count matrix (genes x samples), or DGEList.
249
+ design : ndarray or str, optional
250
+ Design matrix, or an R-style formula string (e.g.
251
+ ``'~ group'``, ``'~ batch + condition'``) which is
252
+ evaluated against the DGEList sample metadata via patsy.
253
+ dispersion : float or ndarray
254
+ NB dispersions.
255
+ offset : ndarray, optional
256
+ Log-scale offsets.
257
+ lib_size : ndarray, optional
258
+ Library sizes.
259
+ weights : ndarray, optional
260
+ Observation weights.
261
+ prior_count : float
262
+ Prior count for shrinking log-fold-changes.
263
+ start : ndarray, optional
264
+ Starting coefficient values.
265
+
266
+ Returns
267
+ -------
268
+ dict (DGEGLM-like) with coefficients, fitted.values, deviance,
269
+ df.residual, design, offset, dispersion, weights, etc.
270
+ """
271
+ # Resolve formula string to design matrix
272
+ from .utils import _resolve_design
273
+ design = _resolve_design(design, y)
274
+
275
+ # DGEList input
276
+ if isinstance(y, dict) and 'counts' in y:
277
+ dge = y
278
+ if design is None:
279
+ design = dge.get('design')
280
+ if design is None:
281
+ group = dge['samples']['group'].values
282
+ from .utils import drop_empty_levels
283
+ group = drop_empty_levels(group)
284
+ unique_groups = np.unique(group)
285
+ if len(unique_groups) > 1:
286
+ # model.matrix(~group)
287
+ from .utils import _model_matrix_group
288
+ design = _model_matrix_group(group)
289
+ if dispersion is None:
290
+ from .dgelist import get_dispersion
291
+ dispersion = get_dispersion(dge)
292
+ if dispersion is None:
293
+ raise ValueError("No dispersion values found in DGEList object.")
294
+ from .dgelist import get_offset
295
+ offset = get_offset(dge)
296
+ from .expression import ave_log_cpm
297
+ if dge.get('AveLogCPM') is None:
298
+ dge['AveLogCPM'] = ave_log_cpm(dge)
299
+
300
+ fit = glm_fit(dge['counts'], design=design, dispersion=dispersion,
301
+ offset=offset, lib_size=None, weights=dge.get('weights'),
302
+ prior_count=prior_count, start=start)
303
+ fit['samples'] = dge['samples']
304
+ fit['genes'] = dge.get('genes')
305
+ fit['prior.df'] = dge.get('prior.df')
306
+ fit['AveLogCPM'] = dge.get('AveLogCPM')
307
+ return fit
308
+
309
+ # Default method
310
+ y = np.asarray(y, dtype=np.float64)
311
+ if y.ndim == 1:
312
+ y = y.reshape(1, -1)
313
+ ntag, nlib = y.shape
314
+
315
+ # Check design
316
+ if design is None:
317
+ design = np.ones((nlib, 1))
318
+ else:
319
+ design = np.asarray(design, dtype=np.float64)
320
+ if design.ndim == 1:
321
+ design = design.reshape(-1, 1)
322
+ if design.shape[0] != nlib:
323
+ raise ValueError("nrow(design) disagrees with ncol(y)")
324
+ ne = non_estimable(design)
325
+ if ne is not None:
326
+ raise ValueError(f"Design matrix not of full rank. Non-estimable: {ne}")
327
+
328
+ # Check dispersion
329
+ if dispersion is None:
330
+ raise ValueError("No dispersion values provided.")
331
+ dispersion = np.atleast_1d(np.asarray(dispersion, dtype=np.float64))
332
+ if np.any(np.isnan(dispersion)):
333
+ raise ValueError("NA dispersions not allowed")
334
+
335
+ # Build offset from lib_size and offset
336
+ if offset is not None:
337
+ offset = np.asarray(offset, dtype=np.float64)
338
+ elif lib_size is not None:
339
+ lib_size = np.asarray(lib_size, dtype=np.float64)
340
+ offset = np.log(lib_size)
341
+ else:
342
+ offset = np.log(y.sum(axis=0))
343
+
344
+ offset_mat = expand_as_matrix(offset, (ntag, nlib))
345
+ disp_mat = expand_as_matrix(dispersion, (ntag, nlib))
346
+
347
+ if weights is not None:
348
+ w_mat = expand_as_matrix(np.asarray(weights, dtype=np.float64), (ntag, nlib))
349
+ else:
350
+ w_mat = None
351
+
352
+ # Fit: use one-way shortcut if design is equivalent to one-way layout
353
+ group = design_as_factor(design)
354
+ unique_groups = np.unique(group)
355
+
356
+ if len(unique_groups) == design.shape[1]:
357
+ fit = mglm_one_way(y, design=design, group=group,
358
+ dispersion=disp_mat, offset=offset_mat,
359
+ weights=w_mat, coef_start=start)
360
+ fit['deviance'] = nbinom_deviance(y, fit['fitted.values'], dispersion, w_mat)
361
+ fit['method'] = 'oneway'
362
+ else:
363
+ fit = mglm_levenberg(y, design=design, dispersion=disp_mat,
364
+ offset=offset_mat, weights=w_mat,
365
+ coef_start=start, maxit=250)
366
+ fit['method'] = 'levenberg'
367
+
368
+ # Prepare output
369
+ fit['counts'] = y
370
+ if prior_count > 0:
371
+ fit['unshrunk.coefficients'] = fit['coefficients'].copy()
372
+ fit['coefficients'] = pred_fc(y, design, offset=offset_mat,
373
+ dispersion=disp_mat,
374
+ prior_count=prior_count,
375
+ weights=w_mat) * np.log(2)
376
+
377
+ fit['df.residual'] = np.full(ntag, nlib - design.shape[1])
378
+ fit['design'] = design
379
+ fit['offset'] = offset_mat
380
+ fit['dispersion'] = dispersion
381
+ fit['weights'] = weights
382
+ fit['prior.count'] = prior_count
383
+
384
+ return fit
385
+
386
+
387
+ def glm_ql_fit(y, design=None, dispersion=None, offset=None, lib_size=None,
388
+ weights=None, abundance_trend=True, ave_log_cpm=None,
389
+ covariate_trend=None, robust=False, winsor_tail_p=(0.05, 0.1),
390
+ legacy=False, top_proportion=None, keep_unit_mat=False):
391
+ """Fit quasi-likelihood negative binomial GLMs.
392
+
393
+ Port of edgeR's glmQLFit.default.
394
+
395
+ Parameters
396
+ ----------
397
+ y : ndarray or DGEList
398
+ Count matrix or DGEList.
399
+ design : ndarray or str, optional
400
+ Design matrix, or an R-style formula string (e.g.
401
+ ``'~ group'``, ``'~ batch + condition'``) evaluated
402
+ against DGEList sample metadata via patsy.
403
+ dispersion : float or ndarray, optional
404
+ NB dispersions.
405
+ offset : ndarray, optional
406
+ Offsets.
407
+ lib_size : ndarray, optional
408
+ Library sizes.
409
+ weights : ndarray, optional
410
+ Observation weights.
411
+ abundance_trend : bool
412
+ Use abundance trend for QL prior.
413
+ ave_log_cpm : ndarray, optional
414
+ Average log-CPM values.
415
+ covariate_trend : ndarray, optional
416
+ Covariate for trended prior.
417
+ robust : bool
418
+ Robust empirical Bayes.
419
+ winsor_tail_p : tuple
420
+ Winsorization tail proportions.
421
+ legacy : bool
422
+ Use legacy (old-style) QL method.
423
+ top_proportion : float, optional
424
+ Proportion of top-abundance genes for dispersion estimation.
425
+ keep_unit_mat : bool
426
+ Keep unit deviance matrix.
427
+
428
+ Returns
429
+ -------
430
+ dict (DGEGLM-like) with added s2.post, df.prior, s2.prior fields.
431
+ """
432
+ from .expression import ave_log_cpm as _ave_log_cpm
433
+ from .utils import _resolve_design
434
+ design = _resolve_design(design, y)
435
+
436
+ # DGEList input
437
+ if isinstance(y, dict) and 'counts' in y:
438
+ dge = y
439
+ if design is None:
440
+ design = dge.get('design')
441
+ if design is None:
442
+ group = dge['samples']['group'].values
443
+ from .utils import drop_empty_levels
444
+ group = drop_empty_levels(group)
445
+ unique_g = np.unique(group)
446
+ if len(unique_g) > 1:
447
+ from .utils import _model_matrix_group
448
+ design = _model_matrix_group(group)
449
+
450
+ if dge.get('AveLogCPM') is None:
451
+ dge['AveLogCPM'] = _ave_log_cpm(dge)
452
+
453
+ if dispersion is None:
454
+ if legacy:
455
+ dispersion = dge.get('trended.dispersion')
456
+ if dispersion is None:
457
+ dispersion = dge.get('common.dispersion')
458
+ if dispersion is None:
459
+ raise ValueError("No dispersion values found in DGEList object.")
460
+ else:
461
+ if dge.get('trended.dispersion') is not None:
462
+ ntop = int(np.ceil(0.1 * dge['counts'].shape[0]))
463
+ i = np.argsort(dge['AveLogCPM'])[::-1][:ntop]
464
+ dispersion = np.mean(dge['trended.dispersion'][i])
465
+
466
+ from .dgelist import get_offset
467
+ offset = get_offset(dge)
468
+
469
+ fit = glm_ql_fit(dge['counts'], design=design, dispersion=dispersion,
470
+ offset=offset, lib_size=None,
471
+ abundance_trend=abundance_trend,
472
+ ave_log_cpm=dge['AveLogCPM'],
473
+ robust=robust, winsor_tail_p=winsor_tail_p,
474
+ weights=dge.get('weights'),
475
+ legacy=legacy, top_proportion=top_proportion,
476
+ keep_unit_mat=keep_unit_mat)
477
+ fit['samples'] = dge['samples']
478
+ fit['genes'] = dge.get('genes')
479
+ fit['AveLogCPM'] = dge['AveLogCPM']
480
+ return fit
481
+
482
+ # Default method
483
+ y_mat = np.asarray(y, dtype=np.float64)
484
+ if y_mat.ndim == 1:
485
+ y_mat = y_mat.reshape(1, -1)
486
+ ngenes = y_mat.shape[0]
487
+ nlibs = y_mat.shape[1]
488
+
489
+ # Check design
490
+ if design is None:
491
+ design = np.ones((nlibs, 1))
492
+
493
+ design = np.asarray(design, dtype=np.float64)
494
+ if design.ndim == 1:
495
+ design = design.reshape(-1, 1)
496
+
497
+ # Check AveLogCPM
498
+ if ave_log_cpm is None:
499
+ ave_log_cpm = _ave_log_cpm(y_mat, offset=offset, lib_size=lib_size,
500
+ weights=weights, dispersion=dispersion)
501
+
502
+ # Check dispersion
503
+ if dispersion is None:
504
+ if legacy:
505
+ raise ValueError("No dispersion values provided.")
506
+ else:
507
+ if top_proportion is None:
508
+ df_residual = nlibs - design.shape[1]
509
+ top_proportion = choose_lowess_span(
510
+ ngenes * np.sqrt(df_residual), small_n=20, min_span=0.02)
511
+ else:
512
+ if top_proportion < 0 or top_proportion > 1:
513
+ raise ValueError("top_proportion should be between 0 and 1.")
514
+ ntop = int(np.ceil(top_proportion * ngenes))
515
+ i = np.argsort(ave_log_cpm)[::-1][:ntop]
516
+ from .dispersion import estimate_glm_common_disp
517
+ if offset is not None:
518
+ off_sub = np.asarray(offset)
519
+ if off_sub.ndim == 2:
520
+ off_sub = off_sub[i]
521
+ else:
522
+ off_sub = None
523
+ w_sub = None
524
+ if weights is not None:
525
+ w_arr = np.asarray(weights)
526
+ if w_arr.ndim == 2:
527
+ w_sub = w_arr[i]
528
+ dispersion = estimate_glm_common_disp(
529
+ y_mat[i], design=design, offset=off_sub, weights=w_sub)
530
+ else:
531
+ # Cap dispersion at 4 for non-legacy
532
+ if not legacy:
533
+ dispersion = np.atleast_1d(np.asarray(dispersion, dtype=np.float64))
534
+ if np.max(dispersion) > 4:
535
+ dispersion = np.minimum(dispersion, 4.0)
536
+
537
+ # Fit GLM (prior_count=0.125 matches R's glmFit.default default for logFC shrinkage)
538
+ fit = glm_fit(y_mat, design=design, dispersion=dispersion, offset=offset,
539
+ lib_size=lib_size, weights=weights, prior_count=0.125)
540
+
541
+ # Store AveLogCPM for computation
542
+ ave_log_cpm2 = ave_log_cpm.copy()
543
+
544
+ # Covariate for trended prior
545
+ if covariate_trend is None:
546
+ if abundance_trend:
547
+ fit['AveLogCPM'] = ave_log_cpm
548
+ else:
549
+ ave_log_cpm = None
550
+ else:
551
+ ave_log_cpm = covariate_trend
552
+
553
+ # Setting residual deviances and df
554
+ if legacy:
555
+ # Old-style: adjust df for fitted values at zero
556
+ zerofit = (fit['fitted.values'] < 1e-4) & (fit['counts'] < 1e-4)
557
+ df_residual = residual_df(zerofit, fit['design'])
558
+ fit['df.residual.zeros'] = df_residual
559
+ s2 = fit['deviance'] / np.maximum(df_residual, 1e-8)
560
+ s2[df_residual == 0] = 0
561
+ else:
562
+ # New-style: adjusted deviance and df using QL weights (matching R's C code)
563
+ from .ql_weights import update_prior, compute_adjust_vec
564
+
565
+ # Expand dispersion to matrix form for ql_weights
566
+ disp_arr = np.atleast_1d(np.asarray(dispersion, dtype=np.float64))
567
+
568
+ # Compute average quasi-dispersion via iterative lowess + adjusted deviance
569
+ ave_ql_disp = update_prior(y_mat, fit['fitted.values'], design,
570
+ disp_arr, weights, ave_log_cpm2)
571
+
572
+ # Refit with dispersion scaled by average quasi-dispersion
573
+ fit = glm_fit(y_mat, design=design, dispersion=dispersion / ave_ql_disp,
574
+ offset=offset, lib_size=lib_size, weights=weights,
575
+ prior_count=0.125)
576
+ fit['dispersion'] = dispersion
577
+
578
+ # Compute adjusted deviance, df, and s2 using QL weights
579
+ out = compute_adjust_vec(y_mat, fit['fitted.values'], design,
580
+ disp_arr, ave_ql_disp, weights)
581
+ s2 = out['s2']
582
+ df_residual = out['df']
583
+ fit['df.residual.adj'] = df_residual
584
+ fit['deviance.adj'] = out['deviance']
585
+ fit['average.ql.dispersion'] = ave_ql_disp
586
+
587
+ # Empirical Bayes moderation
588
+ s2 = np.maximum(s2, 0)
589
+ s2_fit = squeeze_var(s2, df=df_residual, covariate=ave_log_cpm,
590
+ robust=robust, winsor_tail_p=winsor_tail_p)
591
+
592
+ fit['df.prior'] = s2_fit['df_prior']
593
+ fit['s2.post'] = s2_fit['var_post']
594
+ fit['s2.prior'] = s2_fit['var_prior']
595
+ if not legacy:
596
+ fit['top.proportion'] = top_proportion
597
+
598
+ return fit
599
+
600
+
601
+ def _compute_ave_ql_disp(s2, df, ave_log_cpm):
602
+ """Compute average quasi-likelihood dispersion.
603
+
604
+ Matches R's update_prior in ql_glm.c: iteratively fits a lowess trend
605
+ of s2^(1/4) vs AveLogCPM, takes the 90th percentile of the trend,
606
+ and raises to 4th power.
607
+ """
608
+ from scipy.interpolate import UnivariateSpline
609
+ from statsmodels.nonparametric.smoothers_lowess import lowess as sm_lowess
610
+
611
+ threshold = 1e-8
612
+
613
+ # Filter genes with sufficient df
614
+ mask = df > threshold
615
+ x = ave_log_cpm[mask]
616
+ y_vals = np.power(np.maximum(s2[mask], 0), 0.25) # s2^(1/4)
617
+
618
+ if len(x) < 10:
619
+ return 1.0
620
+
621
+ # Two iterations of lowess + 90th percentile (matches R's update_prior)
622
+ prior = 1.0
623
+ for _ in range(2):
624
+ # Fit lowess trend (f=0.5, iter=3 matches R defaults)
625
+ fitted = sm_lowess(y_vals, x, frac=0.5, it=3, return_sorted=False)
626
+
627
+ # 90th percentile of fitted values (R type=7 quantile)
628
+ p90 = np.percentile(fitted, 90, interpolation='linear')
629
+
630
+ # Cap at minimum of 1.0 (on the ^(1/4) scale)
631
+ if p90 < 1.0:
632
+ p90 = 1.0
633
+
634
+ prior = p90 ** 4
635
+
636
+ return max(prior, 1.0)
637
+
638
+
639
+ def _expand_to_matrix(x, shape):
640
+ """Expand scalar, vector, or CompressedMatrix to full matrix."""
641
+ if isinstance(x, CompressedMatrix):
642
+ return x.as_matrix()
643
+ x = np.asarray(x, dtype=np.float64)
644
+ if x.ndim == 0 or x.size == 1:
645
+ return np.full(shape, x.ravel()[0])
646
+ if x.ndim == 1:
647
+ if len(x) == shape[1]:
648
+ return np.tile(x, (shape[0], 1))
649
+ elif len(x) == shape[0]:
650
+ return np.tile(x.reshape(-1, 1), (1, shape[1]))
651
+ if x.shape == shape:
652
+ return x
653
+ return np.broadcast_to(x, shape).copy()