edgepython 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
edgepython/splicing.py ADDED
@@ -0,0 +1,537 @@
1
+ # This code was written by Claude (Anthropic). The project was directed by Lior Pachter.
2
+ """
3
+ Splicing analysis for edgePython.
4
+
5
+ Port of edgeR's diffSplice, diffSpliceDGE, spliceVariants.
6
+ """
7
+
8
+ import numpy as np
9
+ import pandas as pd
10
+ from scipy.stats import f as f_dist, chi2
11
+ from statsmodels.stats.multitest import multipletests
12
+
13
+
14
+ def diff_splice(glmfit, coef=None, contrast=None, geneid=None, exonid=None,
15
+ prior_count=0.125, robust=None, verbose=True):
16
+ """Test for differential exon/transcript usage.
17
+
18
+ Faithful port of edgeR's diffSpliceDGE. Tests whether the log-fold-change
19
+ for each exon differs from the overall gene-level log-fold-change, i.e.
20
+ tests for differential *usage* rather than differential expression.
21
+
22
+ Parameters
23
+ ----------
24
+ glmfit : dict (DGEGLM-like)
25
+ Fitted GLM from glm_fit() or glm_ql_fit().
26
+ coef : int, optional
27
+ Coefficient to test (0-indexed). Default is last column.
28
+ contrast : ndarray, optional
29
+ Contrast vector.
30
+ geneid : ndarray or str, optional
31
+ Gene IDs for each exon/transcript.
32
+ exonid : ndarray or str, optional
33
+ Exon/transcript IDs.
34
+ prior_count : float
35
+ Prior count for gene-level GLM fit.
36
+ robust : bool or None
37
+ Use robust empirical Bayes for squeezeVar. None = auto-detect.
38
+ verbose : bool
39
+ Print progress.
40
+
41
+ Returns
42
+ -------
43
+ dict with gene-level and exon-level test results including:
44
+ - gene.table: DataFrame with GeneID, NExons, gene.F (or gene.LR),
45
+ gene.p.value, gene.Simes.p.value
46
+ - exon.table: DataFrame with GeneID, ExonID, logFC, exon.F (or exon.LR),
47
+ exon.p.value
48
+ - coefficients: exon-level coefficients relative to gene
49
+ - design, comparison
50
+ """
51
+ from .glm_fit import glm_fit
52
+ from .limma_port import squeeze_var, contrast_as_coef
53
+ from .utils import expand_as_matrix
54
+
55
+ # --- Detect LRT vs QL ---
56
+ isLRT = glmfit.get('df.prior') is None
57
+ if robust is None and not isLRT:
58
+ df_prior = glmfit['df.prior']
59
+ robust = hasattr(df_prior, '__len__') and len(np.atleast_1d(df_prior)) > 1
60
+
61
+ # --- Get gene and exon IDs ---
62
+ exon_genes = glmfit.get('genes')
63
+ nexons = glmfit['counts'].shape[0]
64
+ design = np.asarray(glmfit['design'], dtype=np.float64)
65
+
66
+ if exon_genes is None:
67
+ exon_genes = pd.DataFrame({'ExonID': np.arange(nexons)})
68
+ else:
69
+ exon_genes = exon_genes.copy()
70
+
71
+ genecolname = 'GeneID'
72
+ if geneid is None:
73
+ if isinstance(exon_genes, pd.DataFrame):
74
+ for col in ['GeneID', 'geneid', 'gene_id', 'Gene']:
75
+ if col in exon_genes.columns:
76
+ geneid = exon_genes[col].values
77
+ genecolname = col
78
+ break
79
+ if geneid is None:
80
+ raise ValueError("geneid must be provided")
81
+ elif isinstance(geneid, str):
82
+ genecolname = geneid
83
+ geneid = exon_genes[geneid].values
84
+ else:
85
+ exon_genes['GeneID'] = geneid
86
+ genecolname = 'GeneID'
87
+
88
+ exoncolname = None
89
+ if exonid is not None:
90
+ if isinstance(exonid, str):
91
+ exoncolname = exonid
92
+ exonid = exon_genes[exonid].values
93
+ else:
94
+ exon_genes['ExonID'] = exonid
95
+ exoncolname = 'ExonID'
96
+ else:
97
+ exoncolname = None
98
+
99
+ # --- Sort by geneid (+exonid) ---
100
+ geneid = np.asarray(geneid)
101
+ if exonid is not None:
102
+ exonid = np.asarray(exonid)
103
+ o = np.lexsort((exonid, geneid))
104
+ else:
105
+ o = np.argsort(geneid, kind='stable')
106
+
107
+ geneid = geneid[o]
108
+ exon_genes = exon_genes.iloc[o].reset_index(drop=True)
109
+
110
+ # Subset glmfit arrays by o
111
+ counts = glmfit['counts'][o]
112
+ coefficients = glmfit['coefficients'][o]
113
+ deviance = glmfit['deviance'][o]
114
+ df_residual_orig = glmfit['df.residual'][o]
115
+
116
+ # Handle offset: could be matrix or vector
117
+ offset_full = glmfit['offset']
118
+ if offset_full.ndim == 2:
119
+ offset_full = offset_full[o]
120
+ else:
121
+ offset_full = expand_as_matrix(offset_full, (nexons, counts.shape[1]))
122
+ offset_full = offset_full[o]
123
+
124
+ weights = glmfit.get('weights')
125
+ if weights is not None:
126
+ if np.ndim(weights) == 2:
127
+ weights = weights[o]
128
+ else:
129
+ weights = expand_as_matrix(weights, (nexons, counts.shape[1]))
130
+ weights = weights[o]
131
+
132
+ dispersion_orig = glmfit.get('dispersion')
133
+ if dispersion_orig is not None:
134
+ dispersion_orig = np.atleast_1d(dispersion_orig)
135
+ if len(dispersion_orig) == nexons:
136
+ dispersion_orig = dispersion_orig[o]
137
+
138
+ nbeta = design.shape[1]
139
+ if nbeta < 2:
140
+ raise ValueError("Need at least two columns for design")
141
+ coef_names = [f"x{i}" for i in range(nbeta)]
142
+
143
+ # --- Handle contrast or coef ---
144
+ if contrast is not None:
145
+ contrast = np.asarray(contrast, dtype=np.float64)
146
+ if contrast.ndim == 2:
147
+ contrast = contrast[:, 0]
148
+ reform = contrast_as_coef(design, contrast, first=True)
149
+ coef_idx = 0
150
+ beta = coefficients @ contrast
151
+ i = contrast != 0
152
+ coef_name = ' '.join(f"{contrast[j]}*{coef_names[j]}" for j in range(len(contrast)) if contrast[j] != 0)
153
+ design = reform['design']
154
+ else:
155
+ if coef is None:
156
+ coef_idx = nbeta - 1
157
+ else:
158
+ coef_idx = coef
159
+ coef_name = coef_names[coef_idx]
160
+ beta = coefficients[:, coef_idx]
161
+
162
+ design0 = np.delete(design, coef_idx, axis=1)
163
+
164
+ # --- Count exons per gene ---
165
+ unique_genes, gene_idx = np.unique(geneid, return_inverse=True)
166
+ # But we need reorder=False behavior (preserve order of first appearance)
167
+ # R's rowsum(reorder=FALSE) preserves the order of geneid as encountered
168
+ _, first_idx = np.unique(geneid, return_index=True)
169
+ order_by_first = np.argsort(first_idx)
170
+ unique_genes_ordered = unique_genes[order_by_first]
171
+ # Remap gene_idx to match this ordering
172
+ remap = np.empty(len(unique_genes), dtype=int)
173
+ remap[order_by_first] = np.arange(len(unique_genes))
174
+ g = remap[gene_idx] # gene index for each exon (0-based, ordered by first appearance)
175
+
176
+ gene_nexons = np.bincount(g)
177
+ ngenes_total = len(unique_genes_ordered)
178
+
179
+ if verbose:
180
+ print(f"Total number of exons: {nexons}")
181
+ print(f"Total number of genes: {ngenes_total}")
182
+ print(f"Number of genes with 1 exon: {np.sum(gene_nexons == 1)}")
183
+ print(f"Mean number of exons in a gene: {np.round(np.mean(gene_nexons)):.0f}")
184
+ print(f"Max number of exons in a gene: {np.max(gene_nexons)}")
185
+
186
+ # --- Filter to genes with >1 exon ---
187
+ gene_keep = gene_nexons > 1
188
+ ngenes = int(np.sum(gene_keep))
189
+ if ngenes == 0:
190
+ raise ValueError("No genes with more than one exon")
191
+
192
+ exon_keep = gene_keep[g]
193
+ geneid = geneid[exon_keep]
194
+ exon_genes = exon_genes.iloc[exon_keep].reset_index(drop=True)
195
+ beta = beta[exon_keep]
196
+ counts = counts[exon_keep]
197
+ offset_full = offset_full[exon_keep]
198
+ deviance = deviance[exon_keep]
199
+ df_residual_orig = df_residual_orig[exon_keep]
200
+ if weights is not None:
201
+ weights = weights[exon_keep]
202
+ if dispersion_orig is not None and len(dispersion_orig) > 1:
203
+ dispersion_orig = dispersion_orig[exon_keep]
204
+ coefficients_full = coefficients[exon_keep]
205
+
206
+ gene_nexons = gene_nexons[gene_keep]
207
+ unique_genes_ordered = unique_genes_ordered[gene_keep]
208
+ # Rebuild g for kept exons
209
+ g = np.repeat(np.arange(ngenes), gene_nexons)
210
+
211
+ nlib = counts.shape[1]
212
+ nexons_kept = counts.shape[0]
213
+
214
+ # --- Gene-level counts and GLM fit ---
215
+ gene_counts = np.zeros((ngenes, nlib), dtype=np.float64)
216
+ np.add.at(gene_counts, g, counts)
217
+
218
+ # Gene-level offset: use first row's offset (R uses offset[1,])
219
+ gene_offset = offset_full[0, :]
220
+
221
+ fit_gene = glm_fit(gene_counts, design, dispersion=0.05,
222
+ offset=gene_offset, prior_count=prior_count)
223
+
224
+ # --- Gene-level betabar, expand to exon level ---
225
+ gene_betabar = fit_gene['coefficients'][:, coef_idx:coef_idx+1] # (ngenes, 1)
226
+ gene_betabar_exon = gene_betabar[g] # (nexons_kept, 1)
227
+
228
+ # New offset = original offset + gene_betabar @ design[:,coef].T
229
+ design_coef_col = design[:, coef_idx:coef_idx+1] # (nlib, 1)
230
+ offset_new = offset_full + gene_betabar_exon @ design_coef_col.T # (nexons_kept, nlib)
231
+
232
+ # --- Relative coefficients ---
233
+ coefficients_rel = beta - gene_betabar_exon.ravel()
234
+
235
+ # --- Dispersion for reduced model fit ---
236
+ if glmfit.get('average.ql.dispersion') is not None:
237
+ ave_ql_disp = glmfit['average.ql.dispersion']
238
+ if dispersion_orig is not None:
239
+ dispersion = dispersion_orig / ave_ql_disp
240
+ else:
241
+ dispersion = 0.05
242
+ else:
243
+ dispersion = dispersion_orig if dispersion_orig is not None else 0.05
244
+
245
+ # --- Fit reduced model ---
246
+ fit0 = glm_fit(counts, design=design0, offset=offset_new,
247
+ dispersion=dispersion, weights=weights, prior_count=0)
248
+
249
+ # --- Deviance differences ---
250
+ exon_LR = fit0['deviance'] - deviance
251
+ gene_LR = np.zeros(ngenes)
252
+ np.add.at(gene_LR, g, exon_LR)
253
+
254
+ exon_df_test = fit0['df.residual'] - df_residual_orig
255
+ gene_df_test = np.zeros(ngenes)
256
+ np.add.at(gene_df_test, g, exon_df_test)
257
+
258
+ # --- Get adjusted df/deviance for QL path ---
259
+ if not isLRT:
260
+ if glmfit.get('df.residual.zeros') is not None:
261
+ exon_df_residual = glmfit['df.residual.zeros'][o][exon_keep]
262
+ exon_deviance = glmfit['deviance'][o][exon_keep]
263
+ elif glmfit.get('df.residual.adj') is not None:
264
+ exon_df_residual = glmfit['df.residual.adj'][o][exon_keep]
265
+ exon_deviance = glmfit['deviance.adj'][o][exon_keep]
266
+ else:
267
+ exon_df_residual = df_residual_orig
268
+ exon_deviance = deviance
269
+
270
+ # --- Statistical tests ---
271
+ if isLRT:
272
+ # Chi-squared tests
273
+ exon_p_value = chi2.sf(exon_LR, df=exon_df_test)
274
+ gene_p_value = chi2.sf(gene_LR, df=gene_df_test)
275
+ else:
276
+ # QL F-tests
277
+ gene_df_residual = np.zeros(ngenes)
278
+ np.add.at(gene_df_residual, g, exon_df_residual)
279
+
280
+ gene_s2_num = np.zeros(ngenes)
281
+ np.add.at(gene_s2_num, g, exon_deviance)
282
+ gene_s2 = gene_s2_num / gene_df_residual
283
+
284
+ squeeze = squeeze_var(gene_s2, gene_df_residual, robust=robust)
285
+ gene_df_total = gene_df_residual + squeeze['df_prior']
286
+ gene_df_total = np.minimum(gene_df_total, np.sum(gene_df_residual))
287
+ gene_s2_post = squeeze['var_post']
288
+
289
+ # Exon-level F and p-values
290
+ exon_F = exon_LR / exon_df_test / gene_s2_post[g]
291
+ gene_F = gene_LR / gene_df_test / gene_s2_post
292
+
293
+ exon_p_value = f_dist.sf(exon_F, dfn=exon_df_test, dfd=gene_df_total[g])
294
+ gene_p_value = f_dist.sf(gene_F, dfn=gene_df_test, dfd=gene_df_total)
295
+
296
+ # Clamp exon p-values when s2.post < 1 and df.residual.zeros available
297
+ if glmfit.get('df.residual.zeros') is not None:
298
+ i = gene_s2_post[g] < 1
299
+ if np.any(i):
300
+ chisq_pvalue = chi2.sf(exon_LR[i], df=exon_df_test[i])
301
+ exon_p_value[i] = np.maximum(exon_p_value[i], chisq_pvalue)
302
+
303
+ # --- Simes aggregation for gene-level p-values ---
304
+ # R code: sort exon p-values within each gene, compute Simes statistic
305
+ # o <- order(g, exon.p.value)
306
+ simes_order = np.lexsort((exon_p_value, g))
307
+ p_sorted = exon_p_value[simes_order]
308
+
309
+ # Build ranks within each gene
310
+ # r = cumsum(1s) - (cumsum at gene boundaries - gene_nexons) repeated
311
+ q = np.ones(nexons_kept, dtype=np.float64)
312
+ cumq = np.cumsum(q)
313
+ gene_boundaries = np.cumsum(gene_nexons)
314
+ # Value at end of each gene
315
+ boundary_vals = cumq[gene_boundaries - 1]
316
+ # Starting value for each gene
317
+ gene_starts = boundary_vals - gene_nexons
318
+ r = cumq - np.repeat(gene_starts, gene_nexons)
319
+
320
+ # pp = p * nexons_per_gene / rank
321
+ pp = p_sorted * np.repeat(gene_nexons, gene_nexons) / r
322
+
323
+ # Reverse sort to get minimum per gene
324
+ # oo <- order(-g, pp, decreasing=TRUE)
325
+ # This reverses the order so that the first exon of each gene (by the reverse sort)
326
+ # corresponds to the minimum Simes statistic
327
+ oo = np.lexsort((pp, -g))[::-1]
328
+ gene_Simes_p_value = pp[oo][gene_boundaries - 1]
329
+
330
+ # --- Build output ---
331
+ result = {}
332
+ result['comparison'] = coef_name
333
+ result['design'] = design
334
+ result['coefficients'] = coefficients_rel
335
+ result['genes'] = exon_genes
336
+ result['genecolname'] = genecolname
337
+ result['exoncolname'] = exoncolname
338
+ result['exon.df.test'] = exon_df_test
339
+
340
+ if isLRT:
341
+ result['exon.LR'] = exon_LR
342
+ else:
343
+ result['exon.F'] = exon_F
344
+
345
+ result['exon.p.value'] = exon_p_value
346
+ result['gene.df.test'] = gene_df_test
347
+
348
+ if isLRT:
349
+ result['gene.LR'] = gene_LR
350
+ else:
351
+ result['gene.df.prior'] = squeeze['df_prior']
352
+ result['gene.df.residual'] = gene_df_residual
353
+ result['gene.F'] = gene_F
354
+
355
+ result['gene.p.value'] = gene_p_value
356
+ result['gene.Simes.p.value'] = gene_Simes_p_value
357
+
358
+ # --- Gene-level genes table ---
359
+ exon_lastexon = gene_boundaries - 1
360
+ exon_firstexon = exon_lastexon - gene_nexons + 1
361
+ gene_genes = exon_genes.iloc[exon_lastexon].copy().reset_index(drop=True)
362
+ gene_genes['NExons'] = gene_nexons
363
+
364
+ # Identify gene-level columns (duplicated across all exons in a gene)
365
+ no = np.zeros(len(exon_genes), dtype=bool)
366
+ for col in exon_genes.columns:
367
+ vals = exon_genes[col].values
368
+ # Check if column is duplicated within each gene (skip first exon of each gene)
369
+ not_first = np.ones(len(exon_genes), dtype=bool)
370
+ not_first[exon_firstexon] = False
371
+ if not_first.sum() > 0:
372
+ # Check if all non-first exons have duplicate values
373
+ shifted = np.zeros(len(exon_genes), dtype=bool)
374
+ for gi in range(ngenes):
375
+ start = exon_firstexon[gi]
376
+ end = exon_lastexon[gi] + 1
377
+ if end - start > 1:
378
+ first_val = vals[start]
379
+ for ei in range(start + 1, end):
380
+ if vals[ei] != first_val:
381
+ shifted[ei] = True
382
+ no = no | shifted
383
+
384
+ isgenelevel = []
385
+ for col in exon_genes.columns:
386
+ vals = exon_genes[col].values
387
+ is_dup = True
388
+ for gi in range(ngenes):
389
+ start = exon_firstexon[gi]
390
+ end = exon_lastexon[gi] + 1
391
+ if end - start > 1:
392
+ first_val = vals[start]
393
+ for ei in range(start + 1, end):
394
+ if vals[ei] != first_val:
395
+ is_dup = False
396
+ break
397
+ if not is_dup:
398
+ break
399
+ isgenelevel.append(is_dup)
400
+
401
+ gene_level_cols = [col for col, isg in zip(exon_genes.columns, isgenelevel) if isg]
402
+ gene_genes = exon_genes[gene_level_cols].iloc[exon_lastexon].copy().reset_index(drop=True)
403
+ gene_genes['NExons'] = gene_nexons
404
+ result['gene.genes'] = gene_genes
405
+
406
+ return result
407
+
408
+
409
+ def diff_splice_dge(y, geneid=None, exonid=None, group=None,
410
+ dispersion='auto', prior_count=0.125):
411
+ """Test for differential exon usage between groups using exact test.
412
+
413
+ Port of edgeR's diffSpliceDGE.
414
+
415
+ Parameters
416
+ ----------
417
+ y : DGEList-like dict
418
+ DGEList with exon-level counts.
419
+ geneid : ndarray or str
420
+ Gene IDs.
421
+ exonid : ndarray or str, optional
422
+ Exon IDs.
423
+ group : ndarray, optional
424
+ Group factor.
425
+ dispersion : str or ndarray
426
+ Dispersion.
427
+
428
+ Returns
429
+ -------
430
+ dict with gene-level and exon-level test results.
431
+ """
432
+ from .exact_test import exact_test
433
+
434
+ if group is None and isinstance(y, dict):
435
+ group = y['samples']['group'].values
436
+
437
+ unique_groups = np.unique(group)
438
+ if len(unique_groups) != 2:
439
+ raise ValueError("Exactly 2 groups required for diffSpliceDGE")
440
+
441
+ # Run exact test
442
+ result = exact_test(y, pair=unique_groups[:2].tolist(), dispersion=dispersion,
443
+ prior_count=prior_count)
444
+
445
+ # Get gene IDs
446
+ if isinstance(geneid, str) and y.get('genes') is not None:
447
+ geneid = y['genes'][geneid].values
448
+ geneid = np.asarray(geneid)
449
+
450
+ logFC = result['table']['logFC'].values
451
+ p_exon = result['table']['PValue'].values
452
+
453
+ # Simes aggregation
454
+ unique_genes = np.unique(geneid)
455
+ ngenes = len(unique_genes)
456
+ gene_pvalue = np.ones(ngenes)
457
+ gene_nexons = np.zeros(ngenes, dtype=int)
458
+
459
+ for g_idx, gene in enumerate(unique_genes):
460
+ mask = geneid == gene
461
+ n_exons = np.sum(mask)
462
+ gene_nexons[g_idx] = n_exons
463
+ if n_exons <= 1:
464
+ continue
465
+ p_sorted = np.sort(p_exon[mask])
466
+ gene_pvalue[g_idx] = min(np.min(p_sorted * n_exons / np.arange(1, n_exons + 1)), 1.0)
467
+
468
+ _, gene_fdr, _, _ = multipletests(gene_pvalue, method='fdr_bh')
469
+
470
+ return {
471
+ 'gene.table': pd.DataFrame({
472
+ 'GeneID': unique_genes,
473
+ 'NExons': gene_nexons,
474
+ 'PValue': gene_pvalue,
475
+ 'FDR': gene_fdr
476
+ }),
477
+ 'exon.table': result['table'],
478
+ 'comparison': result.get('comparison')
479
+ }
480
+
481
+
482
+ def splice_variants(y, geneids, dispersion=None):
483
+ """Identify genes with splice variants.
484
+
485
+ Port of edgeR's spliceVariants.
486
+
487
+ Parameters
488
+ ----------
489
+ y : DGEList-like dict
490
+ Exon-level count data.
491
+ geneids : ndarray
492
+ Gene IDs for each exon.
493
+ dispersion : float or ndarray, optional
494
+ Dispersion values.
495
+
496
+ Returns
497
+ -------
498
+ DataFrame with splice variant statistics.
499
+ """
500
+ if isinstance(y, dict) and 'counts' in y:
501
+ counts = y['counts']
502
+ else:
503
+ counts = np.asarray(y, dtype=np.float64)
504
+
505
+ geneids = np.asarray(geneids)
506
+ unique_genes = np.unique(geneids)
507
+
508
+ results = []
509
+ for gene in unique_genes:
510
+ mask = geneids == gene
511
+ n_exons = np.sum(mask)
512
+ if n_exons <= 1:
513
+ results.append({'GeneID': gene, 'NExons': n_exons,
514
+ 'Chisq': 0, 'PValue': 1.0})
515
+ continue
516
+
517
+ gene_counts = counts[mask]
518
+ # Chi-squared test for homogeneity of proportions
519
+ col_totals = gene_counts.sum(axis=0)
520
+ row_totals = gene_counts.sum(axis=1)
521
+ grand_total = gene_counts.sum()
522
+
523
+ if grand_total == 0:
524
+ results.append({'GeneID': gene, 'NExons': n_exons,
525
+ 'Chisq': 0, 'PValue': 1.0})
526
+ continue
527
+
528
+ expected = np.outer(row_totals, col_totals) / grand_total
529
+ expected = np.maximum(expected, 1e-10)
530
+ chi_sq = np.sum((gene_counts - expected) ** 2 / expected)
531
+ df = (n_exons - 1) * (gene_counts.shape[1] - 1)
532
+ p_value = chi2.sf(chi_sq, df) if df > 0 else 1.0
533
+
534
+ results.append({'GeneID': gene, 'NExons': n_exons,
535
+ 'Chisq': chi_sq, 'PValue': p_value})
536
+
537
+ return pd.DataFrame(results)