oncosplice 3.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. oncosplice-3.2.0/LICENSE +21 -0
  2. oncosplice-3.2.0/PKG-INFO +313 -0
  3. oncosplice-3.2.0/README.md +251 -0
  4. oncosplice-3.2.0/oncosplice/__init__.py +82 -0
  5. oncosplice-3.2.0/oncosplice/_geney_compat.py +138 -0
  6. oncosplice-3.2.0/oncosplice/engine.py +1200 -0
  7. oncosplice-3.2.0/oncosplice/engines/__init__.py +129 -0
  8. oncosplice-3.2.0/oncosplice/engines/_pangolin_arch.py +140 -0
  9. oncosplice-3.2.0/oncosplice/engines/_vendor/__init__.py +9 -0
  10. oncosplice-3.2.0/oncosplice/engines/_vendor/spliceformer/LICENSE +21 -0
  11. oncosplice-3.2.0/oncosplice/engines/_vendor/spliceformer/__init__.py +16 -0
  12. oncosplice-3.2.0/oncosplice/engines/_vendor/spliceformer/model.py +350 -0
  13. oncosplice-3.2.0/oncosplice/engines/_vendor/spliceformer/weight_init.py +20 -0
  14. oncosplice-3.2.0/oncosplice/engines/base.py +110 -0
  15. oncosplice-3.2.0/oncosplice/engines/ensemble.py +53 -0
  16. oncosplice-3.2.0/oncosplice/engines/openspliceai.py +141 -0
  17. oncosplice-3.2.0/oncosplice/engines/pangolin.py +192 -0
  18. oncosplice-3.2.0/oncosplice/engines/spliceai_keras.py +98 -0
  19. oncosplice-3.2.0/oncosplice/engines/spliceai_pytorch.py +242 -0
  20. oncosplice-3.2.0/oncosplice/engines/spliceformer.py +189 -0
  21. oncosplice-3.2.0/oncosplice/results.py +440 -0
  22. oncosplice-3.2.0/oncosplice/scoring/__init__.py +31 -0
  23. oncosplice-3.2.0/oncosplice/scoring/epistasis.py +625 -0
  24. oncosplice-3.2.0/oncosplice/scoring/fingerprint.py +169 -0
  25. oncosplice-3.2.0/oncosplice/scoring/oncosplice.py +112 -0
  26. oncosplice-3.2.0/oncosplice/scoring/site_query.py +225 -0
  27. oncosplice-3.2.0/oncosplice/scoring/splicing.py +201 -0
  28. oncosplice-3.2.0/oncosplice/variants.py +137 -0
  29. oncosplice-3.2.0/oncosplice/viz.py +846 -0
  30. oncosplice-3.2.0/oncosplice/weights/__init__.py +239 -0
  31. oncosplice-3.2.0/oncosplice/weights/__main__.py +8 -0
  32. oncosplice-3.2.0/oncosplice/weights/manifest.json +45 -0
  33. oncosplice-3.2.0/oncosplice.egg-info/PKG-INFO +313 -0
  34. oncosplice-3.2.0/oncosplice.egg-info/SOURCES.txt +42 -0
  35. oncosplice-3.2.0/oncosplice.egg-info/dependency_links.txt +1 -0
  36. oncosplice-3.2.0/oncosplice.egg-info/entry_points.txt +2 -0
  37. oncosplice-3.2.0/oncosplice.egg-info/requires.txt +42 -0
  38. oncosplice-3.2.0/oncosplice.egg-info/top_level.txt +1 -0
  39. oncosplice-3.2.0/pyproject.toml +104 -0
  40. oncosplice-3.2.0/setup.cfg +4 -0
  41. oncosplice-3.2.0/tests/test_engines_selfcontained.py +63 -0
  42. oncosplice-3.2.0/tests/test_scoring.py +129 -0
  43. oncosplice-3.2.0/tests/test_spliceai_equivalence.py +146 -0
  44. oncosplice-3.2.0/tests/test_variants.py +43 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Nicolas Lynn
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,313 @@
1
+ Metadata-Version: 2.4
2
+ Name: oncosplice
3
+ Version: 3.2.0
4
+ Summary: Sequence-level pipeline for splicing-epistasis analysis of single, double, and N-variant constructs
5
+ Author-email: Nicolas Lynn <nicolasalynn@gmail.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/nicolasalynn/oncosplice
8
+ Project-URL: Documentation, https://nicolasalynn.github.io/oncosplice
9
+ Project-URL: Repository, https://github.com/nicolasalynn/oncosplice
10
+ Project-URL: Issues, https://github.com/nicolasalynn/oncosplice/issues
11
+ Project-URL: Changelog, https://github.com/nicolasalynn/oncosplice/blob/main/CHANGELOG.md
12
+ Keywords: splicing,epistasis,oncosplice,spliceai,openspliceai,pangolin,spliceformer,cancer-genomics,bioinformatics,rna-splicing
13
+ Classifier: Development Status :: 5 - Production/Stable
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: Intended Audience :: Healthcare Industry
16
+ Classifier: License :: OSI Approved :: MIT License
17
+ Classifier: Operating System :: OS Independent
18
+ Classifier: Programming Language :: Python :: 3
19
+ Classifier: Programming Language :: Python :: 3.10
20
+ Classifier: Programming Language :: Python :: 3.11
21
+ Classifier: Programming Language :: Python :: 3.12
22
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
23
+ Classifier: Topic :: Scientific/Engineering :: Medical Science Apps.
24
+ Classifier: Typing :: Typed
25
+ Requires-Python: >=3.10
26
+ Description-Content-Type: text/markdown
27
+ License-File: LICENSE
28
+ Requires-Dist: numpy>=1.23
29
+ Requires-Dist: pandas>=2.2.2
30
+ Requires-Dist: matplotlib>=3.9
31
+ Requires-Dist: biopython>=1.84
32
+ Requires-Dist: seqmat>=1.3
33
+ Requires-Dist: huggingface_hub>=0.20
34
+ Provides-Extra: openspliceai
35
+ Requires-Dist: torch>=2.0; extra == "openspliceai"
36
+ Requires-Dist: openspliceai; extra == "openspliceai"
37
+ Provides-Extra: spliceai-pytorch
38
+ Requires-Dist: torch>=2.0; extra == "spliceai-pytorch"
39
+ Provides-Extra: pangolin
40
+ Requires-Dist: torch>=2.0; extra == "pangolin"
41
+ Provides-Extra: spliceformer
42
+ Requires-Dist: torch>=2.0; extra == "spliceformer"
43
+ Requires-Dist: einops; extra == "spliceformer"
44
+ Provides-Extra: spliceai-keras
45
+ Requires-Dist: tensorflow>=2.8; extra == "spliceai-keras"
46
+ Requires-Dist: spliceai; extra == "spliceai-keras"
47
+ Provides-Extra: all
48
+ Requires-Dist: torch>=2.0; extra == "all"
49
+ Requires-Dist: openspliceai; extra == "all"
50
+ Requires-Dist: einops; extra == "all"
51
+ Provides-Extra: protein
52
+ Requires-Dist: geney>=2.0; extra == "protein"
53
+ Provides-Extra: dev
54
+ Requires-Dist: pytest>=7; extra == "dev"
55
+ Requires-Dist: pytest-cov; extra == "dev"
56
+ Requires-Dist: ruff; extra == "dev"
57
+ Provides-Extra: docs
58
+ Requires-Dist: mkdocs-material>=9.5; extra == "docs"
59
+ Requires-Dist: mkdocstrings[python]>=0.24; extra == "docs"
60
+ Requires-Dist: pymdown-extensions>=10; extra == "docs"
61
+ Dynamic: license-file
62
+
63
+ # oncosplice
64
+
65
+ [![PyPI](https://img.shields.io/pypi/v/oncosplice.svg)](https://pypi.org/project/oncosplice/)
66
+ [![CI](https://github.com/nicolasalynn/oncosplice/actions/workflows/ci.yml/badge.svg)](https://github.com/nicolasalynn/oncosplice/actions/workflows/ci.yml)
67
+ [![codecov](https://codecov.io/gh/nicolasalynn/oncosplice/branch/main/graph/badge.svg)](https://codecov.io/gh/nicolasalynn/oncosplice)
68
+ [![Docs](https://img.shields.io/badge/docs-mkdocs-blue)](https://nicolasalynn.github.io/oncosplice)
69
+ [![Python](https://img.shields.io/pypi/pyversions/oncosplice.svg)](https://pypi.org/project/oncosplice/)
70
+ [![License](https://img.shields.io/badge/license-MIT-green)](LICENSE)
71
+
72
+ > Given two (or more) mutations in the same gene, classify how their joint
73
+ > effect on splicing differs from the additive prediction — into one of four
74
+ > mutually-exclusive mechanism classes: **rescue**, **cryptic rescue**,
75
+ > **deletion synergy**, or **cryptic synergy**.
76
+
77
+ **oncosplice** is a sequence-level pipeline for splicing-epistasis analysis of
78
+ single-, double-, and N-variant constructs. It runs a splice-site predictor
79
+ (SpliceAI, OpenSpliceAI, Pangolin, or Spliceformer) under each variant context,
80
+ computes per-site residuals against the additive expectation, and applies a
81
+ crisp 4-class mechanistic classifier.
82
+
83
+ Implements the algorithms from:
84
+
85
+ 1. *Detecting and understanding meaningful cancerous mutations based on computational models of mRNA splicing* — Lynn & Tuller, *npj Systems Biology* 2024.
86
+ 2. *Large-scale insight into missplicing, intra-gene epistasis and its relevance to human cancer* — in preparation.
87
+
88
+ ```bash
89
+ pip install oncosplice[spliceai_pytorch]
90
+ ```
91
+
92
+ ## What it does
93
+
94
+ Given two (or more) genomic variants in the same gene, oncosplice answers:
95
+
96
+ - **Single-variant impact.** For each mutation alone, how much does it perturb every splice site in the gene? `analyze_single()`.
97
+ - **Joint behavior.** What does splicing look like when both mutations co-occur, and how does that compare to the additive prediction? `analyze_pair()` / `analyze_multi()`.
98
+ - **Mechanism.** Is the joint effect a *synergistic* gain (joint > additive), a *rescue* (single disrupts, joint restores WT), a *compounding* sub-additive stack, or just dominance / noise? Per-site and pair-level classification.
99
+ - **Bulk classification.** Run the same analysis over a DataFrame of hundreds of thousands of pairs with per-gene scheduling, batched inference, and resumable checkpointing. `scan()` / `classify_dataframe()`.
100
+
101
+ ## Install
102
+
103
+ ```bash
104
+ # Recommended — original SpliceAI weights, PyTorch backbone (no TF dependency)
105
+ pip install oncosplice[spliceai_pytorch]
106
+
107
+ # Or pick another engine
108
+ pip install oncosplice[openspliceai] # OpenSpliceAI (MANE-trained, retrained)
109
+ pip install oncosplice[pangolin] # Pangolin (40-model multi-tissue)
110
+ pip install oncosplice[spliceformer] # Spliceformer (40k transformer)
111
+ pip install oncosplice[all] # all 4 production engines
112
+
113
+ # Optional add-ons
114
+ pip install oncosplice[protein] # protein-divergence score (Lynn & Tuller 2024)
115
+ ```
116
+
117
+ Core requires `numpy`, `pandas`, `matplotlib`, `biopython`, `seqmat`. The
118
+ classification core (`analyze_pair`, `scan`, `classify_dataframe`) has **no
119
+ `geney` dependency** — `geney` is only needed for the protein-divergence score
120
+ path (`[protein]` extra).
121
+
122
+ **Model weights download automatically** from the [Hugging Face Hub](https://huggingface.co/nicolynnvila/oncosplice-weights)
123
+ on first use and are cached in `~/.oncosplice/weights/` — no manual step. Set
124
+ `ONCOSPLICE_AUTO_DOWNLOAD=0` to require an explicit `oncosplice-download-weights`
125
+ instead (useful offline / in CI).
126
+
127
+ ## Highlights
128
+
129
+ - **Four production engines under one interface** — SpliceAI (PyTorch port,
130
+ numerically identical to Keras), OpenSpliceAI, Pangolin, Spliceformer. Swap
131
+ with one string. Cross-engine ensembling via `ensemble:a,b,c`.
132
+ - **Four-class mechanistic classifier** — rescue / cryptic rescue / deletion
133
+ synergy / cryptic synergy, defined on probability bands with a hard
134
+ WT-vs-annotation prerequisite that filters predictor noise.
135
+ - **TCGA-scale runner** — `classify_dataframe()` does per-gene grouping +
136
+ batched inference + resumable checkpointing. ~23× faster than per-pair after
137
+ the 3.2.0 vectorization; 800k pairs in ~22 hours on an L40S.
138
+ - **Numerical parity tests** between Keras SpliceAI and the PyTorch port so
139
+ the migration is auditable.
140
+ - **Pure-python scoring core** (`oncosplice.scoring`) with no model
141
+ dependencies — usable as a library in other splicing-prediction stacks.
142
+
143
+ ## Quickstart
144
+
145
+ ```python
146
+ from oncosplice import OncospliceEngine
147
+
148
+ eng = OncospliceEngine(splicing_engine="spliceai_pytorch")
149
+
150
+ # Single variant — does this mutation cause missplicing?
151
+ single = eng.analyze_single("KRAS:12:25227344:A:T")
152
+ print(single.summary())
153
+ print(single.missplicing.to_dataframe()) # missed + discovered sites
154
+ single.plot_missplicing()
155
+
156
+ # Pair — what happens when both mutations co-occur?
157
+ pair = eng.analyze_pair("KRAS:12:25227343:G:T", "KRAS:12:25227344:A:T")
158
+ print(pair.pair_classification) # → "rescue"
159
+ print(pair.epistatic_sites()) # only the syn/rescue/comp sites
160
+ pair.plot_case_study() # the bar figure
161
+
162
+ # N-variant (higher-order)
163
+ multi = eng.analyze_multi([
164
+ "KRAS:12:25227343:G:T",
165
+ "KRAS:12:25227344:A:T",
166
+ "KRAS:12:25227345:G:C",
167
+ ])
168
+ ```
169
+
170
+ ### Bulk classification of a DataFrame
171
+
172
+ ```python
173
+ import pandas as pd
174
+ df = pd.read_csv("pairs.csv") # column: epistasis_id (e.g. "GENE:CHR:POS:REF:ALT|GENE:CHR:POS:REF:ALT")
175
+
176
+ out = eng.classify_dataframe(
177
+ df, epistasis_id_col="epistasis_id",
178
+ checkpoint_path="results.csv",
179
+ )
180
+ # adds: pair_classification, max_abs_residual, max_abs_event_delta,
181
+ # n_del_syn, n_cryp_syn, n_rescue, n_cryp_rescue, engine, error
182
+ ```
183
+
184
+ Per-gene grouping + batched `scan()` underneath — typically 10–40× faster than the per-pair path on TCGA-shaped datasets. The runner is resume-safe (re-running with the same checkpoint path skips already-done pairs) and emits both per-pair and per-single CSVs.
185
+
186
+ ### Engine-only API (no geney needed)
187
+
188
+ ```python
189
+ from oncosplice.engines import get_predictor, list_available_engines
190
+ print(list_available_engines())
191
+
192
+ p = get_predictor("spliceai_pytorch")
193
+ pred = p.predict(padded_sequence) # → SplicingPrediction(acceptor, donor)
194
+ ```
195
+
196
+ ## The classifier — 4 mechanism classes
197
+
198
+ At every splice site, given four predicted probabilities `ref`, `mut1`,
199
+ `mut2`, `event` (all in [0, 1]) and the annotation flag, we test four
200
+ mutually-exclusive rules. The residual `expected − event` (or `event − expected`,
201
+ depending on direction) plus the band-membership of `ref`, `mut1`, `mut2`,
202
+ `event` decide the class. `expected = mut1 + mut2 − ref` is the additive null.
203
+
204
+ **Thresholds (one set, used everywhere):**
205
+
206
+ | Symbol | Value | Meaning |
207
+ |---|---|---|
208
+ | `HIGH` | 0.50 | "site present" (includes alt-spliced sites) |
209
+ | `LOW` | 0.05 | "site absent" |
210
+ | `RES` | 0.10 | minimum residual magnitude |
211
+ | `NEAR_WT` | 0.20 | `|event − ref|` tolerance for rescue |
212
+
213
+ **Hard prerequisite — WT prediction must agree with annotation.** Every rule
214
+ first checks that the engine's wild-type prediction is consistent with the
215
+ annotation: `annotated == True ⇒ ref ≥ HIGH`, `annotated == False ⇒ ref ≤ LOW`.
216
+ Sites where the engine disagrees with the annotation are dropped as
217
+ non-epistatic without consulting the mutations. This is the noise filter.
218
+
219
+ ### The four rules
220
+
221
+ | Class | When the site is annotated (`ref ≥ HIGH`) | Rule | Residual |
222
+ |---|---|---|---|
223
+ | **rescue** | one single deletes, joint restores | `min(mut1, mut2) ≤ ref − HIGH` ∧ `|event − ref| ≤ NEAR_WT` ∧ `event − min(mut1, mut2) ≥ RES` | `rescue_residual = event − min(mut1, mut2)` |
224
+ | **deletion synergy** | both singles preserve, joint destroys | `min(mut1, mut2) ≥ HIGH` ∧ `ref − event ≥ RES` ∧ `expected − event ≥ RES` | `synergy_residual = expected − event` |
225
+
226
+ | Class | When the site is not annotated (`ref ≤ LOW`) | Rule | Residual |
227
+ |---|---|---|---|
228
+ | **cryptic rescue** | one single creates, joint silences | `max(mut1, mut2) ≥ HIGH` ∧ `event ≤ LOW` ∧ `max(mut1, mut2) − event ≥ RES` | `rescue_residual = max(mut1, mut2) − event` |
229
+ | **cryptic synergy** | both silent, joint creates | `max(mut1, mut2) ≤ LOW` ∧ `event ≥ HIGH` ∧ `event − expected ≥ RES` | `synergy_residual = event − expected` |
230
+
231
+ Anything else → **non-epistatic**.
232
+
233
+ ### Numeric example
234
+
235
+ ```
236
+ # annotated acceptor in INPP5J — spliceai_pytorch
237
+ ref = 0.972 annotated = True
238
+ m1 = 0.658 (m1 alone preserves: 0.658 ≥ 0.50)
239
+ m2 = 0.841 (m2 alone preserves: 0.841 ≥ 0.50)
240
+ event = 0.339
241
+ expected = m1 + m2 - ref = 0.527
242
+
243
+ # ref ≥ HIGH ✓ and annotated ✓ → annotated branch
244
+ # min(m1, m2) = 0.658 ≥ HIGH ✓ → not rescue (singles preserve)
245
+ # ref - event = 0.633 ≥ RES (0.10) ✓
246
+ # expected - event = 0.188 ≥ RES (0.10) ✓ → deletion_synergy
247
+ # synergy_residual = 0.188
248
+ ```
249
+
250
+ ### Pair-level aggregation
251
+
252
+ A pair's overall label is the class of the splice site with the *largest*
253
+ mechanism residual (rescue or synergy). Ties break by class priority:
254
+ `deletion_synergy > cryptic_synergy > rescue > cryptic_rescue > non-epistatic`.
255
+ The full per-site breakdown is always retained in `pair.site_residuals`.
256
+
257
+ ## Available splicing engines
258
+
259
+ | Name | Architecture | Notes |
260
+ |---|---|---|
261
+ | `spliceai_pytorch` (default for production) | Original SpliceAI weights (Jaganathan 2019), plain-ReLU PyTorch architecture | Numerically identical to Keras SpliceAI, ~2.5× faster, no TF dependency |
262
+ | `openspliceai` | OpenSpliceAI PyTorch port, MANE-trained 5-model ensemble | Independent retrain; differs from Keras SpliceAI in fine numerics |
263
+ | `pangolin` | 40-model multi-tissue PyTorch ensemble (Zeng & Li 2022) | Tissue-specific splice usage |
264
+ | `spliceformer` | 40k-context transformer ensemble (Jónsson 2024) | Long-range context; requires the Spliceformer repo |
265
+ | `spliceai_keras` | Original Illumina `.h5` weights | **Reference only** — prefer `spliceai_pytorch` |
266
+ | `ensemble:a,b,c` / `average` | Mean probabilities across N constituent engines | Cross-engine consensus |
267
+
268
+ ## Package layout
269
+
270
+ ```
271
+ oncosplice/
272
+ ├── engine.py # OncospliceEngine — orchestrator (analyze_single/pair/multi, scan, classify_dataframe)
273
+ ├── results.py # typed dataclasses: SingleVariantResult, DoubleVariantResult, MultiVariantResult
274
+ ├── variants.py # Variant + VariantPair (no geney dependency)
275
+ ├── viz.py # plot_case_study + supporting bar figures
276
+ ├── engines/ # standalone splice-site predictor adapters (uniform interface)
277
+ │ ├── base.py
278
+ │ ├── spliceai_pytorch.py
279
+ │ ├── openspliceai.py
280
+ │ ├── pangolin.py
281
+ │ ├── spliceformer.py
282
+ │ ├── spliceai_keras.py
283
+ │ └── ensemble.py
284
+ ├── scoring/ # pure-Python scoring primitives
285
+ │ ├── splicing.py
286
+ │ ├── epistasis.py # the 3-bucket classifier + vectorized residual computation
287
+ │ ├── oncosplice.py # protein-divergence Oncosplice score
288
+ │ └── fingerprint.py # splicing-outcome hashing
289
+ └── weights/ # weight-resolution + downloader CLI
290
+ ```
291
+
292
+ ## Examples
293
+
294
+ See `examples/`:
295
+
296
+ - `KRAS_rescue.ipynb` — a canonical KRAS donor disrupted by mut1 alone, restored by the joint event. The mechanism the classifier surfaces as **rescue**.
297
+ - `CREBBP_synergistic.ipynb` — the joint event activates one cryptic acceptor (synergy) while rescuing another from each single's activation (rescue). The classifier reports the dominant **synergistic** call with the rescue site preserved in the per-site table.
298
+
299
+ ## Testing
300
+
301
+ ```bash
302
+ pytest tests/ # full suite
303
+ pytest tests/test_scoring.py # classifier + residual rules
304
+ pytest tests/test_spliceai_equivalence.py # Keras ↔ PyTorch numerical parity
305
+ ```
306
+
307
+ ## Citing
308
+
309
+ If you use this code in a published analysis, please cite the two papers above.
310
+
311
+ ## License
312
+
313
+ MIT — see [LICENSE](LICENSE).
@@ -0,0 +1,251 @@
1
+ # oncosplice
2
+
3
+ [![PyPI](https://img.shields.io/pypi/v/oncosplice.svg)](https://pypi.org/project/oncosplice/)
4
+ [![CI](https://github.com/nicolasalynn/oncosplice/actions/workflows/ci.yml/badge.svg)](https://github.com/nicolasalynn/oncosplice/actions/workflows/ci.yml)
5
+ [![codecov](https://codecov.io/gh/nicolasalynn/oncosplice/branch/main/graph/badge.svg)](https://codecov.io/gh/nicolasalynn/oncosplice)
6
+ [![Docs](https://img.shields.io/badge/docs-mkdocs-blue)](https://nicolasalynn.github.io/oncosplice)
7
+ [![Python](https://img.shields.io/pypi/pyversions/oncosplice.svg)](https://pypi.org/project/oncosplice/)
8
+ [![License](https://img.shields.io/badge/license-MIT-green)](LICENSE)
9
+
10
+ > Given two (or more) mutations in the same gene, classify how their joint
11
+ > effect on splicing differs from the additive prediction — into one of four
12
+ > mutually-exclusive mechanism classes: **rescue**, **cryptic rescue**,
13
+ > **deletion synergy**, or **cryptic synergy**.
14
+
15
+ **oncosplice** is a sequence-level pipeline for splicing-epistasis analysis of
16
+ single-, double-, and N-variant constructs. It runs a splice-site predictor
17
+ (SpliceAI, OpenSpliceAI, Pangolin, or Spliceformer) under each variant context,
18
+ computes per-site residuals against the additive expectation, and applies a
19
+ crisp 4-class mechanistic classifier.
20
+
21
+ Implements the algorithms from:
22
+
23
+ 1. *Detecting and understanding meaningful cancerous mutations based on computational models of mRNA splicing* — Lynn & Tuller, *npj Systems Biology* 2024.
24
+ 2. *Large-scale insight into missplicing, intra-gene epistasis and its relevance to human cancer* — in preparation.
25
+
26
+ ```bash
27
+ pip install oncosplice[spliceai_pytorch]
28
+ ```
29
+
30
+ ## What it does
31
+
32
+ Given two (or more) genomic variants in the same gene, oncosplice answers:
33
+
34
+ - **Single-variant impact.** For each mutation alone, how much does it perturb every splice site in the gene? `analyze_single()`.
35
+ - **Joint behavior.** What does splicing look like when both mutations co-occur, and how does that compare to the additive prediction? `analyze_pair()` / `analyze_multi()`.
36
+ - **Mechanism.** Is the joint effect a *synergistic* gain (joint > additive), a *rescue* (single disrupts, joint restores WT), a *compounding* sub-additive stack, or just dominance / noise? Per-site and pair-level classification.
37
+ - **Bulk classification.** Run the same analysis over a DataFrame of hundreds of thousands of pairs with per-gene scheduling, batched inference, and resumable checkpointing. `scan()` / `classify_dataframe()`.
38
+
39
+ ## Install
40
+
41
+ ```bash
42
+ # Recommended — original SpliceAI weights, PyTorch backbone (no TF dependency)
43
+ pip install oncosplice[spliceai_pytorch]
44
+
45
+ # Or pick another engine
46
+ pip install oncosplice[openspliceai] # OpenSpliceAI (MANE-trained, retrained)
47
+ pip install oncosplice[pangolin] # Pangolin (40-model multi-tissue)
48
+ pip install oncosplice[spliceformer] # Spliceformer (40k transformer)
49
+ pip install oncosplice[all] # all 4 production engines
50
+
51
+ # Optional add-ons
52
+ pip install oncosplice[protein] # protein-divergence score (Lynn & Tuller 2024)
53
+ ```
54
+
55
+ Core requires `numpy`, `pandas`, `matplotlib`, `biopython`, `seqmat`. The
56
+ classification core (`analyze_pair`, `scan`, `classify_dataframe`) has **no
57
+ `geney` dependency** — `geney` is only needed for the protein-divergence score
58
+ path (`[protein]` extra).
59
+
60
+ **Model weights download automatically** from the [Hugging Face Hub](https://huggingface.co/nicolynnvila/oncosplice-weights)
61
+ on first use and are cached in `~/.oncosplice/weights/` — no manual step. Set
62
+ `ONCOSPLICE_AUTO_DOWNLOAD=0` to require an explicit `oncosplice-download-weights`
63
+ instead (useful offline / in CI).
64
+
65
+ ## Highlights
66
+
67
+ - **Four production engines under one interface** — SpliceAI (PyTorch port,
68
+ numerically identical to Keras), OpenSpliceAI, Pangolin, Spliceformer. Swap
69
+ with one string. Cross-engine ensembling via `ensemble:a,b,c`.
70
+ - **Four-class mechanistic classifier** — rescue / cryptic rescue / deletion
71
+ synergy / cryptic synergy, defined on probability bands with a hard
72
+ WT-vs-annotation prerequisite that filters predictor noise.
73
+ - **TCGA-scale runner** — `classify_dataframe()` does per-gene grouping +
74
+ batched inference + resumable checkpointing. ~23× faster than per-pair after
75
+ the 3.2.0 vectorization; 800k pairs in ~22 hours on an L40S.
76
+ - **Numerical parity tests** between Keras SpliceAI and the PyTorch port so
77
+ the migration is auditable.
78
+ - **Pure-python scoring core** (`oncosplice.scoring`) with no model
79
+ dependencies — usable as a library in other splicing-prediction stacks.
80
+
81
+ ## Quickstart
82
+
83
+ ```python
84
+ from oncosplice import OncospliceEngine
85
+
86
+ eng = OncospliceEngine(splicing_engine="spliceai_pytorch")
87
+
88
+ # Single variant — does this mutation cause missplicing?
89
+ single = eng.analyze_single("KRAS:12:25227344:A:T")
90
+ print(single.summary())
91
+ print(single.missplicing.to_dataframe()) # missed + discovered sites
92
+ single.plot_missplicing()
93
+
94
+ # Pair — what happens when both mutations co-occur?
95
+ pair = eng.analyze_pair("KRAS:12:25227343:G:T", "KRAS:12:25227344:A:T")
96
+ print(pair.pair_classification) # → "rescue"
97
+ print(pair.epistatic_sites()) # only the syn/rescue/comp sites
98
+ pair.plot_case_study() # the bar figure
99
+
100
+ # N-variant (higher-order)
101
+ multi = eng.analyze_multi([
102
+ "KRAS:12:25227343:G:T",
103
+ "KRAS:12:25227344:A:T",
104
+ "KRAS:12:25227345:G:C",
105
+ ])
106
+ ```
107
+
108
+ ### Bulk classification of a DataFrame
109
+
110
+ ```python
111
+ import pandas as pd
112
+ df = pd.read_csv("pairs.csv") # column: epistasis_id (e.g. "GENE:CHR:POS:REF:ALT|GENE:CHR:POS:REF:ALT")
113
+
114
+ out = eng.classify_dataframe(
115
+ df, epistasis_id_col="epistasis_id",
116
+ checkpoint_path="results.csv",
117
+ )
118
+ # adds: pair_classification, max_abs_residual, max_abs_event_delta,
119
+ # n_del_syn, n_cryp_syn, n_rescue, n_cryp_rescue, engine, error
120
+ ```
121
+
122
+ Per-gene grouping + batched `scan()` underneath — typically 10–40× faster than the per-pair path on TCGA-shaped datasets. The runner is resume-safe (re-running with the same checkpoint path skips already-done pairs) and emits both per-pair and per-single CSVs.
123
+
124
+ ### Engine-only API (no geney needed)
125
+
126
+ ```python
127
+ from oncosplice.engines import get_predictor, list_available_engines
128
+ print(list_available_engines())
129
+
130
+ p = get_predictor("spliceai_pytorch")
131
+ pred = p.predict(padded_sequence) # → SplicingPrediction(acceptor, donor)
132
+ ```
133
+
134
+ ## The classifier — 4 mechanism classes
135
+
136
+ At every splice site, given four predicted probabilities `ref`, `mut1`,
137
+ `mut2`, `event` (all in [0, 1]) and the annotation flag, we test four
138
+ mutually-exclusive rules. The residual `expected − event` (or `event − expected`,
139
+ depending on direction) plus the band-membership of `ref`, `mut1`, `mut2`,
140
+ `event` decide the class. `expected = mut1 + mut2 − ref` is the additive null.
141
+
142
+ **Thresholds (one set, used everywhere):**
143
+
144
+ | Symbol | Value | Meaning |
145
+ |---|---|---|
146
+ | `HIGH` | 0.50 | "site present" (includes alt-spliced sites) |
147
+ | `LOW` | 0.05 | "site absent" |
148
+ | `RES` | 0.10 | minimum residual magnitude |
149
+ | `NEAR_WT` | 0.20 | `|event − ref|` tolerance for rescue |
150
+
151
+ **Hard prerequisite — WT prediction must agree with annotation.** Every rule
152
+ first checks that the engine's wild-type prediction is consistent with the
153
+ annotation: `annotated == True ⇒ ref ≥ HIGH`, `annotated == False ⇒ ref ≤ LOW`.
154
+ Sites where the engine disagrees with the annotation are dropped as
155
+ non-epistatic without consulting the mutations. This is the noise filter.
156
+
157
+ ### The four rules
158
+
159
+ | Class | When the site is annotated (`ref ≥ HIGH`) | Rule | Residual |
160
+ |---|---|---|---|
161
+ | **rescue** | one single deletes, joint restores | `min(mut1, mut2) ≤ ref − HIGH` ∧ `|event − ref| ≤ NEAR_WT` ∧ `event − min(mut1, mut2) ≥ RES` | `rescue_residual = event − min(mut1, mut2)` |
162
+ | **deletion synergy** | both singles preserve, joint destroys | `min(mut1, mut2) ≥ HIGH` ∧ `ref − event ≥ RES` ∧ `expected − event ≥ RES` | `synergy_residual = expected − event` |
163
+
164
+ | Class | When the site is not annotated (`ref ≤ LOW`) | Rule | Residual |
165
+ |---|---|---|---|
166
+ | **cryptic rescue** | one single creates, joint silences | `max(mut1, mut2) ≥ HIGH` ∧ `event ≤ LOW` ∧ `max(mut1, mut2) − event ≥ RES` | `rescue_residual = max(mut1, mut2) − event` |
167
+ | **cryptic synergy** | both silent, joint creates | `max(mut1, mut2) ≤ LOW` ∧ `event ≥ HIGH` ∧ `event − expected ≥ RES` | `synergy_residual = event − expected` |
168
+
169
+ Anything else → **non-epistatic**.
170
+
171
+ ### Numeric example
172
+
173
+ ```
174
+ # annotated acceptor in INPP5J — spliceai_pytorch
175
+ ref = 0.972 annotated = True
176
+ m1 = 0.658 (m1 alone preserves: 0.658 ≥ 0.50)
177
+ m2 = 0.841 (m2 alone preserves: 0.841 ≥ 0.50)
178
+ event = 0.339
179
+ expected = m1 + m2 - ref = 0.527
180
+
181
+ # ref ≥ HIGH ✓ and annotated ✓ → annotated branch
182
+ # min(m1, m2) = 0.658 ≥ HIGH ✓ → not rescue (singles preserve)
183
+ # ref - event = 0.633 ≥ RES (0.10) ✓
184
+ # expected - event = 0.188 ≥ RES (0.10) ✓ → deletion_synergy
185
+ # synergy_residual = 0.188
186
+ ```
187
+
188
+ ### Pair-level aggregation
189
+
190
+ A pair's overall label is the class of the splice site with the *largest*
191
+ mechanism residual (rescue or synergy). Ties break by class priority:
192
+ `deletion_synergy > cryptic_synergy > rescue > cryptic_rescue > non-epistatic`.
193
+ The full per-site breakdown is always retained in `pair.site_residuals`.
194
+
195
+ ## Available splicing engines
196
+
197
+ | Name | Architecture | Notes |
198
+ |---|---|---|
199
+ | `spliceai_pytorch` (default for production) | Original SpliceAI weights (Jaganathan 2019), plain-ReLU PyTorch architecture | Numerically identical to Keras SpliceAI, ~2.5× faster, no TF dependency |
200
+ | `openspliceai` | OpenSpliceAI PyTorch port, MANE-trained 5-model ensemble | Independent retrain; differs from Keras SpliceAI in fine numerics |
201
+ | `pangolin` | 40-model multi-tissue PyTorch ensemble (Zeng & Li 2022) | Tissue-specific splice usage |
202
+ | `spliceformer` | 40k-context transformer ensemble (Jónsson 2024) | Long-range context; requires the Spliceformer repo |
203
+ | `spliceai_keras` | Original Illumina `.h5` weights | **Reference only** — prefer `spliceai_pytorch` |
204
+ | `ensemble:a,b,c` / `average` | Mean probabilities across N constituent engines | Cross-engine consensus |
205
+
206
+ ## Package layout
207
+
208
+ ```
209
+ oncosplice/
210
+ ├── engine.py # OncospliceEngine — orchestrator (analyze_single/pair/multi, scan, classify_dataframe)
211
+ ├── results.py # typed dataclasses: SingleVariantResult, DoubleVariantResult, MultiVariantResult
212
+ ├── variants.py # Variant + VariantPair (no geney dependency)
213
+ ├── viz.py # plot_case_study + supporting bar figures
214
+ ├── engines/ # standalone splice-site predictor adapters (uniform interface)
215
+ │ ├── base.py
216
+ │ ├── spliceai_pytorch.py
217
+ │ ├── openspliceai.py
218
+ │ ├── pangolin.py
219
+ │ ├── spliceformer.py
220
+ │ ├── spliceai_keras.py
221
+ │ └── ensemble.py
222
+ ├── scoring/ # pure-Python scoring primitives
223
+ │ ├── splicing.py
224
+ │ ├── epistasis.py # the 3-bucket classifier + vectorized residual computation
225
+ │ ├── oncosplice.py # protein-divergence Oncosplice score
226
+ │ └── fingerprint.py # splicing-outcome hashing
227
+ └── weights/ # weight-resolution + downloader CLI
228
+ ```
229
+
230
+ ## Examples
231
+
232
+ See `examples/`:
233
+
234
+ - `KRAS_rescue.ipynb` — a canonical KRAS donor disrupted by mut1 alone, restored by the joint event. The mechanism the classifier surfaces as **rescue**.
235
+ - `CREBBP_synergistic.ipynb` — the joint event activates one cryptic acceptor (synergy) while rescuing another from each single's activation (rescue). The classifier reports the dominant **synergistic** call with the rescue site preserved in the per-site table.
236
+
237
+ ## Testing
238
+
239
+ ```bash
240
+ pytest tests/ # full suite
241
+ pytest tests/test_scoring.py # classifier + residual rules
242
+ pytest tests/test_spliceai_equivalence.py # Keras ↔ PyTorch numerical parity
243
+ ```
244
+
245
+ ## Citing
246
+
247
+ If you use this code in a published analysis, please cite the two papers above.
248
+
249
+ ## License
250
+
251
+ MIT — see [LICENSE](LICENSE).
@@ -0,0 +1,82 @@
1
+ """
2
+ oncosplice — sequence-level splicing-epistasis pipeline.
3
+
4
+ Top-level layout:
5
+
6
+ - :class:`OncospliceEngine` — orchestrator. Requires ``geney`` + ``seqmat``.
7
+ - :mod:`oncosplice.engines` — standalone splice-site predictor adapters
8
+ (``OpenSpliceAI``, ``SpliceAIPyTorch``, ``Pangolin``, ``Spliceformer``,
9
+ ``EnsemblePredictor``). Importable without ``geney``. ``SpliceAIKeras`` is
10
+ also available as the *reference implementation* for verification only —
11
+ prefer ``SpliceAIPyTorch`` (identical output, ~2.5× faster, no TF dep).
12
+ - :mod:`oncosplice.scoring` — splicing / epistasis / Oncosplice scoring
13
+ primitives. Pure-python; no model dependencies.
14
+ - :mod:`oncosplice.results` — typed dataclasses for results / protein library.
15
+ - :mod:`oncosplice.weights` — model-weight resolver + downloader CLI.
16
+
17
+ Quick start:
18
+
19
+ >>> from oncosplice import OncospliceEngine
20
+ >>> eng = OncospliceEngine(splicing_engine="openspliceai")
21
+ >>> pair = eng.analyze_pair("KRAS:12:25227343:G:T", "KRAS:12:25227344:A:T")
22
+ >>> print(pair.summary())
23
+
24
+ Predictor-only use (no geney / seqmat needed):
25
+
26
+ >>> from oncosplice.engines import get_predictor
27
+ >>> p = get_predictor("openspliceai")
28
+ >>> pred = p.predict(sequence)
29
+ """
30
+ from __future__ import annotations
31
+
32
+ # Engines + weights are dependency-light and safe to import eagerly.
33
+ from .engines import (
34
+ EnsemblePredictor,
35
+ OpenSpliceAI,
36
+ Pangolin,
37
+ SpliceAIKeras,
38
+ SpliceAIPyTorch,
39
+ Spliceformer,
40
+ SplicingPrediction,
41
+ SplicingPredictor,
42
+ get_predictor,
43
+ list_available_engines,
44
+ )
45
+ from .results import (
46
+ DoubleVariantResult,
47
+ MissplicingProfile,
48
+ MultiVariantResult,
49
+ ProteinLibrary,
50
+ SingleVariantResult,
51
+ SiteEpistasis,
52
+ )
53
+ from .scoring.fingerprint import splicing_outcome_fingerprint, splicing_outcome_hash
54
+ from .variants import Variant, VariantPair
55
+
56
+ __version__ = "3.2.0" # 3-bucket classifier: synergistic / rescue / compounding (+ non-epistatic fallback)
57
+
58
+ __all__ = [
59
+ "OncospliceEngine", # lazy attribute below
60
+ "SingleVariantResult", "DoubleVariantResult", "MultiVariantResult",
61
+ "SiteEpistasis", "MissplicingProfile", "ProteinLibrary",
62
+ "splicing_outcome_fingerprint", "splicing_outcome_hash",
63
+ "Variant", "VariantPair",
64
+ "SplicingPredictor", "SplicingPrediction",
65
+ "OpenSpliceAI", "SpliceAIKeras", "SpliceAIPyTorch", "Pangolin", "Spliceformer",
66
+ "EnsemblePredictor",
67
+ "get_predictor", "list_available_engines",
68
+ ]
69
+
70
+
71
+ def __getattr__(name):
72
+ """Lazy-load OncospliceEngine so that the rest of the package (engines,
73
+ weights, scoring) is importable without geney/seqmat installed.
74
+ """
75
+ if name == "OncospliceEngine":
76
+ from .engine import OncospliceEngine
77
+ # Cache on the module so subsequent ``from oncosplice import X`` resolves
78
+ # (PEP 562 __getattr__ + ``from … import …`` interacts oddly in some
79
+ # CPython 3.13 builds; binding here is a robust workaround).
80
+ globals()["OncospliceEngine"] = OncospliceEngine
81
+ return OncospliceEngine
82
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")