geney 1.4.40__tar.gz → 1.4.45__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {geney-1.4.40 → geney-1.4.45}/PKG-INFO +2 -3
- {geney-1.4.40 → geney-1.4.45}/geney/__init__.py +18 -5
- geney-1.4.45/geney/engines.py +354 -0
- geney-1.4.45/geney/models/openspliceai-mane/10000nt/model_10000nt_rs10.pt +0 -0
- geney-1.4.45/geney/models/openspliceai-mane/10000nt/model_10000nt_rs11.pt +0 -0
- geney-1.4.45/geney/models/openspliceai-mane/10000nt/model_10000nt_rs12.pt +0 -0
- geney-1.4.45/geney/models/openspliceai-mane/10000nt/model_10000nt_rs13.pt +0 -0
- geney-1.4.45/geney/models/openspliceai-mane/10000nt/model_10000nt_rs14.pt +0 -0
- {geney-1.4.40 → geney-1.4.45}/geney/oncosplice.py +2 -1
- geney-1.4.45/geney/pipelines.py +139 -0
- {geney-1.4.40 → geney-1.4.45}/geney/splice_graph.py +278 -12
- geney-1.4.45/geney/splice_graph_archive.py +948 -0
- {geney-1.4.40 → geney-1.4.45}/geney/transcripts.py +1 -1
- {geney-1.4.40 → geney-1.4.45}/geney.egg-info/PKG-INFO +2 -3
- geney-1.4.45/geney.egg-info/SOURCES.txt +22 -0
- {geney-1.4.40 → geney-1.4.45}/geney.egg-info/requires.txt +1 -2
- {geney-1.4.40 → geney-1.4.45}/setup.py +4 -1
- geney-1.4.40/geney/engines.py +0 -307
- geney-1.4.40/geney/pipelines.py +0 -97
- geney-1.4.40/geney/samples.py +0 -3
- geney-1.4.40/geney/splicing_table.py +0 -142
- geney-1.4.40/geney/utils.py +0 -254
- geney-1.4.40/geney.egg-info/SOURCES.txt +0 -19
- {geney-1.4.40 → geney-1.4.45}/MANIFEST.in +0 -0
- {geney-1.4.40 → geney-1.4.45}/README.md +0 -0
- {geney-1.4.40 → geney-1.4.45}/geney/variants.py +0 -0
- {geney-1.4.40 → geney-1.4.45}/geney.egg-info/dependency_links.txt +0 -0
- {geney-1.4.40 → geney-1.4.45}/geney.egg-info/top_level.txt +0 -0
- {geney-1.4.40 → geney-1.4.45}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: geney
|
|
3
|
-
Version: 1.4.
|
|
3
|
+
Version: 1.4.45
|
|
4
4
|
Summary: A Python package for gene expression modeling.
|
|
5
5
|
Home-page: https://github.com/nicolaslynn/geney
|
|
6
6
|
Author: Nicolas Lynn
|
|
@@ -18,9 +18,8 @@ Requires-Dist: pandas==2.1.4
|
|
|
18
18
|
Requires-Dist: biopython>=1.81
|
|
19
19
|
Requires-Dist: matplotlib
|
|
20
20
|
Requires-Dist: seaborn
|
|
21
|
-
Requires-Dist: tensorflow>=2.8.0
|
|
22
|
-
Requires-Dist: keras>=2.8.0
|
|
23
21
|
Requires-Dist: torch
|
|
22
|
+
Requires-Dist: openspliceai
|
|
24
23
|
Requires-Dist: seqmat
|
|
25
24
|
Dynamic: author
|
|
26
25
|
Dynamic: author-email
|
|
@@ -4,12 +4,17 @@ from .engines import (
|
|
|
4
4
|
sai_predict_probs,
|
|
5
5
|
run_spliceai_seq,
|
|
6
6
|
run_splicing_engine,
|
|
7
|
+
predict_splicing,
|
|
8
|
+
adjoin_splicing_outcomes,
|
|
7
9
|
)
|
|
8
10
|
from .transcripts import TranscriptLibrary
|
|
9
|
-
from .splicing_table import adjoin_splicing_outcomes
|
|
10
11
|
from .splice_graph import SpliceSimulator
|
|
11
|
-
from .pipelines import
|
|
12
|
-
|
|
12
|
+
from .pipelines import (
|
|
13
|
+
oncosplice_pipeline,
|
|
14
|
+
oncosplice_top_isoform,
|
|
15
|
+
max_splicing_delta,
|
|
16
|
+
oncosplice_pipeline_single_transcript, # backwards compat
|
|
17
|
+
)
|
|
13
18
|
|
|
14
19
|
__all__ = [
|
|
15
20
|
"Mutation",
|
|
@@ -18,8 +23,16 @@ __all__ = [
|
|
|
18
23
|
"sai_predict_probs",
|
|
19
24
|
"run_spliceai_seq",
|
|
20
25
|
"run_splicing_engine",
|
|
21
|
-
"
|
|
26
|
+
"predict_splicing",
|
|
22
27
|
"adjoin_splicing_outcomes",
|
|
28
|
+
"TranscriptLibrary",
|
|
23
29
|
"SpliceSimulator",
|
|
30
|
+
"oncosplice_pipeline",
|
|
31
|
+
"oncosplice_top_isoform",
|
|
32
|
+
"max_splicing_delta",
|
|
24
33
|
"oncosplice_pipeline_single_transcript",
|
|
25
|
-
]
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
mut_id = 'KRAS:12:25227343:G:T'
|
|
38
|
+
epistasis_id = 'KRAS:12:25227343:G:T|KRAS:12:25227344:A:T'
|
|
@@ -0,0 +1,354 @@
|
|
|
1
|
+
# oncosplice/engines.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from typing import Dict, List, Tuple, Optional, Union
|
|
5
|
+
import numpy as np
|
|
6
|
+
|
|
7
|
+
# Lazy-loaded model containers (loaded automatically on first use)
|
|
8
|
+
_pang_models = None
|
|
9
|
+
_pang_device = None
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _get_torch_device():
|
|
13
|
+
"""Get the best available device for PyTorch."""
|
|
14
|
+
import sys
|
|
15
|
+
import torch
|
|
16
|
+
|
|
17
|
+
if sys.platform == 'darwin' and torch.backends.mps.is_available():
|
|
18
|
+
try:
|
|
19
|
+
torch.tensor([1.0], device="mps")
|
|
20
|
+
return torch.device("mps")
|
|
21
|
+
except RuntimeError:
|
|
22
|
+
return torch.device("cpu")
|
|
23
|
+
elif torch.cuda.is_available():
|
|
24
|
+
return torch.device("cuda")
|
|
25
|
+
return torch.device("cpu")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _load_pangolin_models():
|
|
29
|
+
"""Lazy load Pangolin models."""
|
|
30
|
+
global _pang_models, _pang_device
|
|
31
|
+
|
|
32
|
+
if _pang_models is not None:
|
|
33
|
+
return _pang_models
|
|
34
|
+
|
|
35
|
+
import torch
|
|
36
|
+
from pkg_resources import resource_filename
|
|
37
|
+
from pangolin.model import Pangolin, L, W, AR
|
|
38
|
+
|
|
39
|
+
_pang_device = _get_torch_device()
|
|
40
|
+
print(f"Pangolin loading to {_pang_device}...")
|
|
41
|
+
|
|
42
|
+
_pang_models = []
|
|
43
|
+
pang_model_nums = [0, 1, 2, 3, 4, 5, 6, 7]
|
|
44
|
+
|
|
45
|
+
for i in pang_model_nums:
|
|
46
|
+
for j in range(1, 6):
|
|
47
|
+
try:
|
|
48
|
+
model = Pangolin(L, W, AR).to(_pang_device)
|
|
49
|
+
model_path = resource_filename("pangolin", f"models/final.{j}.{i}.3")
|
|
50
|
+
weights = torch.load(model_path, weights_only=True, map_location=_pang_device)
|
|
51
|
+
model.load_state_dict(weights)
|
|
52
|
+
model.eval()
|
|
53
|
+
_pang_models.append(model)
|
|
54
|
+
except Exception as e:
|
|
55
|
+
print(f"Warning: Failed to load Pangolin model {j}.{i}: {e}")
|
|
56
|
+
|
|
57
|
+
print(f"Pangolin loaded ({len(_pang_models)} models).")
|
|
58
|
+
return _pang_models
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
_OPENSPLICEAI_MODEL_DIR = None
|
|
62
|
+
|
|
63
|
+
def _get_openspliceai_model_dir() -> str:
|
|
64
|
+
"""Return the path to the OpenSpliceAI MANE 10000nt model directory."""
|
|
65
|
+
global _OPENSPLICEAI_MODEL_DIR
|
|
66
|
+
if _OPENSPLICEAI_MODEL_DIR is not None:
|
|
67
|
+
return _OPENSPLICEAI_MODEL_DIR
|
|
68
|
+
|
|
69
|
+
import os
|
|
70
|
+
|
|
71
|
+
# Models ship inside the package at geney/models/openspliceai-mane/10000nt
|
|
72
|
+
pkg_dir = os.path.dirname(os.path.abspath(__file__))
|
|
73
|
+
default = os.path.join(pkg_dir, 'models', 'openspliceai-mane', '10000nt')
|
|
74
|
+
default = os.path.normpath(default)
|
|
75
|
+
|
|
76
|
+
_OPENSPLICEAI_MODEL_DIR = os.environ.get('OPENSPLICEAI_MODEL_DIR', default)
|
|
77
|
+
return _OPENSPLICEAI_MODEL_DIR
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def pang_one_hot_encode(seq: str) -> np.ndarray:
|
|
81
|
+
"""One-hot encode DNA sequence for Pangolin model."""
|
|
82
|
+
if not isinstance(seq, str):
|
|
83
|
+
raise TypeError(f"Expected string, got {type(seq).__name__}")
|
|
84
|
+
|
|
85
|
+
IN_MAP = np.asarray([[0, 0, 0, 0], # N
|
|
86
|
+
[1, 0, 0, 0], # A
|
|
87
|
+
[0, 1, 0, 0], # C
|
|
88
|
+
[0, 0, 1, 0], # G
|
|
89
|
+
[0, 0, 0, 1]]) # T
|
|
90
|
+
|
|
91
|
+
valid_chars = set('ACGTN')
|
|
92
|
+
if not all(c.upper() in valid_chars for c in seq):
|
|
93
|
+
raise ValueError("Sequence contains invalid characters")
|
|
94
|
+
|
|
95
|
+
seq = seq.upper().replace('A', '1').replace('C', '2')
|
|
96
|
+
seq = seq.replace('G', '3').replace('T', '4').replace('N', '0')
|
|
97
|
+
|
|
98
|
+
seq_array = np.asarray(list(map(int, list(seq))))
|
|
99
|
+
return IN_MAP[seq_array.astype('int8')]
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def pangolin_predict_probs(seq: str, models: list = None) -> Tuple[List[float], List[float]]:
|
|
105
|
+
"""Predict splice site probabilities using Pangolin.
|
|
106
|
+
|
|
107
|
+
Pangolin outputs shape (1, 12, seq_len) where:
|
|
108
|
+
- 12 channels = 4 tissues × 3 prediction types
|
|
109
|
+
- For each tissue: [site_usage, acceptor_gain, donor_gain] or similar
|
|
110
|
+
|
|
111
|
+
We aggregate by taking max across tissues.
|
|
112
|
+
"""
|
|
113
|
+
import torch
|
|
114
|
+
|
|
115
|
+
if models is None:
|
|
116
|
+
models = _load_pangolin_models()
|
|
117
|
+
|
|
118
|
+
if not models:
|
|
119
|
+
raise ValueError("No Pangolin models loaded")
|
|
120
|
+
|
|
121
|
+
x = pang_one_hot_encode(seq)
|
|
122
|
+
x = torch.tensor(x.T[None, :, :], dtype=torch.float32, device=_pang_device)
|
|
123
|
+
|
|
124
|
+
preds = []
|
|
125
|
+
with torch.no_grad():
|
|
126
|
+
for model in models:
|
|
127
|
+
pred = model(x)
|
|
128
|
+
preds.append(pred.cpu().numpy())
|
|
129
|
+
|
|
130
|
+
y = np.mean(preds, axis=0) # Shape: (1, 12, seq_len)
|
|
131
|
+
|
|
132
|
+
# Pangolin has 12 channels organized as:
|
|
133
|
+
# Indices 0,3,6,9: site usage scores for 4 tissues
|
|
134
|
+
# Indices 1,4,7,10: acceptor gain scores for 4 tissues
|
|
135
|
+
# Indices 2,5,8,11: donor gain scores for 4 tissues
|
|
136
|
+
# Take max across the 4 tissues for each type
|
|
137
|
+
|
|
138
|
+
# Acceptor: max of channels 1, 4, 7, 10
|
|
139
|
+
acceptor_channels = y[0, [1, 4, 7, 10], :] # (4, seq_len)
|
|
140
|
+
acceptor_probs = np.max(acceptor_channels, axis=0).tolist()
|
|
141
|
+
|
|
142
|
+
# Donor: max of channels 2, 5, 8, 11
|
|
143
|
+
donor_channels = y[0, [2, 5, 8, 11], :] # (4, seq_len)
|
|
144
|
+
donor_probs = np.max(donor_channels, axis=0).tolist()
|
|
145
|
+
|
|
146
|
+
return donor_probs, acceptor_probs
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def sai_predict_probs(seq: str) -> Tuple[np.ndarray, np.ndarray]:
|
|
150
|
+
"""Predict acceptor and donor probabilities using OpenSpliceAI.
|
|
151
|
+
|
|
152
|
+
Uses the OpenSpliceAI predict() function which handles encoding,
|
|
153
|
+
windowing, ensemble averaging, and softmax internally.
|
|
154
|
+
|
|
155
|
+
Returns (acceptor_probs, donor_probs) as numpy arrays matching the
|
|
156
|
+
full input sequence length.
|
|
157
|
+
"""
|
|
158
|
+
from openspliceai.predict.predict import predict
|
|
159
|
+
import io, sys
|
|
160
|
+
|
|
161
|
+
model_dir = _get_openspliceai_model_dir()
|
|
162
|
+
|
|
163
|
+
# Suppress OpenSpliceAI's verbose print output
|
|
164
|
+
_stdout = sys.stdout
|
|
165
|
+
sys.stdout = io.StringIO()
|
|
166
|
+
try:
|
|
167
|
+
y = predict(seq, model_dir, flanking_size=10000) # (seq_len, 3)
|
|
168
|
+
finally:
|
|
169
|
+
sys.stdout = _stdout
|
|
170
|
+
|
|
171
|
+
y = y.numpy()
|
|
172
|
+
return y[:, 1], y[:, 2] # acceptor, donor
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def run_spliceai_seq(
|
|
176
|
+
seq: str,
|
|
177
|
+
indices: Union[List[int], np.ndarray],
|
|
178
|
+
threshold: float = 0.0,
|
|
179
|
+
) -> Tuple[Dict[int, float], Dict[int, float]]:
|
|
180
|
+
"""Run SpliceAI on seq and return donor/acceptor sites above threshold."""
|
|
181
|
+
if len(indices) != len(seq):
|
|
182
|
+
raise ValueError(f"indices length ({len(indices)}) must match sequence length ({len(seq)})")
|
|
183
|
+
|
|
184
|
+
acc_probs, don_probs = sai_predict_probs(seq)
|
|
185
|
+
acceptor = {pos: p for pos, p in zip(indices, acc_probs) if p >= threshold}
|
|
186
|
+
donor = {pos: p for pos, p in zip(indices, don_probs) if p >= threshold}
|
|
187
|
+
return donor, acceptor
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def _generate_random_sequence(length: int) -> str:
|
|
191
|
+
"""Generate a random DNA sequence of given length."""
|
|
192
|
+
import random
|
|
193
|
+
return ''.join(random.choices('ACGT', k=length))
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def run_splicing_engine(
|
|
197
|
+
seq: Optional[str] = None,
|
|
198
|
+
engine: str = "spliceai",
|
|
199
|
+
) -> Tuple[List[float], List[float]]:
|
|
200
|
+
"""Run specified splicing engine to predict splice site probabilities."""
|
|
201
|
+
if seq is None:
|
|
202
|
+
seq = _generate_random_sequence(15_001)
|
|
203
|
+
|
|
204
|
+
if not isinstance(seq, str) or not seq:
|
|
205
|
+
raise ValueError("Sequence must be a non-empty string")
|
|
206
|
+
|
|
207
|
+
valid_chars = set("ACGTN")
|
|
208
|
+
if not all(c.upper() in valid_chars for c in seq):
|
|
209
|
+
raise ValueError("Sequence contains invalid nucleotides")
|
|
210
|
+
|
|
211
|
+
match engine:
|
|
212
|
+
case "spliceai":
|
|
213
|
+
acc, don = sai_predict_probs(seq)
|
|
214
|
+
return don.tolist(), acc.tolist()
|
|
215
|
+
case "pangolin":
|
|
216
|
+
return pangolin_predict_probs(seq)
|
|
217
|
+
case _:
|
|
218
|
+
raise ValueError(f"Engine '{engine}' not implemented. Available: 'spliceai', 'pangolin'")
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
# ------------------------------------------------------------------------------
|
|
222
|
+
# Higher-level prediction utilities (formerly in splicing_table.py)
|
|
223
|
+
# ------------------------------------------------------------------------------
|
|
224
|
+
|
|
225
|
+
def predict_splicing(s, position: int, engine: str = 'spliceai', context: int = 7500):
|
|
226
|
+
"""
|
|
227
|
+
Predict splicing probabilities at a given position using the specified engine.
|
|
228
|
+
|
|
229
|
+
Args:
|
|
230
|
+
s: Sequence object with .seq, .index, .clone(), .rev attributes
|
|
231
|
+
position: The genomic position to predict splicing probabilities for.
|
|
232
|
+
engine: The prediction engine to use. Supported: 'spliceai', 'pangolin'.
|
|
233
|
+
context: The length of the target central region (default: 7500).
|
|
234
|
+
|
|
235
|
+
Returns:
|
|
236
|
+
pd.DataFrame with position index and columns: donor_prob, acceptor_prob, nucleotides
|
|
237
|
+
"""
|
|
238
|
+
import pandas as pd
|
|
239
|
+
|
|
240
|
+
if position < s.index.min() or position > s.index.max():
|
|
241
|
+
raise ValueError(f"Position {position} is outside sequence bounds [{s.index.min()}, {s.index.max()}]")
|
|
242
|
+
|
|
243
|
+
target = s.clone(position - context, position + context)
|
|
244
|
+
|
|
245
|
+
if len(target.seq) == 0:
|
|
246
|
+
raise ValueError(f"No sequence data found around position {position} with context {context}")
|
|
247
|
+
|
|
248
|
+
seq, indices = target.seq, target.index
|
|
249
|
+
|
|
250
|
+
if len(indices) == 0:
|
|
251
|
+
raise ValueError(f"No indices found in sequence around position {position}")
|
|
252
|
+
|
|
253
|
+
rel_pos = np.abs(indices - position).argmin()
|
|
254
|
+
left_missing, right_missing = max(0, context - rel_pos), max(0, context - (len(seq) - rel_pos))
|
|
255
|
+
|
|
256
|
+
if left_missing > 0 or right_missing > 0:
|
|
257
|
+
step = -1 if s.rev else 1
|
|
258
|
+
|
|
259
|
+
if left_missing > 0:
|
|
260
|
+
left_pad = np.arange(indices[0] - step * left_missing, indices[0], step)
|
|
261
|
+
else:
|
|
262
|
+
left_pad = np.array([], dtype=indices.dtype)
|
|
263
|
+
|
|
264
|
+
if right_missing > 0:
|
|
265
|
+
right_pad = np.arange(indices[-1] + step, indices[-1] + step * (right_missing + 1), step)
|
|
266
|
+
else:
|
|
267
|
+
right_pad = np.array([], dtype=indices.dtype)
|
|
268
|
+
|
|
269
|
+
seq = 'N' * left_missing + seq + 'N' * right_missing
|
|
270
|
+
indices = np.concatenate([left_pad, indices, right_pad])
|
|
271
|
+
|
|
272
|
+
donor_probs, acceptor_probs = run_splicing_engine(seq=seq, engine=engine)
|
|
273
|
+
|
|
274
|
+
seq = seq[5000:-5000]
|
|
275
|
+
indices = indices[5000:-5000]
|
|
276
|
+
expected_len = len(seq)
|
|
277
|
+
|
|
278
|
+
if len(donor_probs) != expected_len:
|
|
279
|
+
if len(donor_probs) > expected_len:
|
|
280
|
+
offset = (len(donor_probs) - expected_len) // 2
|
|
281
|
+
donor_probs = donor_probs[offset:offset + expected_len]
|
|
282
|
+
acceptor_probs = acceptor_probs[offset:offset + expected_len]
|
|
283
|
+
else:
|
|
284
|
+
pad_len = expected_len - len(donor_probs)
|
|
285
|
+
donor_probs = donor_probs + [0.0] * pad_len
|
|
286
|
+
acceptor_probs = acceptor_probs + [0.0] * pad_len
|
|
287
|
+
|
|
288
|
+
df = pd.DataFrame({
|
|
289
|
+
'position': indices,
|
|
290
|
+
'donor_prob': donor_probs,
|
|
291
|
+
'acceptor_prob': acceptor_probs,
|
|
292
|
+
'nucleotides': list(seq)
|
|
293
|
+
}).set_index('position').round(3)
|
|
294
|
+
|
|
295
|
+
df.attrs['name'] = s.name
|
|
296
|
+
return df
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
def adjoin_splicing_outcomes(
|
|
300
|
+
splicing_predictions: Dict[str, 'pd.DataFrame'],
|
|
301
|
+
transcript: Optional[object] = None,
|
|
302
|
+
) -> 'pd.DataFrame':
|
|
303
|
+
"""
|
|
304
|
+
Combine splicing predictions for multiple mutations into a multi-index DataFrame.
|
|
305
|
+
|
|
306
|
+
Args:
|
|
307
|
+
splicing_predictions: {label -> DF with 'donor_prob','acceptor_prob','nucleotides'}
|
|
308
|
+
transcript: optional transcript (must have .acceptors, .donors, .rev)
|
|
309
|
+
"""
|
|
310
|
+
import pandas as pd
|
|
311
|
+
|
|
312
|
+
if not splicing_predictions:
|
|
313
|
+
raise ValueError("splicing_predictions cannot be empty")
|
|
314
|
+
|
|
315
|
+
dfs = []
|
|
316
|
+
for label, df in splicing_predictions.items():
|
|
317
|
+
if not isinstance(df, pd.DataFrame):
|
|
318
|
+
raise TypeError(f"Expected DataFrame for '{label}', got {type(df).__name__}")
|
|
319
|
+
|
|
320
|
+
required_cols = ["donor_prob", "acceptor_prob", "nucleotides"]
|
|
321
|
+
missing = [c for c in required_cols if c not in df.columns]
|
|
322
|
+
if missing:
|
|
323
|
+
raise ValueError(f"DataFrame for '{label}' missing required columns: {missing}")
|
|
324
|
+
|
|
325
|
+
var_df = df.rename(
|
|
326
|
+
columns={
|
|
327
|
+
"donor_prob": ("donors", f"{label}_prob"),
|
|
328
|
+
"acceptor_prob": ("acceptors", f"{label}_prob"),
|
|
329
|
+
"nucleotides": ("nts", f"{label}"),
|
|
330
|
+
}
|
|
331
|
+
)
|
|
332
|
+
dfs.append(var_df)
|
|
333
|
+
|
|
334
|
+
try:
|
|
335
|
+
full_df = pd.concat(dfs, axis=1)
|
|
336
|
+
except Exception as e:
|
|
337
|
+
raise ValueError(f"Failed to concatenate DataFrames: {e}") from e
|
|
338
|
+
|
|
339
|
+
if not isinstance(full_df.columns, pd.MultiIndex):
|
|
340
|
+
full_df.columns = pd.MultiIndex.from_tuples(full_df.columns)
|
|
341
|
+
|
|
342
|
+
if transcript is not None:
|
|
343
|
+
full_df[("acceptors", "annotated")] = full_df.apply(
|
|
344
|
+
lambda row: row.name in transcript.acceptors, axis=1
|
|
345
|
+
)
|
|
346
|
+
full_df[("donors", "annotated")] = full_df.apply(
|
|
347
|
+
lambda row: row.name in transcript.donors, axis=1
|
|
348
|
+
)
|
|
349
|
+
full_df.sort_index(axis=1, level=0, inplace=True)
|
|
350
|
+
full_df.sort_index(ascending=not transcript.rev, inplace=True)
|
|
351
|
+
else:
|
|
352
|
+
full_df.sort_index(axis=1, level=0, inplace=True)
|
|
353
|
+
|
|
354
|
+
return full_df
|
|
@@ -388,9 +388,10 @@ class Oncosplice:
|
|
|
388
388
|
analysis_dict = {
|
|
389
389
|
'reference_protein': self.reference_protein,
|
|
390
390
|
'variant_protein': self.variant_protein,
|
|
391
|
+
'aligned_reference_protein': self.alignment.seqA,
|
|
392
|
+
'aligned_variant_protein': self.alignment.seqB,
|
|
391
393
|
'reference_length': len(self.reference_protein),
|
|
392
394
|
'variant_length': len(self.variant_protein),
|
|
393
|
-
# 'alignment_length': len(self.alignment.seqA),
|
|
394
395
|
'oncosplice_score': self.score,
|
|
395
396
|
'percentile': self.percentile,
|
|
396
397
|
'number_of_deletions': len(self.deletions),
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
# oncosplice/pipelines.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
from seqmat import Gene
|
|
8
|
+
|
|
9
|
+
from .splice_graph import SpliceSimulator
|
|
10
|
+
from .transcripts import TranscriptLibrary
|
|
11
|
+
from .variants import MutationalEvent
|
|
12
|
+
from .Oncosplice import Oncosplice
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def oncosplice_pipeline(
|
|
16
|
+
mut_id: str,
|
|
17
|
+
transcript_id: str | None = None,
|
|
18
|
+
splicing_engine: str = "spliceai",
|
|
19
|
+
organism: str = "hg38",
|
|
20
|
+
) -> pd.DataFrame:
|
|
21
|
+
"""
|
|
22
|
+
Run the full oncosplice pipeline for a mutation.
|
|
23
|
+
|
|
24
|
+
Returns DataFrame with all viable isoforms and their oncosplice scores.
|
|
25
|
+
"""
|
|
26
|
+
m = MutationalEvent(mut_id)
|
|
27
|
+
assert m.compatible(), "Mutations in event are incompatible"
|
|
28
|
+
|
|
29
|
+
reference_transcript = (
|
|
30
|
+
Gene.from_file(m.gene, organism=organism)
|
|
31
|
+
.transcript(transcript_id)
|
|
32
|
+
.generate_pre_mrna()
|
|
33
|
+
.generate_mature_mrna()
|
|
34
|
+
.generate_protein()
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
tl = TranscriptLibrary(reference_transcript, m)
|
|
38
|
+
central_pos = m.central_position
|
|
39
|
+
|
|
40
|
+
tl.predict_splicing(central_pos, engine=splicing_engine, inplace=True)
|
|
41
|
+
splicing_results = tl.get_event_columns("event")
|
|
42
|
+
|
|
43
|
+
ss = SpliceSimulator(
|
|
44
|
+
splicing_results, tl.event, feature="event", max_distance=100_000_000
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
base_report = pd.Series({
|
|
48
|
+
"mut_id": mut_id,
|
|
49
|
+
"gene": m.gene,
|
|
50
|
+
"transcript_id": reference_transcript.transcript_id,
|
|
51
|
+
"primary_transcript": reference_transcript.primary_transcript,
|
|
52
|
+
"splicing_engine": splicing_engine,
|
|
53
|
+
"central_position": central_pos,
|
|
54
|
+
"mutation_count": len(m.positions),
|
|
55
|
+
"time_of_execution": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
|
56
|
+
})
|
|
57
|
+
|
|
58
|
+
ss_metadata = ss.report(central_pos)
|
|
59
|
+
rows = []
|
|
60
|
+
for variant_transcript, isoform_metadata in ss.get_viable_transcripts(metadata=True):
|
|
61
|
+
onco = Oncosplice(
|
|
62
|
+
reference_transcript.protein,
|
|
63
|
+
variant_transcript.protein,
|
|
64
|
+
reference_transcript.cons_vector,
|
|
65
|
+
)
|
|
66
|
+
rows.append(
|
|
67
|
+
pd.concat([
|
|
68
|
+
base_report,
|
|
69
|
+
ss_metadata,
|
|
70
|
+
isoform_metadata,
|
|
71
|
+
pd.Series({
|
|
72
|
+
"reference_mrna": reference_transcript.mature_mrna.seq,
|
|
73
|
+
"variant_mrna": variant_transcript.mature_mrna.seq,
|
|
74
|
+
}),
|
|
75
|
+
onco.get_analysis_series(),
|
|
76
|
+
])
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
return pd.DataFrame(rows)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def oncosplice_top_isoform(
|
|
83
|
+
mut_id: str,
|
|
84
|
+
transcript_id: str | None = None,
|
|
85
|
+
splicing_engine: str = "spliceai",
|
|
86
|
+
organism: str = "hg38",
|
|
87
|
+
) -> pd.Series | None:
|
|
88
|
+
"""
|
|
89
|
+
Get the most likely non-reference isoform for a mutation.
|
|
90
|
+
|
|
91
|
+
Returns Series with full oncosplice analysis, or None if no missplicing detected.
|
|
92
|
+
"""
|
|
93
|
+
df = oncosplice_pipeline(mut_id, transcript_id, splicing_engine, organism)
|
|
94
|
+
|
|
95
|
+
if df.empty:
|
|
96
|
+
return None
|
|
97
|
+
|
|
98
|
+
variants = df[df["summary"] != "-"]
|
|
99
|
+
|
|
100
|
+
if variants.empty:
|
|
101
|
+
return None
|
|
102
|
+
|
|
103
|
+
return variants.iloc[0]
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def max_splicing_delta(
|
|
107
|
+
mut_id: str,
|
|
108
|
+
transcript_id: str | None = None,
|
|
109
|
+
splicing_engine: str = "spliceai",
|
|
110
|
+
organism: str = "hg38",
|
|
111
|
+
) -> float:
|
|
112
|
+
"""
|
|
113
|
+
Get the maximum splice site probability change for a mutation.
|
|
114
|
+
"""
|
|
115
|
+
m = MutationalEvent(mut_id)
|
|
116
|
+
assert m.compatible(), "Mutations in event are incompatible"
|
|
117
|
+
|
|
118
|
+
reference_transcript = (
|
|
119
|
+
Gene.from_file(m.gene, organism=organism)
|
|
120
|
+
.transcript(transcript_id)
|
|
121
|
+
.generate_pre_mrna()
|
|
122
|
+
.generate_mature_mrna()
|
|
123
|
+
.generate_protein()
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
tl = TranscriptLibrary(reference_transcript, m)
|
|
127
|
+
splicing_results = tl.predict_splicing(
|
|
128
|
+
m.central_position, engine=splicing_engine, inplace=True
|
|
129
|
+
).get_event_columns("event")
|
|
130
|
+
|
|
131
|
+
ss = SpliceSimulator(
|
|
132
|
+
splicing_results, tl.event, feature="event", max_distance=100_000_000
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
return ss.max_splicing_delta("event_prob")
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
# Keep old name for backwards compatibility
|
|
139
|
+
oncosplice_pipeline_single_transcript = oncosplice_pipeline
|