geney 1.4.40__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- geney/__init__.py +25 -0
- geney/engines.py +307 -0
- geney/oncosplice.py +411 -0
- geney/pipelines.py +97 -0
- geney/samples.py +3 -0
- geney/splice_graph.py +371 -0
- geney/splicing_table.py +142 -0
- geney/transcripts.py +68 -0
- geney/utils.py +254 -0
- geney/variants.py +389 -0
- geney-1.4.40.dist-info/METADATA +32 -0
- geney-1.4.40.dist-info/RECORD +14 -0
- geney-1.4.40.dist-info/WHEEL +5 -0
- geney-1.4.40.dist-info/top_level.txt +1 -0
geney/__init__.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# oncosplice/__init__.py
|
|
2
|
+
from .variants import Mutation, MutationalEvent, MutationLibrary
|
|
3
|
+
from .engines import (
|
|
4
|
+
sai_predict_probs,
|
|
5
|
+
run_spliceai_seq,
|
|
6
|
+
run_splicing_engine,
|
|
7
|
+
)
|
|
8
|
+
from .transcripts import TranscriptLibrary
|
|
9
|
+
from .splicing_table import adjoin_splicing_outcomes
|
|
10
|
+
from .splice_graph import SpliceSimulator
|
|
11
|
+
from .pipelines import oncosplice_pipeline_single_transcript
|
|
12
|
+
from .samples import *
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
"Mutation",
|
|
16
|
+
"MutationalEvent",
|
|
17
|
+
"MutationLibrary",
|
|
18
|
+
"sai_predict_probs",
|
|
19
|
+
"run_spliceai_seq",
|
|
20
|
+
"run_splicing_engine",
|
|
21
|
+
"TranscriptLibrary",
|
|
22
|
+
"adjoin_splicing_outcomes",
|
|
23
|
+
"SpliceSimulator",
|
|
24
|
+
"oncosplice_pipeline_single_transcript",
|
|
25
|
+
]
|
geney/engines.py
ADDED
|
@@ -0,0 +1,307 @@
|
|
|
1
|
+
# oncosplice/engines.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from typing import Dict, List, Tuple, Optional, Union
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
|
|
8
|
+
# These are your existing helpers; keep them in separate modules if you want.
|
|
9
|
+
# from ._spliceai_utils import one_hot_encode, sai_models # type: ignore
|
|
10
|
+
# from ._pangolin_utils import pangolin_predict_probs, pang_models # type: ignore
|
|
11
|
+
import torch
|
|
12
|
+
from pkg_resources import resource_filename
|
|
13
|
+
from pangolin.model import *
|
|
14
|
+
import numpy as np
|
|
15
|
+
import sys
|
|
16
|
+
|
|
17
|
+
pang_model_nums = [0, 1, 2, 3, 4, 5, 6, 7]
|
|
18
|
+
pang_models = []
|
|
19
|
+
|
|
20
|
+
def get_best_device():
|
|
21
|
+
"""Get the best available device for computation."""
|
|
22
|
+
if sys.platform == 'darwin' and torch.backends.mps.is_available():
|
|
23
|
+
try:
|
|
24
|
+
# Test MPS availability
|
|
25
|
+
torch.tensor([1.0], device="mps")
|
|
26
|
+
return torch.device("mps")
|
|
27
|
+
except RuntimeError:
|
|
28
|
+
print("Warning: MPS not available, falling back to CPU")
|
|
29
|
+
return torch.device("cpu")
|
|
30
|
+
elif torch.cuda.is_available():
|
|
31
|
+
return torch.device("cuda")
|
|
32
|
+
else:
|
|
33
|
+
return torch.device("cpu")
|
|
34
|
+
|
|
35
|
+
device = get_best_device()
|
|
36
|
+
print(f"Pangolin loaded to {device}.")
|
|
37
|
+
|
|
38
|
+
# Initialize models with improved error handling
|
|
39
|
+
try:
|
|
40
|
+
for i in pang_model_nums:
|
|
41
|
+
for j in range(1, 6):
|
|
42
|
+
try:
|
|
43
|
+
model = Pangolin(L, W, AR).to(device)
|
|
44
|
+
|
|
45
|
+
# Load weights with proper device mapping
|
|
46
|
+
model_path = resource_filename("pangolin", f"models/final.{j}.{i}.3")
|
|
47
|
+
weights = torch.load(model_path, weights_only=True, map_location=device)
|
|
48
|
+
|
|
49
|
+
model.load_state_dict(weights)
|
|
50
|
+
model.eval()
|
|
51
|
+
pang_models.append(model)
|
|
52
|
+
|
|
53
|
+
except Exception as e:
|
|
54
|
+
print(f"Warning: Failed to load Pangolin model {j}.{i}: {e}")
|
|
55
|
+
continue
|
|
56
|
+
|
|
57
|
+
except Exception as e:
|
|
58
|
+
print(f"Error initializing Pangolin models: {e}")
|
|
59
|
+
pang_models = []
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def pang_one_hot_encode(seq: str) -> np.ndarray:
|
|
63
|
+
"""One-hot encode DNA sequence for Pangolin model.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
seq: DNA sequence string
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
One-hot encoded array of shape (len(seq), 4)
|
|
70
|
+
|
|
71
|
+
Raises:
|
|
72
|
+
ValueError: If sequence contains invalid characters
|
|
73
|
+
"""
|
|
74
|
+
if not isinstance(seq, str):
|
|
75
|
+
raise TypeError(f"Expected string, got {type(seq).__name__}")
|
|
76
|
+
|
|
77
|
+
IN_MAP = np.asarray([[0, 0, 0, 0], # N or unknown
|
|
78
|
+
[1, 0, 0, 0], # A
|
|
79
|
+
[0, 1, 0, 0], # C
|
|
80
|
+
[0, 0, 1, 0], # G
|
|
81
|
+
[0, 0, 0, 1]]) # T
|
|
82
|
+
|
|
83
|
+
# Validate sequence
|
|
84
|
+
valid_chars = set('ACGTN')
|
|
85
|
+
if not all(c.upper() in valid_chars for c in seq):
|
|
86
|
+
raise ValueError("Sequence contains invalid characters (only A, C, G, T, N allowed)")
|
|
87
|
+
|
|
88
|
+
# Convert to numeric representation
|
|
89
|
+
seq = seq.upper().replace('A', '1').replace('C', '2')
|
|
90
|
+
seq = seq.replace('G', '3').replace('T', '4').replace('N', '0')
|
|
91
|
+
|
|
92
|
+
try:
|
|
93
|
+
seq_array = np.asarray(list(map(int, list(seq))))
|
|
94
|
+
return IN_MAP[seq_array.astype('int8')]
|
|
95
|
+
except (ValueError, IndexError) as e:
|
|
96
|
+
raise ValueError(f"Failed to encode sequence: {e}") from e
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
import os
|
|
102
|
+
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
|
|
103
|
+
|
|
104
|
+
import absl.logging
|
|
105
|
+
absl.logging.set_verbosity(absl.logging.ERROR)
|
|
106
|
+
|
|
107
|
+
import os
|
|
108
|
+
import sys
|
|
109
|
+
import tensorflow as tf
|
|
110
|
+
import numpy as np
|
|
111
|
+
from keras.models import load_model
|
|
112
|
+
from importlib import resources
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
# Force device selection with error handling
|
|
116
|
+
def get_best_tensorflow_device():
|
|
117
|
+
"""Get the best available TensorFlow device."""
|
|
118
|
+
try:
|
|
119
|
+
# Try GPU first
|
|
120
|
+
if tf.config.list_physical_devices('GPU'):
|
|
121
|
+
return '/GPU:0'
|
|
122
|
+
# Try MPS on macOS
|
|
123
|
+
elif sys.platform == 'darwin' and tf.config.list_physical_devices('MPS'):
|
|
124
|
+
return '/device:GPU:0'
|
|
125
|
+
else:
|
|
126
|
+
return '/CPU:0'
|
|
127
|
+
except Exception as e:
|
|
128
|
+
print(f"Warning: Device selection failed, using CPU: {e}")
|
|
129
|
+
return '/CPU:0'
|
|
130
|
+
|
|
131
|
+
device = get_best_tensorflow_device()
|
|
132
|
+
|
|
133
|
+
# Model loading paths with error handling
|
|
134
|
+
def load_spliceai_models():
|
|
135
|
+
"""Load SpliceAI models with proper error handling."""
|
|
136
|
+
try:
|
|
137
|
+
if sys.platform == 'darwin':
|
|
138
|
+
model_filenames = [f"models/spliceai{i}.h5" for i in range(1, 6)]
|
|
139
|
+
model_paths = [resources.files('spliceai').joinpath(f) for f in model_filenames]
|
|
140
|
+
else:
|
|
141
|
+
model_paths = [f"/tamir2/nicolaslynn/tools/SpliceAI/spliceai/models/spliceai{i}.h5"
|
|
142
|
+
for i in range(1, 6)]
|
|
143
|
+
|
|
144
|
+
# Load models onto correct device
|
|
145
|
+
models = []
|
|
146
|
+
with tf.device(device):
|
|
147
|
+
for i, model_path in enumerate(model_paths):
|
|
148
|
+
try:
|
|
149
|
+
model = load_model(str(model_path))
|
|
150
|
+
models.append(model)
|
|
151
|
+
except Exception as e:
|
|
152
|
+
print(f"Warning: Failed to load SpliceAI model {i+1}: {e}")
|
|
153
|
+
continue
|
|
154
|
+
|
|
155
|
+
if not models:
|
|
156
|
+
raise RuntimeError("No SpliceAI models could be loaded")
|
|
157
|
+
|
|
158
|
+
return models
|
|
159
|
+
|
|
160
|
+
except Exception as e:
|
|
161
|
+
print(f"Error loading SpliceAI models: {e}")
|
|
162
|
+
return []
|
|
163
|
+
|
|
164
|
+
sai_models = load_spliceai_models()
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
print(f"SpliceAI loaded to {device}.")
|
|
168
|
+
|
|
169
|
+
def one_hot_encode(seq: str) -> np.ndarray:
|
|
170
|
+
"""One-hot encode DNA sequence for SpliceAI model.
|
|
171
|
+
|
|
172
|
+
Args:
|
|
173
|
+
seq: DNA sequence string
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
One-hot encoded array of shape (len(seq), 4)
|
|
177
|
+
|
|
178
|
+
Raises:
|
|
179
|
+
ValueError: If sequence contains invalid characters
|
|
180
|
+
"""
|
|
181
|
+
if not isinstance(seq, str):
|
|
182
|
+
raise TypeError(f"Expected string, got {type(seq).__name__}")
|
|
183
|
+
|
|
184
|
+
# Validate sequence
|
|
185
|
+
valid_chars = set('ACGTN')
|
|
186
|
+
if not all(c.upper() in valid_chars for c in seq):
|
|
187
|
+
raise ValueError("Sequence contains invalid characters (only A, C, G, T, N allowed)")
|
|
188
|
+
|
|
189
|
+
encoding_map = np.asarray([[0, 0, 0, 0], # N or unknown
|
|
190
|
+
[1, 0, 0, 0], # A
|
|
191
|
+
[0, 1, 0, 0], # C
|
|
192
|
+
[0, 0, 1, 0], # G
|
|
193
|
+
[0, 0, 0, 1]]) # T
|
|
194
|
+
|
|
195
|
+
# Convert to numeric representation
|
|
196
|
+
seq = seq.upper().replace('A', '\x01').replace('C', '\x02')
|
|
197
|
+
seq = seq.replace('G', '\x03').replace('T', '\x04').replace('N', '\x00')
|
|
198
|
+
|
|
199
|
+
try:
|
|
200
|
+
return encoding_map[np.frombuffer(seq.encode('latin1'), np.int8) % 5]
|
|
201
|
+
except Exception as e:
|
|
202
|
+
raise ValueError(f"Failed to encode sequence: {e}") from e
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def sai_predict_probs(seq: str, models: list) -> tuple[np.ndarray, np.ndarray]:
|
|
206
|
+
"""
|
|
207
|
+
Predict donor and acceptor probabilities for each nt in seq using SpliceAI.
|
|
208
|
+
Returns (acceptor_probs, donor_probs) as np.ndarray of shape (L,).
|
|
209
|
+
"""
|
|
210
|
+
if not models:
|
|
211
|
+
raise ValueError("No SpliceAI models loaded")
|
|
212
|
+
|
|
213
|
+
if not isinstance(seq, str):
|
|
214
|
+
raise TypeError(f"Expected string, got {type(seq).__name__}")
|
|
215
|
+
|
|
216
|
+
if len(seq) < 1000:
|
|
217
|
+
raise ValueError(f"Sequence too short: {len(seq)} (expected >= 1000)")
|
|
218
|
+
|
|
219
|
+
try:
|
|
220
|
+
x = one_hot_encode(seq)[None, :]
|
|
221
|
+
preds = []
|
|
222
|
+
for i, model in enumerate(models):
|
|
223
|
+
try:
|
|
224
|
+
pred = model.predict(x, verbose=0)
|
|
225
|
+
preds.append(pred)
|
|
226
|
+
except Exception as e:
|
|
227
|
+
print(f"Warning: SpliceAI model {i+1} failed: {e}")
|
|
228
|
+
if not preds:
|
|
229
|
+
raise RuntimeError("All SpliceAI model predictions failed")
|
|
230
|
+
|
|
231
|
+
y = np.mean(preds, axis=0) # (1, L, 3)
|
|
232
|
+
y = y[0, :, 1:].T # (2, L) -> [acceptor, donor]
|
|
233
|
+
return y[0, :], y[1, :]
|
|
234
|
+
|
|
235
|
+
except Exception as e:
|
|
236
|
+
raise RuntimeError(f"SpliceAI prediction failed: {e}") from e
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def run_spliceai_seq(
|
|
240
|
+
seq: str,
|
|
241
|
+
indices: Union[List[int], np.ndarray],
|
|
242
|
+
threshold: float = 0.0,
|
|
243
|
+
) -> tuple[Dict[int, float], Dict[int, float]]:
|
|
244
|
+
"""
|
|
245
|
+
Run SpliceAI on seq and return donor / acceptor sites above threshold.
|
|
246
|
+
Returns (donor_indices, acceptor_indices) as dict[pos -> prob]
|
|
247
|
+
"""
|
|
248
|
+
if not isinstance(seq, str):
|
|
249
|
+
raise TypeError(f"Expected string sequence, got {type(seq).__name__}")
|
|
250
|
+
|
|
251
|
+
if not isinstance(indices, (list, np.ndarray)):
|
|
252
|
+
raise TypeError(f"Expected list or array for indices, got {type(indices).__name__}")
|
|
253
|
+
|
|
254
|
+
if len(indices) != len(seq):
|
|
255
|
+
raise ValueError(f"indices length ({len(indices)}) must match sequence length ({len(seq)})")
|
|
256
|
+
|
|
257
|
+
if not isinstance(threshold, (int, float)):
|
|
258
|
+
raise TypeError(f"Threshold must be numeric, got {type(threshold).__name__}")
|
|
259
|
+
|
|
260
|
+
try:
|
|
261
|
+
acc_probs, don_probs = sai_predict_probs(seq, models=sai_models)
|
|
262
|
+
acceptor = {pos: p for pos, p in zip(indices, acc_probs) if p >= threshold}
|
|
263
|
+
donor = {pos: p for pos, p in zip(indices, don_probs) if p >= threshold}
|
|
264
|
+
return donor, acceptor
|
|
265
|
+
except Exception as e:
|
|
266
|
+
raise RuntimeError(f"SpliceAI sequence analysis failed: {e}") from e
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def run_splicing_engine(
|
|
270
|
+
seq: Optional[str] = None,
|
|
271
|
+
engine: str = "spliceai",
|
|
272
|
+
) -> Tuple[List[float], List[float]]:
|
|
273
|
+
"""
|
|
274
|
+
Run specified splicing engine to predict splice site probabilities.
|
|
275
|
+
|
|
276
|
+
Returns:
|
|
277
|
+
(donor_probs, acceptor_probs) as lists
|
|
278
|
+
"""
|
|
279
|
+
from .utils import generate_random_sequence # type: ignore
|
|
280
|
+
|
|
281
|
+
if seq is None:
|
|
282
|
+
seq = generate_random_sequence(15_001)
|
|
283
|
+
|
|
284
|
+
if not isinstance(seq, str):
|
|
285
|
+
raise TypeError(f"Sequence must be string, got {type(seq).__name__}")
|
|
286
|
+
if not seq:
|
|
287
|
+
raise ValueError("Sequence cannot be empty")
|
|
288
|
+
|
|
289
|
+
valid_chars = set("ACGTN")
|
|
290
|
+
if not all(c.upper() in valid_chars for c in seq):
|
|
291
|
+
raise ValueError("Sequence contains invalid nucleotides (only A, C, G, T, N allowed)")
|
|
292
|
+
|
|
293
|
+
try:
|
|
294
|
+
match engine:
|
|
295
|
+
case "spliceai":
|
|
296
|
+
acc, don = sai_predict_probs(seq, models=sai_models)
|
|
297
|
+
donor_probs, acceptor_probs = don.tolist(), acc.tolist()
|
|
298
|
+
case "spliceai-pytorch":
|
|
299
|
+
raise ValueError("spliceai-pytorch engine has been removed. Use 'spliceai' instead.")
|
|
300
|
+
case "pangolin":
|
|
301
|
+
donor_probs, acceptor_probs = pangolin_predict_probs(seq, models=pang_models)
|
|
302
|
+
case _:
|
|
303
|
+
raise ValueError(f"Engine '{engine}' not implemented. Available: 'spliceai', 'pangolin'")
|
|
304
|
+
except ImportError as e:
|
|
305
|
+
raise ImportError(f"Failed to import engine '{engine}': {e}") from e
|
|
306
|
+
|
|
307
|
+
return donor_probs, acceptor_probs
|