geney 1.4.40__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
geney/__init__.py ADDED
@@ -0,0 +1,25 @@
1
+ # oncosplice/__init__.py
2
+ from .variants import Mutation, MutationalEvent, MutationLibrary
3
+ from .engines import (
4
+ sai_predict_probs,
5
+ run_spliceai_seq,
6
+ run_splicing_engine,
7
+ )
8
+ from .transcripts import TranscriptLibrary
9
+ from .splicing_table import adjoin_splicing_outcomes
10
+ from .splice_graph import SpliceSimulator
11
+ from .pipelines import oncosplice_pipeline_single_transcript
12
+ from .samples import *
13
+
14
+ __all__ = [
15
+ "Mutation",
16
+ "MutationalEvent",
17
+ "MutationLibrary",
18
+ "sai_predict_probs",
19
+ "run_spliceai_seq",
20
+ "run_splicing_engine",
21
+ "TranscriptLibrary",
22
+ "adjoin_splicing_outcomes",
23
+ "SpliceSimulator",
24
+ "oncosplice_pipeline_single_transcript",
25
+ ]
geney/engines.py ADDED
@@ -0,0 +1,307 @@
1
+ # oncosplice/engines.py
2
+ from __future__ import annotations
3
+
4
+ from typing import Dict, List, Tuple, Optional, Union
5
+
6
+ import numpy as np
7
+
8
+ # These are your existing helpers; keep them in separate modules if you want.
9
+ # from ._spliceai_utils import one_hot_encode, sai_models # type: ignore
10
+ # from ._pangolin_utils import pangolin_predict_probs, pang_models # type: ignore
11
+ import torch
12
+ from pkg_resources import resource_filename
13
+ from pangolin.model import *
14
+ import numpy as np
15
+ import sys
16
+
17
+ pang_model_nums = [0, 1, 2, 3, 4, 5, 6, 7]
18
+ pang_models = []
19
+
20
+ def get_best_device():
21
+ """Get the best available device for computation."""
22
+ if sys.platform == 'darwin' and torch.backends.mps.is_available():
23
+ try:
24
+ # Test MPS availability
25
+ torch.tensor([1.0], device="mps")
26
+ return torch.device("mps")
27
+ except RuntimeError:
28
+ print("Warning: MPS not available, falling back to CPU")
29
+ return torch.device("cpu")
30
+ elif torch.cuda.is_available():
31
+ return torch.device("cuda")
32
+ else:
33
+ return torch.device("cpu")
34
+
35
+ device = get_best_device()
36
+ print(f"Pangolin loaded to {device}.")
37
+
38
+ # Initialize models with improved error handling
39
+ try:
40
+ for i in pang_model_nums:
41
+ for j in range(1, 6):
42
+ try:
43
+ model = Pangolin(L, W, AR).to(device)
44
+
45
+ # Load weights with proper device mapping
46
+ model_path = resource_filename("pangolin", f"models/final.{j}.{i}.3")
47
+ weights = torch.load(model_path, weights_only=True, map_location=device)
48
+
49
+ model.load_state_dict(weights)
50
+ model.eval()
51
+ pang_models.append(model)
52
+
53
+ except Exception as e:
54
+ print(f"Warning: Failed to load Pangolin model {j}.{i}: {e}")
55
+ continue
56
+
57
+ except Exception as e:
58
+ print(f"Error initializing Pangolin models: {e}")
59
+ pang_models = []
60
+
61
+
62
+ def pang_one_hot_encode(seq: str) -> np.ndarray:
63
+ """One-hot encode DNA sequence for Pangolin model.
64
+
65
+ Args:
66
+ seq: DNA sequence string
67
+
68
+ Returns:
69
+ One-hot encoded array of shape (len(seq), 4)
70
+
71
+ Raises:
72
+ ValueError: If sequence contains invalid characters
73
+ """
74
+ if not isinstance(seq, str):
75
+ raise TypeError(f"Expected string, got {type(seq).__name__}")
76
+
77
+ IN_MAP = np.asarray([[0, 0, 0, 0], # N or unknown
78
+ [1, 0, 0, 0], # A
79
+ [0, 1, 0, 0], # C
80
+ [0, 0, 1, 0], # G
81
+ [0, 0, 0, 1]]) # T
82
+
83
+ # Validate sequence
84
+ valid_chars = set('ACGTN')
85
+ if not all(c.upper() in valid_chars for c in seq):
86
+ raise ValueError("Sequence contains invalid characters (only A, C, G, T, N allowed)")
87
+
88
+ # Convert to numeric representation
89
+ seq = seq.upper().replace('A', '1').replace('C', '2')
90
+ seq = seq.replace('G', '3').replace('T', '4').replace('N', '0')
91
+
92
+ try:
93
+ seq_array = np.asarray(list(map(int, list(seq))))
94
+ return IN_MAP[seq_array.astype('int8')]
95
+ except (ValueError, IndexError) as e:
96
+ raise ValueError(f"Failed to encode sequence: {e}") from e
97
+
98
+
99
+
100
+
101
+ import os
102
+ os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
103
+
104
+ import absl.logging
105
+ absl.logging.set_verbosity(absl.logging.ERROR)
106
+
107
+ import os
108
+ import sys
109
+ import tensorflow as tf
110
+ import numpy as np
111
+ from keras.models import load_model
112
+ from importlib import resources
113
+
114
+
115
+ # Force device selection with error handling
116
+ def get_best_tensorflow_device():
117
+ """Get the best available TensorFlow device."""
118
+ try:
119
+ # Try GPU first
120
+ if tf.config.list_physical_devices('GPU'):
121
+ return '/GPU:0'
122
+ # Try MPS on macOS
123
+ elif sys.platform == 'darwin' and tf.config.list_physical_devices('MPS'):
124
+ return '/device:GPU:0'
125
+ else:
126
+ return '/CPU:0'
127
+ except Exception as e:
128
+ print(f"Warning: Device selection failed, using CPU: {e}")
129
+ return '/CPU:0'
130
+
131
+ device = get_best_tensorflow_device()
132
+
133
+ # Model loading paths with error handling
134
+ def load_spliceai_models():
135
+ """Load SpliceAI models with proper error handling."""
136
+ try:
137
+ if sys.platform == 'darwin':
138
+ model_filenames = [f"models/spliceai{i}.h5" for i in range(1, 6)]
139
+ model_paths = [resources.files('spliceai').joinpath(f) for f in model_filenames]
140
+ else:
141
+ model_paths = [f"/tamir2/nicolaslynn/tools/SpliceAI/spliceai/models/spliceai{i}.h5"
142
+ for i in range(1, 6)]
143
+
144
+ # Load models onto correct device
145
+ models = []
146
+ with tf.device(device):
147
+ for i, model_path in enumerate(model_paths):
148
+ try:
149
+ model = load_model(str(model_path))
150
+ models.append(model)
151
+ except Exception as e:
152
+ print(f"Warning: Failed to load SpliceAI model {i+1}: {e}")
153
+ continue
154
+
155
+ if not models:
156
+ raise RuntimeError("No SpliceAI models could be loaded")
157
+
158
+ return models
159
+
160
+ except Exception as e:
161
+ print(f"Error loading SpliceAI models: {e}")
162
+ return []
163
+
164
+ sai_models = load_spliceai_models()
165
+
166
+
167
+ print(f"SpliceAI loaded to {device}.")
168
+
169
+ def one_hot_encode(seq: str) -> np.ndarray:
170
+ """One-hot encode DNA sequence for SpliceAI model.
171
+
172
+ Args:
173
+ seq: DNA sequence string
174
+
175
+ Returns:
176
+ One-hot encoded array of shape (len(seq), 4)
177
+
178
+ Raises:
179
+ ValueError: If sequence contains invalid characters
180
+ """
181
+ if not isinstance(seq, str):
182
+ raise TypeError(f"Expected string, got {type(seq).__name__}")
183
+
184
+ # Validate sequence
185
+ valid_chars = set('ACGTN')
186
+ if not all(c.upper() in valid_chars for c in seq):
187
+ raise ValueError("Sequence contains invalid characters (only A, C, G, T, N allowed)")
188
+
189
+ encoding_map = np.asarray([[0, 0, 0, 0], # N or unknown
190
+ [1, 0, 0, 0], # A
191
+ [0, 1, 0, 0], # C
192
+ [0, 0, 1, 0], # G
193
+ [0, 0, 0, 1]]) # T
194
+
195
+ # Convert to numeric representation
196
+ seq = seq.upper().replace('A', '\x01').replace('C', '\x02')
197
+ seq = seq.replace('G', '\x03').replace('T', '\x04').replace('N', '\x00')
198
+
199
+ try:
200
+ return encoding_map[np.frombuffer(seq.encode('latin1'), np.int8) % 5]
201
+ except Exception as e:
202
+ raise ValueError(f"Failed to encode sequence: {e}") from e
203
+
204
+
205
+ def sai_predict_probs(seq: str, models: list) -> tuple[np.ndarray, np.ndarray]:
206
+ """
207
+ Predict donor and acceptor probabilities for each nt in seq using SpliceAI.
208
+ Returns (acceptor_probs, donor_probs) as np.ndarray of shape (L,).
209
+ """
210
+ if not models:
211
+ raise ValueError("No SpliceAI models loaded")
212
+
213
+ if not isinstance(seq, str):
214
+ raise TypeError(f"Expected string, got {type(seq).__name__}")
215
+
216
+ if len(seq) < 1000:
217
+ raise ValueError(f"Sequence too short: {len(seq)} (expected >= 1000)")
218
+
219
+ try:
220
+ x = one_hot_encode(seq)[None, :]
221
+ preds = []
222
+ for i, model in enumerate(models):
223
+ try:
224
+ pred = model.predict(x, verbose=0)
225
+ preds.append(pred)
226
+ except Exception as e:
227
+ print(f"Warning: SpliceAI model {i+1} failed: {e}")
228
+ if not preds:
229
+ raise RuntimeError("All SpliceAI model predictions failed")
230
+
231
+ y = np.mean(preds, axis=0) # (1, L, 3)
232
+ y = y[0, :, 1:].T # (2, L) -> [acceptor, donor]
233
+ return y[0, :], y[1, :]
234
+
235
+ except Exception as e:
236
+ raise RuntimeError(f"SpliceAI prediction failed: {e}") from e
237
+
238
+
239
+ def run_spliceai_seq(
240
+ seq: str,
241
+ indices: Union[List[int], np.ndarray],
242
+ threshold: float = 0.0,
243
+ ) -> tuple[Dict[int, float], Dict[int, float]]:
244
+ """
245
+ Run SpliceAI on seq and return donor / acceptor sites above threshold.
246
+ Returns (donor_indices, acceptor_indices) as dict[pos -> prob]
247
+ """
248
+ if not isinstance(seq, str):
249
+ raise TypeError(f"Expected string sequence, got {type(seq).__name__}")
250
+
251
+ if not isinstance(indices, (list, np.ndarray)):
252
+ raise TypeError(f"Expected list or array for indices, got {type(indices).__name__}")
253
+
254
+ if len(indices) != len(seq):
255
+ raise ValueError(f"indices length ({len(indices)}) must match sequence length ({len(seq)})")
256
+
257
+ if not isinstance(threshold, (int, float)):
258
+ raise TypeError(f"Threshold must be numeric, got {type(threshold).__name__}")
259
+
260
+ try:
261
+ acc_probs, don_probs = sai_predict_probs(seq, models=sai_models)
262
+ acceptor = {pos: p for pos, p in zip(indices, acc_probs) if p >= threshold}
263
+ donor = {pos: p for pos, p in zip(indices, don_probs) if p >= threshold}
264
+ return donor, acceptor
265
+ except Exception as e:
266
+ raise RuntimeError(f"SpliceAI sequence analysis failed: {e}") from e
267
+
268
+
269
+ def run_splicing_engine(
270
+ seq: Optional[str] = None,
271
+ engine: str = "spliceai",
272
+ ) -> Tuple[List[float], List[float]]:
273
+ """
274
+ Run specified splicing engine to predict splice site probabilities.
275
+
276
+ Returns:
277
+ (donor_probs, acceptor_probs) as lists
278
+ """
279
+ from .utils import generate_random_sequence # type: ignore
280
+
281
+ if seq is None:
282
+ seq = generate_random_sequence(15_001)
283
+
284
+ if not isinstance(seq, str):
285
+ raise TypeError(f"Sequence must be string, got {type(seq).__name__}")
286
+ if not seq:
287
+ raise ValueError("Sequence cannot be empty")
288
+
289
+ valid_chars = set("ACGTN")
290
+ if not all(c.upper() in valid_chars for c in seq):
291
+ raise ValueError("Sequence contains invalid nucleotides (only A, C, G, T, N allowed)")
292
+
293
+ try:
294
+ match engine:
295
+ case "spliceai":
296
+ acc, don = sai_predict_probs(seq, models=sai_models)
297
+ donor_probs, acceptor_probs = don.tolist(), acc.tolist()
298
+ case "spliceai-pytorch":
299
+ raise ValueError("spliceai-pytorch engine has been removed. Use 'spliceai' instead.")
300
+ case "pangolin":
301
+ donor_probs, acceptor_probs = pangolin_predict_probs(seq, models=pang_models)
302
+ case _:
303
+ raise ValueError(f"Engine '{engine}' not implemented. Available: 'spliceai', 'pangolin'")
304
+ except ImportError as e:
305
+ raise ImportError(f"Failed to import engine '{engine}': {e}") from e
306
+
307
+ return donor_probs, acceptor_probs