geney 1.3.78__py2.py3-none-any.whl → 1.4.0__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of geney might be problematic. Click here for more details.
- geney/Gene.py +9 -10
- geney/Oncosplice.py +400 -0
- geney/SpliceSimulator.py +407 -0
- geney/Transcript.py +55 -57
- geney/__init__.py +47 -19
- geney/_config_setup.py +16 -0
- geney/_graphic_utils.py +269 -0
- geney/_gtex_utils.py +68 -0
- geney/_immune_utils.py +125 -0
- geney/{oncosplice.py → _oncosplice.py} +199 -156
- geney/_splicing_utils.py +693 -0
- geney/_survival_utils.py +143 -0
- geney/_tcga_utils.py +405 -0
- geney/_tis_utils.py +172 -0
- geney/immune_utils.py +1 -1
- geney/pipelines.py +66 -0
- geney/power_utils.py +1 -1
- geney/spliceai_utils.py +17 -17
- geney/utils/Fasta_segment.py +260 -0
- geney/utils/SeqMats.py +423 -0
- geney/utils/TranscriptLibrary.py +55 -0
- geney/utils/__init__.py +20 -0
- geney/utils/mutation_utils.py +104 -0
- geney/utils/pangolin_utils.py +173 -0
- geney/utils/spliceai_utils.py +123 -0
- geney/utils/splicing_utils.py +525 -0
- geney/utils/utils.py +89 -0
- {geney-1.3.78.dist-info → geney-1.4.0.dist-info}/METADATA +1 -1
- geney-1.4.0.dist-info/RECORD +51 -0
- {geney-1.3.78.dist-info → geney-1.4.0.dist-info}/WHEEL +1 -1
- geney-1.3.78.dist-info/RECORD +0 -31
- {geney-1.3.78.dist-info → geney-1.4.0.dist-info}/top_level.txt +0 -0
geney/SpliceSimulator.py
ADDED
|
@@ -0,0 +1,407 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from typing import List, Tuple, Dict, Generator, Any
|
|
5
|
+
from pandas import Series
|
|
6
|
+
from .utils.utils import short_hash_of_list
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class SpliceSimulator:
|
|
10
|
+
def __init__(self, splicing_df: pd.DataFrame, transcript, max_distance: int, feature='event'):
|
|
11
|
+
"""
|
|
12
|
+
Initializes the SpliceSimulator.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
splicing_df (pd.DataFrame): DataFrame containing splicing information.
|
|
16
|
+
Expected to have columns 'donors' and 'acceptors', each providing
|
|
17
|
+
a list of tuples (position, probability).
|
|
18
|
+
event_map: Additional event mapping information.
|
|
19
|
+
rev (bool): Indicates whether the orientation is reversed.
|
|
20
|
+
transcript_start (int): Start position of the transcript.
|
|
21
|
+
transcript_end (int): End position of the transcript.
|
|
22
|
+
max_distance (int): Maximum allowable distance for connecting splice sites.
|
|
23
|
+
"""
|
|
24
|
+
self.full_df = splicing_df
|
|
25
|
+
self.feature = feature
|
|
26
|
+
self.rev = transcript.rev
|
|
27
|
+
self.transcript_start = transcript.transcript_start
|
|
28
|
+
self.transcript_end = transcript.transcript_end
|
|
29
|
+
self.donors = transcript.donors
|
|
30
|
+
self.acceptors = transcript.acceptors
|
|
31
|
+
self.transcript = transcript
|
|
32
|
+
self.max_distance = max_distance
|
|
33
|
+
|
|
34
|
+
# Build sorted node lists from DataFrame columns.
|
|
35
|
+
self.set_donor_nodes()
|
|
36
|
+
self.set_acceptor_nodes()
|
|
37
|
+
|
|
38
|
+
def _compute_splice_df(self, site_type: str) -> pd.DataFrame:
|
|
39
|
+
"""
|
|
40
|
+
Generic method to compute donor or acceptor DataFrame with delta calculations and priority scores.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
site_type (str): 'donor' or 'acceptor'
|
|
44
|
+
feature (str): prefix of the feature column (e.g., 'mut1' → 'mut1_prob')
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
pd.DataFrame: Annotated and scored splice site DataFrame
|
|
48
|
+
"""
|
|
49
|
+
feature_col = f'{self.feature}_prob'
|
|
50
|
+
df = getattr(self.full_df, site_type + 's').copy()
|
|
51
|
+
site_set = getattr(self, site_type + 's')
|
|
52
|
+
|
|
53
|
+
# Ensure all known sites are included
|
|
54
|
+
missing = set(site_set) - set(df.index)
|
|
55
|
+
if missing:
|
|
56
|
+
df = pd.concat([df, pd.DataFrame(index=list(missing))], axis=0)
|
|
57
|
+
df.loc[list(missing), ['annotated', 'ref_prob', feature_col]] = [True, 1, 1]
|
|
58
|
+
|
|
59
|
+
# Ensure 'annotated' column exists and is boolean
|
|
60
|
+
if 'annotated' not in df.columns:
|
|
61
|
+
df['annotated'] = False
|
|
62
|
+
else:
|
|
63
|
+
df['annotated'] = df['annotated'].where(df['annotated'].notna(), False).astype(bool)
|
|
64
|
+
|
|
65
|
+
# Sort by genomic position (respect strand orientation)
|
|
66
|
+
df.sort_index(ascending=not self.rev, inplace=True)
|
|
67
|
+
|
|
68
|
+
# === DELTA COMPUTATIONS ===
|
|
69
|
+
MIN_INCREASE_RATIO = 0.2
|
|
70
|
+
|
|
71
|
+
df['discovered_delta'] = np.where(
|
|
72
|
+
~df['annotated'],
|
|
73
|
+
(df[feature_col] - df['ref_prob']),
|
|
74
|
+
np.nan
|
|
75
|
+
)
|
|
76
|
+
df['discovered_delta'] = df['discovered_delta'].where(df['discovered_delta'] >= MIN_INCREASE_RATIO, 0)
|
|
77
|
+
|
|
78
|
+
with np.errstate(divide='ignore', invalid='ignore'):
|
|
79
|
+
df['deleted_delta'] = np.where(
|
|
80
|
+
(df['ref_prob'] > 0) & df['annotated'],
|
|
81
|
+
(df[feature_col] - df['ref_prob']) / df['ref_prob'],
|
|
82
|
+
0
|
|
83
|
+
)
|
|
84
|
+
df['deleted_delta'] = df['deleted_delta'].clip(upper=0)
|
|
85
|
+
|
|
86
|
+
df['P'] = df['annotated'].astype(float) + df['discovered_delta'] + df['deleted_delta']
|
|
87
|
+
return df
|
|
88
|
+
|
|
89
|
+
@property
|
|
90
|
+
def donor_df(self) -> pd.DataFrame:
|
|
91
|
+
return self._compute_splice_df('donor')
|
|
92
|
+
|
|
93
|
+
@property
|
|
94
|
+
def acceptor_df(self) -> pd.DataFrame:
|
|
95
|
+
return self._compute_splice_df('acceptor')
|
|
96
|
+
|
|
97
|
+
def report(self, pos):
|
|
98
|
+
metadata = self.find_splice_site_proximity(pos)
|
|
99
|
+
metadata['donor_events'] = self.donor_df[
|
|
100
|
+
(self.donor_df.deleted_delta.abs() > 0.2) | (
|
|
101
|
+
self.donor_df.discovered_delta.abs() > 0.2)].reset_index().to_json()
|
|
102
|
+
metadata['acceptor_events'] = self.acceptor_df[(self.acceptor_df.deleted_delta.abs() > 0.2) | (
|
|
103
|
+
self.acceptor_df.discovered_delta.abs() > 0.2)].reset_index().to_json()
|
|
104
|
+
metadata['missplicing'] = self.max_splicing_delta()
|
|
105
|
+
return metadata
|
|
106
|
+
|
|
107
|
+
def max_splicing_delta(self, event) -> pd.Series:
|
|
108
|
+
"""
|
|
109
|
+
Computes the maximum missplicing delta for both donor and acceptor sites.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
event: The event column to compare against the reference.
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
|
|
116
|
+
pd.Series: A series with keys 'donor' and 'acceptor' containing the maximum differences.
|
|
117
|
+
"""
|
|
118
|
+
max_missplicing = {}
|
|
119
|
+
for site_type in ['donors', 'acceptors']:
|
|
120
|
+
df = self.full_df[site_type]
|
|
121
|
+
max_missplicing[site_type] = max(abs(df[event] - df['ref_prob']))
|
|
122
|
+
return pd.Series(max_missplicing)
|
|
123
|
+
|
|
124
|
+
def set_donor_nodes(self) -> None:
|
|
125
|
+
"""
|
|
126
|
+
Builds a sorted list of donor nodes.
|
|
127
|
+
A working copy is made from the donors property; then the transcript_end is appended as
|
|
128
|
+
a candidate with a full (1) probability. The list is sorted based on the position and probability.
|
|
129
|
+
"""
|
|
130
|
+
donors = self.donor_df.P
|
|
131
|
+
donor_list = list(donors[donors > 0].round(2).items()) # Each tuple is (position, P)
|
|
132
|
+
donor_list.append((self.transcript_end, 1))
|
|
133
|
+
self.donor_nodes = sorted(
|
|
134
|
+
donor_list,
|
|
135
|
+
key=lambda x: int(x[0]),
|
|
136
|
+
reverse=bool(self.rev)
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
def set_acceptor_nodes(self) -> None:
|
|
140
|
+
"""
|
|
141
|
+
Builds a sorted list of acceptor nodes.
|
|
142
|
+
"""
|
|
143
|
+
acceptors = self.acceptor_df.P
|
|
144
|
+
acceptor_list = list(acceptors[acceptors > 0].round(2).items()) # Each tuple is (position, P)
|
|
145
|
+
acceptor_list.insert(0, (self.transcript_start, 1.0)) # starting point
|
|
146
|
+
self.acceptor_nodes = sorted(
|
|
147
|
+
acceptor_list,
|
|
148
|
+
key=lambda x: int(x[0]),
|
|
149
|
+
reverse=bool(self.rev)
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
def generate_graph(self) -> Dict[Tuple[int, str], List[Tuple[int, str, float]]]:
|
|
153
|
+
"""
|
|
154
|
+
Builds a directed graph (as an adjacency list) where keys are nodes (position, type)
|
|
155
|
+
and values are lists of downstream connections as tuples:
|
|
156
|
+
(next_position, next_type, adjusted_probability)
|
|
157
|
+
|
|
158
|
+
The construction is done in three steps:
|
|
159
|
+
1. Connect each donor node to acceptor nodes within max_distance.
|
|
160
|
+
2. Connect each acceptor node to donor nodes within max_distance.
|
|
161
|
+
3. Connect the transcript_start to donor nodes within max_distance.
|
|
162
|
+
|
|
163
|
+
Returns:
|
|
164
|
+
Dict: The adjacency list representing possible splice site transitions.
|
|
165
|
+
"""
|
|
166
|
+
adjacency_list = defaultdict(list)
|
|
167
|
+
|
|
168
|
+
# 1. Connect each donor node to nearby acceptor nodes.
|
|
169
|
+
for d_pos, d_prob in self.donor_nodes:
|
|
170
|
+
running_prob = 1
|
|
171
|
+
for a_pos, a_prob in self.acceptor_nodes:
|
|
172
|
+
correct_orientation = ((a_pos > d_pos and not self.rev) or
|
|
173
|
+
(a_pos < d_pos and self.rev))
|
|
174
|
+
distance_valid = abs(a_pos - d_pos) <= self.max_distance
|
|
175
|
+
if correct_orientation and distance_valid:
|
|
176
|
+
if not self.rev:
|
|
177
|
+
in_between_acceptors = sum(1 for a, _ in self.acceptor_nodes if d_pos < a < a_pos)
|
|
178
|
+
in_between_donors = sum(1 for d, _ in self.donor_nodes if d_pos < d < a_pos)
|
|
179
|
+
else:
|
|
180
|
+
in_between_acceptors = sum(1 for a, _ in self.acceptor_nodes if a_pos < a < d_pos)
|
|
181
|
+
in_between_donors = sum(1 for d, _ in self.donor_nodes if a_pos < d < d_pos)
|
|
182
|
+
|
|
183
|
+
if in_between_donors == 0 or in_between_acceptors == 0:
|
|
184
|
+
adjacency_list[(d_pos, 'donor')].append((a_pos, 'acceptor', a_prob))
|
|
185
|
+
running_prob -= a_prob
|
|
186
|
+
else:
|
|
187
|
+
if running_prob > 0:
|
|
188
|
+
adjacency_list[(d_pos, 'donor')].append((a_pos, 'acceptor', a_prob * running_prob))
|
|
189
|
+
running_prob -= a_prob
|
|
190
|
+
else:
|
|
191
|
+
break
|
|
192
|
+
|
|
193
|
+
# 2. Connect each acceptor node to nearby donor nodes.
|
|
194
|
+
for a_pos, a_prob in self.acceptor_nodes:
|
|
195
|
+
running_prob = 1
|
|
196
|
+
for d_pos, d_prob in self.donor_nodes:
|
|
197
|
+
correct_orientation = ((d_pos > a_pos and not self.rev) or
|
|
198
|
+
(d_pos < a_pos and self.rev))
|
|
199
|
+
distance_valid = abs(d_pos - a_pos) <= self.max_distance
|
|
200
|
+
if correct_orientation and distance_valid:
|
|
201
|
+
if not self.rev:
|
|
202
|
+
in_between_acceptors = sum(1 for a, _ in self.acceptor_nodes if a_pos < a < d_pos)
|
|
203
|
+
in_between_donors = sum(1 for d, _ in self.donor_nodes if a_pos < d < d_pos)
|
|
204
|
+
else:
|
|
205
|
+
in_between_acceptors = sum(1 for a, _ in self.acceptor_nodes if d_pos < a < a_pos)
|
|
206
|
+
in_between_donors = sum(1 for d, _ in self.donor_nodes if d_pos < d < a_pos)
|
|
207
|
+
tag = 'donor' if d_pos != self.transcript_end else 'transcript_end'
|
|
208
|
+
if in_between_acceptors == 0:
|
|
209
|
+
adjacency_list[(a_pos, 'acceptor')].append((d_pos, tag, d_prob))
|
|
210
|
+
running_prob -= d_prob
|
|
211
|
+
else:
|
|
212
|
+
if running_prob > 0:
|
|
213
|
+
adjacency_list[(a_pos, 'acceptor')].append((d_pos, tag, d_prob * running_prob))
|
|
214
|
+
running_prob -= d_prob
|
|
215
|
+
else:
|
|
216
|
+
break
|
|
217
|
+
|
|
218
|
+
# 3. Connect transcript_start to donor nodes within max_distance.
|
|
219
|
+
running_prob = 1
|
|
220
|
+
for d_pos, d_prob in self.donor_nodes:
|
|
221
|
+
correct_orientation = ((d_pos > self.transcript_start and not self.rev) or
|
|
222
|
+
(d_pos < self.transcript_start and self.rev))
|
|
223
|
+
distance_valid = abs(d_pos - self.transcript_start) <= self.max_distance
|
|
224
|
+
if correct_orientation and distance_valid:
|
|
225
|
+
adjacency_list[(self.transcript_start, 'transcript_start')].append((d_pos, 'donor', d_prob))
|
|
226
|
+
running_prob -= d_prob
|
|
227
|
+
if running_prob <= 0:
|
|
228
|
+
break
|
|
229
|
+
|
|
230
|
+
# Normalize each outgoing edge list so that probabilities sum to 1.
|
|
231
|
+
for key, next_nodes in adjacency_list.items():
|
|
232
|
+
total_prob = sum(prob for (_, _, prob) in next_nodes)
|
|
233
|
+
if total_prob > 0:
|
|
234
|
+
adjacency_list[key] = [(pos, typ, round(prob / total_prob, 3))
|
|
235
|
+
for pos, typ, prob in next_nodes]
|
|
236
|
+
return adjacency_list
|
|
237
|
+
|
|
238
|
+
def find_all_paths(self,
|
|
239
|
+
graph: Dict[Tuple[int, str], List[Tuple[int, str, float]]],
|
|
240
|
+
start: Tuple[int, str],
|
|
241
|
+
end: Tuple[int, str],
|
|
242
|
+
path: List[Tuple[int, str]] = None,
|
|
243
|
+
probability: float = 1.0) -> Generator[Tuple[List[Tuple[int, str]], float], None, None]:
|
|
244
|
+
"""
|
|
245
|
+
Recursively traverses the graph to yield all complete paths from start to end.
|
|
246
|
+
|
|
247
|
+
Args:
|
|
248
|
+
graph (Dict): The adjacency list graph.
|
|
249
|
+
start (Tuple[int, str]): The current node.
|
|
250
|
+
end (Tuple[int, str]): The target node.
|
|
251
|
+
path (List[Tuple[int, str]], optional): The current path. Defaults to None.
|
|
252
|
+
probability (float, optional): The cumulative probability along the current path.
|
|
253
|
+
|
|
254
|
+
Yields:
|
|
255
|
+
Generator yielding tuples of (path, cumulative_probability).
|
|
256
|
+
"""
|
|
257
|
+
if path is None:
|
|
258
|
+
path = [start]
|
|
259
|
+
else:
|
|
260
|
+
path = path + [start]
|
|
261
|
+
if start == end:
|
|
262
|
+
yield path, probability
|
|
263
|
+
return
|
|
264
|
+
if start not in graph:
|
|
265
|
+
return
|
|
266
|
+
for next_node, node_type, prob in graph[start]:
|
|
267
|
+
yield from self.find_all_paths(graph, (next_node, node_type), end, path, probability * prob)
|
|
268
|
+
|
|
269
|
+
def get_viable_paths(self) -> List[Tuple[List[Tuple[int, str]], float]]:
|
|
270
|
+
"""
|
|
271
|
+
Generates and returns all complete splice-site paths (from transcript_start to transcript_end),
|
|
272
|
+
sorted by overall likelihood in descending order.
|
|
273
|
+
|
|
274
|
+
Returns:
|
|
275
|
+
List[Tuple[List[Tuple[int, str]], float]]: Each tuple contains a path (list of (position, type))
|
|
276
|
+
and its overall probability.
|
|
277
|
+
"""
|
|
278
|
+
graph = self.generate_graph()
|
|
279
|
+
start_node = (self.transcript_start, 'transcript_start')
|
|
280
|
+
end_node = (self.transcript_end, 'transcript_end')
|
|
281
|
+
paths = list(self.find_all_paths(graph, start_node, end_node))
|
|
282
|
+
paths.sort(key=lambda x: x[1], reverse=True)
|
|
283
|
+
return paths
|
|
284
|
+
|
|
285
|
+
def get_viable_transcripts(self, metadata=False) -> Generator[tuple[Any, Series] | Any, Any, None]:
|
|
286
|
+
"""
|
|
287
|
+
Returns a list of transcript-like objects cloned from `self.transcript`,
|
|
288
|
+
each representing a valid splice path with updated donor/acceptor sites,
|
|
289
|
+
total path probability, and a unique hash based on exon/intron structure.
|
|
290
|
+
"""
|
|
291
|
+
graph = self.generate_graph()
|
|
292
|
+
start_node = (self.transcript_start, 'transcript_start')
|
|
293
|
+
end_node = (self.transcript_end, 'transcript_end')
|
|
294
|
+
|
|
295
|
+
paths = list(self.find_all_paths(graph, start_node, end_node))
|
|
296
|
+
paths.sort(key=lambda x: x[1], reverse=True)
|
|
297
|
+
|
|
298
|
+
viable_transcripts = []
|
|
299
|
+
|
|
300
|
+
for path, prob in paths:
|
|
301
|
+
donors = [pos for pos, typ in path if typ == 'donor']
|
|
302
|
+
acceptors = [pos for pos, typ in path if typ == 'acceptor']
|
|
303
|
+
|
|
304
|
+
transcript = self.transcript.clone() # Make sure this creates a deep copy
|
|
305
|
+
|
|
306
|
+
transcript.donors = [d for d in donors if d != transcript.transcript_end]
|
|
307
|
+
transcript.acceptors = [a for a in acceptors if a != transcript.transcript_start]
|
|
308
|
+
transcript.path_weight = prob
|
|
309
|
+
transcript.path_hash = short_hash_of_list(tuple(donors + acceptors)) # or use a better hash function if needed
|
|
310
|
+
transcript.generate_mature_mrna().generate_protein()
|
|
311
|
+
if metadata:
|
|
312
|
+
md = pd.concat([self.compare_splicing_to_reference(transcript), pd.Series({'isoform_prevalence': transcript.path_weight, 'isoform_id': transcript.path_hash})])
|
|
313
|
+
yield transcript, md
|
|
314
|
+
else:
|
|
315
|
+
yield transcript
|
|
316
|
+
|
|
317
|
+
def find_splice_site_proximity(self, pos):
|
|
318
|
+
def result(region, index, start, end):
|
|
319
|
+
return pd.Series({
|
|
320
|
+
'region': region,
|
|
321
|
+
'index': index + 1,
|
|
322
|
+
"5'_dist": abs(pos - min(start, end)),
|
|
323
|
+
"3'_dist": abs(pos - max(start, end))
|
|
324
|
+
})
|
|
325
|
+
|
|
326
|
+
if not hasattr(self.transcript, 'exons') or not hasattr(self.transcript, 'introns'):
|
|
327
|
+
return pd.Series({'region': None, 'index': None, "5'_dist": np.inf, "3'_dist": np.inf})
|
|
328
|
+
|
|
329
|
+
for i, (start, end) in enumerate(self.transcript.exons):
|
|
330
|
+
if min(start, end) <= pos <= max(start, end):
|
|
331
|
+
return result('exon', i, start, end)
|
|
332
|
+
|
|
333
|
+
for i, (start, end) in enumerate(self.transcript.introns):
|
|
334
|
+
if min(start, end) <= pos <= max(start, end):
|
|
335
|
+
return result('intron', i, start, end)
|
|
336
|
+
|
|
337
|
+
return pd.Series({'region': None, 'index': None, "5'_dist": np.inf, "3'_dist": np.inf})
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
def define_missplicing_events(self, var):
|
|
341
|
+
"""
|
|
342
|
+
Compares a reference transcript and a variant to detect splicing abnormalities.
|
|
343
|
+
Returns string descriptions of each type of missplicing event.
|
|
344
|
+
"""
|
|
345
|
+
|
|
346
|
+
ref = self.transcript
|
|
347
|
+
ref_introns, ref_exons = getattr(ref, 'introns', []), getattr(ref, 'exons', [])
|
|
348
|
+
var_introns, var_exons = getattr(var, 'introns', []), getattr(var, 'exons', [])
|
|
349
|
+
|
|
350
|
+
num_ref_exons = len(ref_exons)
|
|
351
|
+
num_ref_introns = len(ref_introns)
|
|
352
|
+
|
|
353
|
+
pes = []
|
|
354
|
+
pir = []
|
|
355
|
+
es = []
|
|
356
|
+
ne = []
|
|
357
|
+
ir = []
|
|
358
|
+
|
|
359
|
+
for exon_count, (t1, t2) in enumerate(ref_exons):
|
|
360
|
+
for (s1, s2) in var_exons:
|
|
361
|
+
if not ref.rev and ((s1 == t1 and s2 < t2) or (s1 > t1 and s2 == t2)) or \
|
|
362
|
+
(ref.rev and ((s1 == t1 and s2 > t2) or (s1 < t1 and s2 == t2))):
|
|
363
|
+
pes.append(f'Exon {exon_count + 1}/{num_ref_exons} truncated: {(t1, t2)} --> {(s1, s2)}')
|
|
364
|
+
|
|
365
|
+
for intron_count, (t1, t2) in enumerate(ref_introns):
|
|
366
|
+
for (s1, s2) in var_introns:
|
|
367
|
+
if not ref.rev and ((s1 == t1 and s2 < t2) or (s1 > t1 and s2 == t2)) or \
|
|
368
|
+
(ref.rev and ((s1 == t1 and s2 > t2) or (s1 < t1 and s2 == t2))):
|
|
369
|
+
pir.append(f'Intron {intron_count + 1}/{num_ref_introns} partially retained: {(t1, t2)} --> {(s1, s2)}')
|
|
370
|
+
|
|
371
|
+
for exon_count, (t1, t2) in enumerate(ref_exons):
|
|
372
|
+
if t1 not in var.acceptors and t2 not in var.donors:
|
|
373
|
+
es.append(f'Exon {exon_count + 1}/{num_ref_exons} skipped: {(t1, t2)}')
|
|
374
|
+
|
|
375
|
+
for (s1, s2) in var_exons:
|
|
376
|
+
if s1 not in ref.acceptors and s2 not in ref.donors:
|
|
377
|
+
ne.append(f'Novel Exon: {(s1, s2)}')
|
|
378
|
+
|
|
379
|
+
for intron_count, (t1, t2) in enumerate(ref_introns):
|
|
380
|
+
if t1 not in var.donors and t2 not in var.acceptors:
|
|
381
|
+
ir.append(f'Intron {intron_count + 1}/{num_ref_introns} retained: {(t1, t2)}')
|
|
382
|
+
|
|
383
|
+
return ','.join(pes), ','.join(pir), ','.join(es), ','.join(ne), ','.join(ir)
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
def summarize_missplicing_event(self, pes, pir, es, ne, ir):
|
|
387
|
+
"""
|
|
388
|
+
Given raw missplicing event strings, returns a compact classification tag.
|
|
389
|
+
"""
|
|
390
|
+
event = []
|
|
391
|
+
if pes: event.append('PES')
|
|
392
|
+
if es: event.append('ES')
|
|
393
|
+
if pir: event.append('PIR')
|
|
394
|
+
if ir: event.append('IR')
|
|
395
|
+
if ne: event.append('NE')
|
|
396
|
+
return ','.join(event) if event else '-'
|
|
397
|
+
|
|
398
|
+
def compare_splicing_to_reference(self, transcript_variant):
|
|
399
|
+
pes, pir, es, ne, ir = self.define_missplicing_events(transcript_variant)
|
|
400
|
+
return pd.Series({
|
|
401
|
+
'pes': pes,
|
|
402
|
+
'pir': pir,
|
|
403
|
+
'es': es,
|
|
404
|
+
'ne': ne,
|
|
405
|
+
'ir': ir,
|
|
406
|
+
'summary': self.summarize_missplicing_event(pes, pir, es, ne, ir)
|
|
407
|
+
})
|
geney/Transcript.py
CHANGED
|
@@ -3,9 +3,10 @@ from typing import Any, Optional, Union
|
|
|
3
3
|
import numpy as np
|
|
4
4
|
import copy
|
|
5
5
|
from Bio.Seq import Seq # Assuming Biopython is used
|
|
6
|
-
from . import
|
|
7
|
-
from .
|
|
8
|
-
from .
|
|
6
|
+
from . import config
|
|
7
|
+
from .utils import unload_pickle
|
|
8
|
+
from .utils.SeqMats import SeqMat #, MutSeqMat
|
|
9
|
+
from .utils.Fasta_segment import Fasta_segment
|
|
9
10
|
|
|
10
11
|
class Transcript:
|
|
11
12
|
"""
|
|
@@ -40,7 +41,7 @@ class Transcript:
|
|
|
40
41
|
AssertionError: If required attributes are missing.
|
|
41
42
|
"""
|
|
42
43
|
# Convert certain attributes to NumPy arrays for consistent processing
|
|
43
|
-
array_fields = {'acceptors', 'donors', 'cons_vector'}
|
|
44
|
+
array_fields = {'acceptors', 'donors', 'cons_vector', 'rev'}
|
|
44
45
|
for k, v in d.items():
|
|
45
46
|
if k in array_fields and v is not None:
|
|
46
47
|
v = np.array(v)
|
|
@@ -54,6 +55,7 @@ class Transcript:
|
|
|
54
55
|
if missing:
|
|
55
56
|
raise AssertionError(f"Transcript is missing required attributes: {missing}")
|
|
56
57
|
|
|
58
|
+
|
|
57
59
|
# Default fallback values for optional attributes
|
|
58
60
|
if not hasattr(self, 'donors') or self.donors is None:
|
|
59
61
|
self.donors = np.array([])
|
|
@@ -134,6 +136,16 @@ class Transcript:
|
|
|
134
136
|
return False
|
|
135
137
|
return np.all(np.isin(subvalue.seqmat[1, :], self.pre_mrna.seqmat[1, :]))
|
|
136
138
|
|
|
139
|
+
|
|
140
|
+
def clone(self) -> Transcript:
|
|
141
|
+
"""
|
|
142
|
+
Returns a deep copy of this Transcript instance.
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
Transcript: A new Transcript object that is a deep copy of the current instance.
|
|
146
|
+
"""
|
|
147
|
+
return copy.deepcopy(self)
|
|
148
|
+
|
|
137
149
|
@property
|
|
138
150
|
def exons(self) -> list[tuple[int, int]]:
|
|
139
151
|
"""
|
|
@@ -265,45 +277,45 @@ class Transcript:
|
|
|
265
277
|
Returns:
|
|
266
278
|
Transcript: The current Transcript object (for chaining).
|
|
267
279
|
"""
|
|
268
|
-
pre_mrna = SeqMat
|
|
280
|
+
pre_mrna = SeqMat(**self.pull_pre_mrna_pos())
|
|
269
281
|
if self.rev:
|
|
270
282
|
pre_mrna.reverse_complement()
|
|
271
283
|
self.pre_mrna = pre_mrna
|
|
272
284
|
return self
|
|
273
285
|
|
|
274
|
-
def mutate(self, mutation: MutSeqMat, inplace: bool = False) -> Union[Transcript, SeqMat]:
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
286
|
+
# def mutate(self, mutation: MutSeqMat, inplace: bool = False) -> Union[Transcript, SeqMat]:
|
|
287
|
+
# """
|
|
288
|
+
# Apply a mutation to the pre_mRNA sequence of this Transcript.
|
|
289
|
+
#
|
|
290
|
+
# If the transcript is on the reverse strand (self.rev is True),
|
|
291
|
+
# the mutation is first reverse-complemented to ensure strand compatibility.
|
|
292
|
+
#
|
|
293
|
+
# Args:
|
|
294
|
+
# mutation (SeqMat): The mutation to apply. Must be a SeqMat or a compatible object that supports .mutate().
|
|
295
|
+
# inplace (bool): If True, apply the mutation directly to this Transcript's pre_mRNA
|
|
296
|
+
# and return 'self'. If False, return a new SeqMat with the mutated sequence.
|
|
297
|
+
#
|
|
298
|
+
# Returns:
|
|
299
|
+
# Transcript: If inplace=True, returns the updated Transcript object.
|
|
300
|
+
# SeqMat: If inplace=False, returns a new SeqMat object representing the mutated sequence.
|
|
301
|
+
# """
|
|
302
|
+
# # If transcript is reversed, reverse-complement the mutation first
|
|
303
|
+
# if self.rev:
|
|
304
|
+
# mutation.reverse_complement(inplace=True)
|
|
305
|
+
#
|
|
306
|
+
# # Attempt the mutation operation
|
|
307
|
+
# mutated_seqmat = self.pre_mrna.mutate(mutation).seqmat
|
|
308
|
+
# if inplace:
|
|
309
|
+
# # Update this Transcript's pre_mRNA and return the Transcript itself
|
|
310
|
+
# self.pre_mrna = SeqMat(mutated_seqmat)
|
|
311
|
+
# return self
|
|
312
|
+
#
|
|
313
|
+
# else:
|
|
314
|
+
# # Create a copy of the current Transcript and update its pre_mrna
|
|
315
|
+
# # Assuming you have a way to clone the Transcript; if not, manually recreate it.
|
|
316
|
+
# new_transcript = copy.deepcopy(self)
|
|
317
|
+
# new_transcript.pre_mrna = SeqMat(mutated_seqmat)
|
|
318
|
+
# return new_transcript
|
|
307
319
|
|
|
308
320
|
def generate_mature_mrna(self, inplace: bool = True) -> Union[Transcript, SeqMat]:
|
|
309
321
|
"""
|
|
@@ -317,17 +329,11 @@ class Transcript:
|
|
|
317
329
|
"""
|
|
318
330
|
self._fix_and_check_introns()
|
|
319
331
|
|
|
320
|
-
mature_mrna = SeqMat.empty()
|
|
321
|
-
pos_mrna = self.pre_mrna
|
|
322
|
-
|
|
323
|
-
for exon_start, exon_end in self.exons:
|
|
324
|
-
# Add each exon region to the mature_mrna
|
|
325
|
-
mature_mrna += pos_mrna[exon_start:exon_end]
|
|
326
|
-
|
|
327
332
|
if inplace:
|
|
328
|
-
self.mature_mrna =
|
|
333
|
+
self.mature_mrna = self.pre_mrna.cut_out(self.introns)
|
|
329
334
|
return self
|
|
330
|
-
|
|
335
|
+
|
|
336
|
+
return self.pre_mrna.splice_out(self.introns)
|
|
331
337
|
|
|
332
338
|
@property
|
|
333
339
|
def orf(self, tis=None):
|
|
@@ -343,16 +349,8 @@ class Transcript:
|
|
|
343
349
|
|
|
344
350
|
if tis is None:
|
|
345
351
|
tis = self.TIS
|
|
346
|
-
return self.mature_mrna.orf_seqmat(tis)
|
|
347
352
|
|
|
348
|
-
|
|
349
|
-
"""
|
|
350
|
-
Returns a deep copy of this Transcript instance.
|
|
351
|
-
|
|
352
|
-
Returns:
|
|
353
|
-
Transcript: A new Transcript object that is a deep copy of the current instance.
|
|
354
|
-
"""
|
|
355
|
-
return copy.deepcopy(self)
|
|
353
|
+
return self.mature_mrna.open_reading_frame(tis)
|
|
356
354
|
|
|
357
355
|
def generate_protein(self, inplace: bool = True, domains: Optional[np.ndarray] = None) -> Union[
|
|
358
356
|
Transcript, tuple[str, np.ndarray]]:
|
|
@@ -367,7 +365,7 @@ class Transcript:
|
|
|
367
365
|
Transcript or (protein: str, cons_vector: np.ndarray): The Transcript object if inplace=True, else the protein and cons_vector.
|
|
368
366
|
"""
|
|
369
367
|
if not self.protein_coding:
|
|
370
|
-
print("No protein can be generated without TIS/TTS.")
|
|
368
|
+
# print("No protein can be generated without TIS/TTS.")
|
|
371
369
|
return self if inplace else ("", np.array([]))
|
|
372
370
|
|
|
373
371
|
# Translate the ORF to protein
|
geney/__init__.py
CHANGED
|
@@ -1,27 +1,55 @@
|
|
|
1
|
-
|
|
1
|
+
import os
|
|
2
|
+
import json
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from .utils.Fasta_segment import Fasta_segment
|
|
6
|
+
from . import utils # this will now load all modules in utils/
|
|
7
|
+
|
|
8
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
9
|
+
# Configuration Loader
|
|
10
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
11
|
+
|
|
12
|
+
def get_config():
|
|
13
|
+
config_file = Path.home() / '.oncosplice_setup_1_2' / 'config.json'
|
|
14
|
+
if config_file.exists():
|
|
15
|
+
with open(config_file) as f:
|
|
16
|
+
config_json = json.load(f)
|
|
17
|
+
config_setup = {
|
|
18
|
+
k: {k_in: Path(p_in) for k_in, p_in in v.items()}
|
|
19
|
+
for k, v in config_json.items()
|
|
20
|
+
}
|
|
21
|
+
# Override or extend paths for hg38
|
|
22
|
+
config_setup.setdefault('hg38', {}).update({
|
|
23
|
+
'titer_path': Path('/tamir2/nicolaslynn/tools/titer'),
|
|
24
|
+
'yoram_path': Path('/tamir2/yoramzar/Projects/Cancer_mut/Utils'),
|
|
25
|
+
'splicing_db': Path('/tamir2/nicolaslynn/data/OncosplicePredictions/hg38/splicing'),
|
|
26
|
+
})
|
|
27
|
+
else:
|
|
28
|
+
print("⚠️ OncoSplice config not found at expected location.")
|
|
29
|
+
config_setup = {}
|
|
30
|
+
|
|
31
|
+
return config_setup
|
|
32
|
+
|
|
33
|
+
# Load config once
|
|
2
34
|
config = get_config()
|
|
3
|
-
|
|
4
|
-
|
|
35
|
+
|
|
36
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
37
|
+
# Constants and Example IDs
|
|
38
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
5
39
|
|
|
6
40
|
mut_id = 'KRAS:12:25227343:G:T'
|
|
7
41
|
epistasis_id = 'KRAS:12:25227343:G:T|KRAS:12:25227344:A:T'
|
|
8
42
|
|
|
43
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
44
|
+
# Public API: available_genes
|
|
45
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
46
|
+
|
|
9
47
|
def available_genes(organism='hg38'):
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
48
|
+
"""Yield gene names found in the MRNA_PATH/protein_coding directory."""
|
|
49
|
+
mrna_path = config.get(organism, {}).get('MRNA_PATH')
|
|
50
|
+
if not mrna_path:
|
|
51
|
+
raise ValueError(f"MRNA_PATH not found in config for organism '{organism}'")
|
|
52
|
+
for file in os.listdir(mrna_path / 'protein_coding'):
|
|
53
|
+
gene = file.split('_')[-1].removesuffix('.pkl')
|
|
13
54
|
yield gene
|
|
14
55
|
|
|
15
|
-
|
|
16
|
-
# import os
|
|
17
|
-
# import json
|
|
18
|
-
# from pathlib import Path
|
|
19
|
-
#
|
|
20
|
-
# config_file = os.path.join(os.path.expanduser('~'), '.oncosplice_setup', 'config.json')
|
|
21
|
-
# if Path(config_file).exists():
|
|
22
|
-
# config_setup = {k: Path(p) for k, p in json.loads(open(config_file).read()).items()}
|
|
23
|
-
#
|
|
24
|
-
# else:
|
|
25
|
-
# print("Database not set up.")
|
|
26
|
-
# config_setup = {}
|
|
27
|
-
#
|
geney/_config_setup.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import json
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
def get_config():
|
|
6
|
+
config_file = os.path.join(os.path.expanduser('~'), '.oncosplice_setup_1_2', 'config.json')
|
|
7
|
+
if Path(config_file).exists():
|
|
8
|
+
config_setup = {k: {k_in: Path(p_in) for k_in, p_in in p.items()} for k, p in json.loads(open(config_file).read()).items()}
|
|
9
|
+
config_setup['hg38']['titer_path'] = Path('/tamir2/nicolaslynn/tools/titer')
|
|
10
|
+
config_setup['hg38']['yoram_path'] = Path('/tamir2/yoramzar/Projects/Cancer_mut/Utils')
|
|
11
|
+
config_setup['hg38']['splicing_db'] = Path('/tamir2/nicolaslynn/data/OncosplicePredictions/hg38/splicing')
|
|
12
|
+
else:
|
|
13
|
+
print("Database not set up.")
|
|
14
|
+
config_setup = {}
|
|
15
|
+
|
|
16
|
+
return config_setup
|