geney 1.3.78__py2.py3-none-any.whl → 1.4.0__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of geney might be problematic. Click here for more details.

@@ -0,0 +1,407 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ from collections import defaultdict
4
+ from typing import List, Tuple, Dict, Generator, Any
5
+ from pandas import Series
6
+ from .utils.utils import short_hash_of_list
7
+
8
+
9
+ class SpliceSimulator:
10
+ def __init__(self, splicing_df: pd.DataFrame, transcript, max_distance: int, feature='event'):
11
+ """
12
+ Initializes the SpliceSimulator.
13
+
14
+ Args:
15
+ splicing_df (pd.DataFrame): DataFrame containing splicing information.
16
+ Expected to have columns 'donors' and 'acceptors', each providing
17
+ a list of tuples (position, probability).
18
+ event_map: Additional event mapping information.
19
+ rev (bool): Indicates whether the orientation is reversed.
20
+ transcript_start (int): Start position of the transcript.
21
+ transcript_end (int): End position of the transcript.
22
+ max_distance (int): Maximum allowable distance for connecting splice sites.
23
+ """
24
+ self.full_df = splicing_df
25
+ self.feature = feature
26
+ self.rev = transcript.rev
27
+ self.transcript_start = transcript.transcript_start
28
+ self.transcript_end = transcript.transcript_end
29
+ self.donors = transcript.donors
30
+ self.acceptors = transcript.acceptors
31
+ self.transcript = transcript
32
+ self.max_distance = max_distance
33
+
34
+ # Build sorted node lists from DataFrame columns.
35
+ self.set_donor_nodes()
36
+ self.set_acceptor_nodes()
37
+
38
+ def _compute_splice_df(self, site_type: str) -> pd.DataFrame:
39
+ """
40
+ Generic method to compute donor or acceptor DataFrame with delta calculations and priority scores.
41
+
42
+ Args:
43
+ site_type (str): 'donor' or 'acceptor'
44
+ feature (str): prefix of the feature column (e.g., 'mut1' → 'mut1_prob')
45
+
46
+ Returns:
47
+ pd.DataFrame: Annotated and scored splice site DataFrame
48
+ """
49
+ feature_col = f'{self.feature}_prob'
50
+ df = getattr(self.full_df, site_type + 's').copy()
51
+ site_set = getattr(self, site_type + 's')
52
+
53
+ # Ensure all known sites are included
54
+ missing = set(site_set) - set(df.index)
55
+ if missing:
56
+ df = pd.concat([df, pd.DataFrame(index=list(missing))], axis=0)
57
+ df.loc[list(missing), ['annotated', 'ref_prob', feature_col]] = [True, 1, 1]
58
+
59
+ # Ensure 'annotated' column exists and is boolean
60
+ if 'annotated' not in df.columns:
61
+ df['annotated'] = False
62
+ else:
63
+ df['annotated'] = df['annotated'].where(df['annotated'].notna(), False).astype(bool)
64
+
65
+ # Sort by genomic position (respect strand orientation)
66
+ df.sort_index(ascending=not self.rev, inplace=True)
67
+
68
+ # === DELTA COMPUTATIONS ===
69
+ MIN_INCREASE_RATIO = 0.2
70
+
71
+ df['discovered_delta'] = np.where(
72
+ ~df['annotated'],
73
+ (df[feature_col] - df['ref_prob']),
74
+ np.nan
75
+ )
76
+ df['discovered_delta'] = df['discovered_delta'].where(df['discovered_delta'] >= MIN_INCREASE_RATIO, 0)
77
+
78
+ with np.errstate(divide='ignore', invalid='ignore'):
79
+ df['deleted_delta'] = np.where(
80
+ (df['ref_prob'] > 0) & df['annotated'],
81
+ (df[feature_col] - df['ref_prob']) / df['ref_prob'],
82
+ 0
83
+ )
84
+ df['deleted_delta'] = df['deleted_delta'].clip(upper=0)
85
+
86
+ df['P'] = df['annotated'].astype(float) + df['discovered_delta'] + df['deleted_delta']
87
+ return df
88
+
89
+ @property
90
+ def donor_df(self) -> pd.DataFrame:
91
+ return self._compute_splice_df('donor')
92
+
93
+ @property
94
+ def acceptor_df(self) -> pd.DataFrame:
95
+ return self._compute_splice_df('acceptor')
96
+
97
+ def report(self, pos):
98
+ metadata = self.find_splice_site_proximity(pos)
99
+ metadata['donor_events'] = self.donor_df[
100
+ (self.donor_df.deleted_delta.abs() > 0.2) | (
101
+ self.donor_df.discovered_delta.abs() > 0.2)].reset_index().to_json()
102
+ metadata['acceptor_events'] = self.acceptor_df[(self.acceptor_df.deleted_delta.abs() > 0.2) | (
103
+ self.acceptor_df.discovered_delta.abs() > 0.2)].reset_index().to_json()
104
+ metadata['missplicing'] = self.max_splicing_delta()
105
+ return metadata
106
+
107
+ def max_splicing_delta(self, event) -> pd.Series:
108
+ """
109
+ Computes the maximum missplicing delta for both donor and acceptor sites.
110
+
111
+ Args:
112
+ event: The event column to compare against the reference.
113
+
114
+ Returns:
115
+
116
+ pd.Series: A series with keys 'donor' and 'acceptor' containing the maximum differences.
117
+ """
118
+ max_missplicing = {}
119
+ for site_type in ['donors', 'acceptors']:
120
+ df = self.full_df[site_type]
121
+ max_missplicing[site_type] = max(abs(df[event] - df['ref_prob']))
122
+ return pd.Series(max_missplicing)
123
+
124
+ def set_donor_nodes(self) -> None:
125
+ """
126
+ Builds a sorted list of donor nodes.
127
+ A working copy is made from the donors property; then the transcript_end is appended as
128
+ a candidate with a full (1) probability. The list is sorted based on the position and probability.
129
+ """
130
+ donors = self.donor_df.P
131
+ donor_list = list(donors[donors > 0].round(2).items()) # Each tuple is (position, P)
132
+ donor_list.append((self.transcript_end, 1))
133
+ self.donor_nodes = sorted(
134
+ donor_list,
135
+ key=lambda x: int(x[0]),
136
+ reverse=bool(self.rev)
137
+ )
138
+
139
+ def set_acceptor_nodes(self) -> None:
140
+ """
141
+ Builds a sorted list of acceptor nodes.
142
+ """
143
+ acceptors = self.acceptor_df.P
144
+ acceptor_list = list(acceptors[acceptors > 0].round(2).items()) # Each tuple is (position, P)
145
+ acceptor_list.insert(0, (self.transcript_start, 1.0)) # starting point
146
+ self.acceptor_nodes = sorted(
147
+ acceptor_list,
148
+ key=lambda x: int(x[0]),
149
+ reverse=bool(self.rev)
150
+ )
151
+
152
+ def generate_graph(self) -> Dict[Tuple[int, str], List[Tuple[int, str, float]]]:
153
+ """
154
+ Builds a directed graph (as an adjacency list) where keys are nodes (position, type)
155
+ and values are lists of downstream connections as tuples:
156
+ (next_position, next_type, adjusted_probability)
157
+
158
+ The construction is done in three steps:
159
+ 1. Connect each donor node to acceptor nodes within max_distance.
160
+ 2. Connect each acceptor node to donor nodes within max_distance.
161
+ 3. Connect the transcript_start to donor nodes within max_distance.
162
+
163
+ Returns:
164
+ Dict: The adjacency list representing possible splice site transitions.
165
+ """
166
+ adjacency_list = defaultdict(list)
167
+
168
+ # 1. Connect each donor node to nearby acceptor nodes.
169
+ for d_pos, d_prob in self.donor_nodes:
170
+ running_prob = 1
171
+ for a_pos, a_prob in self.acceptor_nodes:
172
+ correct_orientation = ((a_pos > d_pos and not self.rev) or
173
+ (a_pos < d_pos and self.rev))
174
+ distance_valid = abs(a_pos - d_pos) <= self.max_distance
175
+ if correct_orientation and distance_valid:
176
+ if not self.rev:
177
+ in_between_acceptors = sum(1 for a, _ in self.acceptor_nodes if d_pos < a < a_pos)
178
+ in_between_donors = sum(1 for d, _ in self.donor_nodes if d_pos < d < a_pos)
179
+ else:
180
+ in_between_acceptors = sum(1 for a, _ in self.acceptor_nodes if a_pos < a < d_pos)
181
+ in_between_donors = sum(1 for d, _ in self.donor_nodes if a_pos < d < d_pos)
182
+
183
+ if in_between_donors == 0 or in_between_acceptors == 0:
184
+ adjacency_list[(d_pos, 'donor')].append((a_pos, 'acceptor', a_prob))
185
+ running_prob -= a_prob
186
+ else:
187
+ if running_prob > 0:
188
+ adjacency_list[(d_pos, 'donor')].append((a_pos, 'acceptor', a_prob * running_prob))
189
+ running_prob -= a_prob
190
+ else:
191
+ break
192
+
193
+ # 2. Connect each acceptor node to nearby donor nodes.
194
+ for a_pos, a_prob in self.acceptor_nodes:
195
+ running_prob = 1
196
+ for d_pos, d_prob in self.donor_nodes:
197
+ correct_orientation = ((d_pos > a_pos and not self.rev) or
198
+ (d_pos < a_pos and self.rev))
199
+ distance_valid = abs(d_pos - a_pos) <= self.max_distance
200
+ if correct_orientation and distance_valid:
201
+ if not self.rev:
202
+ in_between_acceptors = sum(1 for a, _ in self.acceptor_nodes if a_pos < a < d_pos)
203
+ in_between_donors = sum(1 for d, _ in self.donor_nodes if a_pos < d < d_pos)
204
+ else:
205
+ in_between_acceptors = sum(1 for a, _ in self.acceptor_nodes if d_pos < a < a_pos)
206
+ in_between_donors = sum(1 for d, _ in self.donor_nodes if d_pos < d < a_pos)
207
+ tag = 'donor' if d_pos != self.transcript_end else 'transcript_end'
208
+ if in_between_acceptors == 0:
209
+ adjacency_list[(a_pos, 'acceptor')].append((d_pos, tag, d_prob))
210
+ running_prob -= d_prob
211
+ else:
212
+ if running_prob > 0:
213
+ adjacency_list[(a_pos, 'acceptor')].append((d_pos, tag, d_prob * running_prob))
214
+ running_prob -= d_prob
215
+ else:
216
+ break
217
+
218
+ # 3. Connect transcript_start to donor nodes within max_distance.
219
+ running_prob = 1
220
+ for d_pos, d_prob in self.donor_nodes:
221
+ correct_orientation = ((d_pos > self.transcript_start and not self.rev) or
222
+ (d_pos < self.transcript_start and self.rev))
223
+ distance_valid = abs(d_pos - self.transcript_start) <= self.max_distance
224
+ if correct_orientation and distance_valid:
225
+ adjacency_list[(self.transcript_start, 'transcript_start')].append((d_pos, 'donor', d_prob))
226
+ running_prob -= d_prob
227
+ if running_prob <= 0:
228
+ break
229
+
230
+ # Normalize each outgoing edge list so that probabilities sum to 1.
231
+ for key, next_nodes in adjacency_list.items():
232
+ total_prob = sum(prob for (_, _, prob) in next_nodes)
233
+ if total_prob > 0:
234
+ adjacency_list[key] = [(pos, typ, round(prob / total_prob, 3))
235
+ for pos, typ, prob in next_nodes]
236
+ return adjacency_list
237
+
238
+ def find_all_paths(self,
239
+ graph: Dict[Tuple[int, str], List[Tuple[int, str, float]]],
240
+ start: Tuple[int, str],
241
+ end: Tuple[int, str],
242
+ path: List[Tuple[int, str]] = None,
243
+ probability: float = 1.0) -> Generator[Tuple[List[Tuple[int, str]], float], None, None]:
244
+ """
245
+ Recursively traverses the graph to yield all complete paths from start to end.
246
+
247
+ Args:
248
+ graph (Dict): The adjacency list graph.
249
+ start (Tuple[int, str]): The current node.
250
+ end (Tuple[int, str]): The target node.
251
+ path (List[Tuple[int, str]], optional): The current path. Defaults to None.
252
+ probability (float, optional): The cumulative probability along the current path.
253
+
254
+ Yields:
255
+ Generator yielding tuples of (path, cumulative_probability).
256
+ """
257
+ if path is None:
258
+ path = [start]
259
+ else:
260
+ path = path + [start]
261
+ if start == end:
262
+ yield path, probability
263
+ return
264
+ if start not in graph:
265
+ return
266
+ for next_node, node_type, prob in graph[start]:
267
+ yield from self.find_all_paths(graph, (next_node, node_type), end, path, probability * prob)
268
+
269
+ def get_viable_paths(self) -> List[Tuple[List[Tuple[int, str]], float]]:
270
+ """
271
+ Generates and returns all complete splice-site paths (from transcript_start to transcript_end),
272
+ sorted by overall likelihood in descending order.
273
+
274
+ Returns:
275
+ List[Tuple[List[Tuple[int, str]], float]]: Each tuple contains a path (list of (position, type))
276
+ and its overall probability.
277
+ """
278
+ graph = self.generate_graph()
279
+ start_node = (self.transcript_start, 'transcript_start')
280
+ end_node = (self.transcript_end, 'transcript_end')
281
+ paths = list(self.find_all_paths(graph, start_node, end_node))
282
+ paths.sort(key=lambda x: x[1], reverse=True)
283
+ return paths
284
+
285
+ def get_viable_transcripts(self, metadata=False) -> Generator[tuple[Any, Series] | Any, Any, None]:
286
+ """
287
+ Returns a list of transcript-like objects cloned from `self.transcript`,
288
+ each representing a valid splice path with updated donor/acceptor sites,
289
+ total path probability, and a unique hash based on exon/intron structure.
290
+ """
291
+ graph = self.generate_graph()
292
+ start_node = (self.transcript_start, 'transcript_start')
293
+ end_node = (self.transcript_end, 'transcript_end')
294
+
295
+ paths = list(self.find_all_paths(graph, start_node, end_node))
296
+ paths.sort(key=lambda x: x[1], reverse=True)
297
+
298
+ viable_transcripts = []
299
+
300
+ for path, prob in paths:
301
+ donors = [pos for pos, typ in path if typ == 'donor']
302
+ acceptors = [pos for pos, typ in path if typ == 'acceptor']
303
+
304
+ transcript = self.transcript.clone() # Make sure this creates a deep copy
305
+
306
+ transcript.donors = [d for d in donors if d != transcript.transcript_end]
307
+ transcript.acceptors = [a for a in acceptors if a != transcript.transcript_start]
308
+ transcript.path_weight = prob
309
+ transcript.path_hash = short_hash_of_list(tuple(donors + acceptors)) # or use a better hash function if needed
310
+ transcript.generate_mature_mrna().generate_protein()
311
+ if metadata:
312
+ md = pd.concat([self.compare_splicing_to_reference(transcript), pd.Series({'isoform_prevalence': transcript.path_weight, 'isoform_id': transcript.path_hash})])
313
+ yield transcript, md
314
+ else:
315
+ yield transcript
316
+
317
+ def find_splice_site_proximity(self, pos):
318
+ def result(region, index, start, end):
319
+ return pd.Series({
320
+ 'region': region,
321
+ 'index': index + 1,
322
+ "5'_dist": abs(pos - min(start, end)),
323
+ "3'_dist": abs(pos - max(start, end))
324
+ })
325
+
326
+ if not hasattr(self.transcript, 'exons') or not hasattr(self.transcript, 'introns'):
327
+ return pd.Series({'region': None, 'index': None, "5'_dist": np.inf, "3'_dist": np.inf})
328
+
329
+ for i, (start, end) in enumerate(self.transcript.exons):
330
+ if min(start, end) <= pos <= max(start, end):
331
+ return result('exon', i, start, end)
332
+
333
+ for i, (start, end) in enumerate(self.transcript.introns):
334
+ if min(start, end) <= pos <= max(start, end):
335
+ return result('intron', i, start, end)
336
+
337
+ return pd.Series({'region': None, 'index': None, "5'_dist": np.inf, "3'_dist": np.inf})
338
+
339
+
340
+ def define_missplicing_events(self, var):
341
+ """
342
+ Compares a reference transcript and a variant to detect splicing abnormalities.
343
+ Returns string descriptions of each type of missplicing event.
344
+ """
345
+
346
+ ref = self.transcript
347
+ ref_introns, ref_exons = getattr(ref, 'introns', []), getattr(ref, 'exons', [])
348
+ var_introns, var_exons = getattr(var, 'introns', []), getattr(var, 'exons', [])
349
+
350
+ num_ref_exons = len(ref_exons)
351
+ num_ref_introns = len(ref_introns)
352
+
353
+ pes = []
354
+ pir = []
355
+ es = []
356
+ ne = []
357
+ ir = []
358
+
359
+ for exon_count, (t1, t2) in enumerate(ref_exons):
360
+ for (s1, s2) in var_exons:
361
+ if not ref.rev and ((s1 == t1 and s2 < t2) or (s1 > t1 and s2 == t2)) or \
362
+ (ref.rev and ((s1 == t1 and s2 > t2) or (s1 < t1 and s2 == t2))):
363
+ pes.append(f'Exon {exon_count + 1}/{num_ref_exons} truncated: {(t1, t2)} --> {(s1, s2)}')
364
+
365
+ for intron_count, (t1, t2) in enumerate(ref_introns):
366
+ for (s1, s2) in var_introns:
367
+ if not ref.rev and ((s1 == t1 and s2 < t2) or (s1 > t1 and s2 == t2)) or \
368
+ (ref.rev and ((s1 == t1 and s2 > t2) or (s1 < t1 and s2 == t2))):
369
+ pir.append(f'Intron {intron_count + 1}/{num_ref_introns} partially retained: {(t1, t2)} --> {(s1, s2)}')
370
+
371
+ for exon_count, (t1, t2) in enumerate(ref_exons):
372
+ if t1 not in var.acceptors and t2 not in var.donors:
373
+ es.append(f'Exon {exon_count + 1}/{num_ref_exons} skipped: {(t1, t2)}')
374
+
375
+ for (s1, s2) in var_exons:
376
+ if s1 not in ref.acceptors and s2 not in ref.donors:
377
+ ne.append(f'Novel Exon: {(s1, s2)}')
378
+
379
+ for intron_count, (t1, t2) in enumerate(ref_introns):
380
+ if t1 not in var.donors and t2 not in var.acceptors:
381
+ ir.append(f'Intron {intron_count + 1}/{num_ref_introns} retained: {(t1, t2)}')
382
+
383
+ return ','.join(pes), ','.join(pir), ','.join(es), ','.join(ne), ','.join(ir)
384
+
385
+
386
+ def summarize_missplicing_event(self, pes, pir, es, ne, ir):
387
+ """
388
+ Given raw missplicing event strings, returns a compact classification tag.
389
+ """
390
+ event = []
391
+ if pes: event.append('PES')
392
+ if es: event.append('ES')
393
+ if pir: event.append('PIR')
394
+ if ir: event.append('IR')
395
+ if ne: event.append('NE')
396
+ return ','.join(event) if event else '-'
397
+
398
+ def compare_splicing_to_reference(self, transcript_variant):
399
+ pes, pir, es, ne, ir = self.define_missplicing_events(transcript_variant)
400
+ return pd.Series({
401
+ 'pes': pes,
402
+ 'pir': pir,
403
+ 'es': es,
404
+ 'ne': ne,
405
+ 'ir': ir,
406
+ 'summary': self.summarize_missplicing_event(pes, pir, es, ne, ir)
407
+ })
geney/Transcript.py CHANGED
@@ -3,9 +3,10 @@ from typing import Any, Optional, Union
3
3
  import numpy as np
4
4
  import copy
5
5
  from Bio.Seq import Seq # Assuming Biopython is used
6
- from . import unload_pickle, config
7
- from .SeqMats import SeqMat, MutSeqMat
8
- from .Fasta_segment import Fasta_segment
6
+ from . import config
7
+ from .utils import unload_pickle
8
+ from .utils.SeqMats import SeqMat #, MutSeqMat
9
+ from .utils.Fasta_segment import Fasta_segment
9
10
 
10
11
  class Transcript:
11
12
  """
@@ -40,7 +41,7 @@ class Transcript:
40
41
  AssertionError: If required attributes are missing.
41
42
  """
42
43
  # Convert certain attributes to NumPy arrays for consistent processing
43
- array_fields = {'acceptors', 'donors', 'cons_vector'}
44
+ array_fields = {'acceptors', 'donors', 'cons_vector', 'rev'}
44
45
  for k, v in d.items():
45
46
  if k in array_fields and v is not None:
46
47
  v = np.array(v)
@@ -54,6 +55,7 @@ class Transcript:
54
55
  if missing:
55
56
  raise AssertionError(f"Transcript is missing required attributes: {missing}")
56
57
 
58
+
57
59
  # Default fallback values for optional attributes
58
60
  if not hasattr(self, 'donors') or self.donors is None:
59
61
  self.donors = np.array([])
@@ -134,6 +136,16 @@ class Transcript:
134
136
  return False
135
137
  return np.all(np.isin(subvalue.seqmat[1, :], self.pre_mrna.seqmat[1, :]))
136
138
 
139
+
140
+ def clone(self) -> Transcript:
141
+ """
142
+ Returns a deep copy of this Transcript instance.
143
+
144
+ Returns:
145
+ Transcript: A new Transcript object that is a deep copy of the current instance.
146
+ """
147
+ return copy.deepcopy(self)
148
+
137
149
  @property
138
150
  def exons(self) -> list[tuple[int, int]]:
139
151
  """
@@ -265,45 +277,45 @@ class Transcript:
265
277
  Returns:
266
278
  Transcript: The current Transcript object (for chaining).
267
279
  """
268
- pre_mrna = SeqMat.from_seq(self.pull_pre_mrna_pos())
280
+ pre_mrna = SeqMat(**self.pull_pre_mrna_pos())
269
281
  if self.rev:
270
282
  pre_mrna.reverse_complement()
271
283
  self.pre_mrna = pre_mrna
272
284
  return self
273
285
 
274
- def mutate(self, mutation: MutSeqMat, inplace: bool = False) -> Union[Transcript, SeqMat]:
275
- """
276
- Apply a mutation to the pre_mRNA sequence of this Transcript.
277
-
278
- If the transcript is on the reverse strand (self.rev is True),
279
- the mutation is first reverse-complemented to ensure strand compatibility.
280
-
281
- Args:
282
- mutation (SeqMat): The mutation to apply. Must be a SeqMat or a compatible object that supports .mutate().
283
- inplace (bool): If True, apply the mutation directly to this Transcript's pre_mRNA
284
- and return 'self'. If False, return a new SeqMat with the mutated sequence.
285
-
286
- Returns:
287
- Transcript: If inplace=True, returns the updated Transcript object.
288
- SeqMat: If inplace=False, returns a new SeqMat object representing the mutated sequence.
289
- """
290
- # If transcript is reversed, reverse-complement the mutation first
291
- if self.rev:
292
- mutation.reverse_complement()
293
-
294
- # Attempt the mutation operation
295
- mutated_seqmat = self.pre_mrna.mutate(mutation).seqmat
296
- if inplace:
297
- # Update this Transcript's pre_mRNA and return the Transcript itself
298
- self.pre_mrna = SeqMat(mutated_seqmat)
299
- return self
300
-
301
- else:
302
- # Create a copy of the current Transcript and update its pre_mrna
303
- # Assuming you have a way to clone the Transcript; if not, manually recreate it.
304
- new_transcript = copy.deepcopy(self)
305
- new_transcript.pre_mrna = SeqMat(mutated_seqmat)
306
- return new_transcript
286
+ # def mutate(self, mutation: MutSeqMat, inplace: bool = False) -> Union[Transcript, SeqMat]:
287
+ # """
288
+ # Apply a mutation to the pre_mRNA sequence of this Transcript.
289
+ #
290
+ # If the transcript is on the reverse strand (self.rev is True),
291
+ # the mutation is first reverse-complemented to ensure strand compatibility.
292
+ #
293
+ # Args:
294
+ # mutation (SeqMat): The mutation to apply. Must be a SeqMat or a compatible object that supports .mutate().
295
+ # inplace (bool): If True, apply the mutation directly to this Transcript's pre_mRNA
296
+ # and return 'self'. If False, return a new SeqMat with the mutated sequence.
297
+ #
298
+ # Returns:
299
+ # Transcript: If inplace=True, returns the updated Transcript object.
300
+ # SeqMat: If inplace=False, returns a new SeqMat object representing the mutated sequence.
301
+ # """
302
+ # # If transcript is reversed, reverse-complement the mutation first
303
+ # if self.rev:
304
+ # mutation.reverse_complement(inplace=True)
305
+ #
306
+ # # Attempt the mutation operation
307
+ # mutated_seqmat = self.pre_mrna.mutate(mutation).seqmat
308
+ # if inplace:
309
+ # # Update this Transcript's pre_mRNA and return the Transcript itself
310
+ # self.pre_mrna = SeqMat(mutated_seqmat)
311
+ # return self
312
+ #
313
+ # else:
314
+ # # Create a copy of the current Transcript and update its pre_mrna
315
+ # # Assuming you have a way to clone the Transcript; if not, manually recreate it.
316
+ # new_transcript = copy.deepcopy(self)
317
+ # new_transcript.pre_mrna = SeqMat(mutated_seqmat)
318
+ # return new_transcript
307
319
 
308
320
  def generate_mature_mrna(self, inplace: bool = True) -> Union[Transcript, SeqMat]:
309
321
  """
@@ -317,17 +329,11 @@ class Transcript:
317
329
  """
318
330
  self._fix_and_check_introns()
319
331
 
320
- mature_mrna = SeqMat.empty()
321
- pos_mrna = self.pre_mrna
322
-
323
- for exon_start, exon_end in self.exons:
324
- # Add each exon region to the mature_mrna
325
- mature_mrna += pos_mrna[exon_start:exon_end]
326
-
327
332
  if inplace:
328
- self.mature_mrna = mature_mrna
333
+ self.mature_mrna = self.pre_mrna.cut_out(self.introns)
329
334
  return self
330
- return mature_mrna
335
+
336
+ return self.pre_mrna.splice_out(self.introns)
331
337
 
332
338
  @property
333
339
  def orf(self, tis=None):
@@ -343,16 +349,8 @@ class Transcript:
343
349
 
344
350
  if tis is None:
345
351
  tis = self.TIS
346
- return self.mature_mrna.orf_seqmat(tis)
347
352
 
348
- def clone(self) -> Transcript:
349
- """
350
- Returns a deep copy of this Transcript instance.
351
-
352
- Returns:
353
- Transcript: A new Transcript object that is a deep copy of the current instance.
354
- """
355
- return copy.deepcopy(self)
353
+ return self.mature_mrna.open_reading_frame(tis)
356
354
 
357
355
  def generate_protein(self, inplace: bool = True, domains: Optional[np.ndarray] = None) -> Union[
358
356
  Transcript, tuple[str, np.ndarray]]:
@@ -367,7 +365,7 @@ class Transcript:
367
365
  Transcript or (protein: str, cons_vector: np.ndarray): The Transcript object if inplace=True, else the protein and cons_vector.
368
366
  """
369
367
  if not self.protein_coding:
370
- print("No protein can be generated without TIS/TTS.")
368
+ # print("No protein can be generated without TIS/TTS.")
371
369
  return self if inplace else ("", np.array([]))
372
370
 
373
371
  # Translate the ORF to protein
geney/__init__.py CHANGED
@@ -1,27 +1,55 @@
1
- from .config_setup import get_config
1
+ import os
2
+ import json
3
+ from pathlib import Path
4
+
5
+ from .utils.Fasta_segment import Fasta_segment
6
+ from . import utils # this will now load all modules in utils/
7
+
8
+ # ─────────────────────────────────────────────────────────────────────────────
9
+ # Configuration Loader
10
+ # ─────────────────────────────────────────────────────────────────────────────
11
+
12
+ def get_config():
13
+ config_file = Path.home() / '.oncosplice_setup_1_2' / 'config.json'
14
+ if config_file.exists():
15
+ with open(config_file) as f:
16
+ config_json = json.load(f)
17
+ config_setup = {
18
+ k: {k_in: Path(p_in) for k_in, p_in in v.items()}
19
+ for k, v in config_json.items()
20
+ }
21
+ # Override or extend paths for hg38
22
+ config_setup.setdefault('hg38', {}).update({
23
+ 'titer_path': Path('/tamir2/nicolaslynn/tools/titer'),
24
+ 'yoram_path': Path('/tamir2/yoramzar/Projects/Cancer_mut/Utils'),
25
+ 'splicing_db': Path('/tamir2/nicolaslynn/data/OncosplicePredictions/hg38/splicing'),
26
+ })
27
+ else:
28
+ print("⚠️ OncoSplice config not found at expected location.")
29
+ config_setup = {}
30
+
31
+ return config_setup
32
+
33
+ # Load config once
2
34
  config = get_config()
3
- from .Fasta_segment import Fasta_segment
4
- from .utils import *
35
+
36
+ # ─────────────────────────────────────────────────────────────────────────────
37
+ # Constants and Example IDs
38
+ # ─────────────────────────────────────────────────────────────────────────────
5
39
 
6
40
  mut_id = 'KRAS:12:25227343:G:T'
7
41
  epistasis_id = 'KRAS:12:25227343:G:T|KRAS:12:25227344:A:T'
8
42
 
43
+ # ─────────────────────────────────────────────────────────────────────────────
44
+ # Public API: available_genes
45
+ # ─────────────────────────────────────────────────────────────────────────────
46
+
9
47
  def available_genes(organism='hg38'):
10
- import os
11
- for file in os.listdir(config[organism]['MRNA_PATH'] / 'protein_coding'):
12
- gene = file.split('_')[-1].strip('.pkl')
48
+ """Yield gene names found in the MRNA_PATH/protein_coding directory."""
49
+ mrna_path = config.get(organism, {}).get('MRNA_PATH')
50
+ if not mrna_path:
51
+ raise ValueError(f"MRNA_PATH not found in config for organism '{organism}'")
52
+ for file in os.listdir(mrna_path / 'protein_coding'):
53
+ gene = file.split('_')[-1].removesuffix('.pkl')
13
54
  yield gene
14
55
 
15
-
16
- # import os
17
- # import json
18
- # from pathlib import Path
19
- #
20
- # config_file = os.path.join(os.path.expanduser('~'), '.oncosplice_setup', 'config.json')
21
- # if Path(config_file).exists():
22
- # config_setup = {k: Path(p) for k, p in json.loads(open(config_file).read()).items()}
23
- #
24
- # else:
25
- # print("Database not set up.")
26
- # config_setup = {}
27
- #
geney/_config_setup.py ADDED
@@ -0,0 +1,16 @@
1
+ import os
2
+ import json
3
+ from pathlib import Path
4
+
5
+ def get_config():
6
+ config_file = os.path.join(os.path.expanduser('~'), '.oncosplice_setup_1_2', 'config.json')
7
+ if Path(config_file).exists():
8
+ config_setup = {k: {k_in: Path(p_in) for k_in, p_in in p.items()} for k, p in json.loads(open(config_file).read()).items()}
9
+ config_setup['hg38']['titer_path'] = Path('/tamir2/nicolaslynn/tools/titer')
10
+ config_setup['hg38']['yoram_path'] = Path('/tamir2/yoramzar/Projects/Cancer_mut/Utils')
11
+ config_setup['hg38']['splicing_db'] = Path('/tamir2/nicolaslynn/data/OncosplicePredictions/hg38/splicing')
12
+ else:
13
+ print("Database not set up.")
14
+ config_setup = {}
15
+
16
+ return config_setup