geney 1.4.40__py3-none-any.whl → 1.4.41__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
geney/pipelines.py CHANGED
@@ -4,36 +4,24 @@ from __future__ import annotations
4
4
  from datetime import datetime
5
5
  import pandas as pd
6
6
 
7
- from seqmat import Gene # external dependency
7
+ from seqmat import Gene
8
8
 
9
9
  from .splice_graph import SpliceSimulator
10
10
  from .transcripts import TranscriptLibrary
11
11
  from .variants import MutationalEvent
12
- from .oncosplice import Oncosplice # your existing oncosplice core
12
+ from .oncosplice import Oncosplice
13
13
 
14
14
 
15
- def max_splicing_delta(mut_id, transcript_id=None, splicing_engine='spliceai', organism='hg38'):
16
- print("we are here")
17
- m = MutationalEvent(mut_id)
18
- assert m.compatible(), 'Mutations in event are incompatible'
19
- reference_transcript = Gene.from_file(
20
- m.gene, organism=organism).transcript(transcript_id).generate_pre_mrna().generate_mature_mrna().generate_protein()
21
- tl = TranscriptLibrary(reference_transcript, m)
22
- splicing_results = tl.predict_splicing(m.position, engine=splicing_engine, inplace=True).get_event_columns('event')
23
- ss = SpliceSimulator(splicing_results, tl.event, feature='event', max_distance=100_000_000)
24
- return ss.max_splicing_delta('event_prob')
25
-
26
-
27
-
28
- def oncosplice_pipeline_single_transcript(
15
+ def oncosplice_pipeline(
29
16
  mut_id: str,
30
17
  transcript_id: str | None = None,
31
18
  splicing_engine: str = "spliceai",
32
19
  organism: str = "hg38",
33
20
  ) -> pd.DataFrame:
34
21
  """
35
- High-level pipeline:
36
- mutation event -> transcript -> splicing -> splice graph -> isoforms -> oncosplice scores
22
+ Run the full oncosplice pipeline for a mutation.
23
+
24
+ Returns DataFrame with all viable isoforms and their oncosplice scores.
37
25
  """
38
26
  m = MutationalEvent(mut_id)
39
27
  assert m.compatible(), "Mutations in event are incompatible"
@@ -47,7 +35,6 @@ def oncosplice_pipeline_single_transcript(
47
35
  )
48
36
 
49
37
  tl = TranscriptLibrary(reference_transcript, m)
50
-
51
38
  central_pos = m.central_position
52
39
 
53
40
  tl.predict_splicing(central_pos, engine=splicing_engine, inplace=True)
@@ -57,18 +44,16 @@ def oncosplice_pipeline_single_transcript(
57
44
  splicing_results, tl.event, feature="event", max_distance=100_000_000
58
45
  )
59
46
 
60
- base_report = pd.Series(
61
- {
62
- "mut_id": mut_id,
63
- "gene": m.gene,
64
- "transcript_id": reference_transcript.transcript_id,
65
- "primary_transcript": reference_transcript.primary_transcript,
66
- "splicing_engine": splicing_engine,
67
- "central_position": central_pos,
68
- "mutation_count": len(m.positions),
69
- "time_of_execution": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
70
- }
71
- )
47
+ base_report = pd.Series({
48
+ "mut_id": mut_id,
49
+ "gene": m.gene,
50
+ "transcript_id": reference_transcript.transcript_id,
51
+ "primary_transcript": reference_transcript.primary_transcript,
52
+ "splicing_engine": splicing_engine,
53
+ "central_position": central_pos,
54
+ "mutation_count": len(m.positions),
55
+ "time_of_execution": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
56
+ })
72
57
 
73
58
  ss_metadata = ss.report(central_pos)
74
59
  rows = []
@@ -79,19 +64,76 @@ def oncosplice_pipeline_single_transcript(
79
64
  reference_transcript.cons_vector,
80
65
  )
81
66
  rows.append(
82
- pd.concat(
83
- [
84
- base_report,
85
- ss_metadata,
86
- isoform_metadata,
87
- pd.Series(
88
- {
89
- "reference_mrna": reference_transcript.mature_mrna.seq,
90
- "variant_mrna": variant_transcript.mature_mrna.seq,
91
- }
92
- ),
93
- onco.get_analysis_series(),
94
- ]
95
- )
67
+ pd.concat([
68
+ base_report,
69
+ ss_metadata,
70
+ isoform_metadata,
71
+ pd.Series({
72
+ "reference_mrna": reference_transcript.mature_mrna.seq,
73
+ "variant_mrna": variant_transcript.mature_mrna.seq,
74
+ }),
75
+ onco.get_analysis_series(),
76
+ ])
96
77
  )
97
- return pd.DataFrame(rows)
78
+
79
+ return pd.DataFrame(rows)
80
+
81
+
82
+ def oncosplice_top_isoform(
83
+ mut_id: str,
84
+ transcript_id: str | None = None,
85
+ splicing_engine: str = "spliceai",
86
+ organism: str = "hg38",
87
+ ) -> pd.Series | None:
88
+ """
89
+ Get the most likely non-reference isoform for a mutation.
90
+
91
+ Returns Series with full oncosplice analysis, or None if no missplicing detected.
92
+ """
93
+ df = oncosplice_pipeline(mut_id, transcript_id, splicing_engine, organism)
94
+
95
+ if df.empty:
96
+ return None
97
+
98
+ variants = df[df["summary"] != "-"]
99
+
100
+ if variants.empty:
101
+ return None
102
+
103
+ return variants.iloc[0]
104
+
105
+
106
+ def max_splicing_delta(
107
+ mut_id: str,
108
+ transcript_id: str | None = None,
109
+ splicing_engine: str = "spliceai",
110
+ organism: str = "hg38",
111
+ ) -> float:
112
+ """
113
+ Get the maximum splice site probability change for a mutation.
114
+ """
115
+ m = MutationalEvent(mut_id)
116
+ assert m.compatible(), "Mutations in event are incompatible"
117
+
118
+ reference_transcript = (
119
+ Gene.from_file(m.gene, organism=organism)
120
+ .transcript(transcript_id)
121
+ .generate_pre_mrna()
122
+ .generate_mature_mrna()
123
+ .generate_protein()
124
+ )
125
+
126
+ tl = TranscriptLibrary(reference_transcript, m)
127
+ splicing_results = tl.predict_splicing(
128
+ m.central_position, engine=splicing_engine, inplace=True
129
+ ).get_event_columns("event")
130
+
131
+ ss = SpliceSimulator(
132
+ splicing_results, tl.event, feature="event", max_distance=100_000_000
133
+ )
134
+
135
+ return ss.max_splicing_delta("event_prob")
136
+
137
+
138
+ # Keep old name for backwards compatibility
139
+ oncosplice_pipeline_single_transcript = oncosplice_pipeline
geney/splice_graph.py CHANGED
@@ -2,13 +2,18 @@
2
2
  from __future__ import annotations
3
3
 
4
4
  from collections import defaultdict
5
- from typing import Any, Dict, Generator, List, Tuple
5
+ import hashlib
6
+ from typing import Dict, Generator, List, Tuple
6
7
 
7
8
  import numpy as np
8
9
  import pandas as pd
9
10
  from pandas import Series
10
11
 
11
- from .utils import short_hash_of_list # type: ignore
12
+
13
+ def _short_hash(items: Tuple) -> str:
14
+ """Generate a short hash string from a tuple."""
15
+ encoded = repr(items).encode('utf-8')
16
+ return hashlib.sha256(encoded).hexdigest()[:8]
12
17
 
13
18
 
14
19
  class SpliceSimulator:
@@ -90,6 +95,65 @@ class SpliceSimulator:
90
95
  metadata["missplicing"] = self.max_splicing_delta("event_prob")
91
96
  return metadata
92
97
 
98
+ def summarize_events(self, threshold: float = 0.2) -> str:
99
+ """
100
+ Generate human-readable summary of splice site changes.
101
+
102
+ Returns text describing discovered and deleted donors/acceptors.
103
+ Format: "D(position) ref_prob -> event_prob" or "A(position) ref_prob -> event_prob"
104
+ """
105
+ feature_col = f"{self.feature}_prob"
106
+ lines = []
107
+
108
+ # Process donors
109
+ donor_df = self.donor_df
110
+ discovered_donors = donor_df[donor_df["discovered_delta"].abs() >= threshold]
111
+ deleted_donors = donor_df[donor_df["deleted_delta"].abs() >= threshold]
112
+
113
+ if len(discovered_donors) > 0 or len(deleted_donors) > 0:
114
+ lines.append("=== DONORS ===")
115
+
116
+ if len(discovered_donors) > 0:
117
+ lines.append("Discovered:")
118
+ for pos, row in discovered_donors.iterrows():
119
+ ref = row.get("ref_prob", 0)
120
+ evt = row.get(feature_col, row.get("event_prob", 0))
121
+ lines.append(f" D({pos}) {ref:.2f} -> {evt:.2f} [+{evt-ref:.2f}]")
122
+
123
+ if len(deleted_donors) > 0:
124
+ lines.append("Deleted:")
125
+ for pos, row in deleted_donors.iterrows():
126
+ ref = row.get("ref_prob", 0)
127
+ evt = row.get(feature_col, row.get("event_prob", 0))
128
+ lines.append(f" D({pos}) {ref:.2f} -> {evt:.2f} [{evt-ref:.2f}]")
129
+
130
+ # Process acceptors
131
+ acceptor_df = self.acceptor_df
132
+ discovered_acceptors = acceptor_df[acceptor_df["discovered_delta"].abs() >= threshold]
133
+ deleted_acceptors = acceptor_df[acceptor_df["deleted_delta"].abs() >= threshold]
134
+
135
+ if len(discovered_acceptors) > 0 or len(deleted_acceptors) > 0:
136
+ lines.append("=== ACCEPTORS ===")
137
+
138
+ if len(discovered_acceptors) > 0:
139
+ lines.append("Discovered:")
140
+ for pos, row in discovered_acceptors.iterrows():
141
+ ref = row.get("ref_prob", 0)
142
+ evt = row.get(feature_col, row.get("event_prob", 0))
143
+ lines.append(f" A({pos}) {ref:.2f} -> {evt:.2f} [+{evt-ref:.2f}]")
144
+
145
+ if len(deleted_acceptors) > 0:
146
+ lines.append("Deleted:")
147
+ for pos, row in deleted_acceptors.iterrows():
148
+ ref = row.get("ref_prob", 0)
149
+ evt = row.get(feature_col, row.get("event_prob", 0))
150
+ lines.append(f" A({pos}) {ref:.2f} -> {evt:.2f} [{evt-ref:.2f}]")
151
+
152
+ if not lines:
153
+ return "No significant splice site changes detected."
154
+
155
+ return "\n".join(lines)
156
+
93
157
  def max_splicing_delta(self, event: str) -> float:
94
158
  all_diffs = []
95
159
  for site_type in ["donors", "acceptors"]:
@@ -236,6 +300,110 @@ class SpliceSimulator:
236
300
  paths.sort(key=lambda x: x[1], reverse=True)
237
301
  return paths
238
302
 
303
+ def isoforms_df(self) -> pd.DataFrame:
304
+ """
305
+ Return a DataFrame of all viable isoforms with probabilities and missplicing descriptions.
306
+
307
+ Columns:
308
+ - isoform_id: unique hash of the splice path
309
+ - probability: probability/prevalence of this isoform
310
+ - splicing_changes: short missplicing event codes (ES, IR, PES, PIR, NE, or "-" for canonical)
311
+ - exon_skipping: full exon skipping details
312
+ - partial_exon_skipping: partial exon skipping (truncation) details
313
+ - intron_retention: full intron retention details
314
+ - partial_intron_retention: partial intron retention details
315
+ - novel_exon: novel/cryptic exon details
316
+ """
317
+ rows = []
318
+ for t, md in self.get_viable_transcripts(metadata=True):
319
+ rows.append({
320
+ "isoform_id": md.get("isoform_id", ""),
321
+ "probability": md.get("isoform_prevalence", 0.0),
322
+ "splicing_changes": md.get("summary", "-"),
323
+ "exon_skipping": md.get("es", ""),
324
+ "partial_exon_skipping": md.get("pes", ""),
325
+ "intron_retention": md.get("ir", ""),
326
+ "partial_intron_retention": md.get("pir", ""),
327
+ "novel_exon": md.get("ne", ""),
328
+ })
329
+
330
+ if not rows:
331
+ return pd.DataFrame()
332
+
333
+ return pd.DataFrame(rows)
334
+
335
+ def _is_implausible_ir_path(self, var_transcript) -> bool:
336
+ """
337
+ Check if this transcript has intron retention that is implausible
338
+ because nearby cryptic splice sites compensate for the lost original sites.
339
+
340
+ Returns True if the path should be filtered out.
341
+
342
+ Key insight: If the variant uses ANY splice site near a reference intron
343
+ boundary, the intron is being spliced (possibly at a shifted position).
344
+ True IR only occurs when NO splice sites are used near BOTH boundaries.
345
+ """
346
+ ref_introns = getattr(self.transcript, "introns", [])
347
+
348
+ if not ref_introns:
349
+ return False
350
+
351
+ TOLERANCE = 500 # bp - consider splice sites within this distance as "covering" the boundary
352
+ MIN_TOTAL_PROB = 0.5 # if total prob >= this, cryptic sites could compensate
353
+
354
+ var_donors = set(var_transcript.donors)
355
+ var_acceptors = set(var_transcript.acceptors)
356
+
357
+ donor_df = self.donor_df
358
+ acceptor_df = self.acceptor_df
359
+
360
+ for t1, t2 in ref_introns:
361
+ # Determine which end is donor and which is acceptor based on strand
362
+ if not self.rev:
363
+ donor_pos, acceptor_pos = t1, t2 # + strand
364
+ else:
365
+ donor_pos, acceptor_pos = t2, t1 # - strand
366
+
367
+ # Check if variant uses ANY donor near the reference donor position
368
+ donor_used = any(
369
+ abs(d - donor_pos) <= TOLERANCE
370
+ for d in var_donors
371
+ )
372
+
373
+ # Check if variant uses ANY acceptor near the reference acceptor position
374
+ acceptor_used = any(
375
+ abs(a - acceptor_pos) <= TOLERANCE
376
+ for a in var_acceptors
377
+ )
378
+
379
+ # If both boundaries are used (possibly at shifted positions),
380
+ # the intron is being spliced out - NOT retained
381
+ if donor_used and acceptor_used:
382
+ continue
383
+
384
+ # At least one boundary is NOT used - this path has potential IR
385
+ # Check if cryptic sites with high probability exist but aren't being used
386
+ # (which would make this IR path implausible)
387
+
388
+ nearby_donors = donor_df.loc[
389
+ (donor_df.index >= donor_pos - TOLERANCE) &
390
+ (donor_df.index <= donor_pos + TOLERANCE)
391
+ ]
392
+ total_donor_prob = nearby_donors["P"].sum() if len(nearby_donors) > 0 else 0
393
+
394
+ nearby_acceptors = acceptor_df.loc[
395
+ (acceptor_df.index >= acceptor_pos - TOLERANCE) &
396
+ (acceptor_df.index <= acceptor_pos + TOLERANCE)
397
+ ]
398
+ total_acceptor_prob = nearby_acceptors["P"].sum() if len(nearby_acceptors) > 0 else 0
399
+
400
+ # If both boundaries have high probability cryptic sites available,
401
+ # but this path doesn't use them, the IR is implausible
402
+ if total_donor_prob >= MIN_TOTAL_PROB and total_acceptor_prob >= MIN_TOTAL_PROB:
403
+ return True
404
+
405
+ return False
406
+
239
407
  def get_viable_transcripts(self, metadata: bool = False):
240
408
  graph = self.generate_graph()
241
409
  start_node = (self.transcript_start, "transcript_start")
@@ -251,8 +419,13 @@ class SpliceSimulator:
251
419
  t.donors = [d for d in donors if d != t.transcript_end]
252
420
  t.acceptors = [a for a in acceptors if a != t.transcript_start]
253
421
  t.path_weight = prob
254
- t.path_hash = short_hash_of_list(tuple(donors + acceptors))
422
+ t.path_hash = _short_hash(tuple(donors + acceptors))
255
423
  t.generate_mature_mrna().generate_protein()
424
+
425
+ # Filter out implausible IR paths (where cryptic sites compensate)
426
+ if self._is_implausible_ir_path(t):
427
+ continue
428
+
256
429
  if metadata:
257
430
  md = pd.concat(
258
431
  [
@@ -306,7 +479,9 @@ class SpliceSimulator:
306
479
  num_ref_introns = len(ref_introns)
307
480
 
308
481
  pes, pir, es, ne, ir = [], [], [], [], []
482
+ pir_intron_indices = set() # Track which introns have PIR
309
483
 
484
+ # Partial exon skipping (exon truncation)
310
485
  for exon_count, (t1, t2) in enumerate(ref_exons):
311
486
  for (s1, s2) in var_exons:
312
487
  if (not ref.rev and ((s1 == t1 and s2 < t2) or (s1 > t1 and s2 == t2))) or (
@@ -316,30 +491,61 @@ class SpliceSimulator:
316
491
  f"Exon {exon_count+1}/{num_ref_exons} truncated: {(t1, t2)} --> {(s1, s2)}"
317
492
  )
318
493
 
494
+ # Partial intron retention (one boundary preserved, other shifted)
319
495
  for intron_count, (t1, t2) in enumerate(ref_introns):
320
496
  for (s1, s2) in var_introns:
497
+ # Check if one boundary matches and the intron is shorter (partial retention)
321
498
  if (not ref.rev and ((s1 == t1 and s2 < t2) or (s1 > t1 and s2 == t2))) or (
322
499
  ref.rev and ((s1 == t1 and s2 > t2) or (s1 < t1 and s2 == t2))
323
500
  ):
324
501
  pir.append(
325
502
  f"Intron {intron_count+1}/{num_ref_introns} partially retained: {(t1, t2)} --> {(s1, s2)}"
326
503
  )
504
+ pir_intron_indices.add(intron_count)
327
505
 
506
+ # Exon skipping (both boundaries missing)
328
507
  for exon_count, (t1, t2) in enumerate(ref_exons):
329
508
  if t1 not in var.acceptors and t2 not in var.donors:
330
509
  es.append(
331
510
  f"Exon {exon_count+1}/{num_ref_exons} skipped: {(t1, t2)}"
332
511
  )
333
512
 
513
+ # Novel exon (boundaries not in reference)
334
514
  for (s1, s2) in var_exons:
335
515
  if s1 not in ref.acceptors and s2 not in ref.donors:
336
516
  ne.append(f"Novel Exon: {(s1, s2)}")
337
517
 
518
+ # Full intron retention - only if NOT already partial retention
519
+ # AND no splice sites are being used near the intron boundaries
520
+ TOLERANCE = 500 # bp - consider splice sites within this distance as "covering" the boundary
521
+
338
522
  for intron_count, (t1, t2) in enumerate(ref_introns):
339
- if t1 not in var.donors and t2 not in var.acceptors:
340
- ir.append(
341
- f"Intron {intron_count+1}/{num_ref_introns} retained: {(t1, t2)}"
342
- )
523
+ if intron_count in pir_intron_indices:
524
+ continue # Already classified as PIR
525
+
526
+ # Check if the intron is preserved exactly in variant
527
+ intron_preserved = any(s1 == t1 and s2 == t2 for s1, s2 in var_introns)
528
+ if intron_preserved:
529
+ continue # Intron is properly spliced
530
+
531
+ # Determine donor/acceptor positions based on strand
532
+ if not ref.rev:
533
+ donor_pos, acceptor_pos = t1, t2 # + strand
534
+ else:
535
+ donor_pos, acceptor_pos = t2, t1 # - strand
536
+
537
+ # Check if variant uses ANY splice site near each boundary
538
+ # If so, the intron is being spliced (at shifted positions), not retained
539
+ donor_used = any(abs(d - donor_pos) <= TOLERANCE for d in var.donors)
540
+ acceptor_used = any(abs(a - acceptor_pos) <= TOLERANCE for a in var.acceptors)
541
+
542
+ if donor_used and acceptor_used:
543
+ continue # Intron is being spliced at shifted positions, not retained
544
+
545
+ # If we get here, the intron is truly retained
546
+ ir.append(
547
+ f"Intron {intron_count+1}/{num_ref_introns} retained: {(t1, t2)}"
548
+ )
343
549
 
344
550
  return ",".join(pes), ",".join(pir), ",".join(es), ",".join(ne), ",".join(ir)
345
551
 
geney/transcripts.py CHANGED
@@ -3,7 +3,7 @@ from __future__ import annotations
3
3
 
4
4
  from typing import Dict, Iterable, List, Tuple, Optional
5
5
 
6
- from .splicing_table import adjoin_splicing_outcomes, predict_splicing
6
+ from .engines import adjoin_splicing_outcomes, predict_splicing
7
7
 
8
8
 
9
9
  class TranscriptLibrary:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: geney
3
- Version: 1.4.40
3
+ Version: 1.4.41
4
4
  Summary: A Python package for gene expression modeling.
5
5
  Home-page: https://github.com/nicolaslynn/geney
6
6
  Author: Nicolas Lynn
@@ -22,6 +22,7 @@ Requires-Dist: tensorflow>=2.8.0
22
22
  Requires-Dist: keras>=2.8.0
23
23
  Requires-Dist: torch
24
24
  Requires-Dist: seqmat
25
+ Requires-Dist: h5py
25
26
  Dynamic: author
26
27
  Dynamic: author-email
27
28
  Dynamic: classifier
@@ -0,0 +1,11 @@
1
+ geney/__init__.py,sha256=nkhniqCNWJzrb7xHgTDFEXSvRVdggb9ZCJ7ih7HEYq8,966
2
+ geney/engines.py,sha256=9_oNsoluJsjdLC3cyWttjHF3cuQoy65FWgS4r7ehzek,14296
3
+ geney/oncosplice.py,sha256=eGQQl9ftmoFENMYBWoJtenKWmzyxR9N1of5cZst_bHQ,18014
4
+ geney/pipelines.py,sha256=gsy-gmHIi260SC5MKQ9IBSE0wko8Tvd7IC3wj083mPQ,3996
5
+ geney/splice_graph.py,sha256=PANtLUAQiz578NZwxVlTSgboetnToHnQSkYpT0zbi_w,23931
6
+ geney/transcripts.py,sha256=BBgyeqF4jeIiHaD_bXxgOTXz19kdUgjcPVo4ClpcSUg,2594
7
+ geney/variants.py,sha256=vjbiBH-duZ4TJZyXwXbQ_VmJxCFafjeDwLNTZg3ubSc,11832
8
+ geney-1.4.41.dist-info/METADATA,sha256=zuzWKIEeHSaFr08eRUjq3ZSiloOepcCD_QRG5ifS8j0,972
9
+ geney-1.4.41.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
10
+ geney-1.4.41.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
11
+ geney-1.4.41.dist-info/RECORD,,
geney/samples.py DELETED
@@ -1,3 +0,0 @@
1
- mut_id = 'KRAS:12:25227343:G:T'
2
- epistasis_id = 'KRAS:12:25227343:G:T|KRAS:12:25227344:A:T'
3
-
geney/splicing_table.py DELETED
@@ -1,142 +0,0 @@
1
- # oncosplice/splicing_table.py
2
- from __future__ import annotations
3
-
4
- from typing import Dict, Optional, Union
5
- import numpy as np
6
- import pandas as pd
7
-
8
- from .engines import run_splicing_engine
9
-
10
-
11
-
12
- def predict_splicing(s, position: int, engine: str = 'spliceai', context: int = 7500,
13
- ) -> Union['SeqMat', pd.DataFrame]:
14
- """
15
- Predict splicing probabilities at a given position using the specified engine.
16
-
17
- Args:
18
- position (int): The genomic position to predict splicing probabilities for.
19
- engine (str): The prediction engine to use. Supported: 'spliceai', 'pangolin'.
20
- context (int): The length of the target central region (default: 7500).
21
- format (str): Output format for the splicing engine results.
22
-
23
- Returns:
24
- pd.DataFrame: A DataFrame containing:
25
- - position: The genomic position
26
- - donor_prob: Probability of being a donor splice site
27
- - acceptor_prob: Probability of being an acceptor splice site
28
- - nucleotides: The nucleotide sequence at that position
29
-
30
- Raises:
31
- ValueError: If an unsupported engine is provided.
32
- IndexError: If the position is not found in the sequence.
33
- """
34
- # Validate position is within sequence bounds
35
- if position < s.index.min() or position > s.index.max():
36
- raise ValueError(f"Position {position} is outside sequence bounds [{s.index.min()}, {s.index.max()}]")
37
-
38
- # Retrieve extended context (includes flanks) around the position.
39
- target = s.clone(position - context, position + context)
40
-
41
- # Check if target clone resulted in empty sequence
42
- if len(target.seq) == 0:
43
- raise ValueError(f"No sequence data found around position {position} with context {context}")
44
-
45
- seq, indices = target.seq, target.index
46
-
47
- # Validate indices array is not empty
48
- if len(indices) == 0:
49
- raise ValueError(f"No indices found in sequence around position {position}")
50
-
51
- # Find relative position within the context window
52
- rel_pos = np.abs(indices - position).argmin()
53
- left_missing, right_missing = max(0, context - rel_pos), max(0, context - (len(seq) - rel_pos))
54
- # print(left_missing, right_missing)
55
- if left_missing > 0 or right_missing > 0:
56
- step = -1 if s.rev else 1
57
-
58
- if left_missing > 0:
59
- left_pad = np.arange(indices[0] - step * left_missing, indices[0], step)
60
- else:
61
- left_pad = np.array([], dtype=indices.dtype)
62
-
63
- if right_missing > 0:
64
- right_pad = np.arange(indices[-1] + step, indices[-1] + step * (right_missing + 1), step)
65
- else:
66
- right_pad = np.array([], dtype=indices.dtype)
67
-
68
- seq = 'N' * left_missing + seq + 'N' * right_missing
69
- indices = np.concatenate([left_pad, indices, right_pad])
70
-
71
- # Run the splicing prediction engine (function assumed to be defined externally)
72
- donor_probs, acceptor_probs = run_splicing_engine(seq=seq, engine=engine)
73
- # Trim off the fixed flanks before returning results.
74
- seq = seq[5000:-5000]
75
- indices = indices[5000:-5000]
76
- df = pd.DataFrame({
77
- 'position': indices,
78
- 'donor_prob': donor_probs,
79
- 'acceptor_prob': acceptor_probs,
80
- 'nucleotides': list(seq)
81
- }).set_index('position').round(3)
82
-
83
- df.attrs['name'] = s.name
84
- return df
85
-
86
-
87
-
88
- def adjoin_splicing_outcomes(
89
- splicing_predictions: Dict[str, pd.DataFrame],
90
- transcript: Optional[object] = None,
91
- ) -> pd.DataFrame:
92
- """
93
- Combine splicing predictions for multiple mutations into a multi-index DataFrame.
94
-
95
- splicing_predictions: {label -> DF with 'donor_prob','acceptor_prob','nucleotides'}
96
- transcript: optional transcript (must have .acceptors, .donors, .rev)
97
- """
98
- if not splicing_predictions:
99
- raise ValueError("splicing_predictions cannot be empty")
100
-
101
- dfs = []
102
- for label, df in splicing_predictions.items():
103
- if not isinstance(df, pd.DataFrame):
104
- raise TypeError(f"Expected DataFrame for '{label}', got {type(df).__name__}")
105
-
106
- required_cols = ["donor_prob", "acceptor_prob", "nucleotides"]
107
- missing = [c for c in required_cols if c not in df.columns]
108
- if missing:
109
- raise ValueError(
110
- f"DataFrame for '{label}' missing required columns: {missing}"
111
- )
112
-
113
- var_df = df.rename(
114
- columns={
115
- "donor_prob": ("donors", f"{label}_prob"),
116
- "acceptor_prob": ("acceptors", f"{label}_prob"),
117
- "nucleotides": ("nts", f"{label}"),
118
- }
119
- )
120
- dfs.append(var_df)
121
-
122
- try:
123
- full_df = pd.concat(dfs, axis=1)
124
- except Exception as e:
125
- raise ValueError(f"Failed to concatenate DataFrames: {e}") from e
126
-
127
- if not isinstance(full_df.columns, pd.MultiIndex):
128
- full_df.columns = pd.MultiIndex.from_tuples(full_df.columns)
129
-
130
- if transcript is not None:
131
- full_df[("acceptors", "annotated")] = full_df.apply(
132
- lambda row: row.name in transcript.acceptors, axis=1
133
- )
134
- full_df[("donors", "annotated")] = full_df.apply(
135
- lambda row: row.name in transcript.donors, axis=1
136
- )
137
- full_df.sort_index(axis=1, level=0, inplace=True)
138
- full_df.sort_index(ascending=not transcript.rev, inplace=True)
139
- else:
140
- full_df.sort_index(axis=1, level=0, inplace=True)
141
-
142
- return full_df