geney 1.4.40__py3-none-any.whl → 1.4.41__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- geney/__init__.py +18 -5
- geney/engines.py +313 -204
- geney/pipelines.py +88 -46
- geney/splice_graph.py +213 -7
- geney/transcripts.py +1 -1
- {geney-1.4.40.dist-info → geney-1.4.41.dist-info}/METADATA +2 -1
- geney-1.4.41.dist-info/RECORD +11 -0
- geney/samples.py +0 -3
- geney/splicing_table.py +0 -142
- geney/utils.py +0 -254
- geney-1.4.40.dist-info/RECORD +0 -14
- {geney-1.4.40.dist-info → geney-1.4.41.dist-info}/WHEEL +0 -0
- {geney-1.4.40.dist-info → geney-1.4.41.dist-info}/top_level.txt +0 -0
geney/pipelines.py
CHANGED
|
@@ -4,36 +4,24 @@ from __future__ import annotations
|
|
|
4
4
|
from datetime import datetime
|
|
5
5
|
import pandas as pd
|
|
6
6
|
|
|
7
|
-
from seqmat import Gene
|
|
7
|
+
from seqmat import Gene
|
|
8
8
|
|
|
9
9
|
from .splice_graph import SpliceSimulator
|
|
10
10
|
from .transcripts import TranscriptLibrary
|
|
11
11
|
from .variants import MutationalEvent
|
|
12
|
-
from .oncosplice import Oncosplice
|
|
12
|
+
from .oncosplice import Oncosplice
|
|
13
13
|
|
|
14
14
|
|
|
15
|
-
def
|
|
16
|
-
print("we are here")
|
|
17
|
-
m = MutationalEvent(mut_id)
|
|
18
|
-
assert m.compatible(), 'Mutations in event are incompatible'
|
|
19
|
-
reference_transcript = Gene.from_file(
|
|
20
|
-
m.gene, organism=organism).transcript(transcript_id).generate_pre_mrna().generate_mature_mrna().generate_protein()
|
|
21
|
-
tl = TranscriptLibrary(reference_transcript, m)
|
|
22
|
-
splicing_results = tl.predict_splicing(m.position, engine=splicing_engine, inplace=True).get_event_columns('event')
|
|
23
|
-
ss = SpliceSimulator(splicing_results, tl.event, feature='event', max_distance=100_000_000)
|
|
24
|
-
return ss.max_splicing_delta('event_prob')
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
def oncosplice_pipeline_single_transcript(
|
|
15
|
+
def oncosplice_pipeline(
|
|
29
16
|
mut_id: str,
|
|
30
17
|
transcript_id: str | None = None,
|
|
31
18
|
splicing_engine: str = "spliceai",
|
|
32
19
|
organism: str = "hg38",
|
|
33
20
|
) -> pd.DataFrame:
|
|
34
21
|
"""
|
|
35
|
-
|
|
36
|
-
|
|
22
|
+
Run the full oncosplice pipeline for a mutation.
|
|
23
|
+
|
|
24
|
+
Returns DataFrame with all viable isoforms and their oncosplice scores.
|
|
37
25
|
"""
|
|
38
26
|
m = MutationalEvent(mut_id)
|
|
39
27
|
assert m.compatible(), "Mutations in event are incompatible"
|
|
@@ -47,7 +35,6 @@ def oncosplice_pipeline_single_transcript(
|
|
|
47
35
|
)
|
|
48
36
|
|
|
49
37
|
tl = TranscriptLibrary(reference_transcript, m)
|
|
50
|
-
|
|
51
38
|
central_pos = m.central_position
|
|
52
39
|
|
|
53
40
|
tl.predict_splicing(central_pos, engine=splicing_engine, inplace=True)
|
|
@@ -57,18 +44,16 @@ def oncosplice_pipeline_single_transcript(
|
|
|
57
44
|
splicing_results, tl.event, feature="event", max_distance=100_000_000
|
|
58
45
|
)
|
|
59
46
|
|
|
60
|
-
base_report = pd.Series(
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
}
|
|
71
|
-
)
|
|
47
|
+
base_report = pd.Series({
|
|
48
|
+
"mut_id": mut_id,
|
|
49
|
+
"gene": m.gene,
|
|
50
|
+
"transcript_id": reference_transcript.transcript_id,
|
|
51
|
+
"primary_transcript": reference_transcript.primary_transcript,
|
|
52
|
+
"splicing_engine": splicing_engine,
|
|
53
|
+
"central_position": central_pos,
|
|
54
|
+
"mutation_count": len(m.positions),
|
|
55
|
+
"time_of_execution": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
|
56
|
+
})
|
|
72
57
|
|
|
73
58
|
ss_metadata = ss.report(central_pos)
|
|
74
59
|
rows = []
|
|
@@ -79,19 +64,76 @@ def oncosplice_pipeline_single_transcript(
|
|
|
79
64
|
reference_transcript.cons_vector,
|
|
80
65
|
)
|
|
81
66
|
rows.append(
|
|
82
|
-
pd.concat(
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
),
|
|
93
|
-
onco.get_analysis_series(),
|
|
94
|
-
]
|
|
95
|
-
)
|
|
67
|
+
pd.concat([
|
|
68
|
+
base_report,
|
|
69
|
+
ss_metadata,
|
|
70
|
+
isoform_metadata,
|
|
71
|
+
pd.Series({
|
|
72
|
+
"reference_mrna": reference_transcript.mature_mrna.seq,
|
|
73
|
+
"variant_mrna": variant_transcript.mature_mrna.seq,
|
|
74
|
+
}),
|
|
75
|
+
onco.get_analysis_series(),
|
|
76
|
+
])
|
|
96
77
|
)
|
|
97
|
-
|
|
78
|
+
|
|
79
|
+
return pd.DataFrame(rows)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def oncosplice_top_isoform(
|
|
83
|
+
mut_id: str,
|
|
84
|
+
transcript_id: str | None = None,
|
|
85
|
+
splicing_engine: str = "spliceai",
|
|
86
|
+
organism: str = "hg38",
|
|
87
|
+
) -> pd.Series | None:
|
|
88
|
+
"""
|
|
89
|
+
Get the most likely non-reference isoform for a mutation.
|
|
90
|
+
|
|
91
|
+
Returns Series with full oncosplice analysis, or None if no missplicing detected.
|
|
92
|
+
"""
|
|
93
|
+
df = oncosplice_pipeline(mut_id, transcript_id, splicing_engine, organism)
|
|
94
|
+
|
|
95
|
+
if df.empty:
|
|
96
|
+
return None
|
|
97
|
+
|
|
98
|
+
variants = df[df["summary"] != "-"]
|
|
99
|
+
|
|
100
|
+
if variants.empty:
|
|
101
|
+
return None
|
|
102
|
+
|
|
103
|
+
return variants.iloc[0]
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def max_splicing_delta(
|
|
107
|
+
mut_id: str,
|
|
108
|
+
transcript_id: str | None = None,
|
|
109
|
+
splicing_engine: str = "spliceai",
|
|
110
|
+
organism: str = "hg38",
|
|
111
|
+
) -> float:
|
|
112
|
+
"""
|
|
113
|
+
Get the maximum splice site probability change for a mutation.
|
|
114
|
+
"""
|
|
115
|
+
m = MutationalEvent(mut_id)
|
|
116
|
+
assert m.compatible(), "Mutations in event are incompatible"
|
|
117
|
+
|
|
118
|
+
reference_transcript = (
|
|
119
|
+
Gene.from_file(m.gene, organism=organism)
|
|
120
|
+
.transcript(transcript_id)
|
|
121
|
+
.generate_pre_mrna()
|
|
122
|
+
.generate_mature_mrna()
|
|
123
|
+
.generate_protein()
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
tl = TranscriptLibrary(reference_transcript, m)
|
|
127
|
+
splicing_results = tl.predict_splicing(
|
|
128
|
+
m.central_position, engine=splicing_engine, inplace=True
|
|
129
|
+
).get_event_columns("event")
|
|
130
|
+
|
|
131
|
+
ss = SpliceSimulator(
|
|
132
|
+
splicing_results, tl.event, feature="event", max_distance=100_000_000
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
return ss.max_splicing_delta("event_prob")
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
# Keep old name for backwards compatibility
|
|
139
|
+
oncosplice_pipeline_single_transcript = oncosplice_pipeline
|
geney/splice_graph.py
CHANGED
|
@@ -2,13 +2,18 @@
|
|
|
2
2
|
from __future__ import annotations
|
|
3
3
|
|
|
4
4
|
from collections import defaultdict
|
|
5
|
-
|
|
5
|
+
import hashlib
|
|
6
|
+
from typing import Dict, Generator, List, Tuple
|
|
6
7
|
|
|
7
8
|
import numpy as np
|
|
8
9
|
import pandas as pd
|
|
9
10
|
from pandas import Series
|
|
10
11
|
|
|
11
|
-
|
|
12
|
+
|
|
13
|
+
def _short_hash(items: Tuple) -> str:
|
|
14
|
+
"""Generate a short hash string from a tuple."""
|
|
15
|
+
encoded = repr(items).encode('utf-8')
|
|
16
|
+
return hashlib.sha256(encoded).hexdigest()[:8]
|
|
12
17
|
|
|
13
18
|
|
|
14
19
|
class SpliceSimulator:
|
|
@@ -90,6 +95,65 @@ class SpliceSimulator:
|
|
|
90
95
|
metadata["missplicing"] = self.max_splicing_delta("event_prob")
|
|
91
96
|
return metadata
|
|
92
97
|
|
|
98
|
+
def summarize_events(self, threshold: float = 0.2) -> str:
|
|
99
|
+
"""
|
|
100
|
+
Generate human-readable summary of splice site changes.
|
|
101
|
+
|
|
102
|
+
Returns text describing discovered and deleted donors/acceptors.
|
|
103
|
+
Format: "D(position) ref_prob -> event_prob" or "A(position) ref_prob -> event_prob"
|
|
104
|
+
"""
|
|
105
|
+
feature_col = f"{self.feature}_prob"
|
|
106
|
+
lines = []
|
|
107
|
+
|
|
108
|
+
# Process donors
|
|
109
|
+
donor_df = self.donor_df
|
|
110
|
+
discovered_donors = donor_df[donor_df["discovered_delta"].abs() >= threshold]
|
|
111
|
+
deleted_donors = donor_df[donor_df["deleted_delta"].abs() >= threshold]
|
|
112
|
+
|
|
113
|
+
if len(discovered_donors) > 0 or len(deleted_donors) > 0:
|
|
114
|
+
lines.append("=== DONORS ===")
|
|
115
|
+
|
|
116
|
+
if len(discovered_donors) > 0:
|
|
117
|
+
lines.append("Discovered:")
|
|
118
|
+
for pos, row in discovered_donors.iterrows():
|
|
119
|
+
ref = row.get("ref_prob", 0)
|
|
120
|
+
evt = row.get(feature_col, row.get("event_prob", 0))
|
|
121
|
+
lines.append(f" D({pos}) {ref:.2f} -> {evt:.2f} [+{evt-ref:.2f}]")
|
|
122
|
+
|
|
123
|
+
if len(deleted_donors) > 0:
|
|
124
|
+
lines.append("Deleted:")
|
|
125
|
+
for pos, row in deleted_donors.iterrows():
|
|
126
|
+
ref = row.get("ref_prob", 0)
|
|
127
|
+
evt = row.get(feature_col, row.get("event_prob", 0))
|
|
128
|
+
lines.append(f" D({pos}) {ref:.2f} -> {evt:.2f} [{evt-ref:.2f}]")
|
|
129
|
+
|
|
130
|
+
# Process acceptors
|
|
131
|
+
acceptor_df = self.acceptor_df
|
|
132
|
+
discovered_acceptors = acceptor_df[acceptor_df["discovered_delta"].abs() >= threshold]
|
|
133
|
+
deleted_acceptors = acceptor_df[acceptor_df["deleted_delta"].abs() >= threshold]
|
|
134
|
+
|
|
135
|
+
if len(discovered_acceptors) > 0 or len(deleted_acceptors) > 0:
|
|
136
|
+
lines.append("=== ACCEPTORS ===")
|
|
137
|
+
|
|
138
|
+
if len(discovered_acceptors) > 0:
|
|
139
|
+
lines.append("Discovered:")
|
|
140
|
+
for pos, row in discovered_acceptors.iterrows():
|
|
141
|
+
ref = row.get("ref_prob", 0)
|
|
142
|
+
evt = row.get(feature_col, row.get("event_prob", 0))
|
|
143
|
+
lines.append(f" A({pos}) {ref:.2f} -> {evt:.2f} [+{evt-ref:.2f}]")
|
|
144
|
+
|
|
145
|
+
if len(deleted_acceptors) > 0:
|
|
146
|
+
lines.append("Deleted:")
|
|
147
|
+
for pos, row in deleted_acceptors.iterrows():
|
|
148
|
+
ref = row.get("ref_prob", 0)
|
|
149
|
+
evt = row.get(feature_col, row.get("event_prob", 0))
|
|
150
|
+
lines.append(f" A({pos}) {ref:.2f} -> {evt:.2f} [{evt-ref:.2f}]")
|
|
151
|
+
|
|
152
|
+
if not lines:
|
|
153
|
+
return "No significant splice site changes detected."
|
|
154
|
+
|
|
155
|
+
return "\n".join(lines)
|
|
156
|
+
|
|
93
157
|
def max_splicing_delta(self, event: str) -> float:
|
|
94
158
|
all_diffs = []
|
|
95
159
|
for site_type in ["donors", "acceptors"]:
|
|
@@ -236,6 +300,110 @@ class SpliceSimulator:
|
|
|
236
300
|
paths.sort(key=lambda x: x[1], reverse=True)
|
|
237
301
|
return paths
|
|
238
302
|
|
|
303
|
+
def isoforms_df(self) -> pd.DataFrame:
|
|
304
|
+
"""
|
|
305
|
+
Return a DataFrame of all viable isoforms with probabilities and missplicing descriptions.
|
|
306
|
+
|
|
307
|
+
Columns:
|
|
308
|
+
- isoform_id: unique hash of the splice path
|
|
309
|
+
- probability: probability/prevalence of this isoform
|
|
310
|
+
- splicing_changes: short missplicing event codes (ES, IR, PES, PIR, NE, or "-" for canonical)
|
|
311
|
+
- exon_skipping: full exon skipping details
|
|
312
|
+
- partial_exon_skipping: partial exon skipping (truncation) details
|
|
313
|
+
- intron_retention: full intron retention details
|
|
314
|
+
- partial_intron_retention: partial intron retention details
|
|
315
|
+
- novel_exon: novel/cryptic exon details
|
|
316
|
+
"""
|
|
317
|
+
rows = []
|
|
318
|
+
for t, md in self.get_viable_transcripts(metadata=True):
|
|
319
|
+
rows.append({
|
|
320
|
+
"isoform_id": md.get("isoform_id", ""),
|
|
321
|
+
"probability": md.get("isoform_prevalence", 0.0),
|
|
322
|
+
"splicing_changes": md.get("summary", "-"),
|
|
323
|
+
"exon_skipping": md.get("es", ""),
|
|
324
|
+
"partial_exon_skipping": md.get("pes", ""),
|
|
325
|
+
"intron_retention": md.get("ir", ""),
|
|
326
|
+
"partial_intron_retention": md.get("pir", ""),
|
|
327
|
+
"novel_exon": md.get("ne", ""),
|
|
328
|
+
})
|
|
329
|
+
|
|
330
|
+
if not rows:
|
|
331
|
+
return pd.DataFrame()
|
|
332
|
+
|
|
333
|
+
return pd.DataFrame(rows)
|
|
334
|
+
|
|
335
|
+
def _is_implausible_ir_path(self, var_transcript) -> bool:
|
|
336
|
+
"""
|
|
337
|
+
Check if this transcript has intron retention that is implausible
|
|
338
|
+
because nearby cryptic splice sites compensate for the lost original sites.
|
|
339
|
+
|
|
340
|
+
Returns True if the path should be filtered out.
|
|
341
|
+
|
|
342
|
+
Key insight: If the variant uses ANY splice site near a reference intron
|
|
343
|
+
boundary, the intron is being spliced (possibly at a shifted position).
|
|
344
|
+
True IR only occurs when NO splice sites are used near BOTH boundaries.
|
|
345
|
+
"""
|
|
346
|
+
ref_introns = getattr(self.transcript, "introns", [])
|
|
347
|
+
|
|
348
|
+
if not ref_introns:
|
|
349
|
+
return False
|
|
350
|
+
|
|
351
|
+
TOLERANCE = 500 # bp - consider splice sites within this distance as "covering" the boundary
|
|
352
|
+
MIN_TOTAL_PROB = 0.5 # if total prob >= this, cryptic sites could compensate
|
|
353
|
+
|
|
354
|
+
var_donors = set(var_transcript.donors)
|
|
355
|
+
var_acceptors = set(var_transcript.acceptors)
|
|
356
|
+
|
|
357
|
+
donor_df = self.donor_df
|
|
358
|
+
acceptor_df = self.acceptor_df
|
|
359
|
+
|
|
360
|
+
for t1, t2 in ref_introns:
|
|
361
|
+
# Determine which end is donor and which is acceptor based on strand
|
|
362
|
+
if not self.rev:
|
|
363
|
+
donor_pos, acceptor_pos = t1, t2 # + strand
|
|
364
|
+
else:
|
|
365
|
+
donor_pos, acceptor_pos = t2, t1 # - strand
|
|
366
|
+
|
|
367
|
+
# Check if variant uses ANY donor near the reference donor position
|
|
368
|
+
donor_used = any(
|
|
369
|
+
abs(d - donor_pos) <= TOLERANCE
|
|
370
|
+
for d in var_donors
|
|
371
|
+
)
|
|
372
|
+
|
|
373
|
+
# Check if variant uses ANY acceptor near the reference acceptor position
|
|
374
|
+
acceptor_used = any(
|
|
375
|
+
abs(a - acceptor_pos) <= TOLERANCE
|
|
376
|
+
for a in var_acceptors
|
|
377
|
+
)
|
|
378
|
+
|
|
379
|
+
# If both boundaries are used (possibly at shifted positions),
|
|
380
|
+
# the intron is being spliced out - NOT retained
|
|
381
|
+
if donor_used and acceptor_used:
|
|
382
|
+
continue
|
|
383
|
+
|
|
384
|
+
# At least one boundary is NOT used - this path has potential IR
|
|
385
|
+
# Check if cryptic sites with high probability exist but aren't being used
|
|
386
|
+
# (which would make this IR path implausible)
|
|
387
|
+
|
|
388
|
+
nearby_donors = donor_df.loc[
|
|
389
|
+
(donor_df.index >= donor_pos - TOLERANCE) &
|
|
390
|
+
(donor_df.index <= donor_pos + TOLERANCE)
|
|
391
|
+
]
|
|
392
|
+
total_donor_prob = nearby_donors["P"].sum() if len(nearby_donors) > 0 else 0
|
|
393
|
+
|
|
394
|
+
nearby_acceptors = acceptor_df.loc[
|
|
395
|
+
(acceptor_df.index >= acceptor_pos - TOLERANCE) &
|
|
396
|
+
(acceptor_df.index <= acceptor_pos + TOLERANCE)
|
|
397
|
+
]
|
|
398
|
+
total_acceptor_prob = nearby_acceptors["P"].sum() if len(nearby_acceptors) > 0 else 0
|
|
399
|
+
|
|
400
|
+
# If both boundaries have high probability cryptic sites available,
|
|
401
|
+
# but this path doesn't use them, the IR is implausible
|
|
402
|
+
if total_donor_prob >= MIN_TOTAL_PROB and total_acceptor_prob >= MIN_TOTAL_PROB:
|
|
403
|
+
return True
|
|
404
|
+
|
|
405
|
+
return False
|
|
406
|
+
|
|
239
407
|
def get_viable_transcripts(self, metadata: bool = False):
|
|
240
408
|
graph = self.generate_graph()
|
|
241
409
|
start_node = (self.transcript_start, "transcript_start")
|
|
@@ -251,8 +419,13 @@ class SpliceSimulator:
|
|
|
251
419
|
t.donors = [d for d in donors if d != t.transcript_end]
|
|
252
420
|
t.acceptors = [a for a in acceptors if a != t.transcript_start]
|
|
253
421
|
t.path_weight = prob
|
|
254
|
-
t.path_hash =
|
|
422
|
+
t.path_hash = _short_hash(tuple(donors + acceptors))
|
|
255
423
|
t.generate_mature_mrna().generate_protein()
|
|
424
|
+
|
|
425
|
+
# Filter out implausible IR paths (where cryptic sites compensate)
|
|
426
|
+
if self._is_implausible_ir_path(t):
|
|
427
|
+
continue
|
|
428
|
+
|
|
256
429
|
if metadata:
|
|
257
430
|
md = pd.concat(
|
|
258
431
|
[
|
|
@@ -306,7 +479,9 @@ class SpliceSimulator:
|
|
|
306
479
|
num_ref_introns = len(ref_introns)
|
|
307
480
|
|
|
308
481
|
pes, pir, es, ne, ir = [], [], [], [], []
|
|
482
|
+
pir_intron_indices = set() # Track which introns have PIR
|
|
309
483
|
|
|
484
|
+
# Partial exon skipping (exon truncation)
|
|
310
485
|
for exon_count, (t1, t2) in enumerate(ref_exons):
|
|
311
486
|
for (s1, s2) in var_exons:
|
|
312
487
|
if (not ref.rev and ((s1 == t1 and s2 < t2) or (s1 > t1 and s2 == t2))) or (
|
|
@@ -316,30 +491,61 @@ class SpliceSimulator:
|
|
|
316
491
|
f"Exon {exon_count+1}/{num_ref_exons} truncated: {(t1, t2)} --> {(s1, s2)}"
|
|
317
492
|
)
|
|
318
493
|
|
|
494
|
+
# Partial intron retention (one boundary preserved, other shifted)
|
|
319
495
|
for intron_count, (t1, t2) in enumerate(ref_introns):
|
|
320
496
|
for (s1, s2) in var_introns:
|
|
497
|
+
# Check if one boundary matches and the intron is shorter (partial retention)
|
|
321
498
|
if (not ref.rev and ((s1 == t1 and s2 < t2) or (s1 > t1 and s2 == t2))) or (
|
|
322
499
|
ref.rev and ((s1 == t1 and s2 > t2) or (s1 < t1 and s2 == t2))
|
|
323
500
|
):
|
|
324
501
|
pir.append(
|
|
325
502
|
f"Intron {intron_count+1}/{num_ref_introns} partially retained: {(t1, t2)} --> {(s1, s2)}"
|
|
326
503
|
)
|
|
504
|
+
pir_intron_indices.add(intron_count)
|
|
327
505
|
|
|
506
|
+
# Exon skipping (both boundaries missing)
|
|
328
507
|
for exon_count, (t1, t2) in enumerate(ref_exons):
|
|
329
508
|
if t1 not in var.acceptors and t2 not in var.donors:
|
|
330
509
|
es.append(
|
|
331
510
|
f"Exon {exon_count+1}/{num_ref_exons} skipped: {(t1, t2)}"
|
|
332
511
|
)
|
|
333
512
|
|
|
513
|
+
# Novel exon (boundaries not in reference)
|
|
334
514
|
for (s1, s2) in var_exons:
|
|
335
515
|
if s1 not in ref.acceptors and s2 not in ref.donors:
|
|
336
516
|
ne.append(f"Novel Exon: {(s1, s2)}")
|
|
337
517
|
|
|
518
|
+
# Full intron retention - only if NOT already partial retention
|
|
519
|
+
# AND no splice sites are being used near the intron boundaries
|
|
520
|
+
TOLERANCE = 500 # bp - consider splice sites within this distance as "covering" the boundary
|
|
521
|
+
|
|
338
522
|
for intron_count, (t1, t2) in enumerate(ref_introns):
|
|
339
|
-
if
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
523
|
+
if intron_count in pir_intron_indices:
|
|
524
|
+
continue # Already classified as PIR
|
|
525
|
+
|
|
526
|
+
# Check if the intron is preserved exactly in variant
|
|
527
|
+
intron_preserved = any(s1 == t1 and s2 == t2 for s1, s2 in var_introns)
|
|
528
|
+
if intron_preserved:
|
|
529
|
+
continue # Intron is properly spliced
|
|
530
|
+
|
|
531
|
+
# Determine donor/acceptor positions based on strand
|
|
532
|
+
if not ref.rev:
|
|
533
|
+
donor_pos, acceptor_pos = t1, t2 # + strand
|
|
534
|
+
else:
|
|
535
|
+
donor_pos, acceptor_pos = t2, t1 # - strand
|
|
536
|
+
|
|
537
|
+
# Check if variant uses ANY splice site near each boundary
|
|
538
|
+
# If so, the intron is being spliced (at shifted positions), not retained
|
|
539
|
+
donor_used = any(abs(d - donor_pos) <= TOLERANCE for d in var.donors)
|
|
540
|
+
acceptor_used = any(abs(a - acceptor_pos) <= TOLERANCE for a in var.acceptors)
|
|
541
|
+
|
|
542
|
+
if donor_used and acceptor_used:
|
|
543
|
+
continue # Intron is being spliced at shifted positions, not retained
|
|
544
|
+
|
|
545
|
+
# If we get here, the intron is truly retained
|
|
546
|
+
ir.append(
|
|
547
|
+
f"Intron {intron_count+1}/{num_ref_introns} retained: {(t1, t2)}"
|
|
548
|
+
)
|
|
343
549
|
|
|
344
550
|
return ",".join(pes), ",".join(pir), ",".join(es), ",".join(ne), ",".join(ir)
|
|
345
551
|
|
geney/transcripts.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: geney
|
|
3
|
-
Version: 1.4.
|
|
3
|
+
Version: 1.4.41
|
|
4
4
|
Summary: A Python package for gene expression modeling.
|
|
5
5
|
Home-page: https://github.com/nicolaslynn/geney
|
|
6
6
|
Author: Nicolas Lynn
|
|
@@ -22,6 +22,7 @@ Requires-Dist: tensorflow>=2.8.0
|
|
|
22
22
|
Requires-Dist: keras>=2.8.0
|
|
23
23
|
Requires-Dist: torch
|
|
24
24
|
Requires-Dist: seqmat
|
|
25
|
+
Requires-Dist: h5py
|
|
25
26
|
Dynamic: author
|
|
26
27
|
Dynamic: author-email
|
|
27
28
|
Dynamic: classifier
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
geney/__init__.py,sha256=nkhniqCNWJzrb7xHgTDFEXSvRVdggb9ZCJ7ih7HEYq8,966
|
|
2
|
+
geney/engines.py,sha256=9_oNsoluJsjdLC3cyWttjHF3cuQoy65FWgS4r7ehzek,14296
|
|
3
|
+
geney/oncosplice.py,sha256=eGQQl9ftmoFENMYBWoJtenKWmzyxR9N1of5cZst_bHQ,18014
|
|
4
|
+
geney/pipelines.py,sha256=gsy-gmHIi260SC5MKQ9IBSE0wko8Tvd7IC3wj083mPQ,3996
|
|
5
|
+
geney/splice_graph.py,sha256=PANtLUAQiz578NZwxVlTSgboetnToHnQSkYpT0zbi_w,23931
|
|
6
|
+
geney/transcripts.py,sha256=BBgyeqF4jeIiHaD_bXxgOTXz19kdUgjcPVo4ClpcSUg,2594
|
|
7
|
+
geney/variants.py,sha256=vjbiBH-duZ4TJZyXwXbQ_VmJxCFafjeDwLNTZg3ubSc,11832
|
|
8
|
+
geney-1.4.41.dist-info/METADATA,sha256=zuzWKIEeHSaFr08eRUjq3ZSiloOepcCD_QRG5ifS8j0,972
|
|
9
|
+
geney-1.4.41.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
10
|
+
geney-1.4.41.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
|
|
11
|
+
geney-1.4.41.dist-info/RECORD,,
|
geney/samples.py
DELETED
geney/splicing_table.py
DELETED
|
@@ -1,142 +0,0 @@
|
|
|
1
|
-
# oncosplice/splicing_table.py
|
|
2
|
-
from __future__ import annotations
|
|
3
|
-
|
|
4
|
-
from typing import Dict, Optional, Union
|
|
5
|
-
import numpy as np
|
|
6
|
-
import pandas as pd
|
|
7
|
-
|
|
8
|
-
from .engines import run_splicing_engine
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
def predict_splicing(s, position: int, engine: str = 'spliceai', context: int = 7500,
|
|
13
|
-
) -> Union['SeqMat', pd.DataFrame]:
|
|
14
|
-
"""
|
|
15
|
-
Predict splicing probabilities at a given position using the specified engine.
|
|
16
|
-
|
|
17
|
-
Args:
|
|
18
|
-
position (int): The genomic position to predict splicing probabilities for.
|
|
19
|
-
engine (str): The prediction engine to use. Supported: 'spliceai', 'pangolin'.
|
|
20
|
-
context (int): The length of the target central region (default: 7500).
|
|
21
|
-
format (str): Output format for the splicing engine results.
|
|
22
|
-
|
|
23
|
-
Returns:
|
|
24
|
-
pd.DataFrame: A DataFrame containing:
|
|
25
|
-
- position: The genomic position
|
|
26
|
-
- donor_prob: Probability of being a donor splice site
|
|
27
|
-
- acceptor_prob: Probability of being an acceptor splice site
|
|
28
|
-
- nucleotides: The nucleotide sequence at that position
|
|
29
|
-
|
|
30
|
-
Raises:
|
|
31
|
-
ValueError: If an unsupported engine is provided.
|
|
32
|
-
IndexError: If the position is not found in the sequence.
|
|
33
|
-
"""
|
|
34
|
-
# Validate position is within sequence bounds
|
|
35
|
-
if position < s.index.min() or position > s.index.max():
|
|
36
|
-
raise ValueError(f"Position {position} is outside sequence bounds [{s.index.min()}, {s.index.max()}]")
|
|
37
|
-
|
|
38
|
-
# Retrieve extended context (includes flanks) around the position.
|
|
39
|
-
target = s.clone(position - context, position + context)
|
|
40
|
-
|
|
41
|
-
# Check if target clone resulted in empty sequence
|
|
42
|
-
if len(target.seq) == 0:
|
|
43
|
-
raise ValueError(f"No sequence data found around position {position} with context {context}")
|
|
44
|
-
|
|
45
|
-
seq, indices = target.seq, target.index
|
|
46
|
-
|
|
47
|
-
# Validate indices array is not empty
|
|
48
|
-
if len(indices) == 0:
|
|
49
|
-
raise ValueError(f"No indices found in sequence around position {position}")
|
|
50
|
-
|
|
51
|
-
# Find relative position within the context window
|
|
52
|
-
rel_pos = np.abs(indices - position).argmin()
|
|
53
|
-
left_missing, right_missing = max(0, context - rel_pos), max(0, context - (len(seq) - rel_pos))
|
|
54
|
-
# print(left_missing, right_missing)
|
|
55
|
-
if left_missing > 0 or right_missing > 0:
|
|
56
|
-
step = -1 if s.rev else 1
|
|
57
|
-
|
|
58
|
-
if left_missing > 0:
|
|
59
|
-
left_pad = np.arange(indices[0] - step * left_missing, indices[0], step)
|
|
60
|
-
else:
|
|
61
|
-
left_pad = np.array([], dtype=indices.dtype)
|
|
62
|
-
|
|
63
|
-
if right_missing > 0:
|
|
64
|
-
right_pad = np.arange(indices[-1] + step, indices[-1] + step * (right_missing + 1), step)
|
|
65
|
-
else:
|
|
66
|
-
right_pad = np.array([], dtype=indices.dtype)
|
|
67
|
-
|
|
68
|
-
seq = 'N' * left_missing + seq + 'N' * right_missing
|
|
69
|
-
indices = np.concatenate([left_pad, indices, right_pad])
|
|
70
|
-
|
|
71
|
-
# Run the splicing prediction engine (function assumed to be defined externally)
|
|
72
|
-
donor_probs, acceptor_probs = run_splicing_engine(seq=seq, engine=engine)
|
|
73
|
-
# Trim off the fixed flanks before returning results.
|
|
74
|
-
seq = seq[5000:-5000]
|
|
75
|
-
indices = indices[5000:-5000]
|
|
76
|
-
df = pd.DataFrame({
|
|
77
|
-
'position': indices,
|
|
78
|
-
'donor_prob': donor_probs,
|
|
79
|
-
'acceptor_prob': acceptor_probs,
|
|
80
|
-
'nucleotides': list(seq)
|
|
81
|
-
}).set_index('position').round(3)
|
|
82
|
-
|
|
83
|
-
df.attrs['name'] = s.name
|
|
84
|
-
return df
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
def adjoin_splicing_outcomes(
|
|
89
|
-
splicing_predictions: Dict[str, pd.DataFrame],
|
|
90
|
-
transcript: Optional[object] = None,
|
|
91
|
-
) -> pd.DataFrame:
|
|
92
|
-
"""
|
|
93
|
-
Combine splicing predictions for multiple mutations into a multi-index DataFrame.
|
|
94
|
-
|
|
95
|
-
splicing_predictions: {label -> DF with 'donor_prob','acceptor_prob','nucleotides'}
|
|
96
|
-
transcript: optional transcript (must have .acceptors, .donors, .rev)
|
|
97
|
-
"""
|
|
98
|
-
if not splicing_predictions:
|
|
99
|
-
raise ValueError("splicing_predictions cannot be empty")
|
|
100
|
-
|
|
101
|
-
dfs = []
|
|
102
|
-
for label, df in splicing_predictions.items():
|
|
103
|
-
if not isinstance(df, pd.DataFrame):
|
|
104
|
-
raise TypeError(f"Expected DataFrame for '{label}', got {type(df).__name__}")
|
|
105
|
-
|
|
106
|
-
required_cols = ["donor_prob", "acceptor_prob", "nucleotides"]
|
|
107
|
-
missing = [c for c in required_cols if c not in df.columns]
|
|
108
|
-
if missing:
|
|
109
|
-
raise ValueError(
|
|
110
|
-
f"DataFrame for '{label}' missing required columns: {missing}"
|
|
111
|
-
)
|
|
112
|
-
|
|
113
|
-
var_df = df.rename(
|
|
114
|
-
columns={
|
|
115
|
-
"donor_prob": ("donors", f"{label}_prob"),
|
|
116
|
-
"acceptor_prob": ("acceptors", f"{label}_prob"),
|
|
117
|
-
"nucleotides": ("nts", f"{label}"),
|
|
118
|
-
}
|
|
119
|
-
)
|
|
120
|
-
dfs.append(var_df)
|
|
121
|
-
|
|
122
|
-
try:
|
|
123
|
-
full_df = pd.concat(dfs, axis=1)
|
|
124
|
-
except Exception as e:
|
|
125
|
-
raise ValueError(f"Failed to concatenate DataFrames: {e}") from e
|
|
126
|
-
|
|
127
|
-
if not isinstance(full_df.columns, pd.MultiIndex):
|
|
128
|
-
full_df.columns = pd.MultiIndex.from_tuples(full_df.columns)
|
|
129
|
-
|
|
130
|
-
if transcript is not None:
|
|
131
|
-
full_df[("acceptors", "annotated")] = full_df.apply(
|
|
132
|
-
lambda row: row.name in transcript.acceptors, axis=1
|
|
133
|
-
)
|
|
134
|
-
full_df[("donors", "annotated")] = full_df.apply(
|
|
135
|
-
lambda row: row.name in transcript.donors, axis=1
|
|
136
|
-
)
|
|
137
|
-
full_df.sort_index(axis=1, level=0, inplace=True)
|
|
138
|
-
full_df.sort_index(ascending=not transcript.rev, inplace=True)
|
|
139
|
-
else:
|
|
140
|
-
full_df.sort_index(axis=1, level=0, inplace=True)
|
|
141
|
-
|
|
142
|
-
return full_df
|