geney 1.3.0__tar.gz → 1.4.45__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {geney-1.3.0 → geney-1.4.45}/PKG-INFO +19 -7
- geney-1.4.45/README.md +380 -0
- geney-1.4.45/geney/__init__.py +38 -0
- geney-1.4.45/geney/engines.py +354 -0
- geney-1.4.45/geney/models/openspliceai-mane/10000nt/model_10000nt_rs10.pt +0 -0
- geney-1.4.45/geney/models/openspliceai-mane/10000nt/model_10000nt_rs11.pt +0 -0
- geney-1.4.45/geney/models/openspliceai-mane/10000nt/model_10000nt_rs12.pt +0 -0
- geney-1.4.45/geney/models/openspliceai-mane/10000nt/model_10000nt_rs13.pt +0 -0
- geney-1.4.45/geney/models/openspliceai-mane/10000nt/model_10000nt_rs14.pt +0 -0
- geney-1.4.45/geney/oncosplice.py +412 -0
- geney-1.4.45/geney/pipelines.py +139 -0
- geney-1.4.45/geney/splice_graph.py +637 -0
- geney-1.4.45/geney/splice_graph_archive.py +948 -0
- geney-1.4.45/geney/transcripts.py +68 -0
- geney-1.4.45/geney/variants.py +389 -0
- {geney-1.3.0 → geney-1.4.45}/geney.egg-info/PKG-INFO +19 -7
- geney-1.4.45/geney.egg-info/SOURCES.txt +22 -0
- geney-1.4.45/geney.egg-info/requires.txt +8 -0
- {geney-1.3.0 → geney-1.4.45}/setup.py +5 -2
- geney-1.3.0/geney/Fasta_segment.py +0 -257
- geney-1.3.0/geney/Gene.py +0 -177
- geney-1.3.0/geney/SeqMats.py +0 -492
- geney-1.3.0/geney/Transcript.py +0 -379
- geney-1.3.0/geney/__init__.py +0 -27
- geney-1.3.0/geney/_mutation_utils.py +0 -38
- geney-1.3.0/geney/config_setup.py +0 -15
- geney-1.3.0/geney/data_setup.py +0 -306
- geney-1.3.0/geney/graphic_utils.py +0 -269
- geney-1.3.0/geney/gtex_utils.py +0 -68
- geney-1.3.0/geney/immune_utils.py +0 -125
- geney-1.3.0/geney/oncosplice.py +0 -484
- geney-1.3.0/geney/pangolin_utils.py +0 -82
- geney-1.3.0/geney/spliceai_utils.py +0 -76
- geney-1.3.0/geney/splicing_utils.py +0 -466
- geney-1.3.0/geney/survival_utils.py +0 -143
- geney-1.3.0/geney/tcga_utils.py +0 -406
- geney-1.3.0/geney/tis_utils.py +0 -163
- geney-1.3.0/geney/translation_initiation/__init__.py +0 -0
- geney-1.3.0/geney/translation_initiation/resources/kozak_pssm.json +0 -1
- geney-1.3.0/geney/translation_initiation/resources/tis_regressor_model.joblib +0 -0
- geney-1.3.0/geney/translation_initiation/tis_utils.py +0 -124
- geney-1.3.0/geney/utils.py +0 -80
- geney-1.3.0/geney.egg-info/SOURCES.txt +0 -31
- geney-1.3.0/geney.egg-info/requires.txt +0 -17
- geney-1.3.0/tests/test_oncosplice.py +0 -25
- {geney-1.3.0 → geney-1.4.45}/MANIFEST.in +0 -0
- {geney-1.3.0 → geney-1.4.45}/geney.egg-info/dependency_links.txt +0 -0
- {geney-1.3.0 → geney-1.4.45}/geney.egg-info/top_level.txt +0 -0
- {geney-1.3.0 → geney-1.4.45}/setup.cfg +0 -0
|
@@ -1,19 +1,31 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: geney
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.4.45
|
|
4
4
|
Summary: A Python package for gene expression modeling.
|
|
5
5
|
Home-page: https://github.com/nicolaslynn/geney
|
|
6
6
|
Author: Nicolas Lynn
|
|
7
7
|
Author-email: nicolasalynn@gmail.com
|
|
8
8
|
License: Free for non-commercial use
|
|
9
|
-
Platform: UNKNOWN
|
|
10
9
|
Classifier: Development Status :: 1 - Planning
|
|
11
10
|
Classifier: Intended Audience :: Science/Research
|
|
12
11
|
Classifier: License :: Free for non-commercial use
|
|
13
12
|
Classifier: Operating System :: POSIX :: Linux
|
|
14
13
|
Classifier: Operating System :: MacOS
|
|
15
14
|
Classifier: Programming Language :: Python :: 3.9
|
|
16
|
-
Requires-Python: >3.
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
15
|
+
Requires-Python: >3.10
|
|
16
|
+
Requires-Dist: numpy<2.0
|
|
17
|
+
Requires-Dist: pandas==2.1.4
|
|
18
|
+
Requires-Dist: biopython>=1.81
|
|
19
|
+
Requires-Dist: matplotlib
|
|
20
|
+
Requires-Dist: seaborn
|
|
21
|
+
Requires-Dist: torch
|
|
22
|
+
Requires-Dist: openspliceai
|
|
23
|
+
Requires-Dist: seqmat
|
|
24
|
+
Dynamic: author
|
|
25
|
+
Dynamic: author-email
|
|
26
|
+
Dynamic: classifier
|
|
27
|
+
Dynamic: home-page
|
|
28
|
+
Dynamic: license
|
|
29
|
+
Dynamic: requires-dist
|
|
30
|
+
Dynamic: requires-python
|
|
31
|
+
Dynamic: summary
|
geney-1.4.45/README.md
ADDED
|
@@ -0,0 +1,380 @@
|
|
|
1
|
+
# Geney - Splicing and Oncosplice Analysis Library
|
|
2
|
+
|
|
3
|
+
A Python library for analyzing splicing events and their impact on protein conservation in cancer genomics.
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
Geney provides tools for:
|
|
8
|
+
1. **Variant representation** - Parse and validate genomic mutations
|
|
9
|
+
2. **Splicing prediction** - Predict splice site changes using SpliceAI or Pangolin
|
|
10
|
+
3. **Splice simulation** - Generate all viable transcript isoforms from predicted splicing
|
|
11
|
+
4. **Oncosplice scoring** - Assess impact of splicing changes on conserved protein domains
|
|
12
|
+
|
|
13
|
+
## Installation
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
# Install dependencies
|
|
17
|
+
pip install -r requirements.txt
|
|
18
|
+
|
|
19
|
+
# Note: spliceai must be installed via conda
|
|
20
|
+
conda install -c bioconda spliceai
|
|
21
|
+
|
|
22
|
+
# Install geney in development mode
|
|
23
|
+
pip install -e .
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
## Core Classes
|
|
27
|
+
|
|
28
|
+
### 1. `MutationalEvent` (from `variants.py`)
|
|
29
|
+
|
|
30
|
+
Represents one or more genomic mutations.
|
|
31
|
+
|
|
32
|
+
**Input:**
|
|
33
|
+
```python
|
|
34
|
+
from geney.variants import MutationalEvent
|
|
35
|
+
|
|
36
|
+
# Single mutation
|
|
37
|
+
m = MutationalEvent("KRAS:12:25227343:G:T")
|
|
38
|
+
|
|
39
|
+
# Multiple mutations (epistasis)
|
|
40
|
+
m = MutationalEvent("KRAS:12:25227343:G:T|KRAS:12:25227344:A:T")
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
**Key Properties:**
|
|
44
|
+
- `.gene` - Gene name (str)
|
|
45
|
+
- `.central_position` - Central position (density center) of all mutations (int)
|
|
46
|
+
- `.position` - Alias for `.central_position` (backward compatibility)
|
|
47
|
+
- `.positions` - List of all mutation positions (List[int])
|
|
48
|
+
- `.compatible()` - Returns True if mutations don't overlap (bool)
|
|
49
|
+
- Iterable: yields `(pos, ref, alt)` tuples
|
|
50
|
+
|
|
51
|
+
**Note on Central Position:**
|
|
52
|
+
- For single mutations: equals the mutation position
|
|
53
|
+
- For multiple mutations: equals the mean (centroid) of all positions
|
|
54
|
+
- Used as the analysis point for splicing predictions
|
|
55
|
+
|
|
56
|
+
**Output:** Structured mutation object for downstream analysis
|
|
57
|
+
|
|
58
|
+
---
|
|
59
|
+
|
|
60
|
+
### 2. `TranscriptLibrary` (from `TranscriptLibrary.py`)
|
|
61
|
+
|
|
62
|
+
Creates reference and mutated transcript variants, then predicts splicing changes.
|
|
63
|
+
|
|
64
|
+
**Input:**
|
|
65
|
+
```python
|
|
66
|
+
from geney import TranscriptLibrary
|
|
67
|
+
|
|
68
|
+
tl = TranscriptLibrary(
|
|
69
|
+
reference_transcript, # seqmat Transcript object
|
|
70
|
+
mutations # MutationalEvent object (iterable)
|
|
71
|
+
)
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
**Key Methods:**
|
|
75
|
+
```python
|
|
76
|
+
# Predict splicing for all transcripts
|
|
77
|
+
tl.predict_splicing(
|
|
78
|
+
pos=25227343, # Position to analyze
|
|
79
|
+
engine='spliceai', # 'spliceai' or 'pangolin'
|
|
80
|
+
inplace=True # Returns self if True
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
# Get splicing results for specific event
|
|
84
|
+
splicing_df = tl.get_event_columns('event')
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
**Output:**
|
|
88
|
+
- `splicing_df` is a MultiIndex DataFrame with:
|
|
89
|
+
- **Rows:** Genomic positions
|
|
90
|
+
- **Columns:** MultiIndex with:
|
|
91
|
+
- Level 0: `'donors'` or `'acceptors'`
|
|
92
|
+
- Level 1: `'event_prob'`, `'ref_prob'`, `'annotated'`
|
|
93
|
+
- **Values:** Splice site probabilities (0-1)
|
|
94
|
+
|
|
95
|
+
**Key Attributes:**
|
|
96
|
+
- `.ref` - Reference transcript
|
|
97
|
+
- `.event` - Mutated transcript with all mutations applied
|
|
98
|
+
- `.splicing_results` - Full splicing prediction DataFrame
|
|
99
|
+
|
|
100
|
+
**⚠️ Important:** This class depends on seqmat's Transcript objects having:
|
|
101
|
+
- `.clone()` method
|
|
102
|
+
- `.pre_mrna.apply_mutations((pos, ref, alt))` method
|
|
103
|
+
- `.pre_mrna.predict_splicing(pos, engine, inplace)` method
|
|
104
|
+
- `.pre_mrna.predicted_splicing` attribute
|
|
105
|
+
|
|
106
|
+
---
|
|
107
|
+
|
|
108
|
+
### 3. `SpliceSimulator` (from `SpliceSimulator.py`)
|
|
109
|
+
|
|
110
|
+
Generates all viable transcript isoforms based on splice site predictions.
|
|
111
|
+
|
|
112
|
+
**Input:**
|
|
113
|
+
```python
|
|
114
|
+
from geney import SpliceSimulator
|
|
115
|
+
|
|
116
|
+
ss = SpliceSimulator(
|
|
117
|
+
splicing_df=splicing_results, # From TranscriptLibrary
|
|
118
|
+
transcript=tl.event, # Mutated transcript
|
|
119
|
+
max_distance=100_000_000, # Max intron size
|
|
120
|
+
feature='event' # Column prefix to use
|
|
121
|
+
)
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
**Key Methods:**
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
# Get summary statistics
|
|
128
|
+
metadata = ss.report(position)
|
|
129
|
+
# Returns pd.Series with:
|
|
130
|
+
# - 'region': 'exon', 'intron', or None
|
|
131
|
+
# - 'index': Region index
|
|
132
|
+
# - "5'_dist": Distance to 5' end
|
|
133
|
+
# - "3'_dist": Distance to 3' end
|
|
134
|
+
# - 'donor_events': JSON of altered donor sites
|
|
135
|
+
# - 'acceptor_events': JSON of altered acceptor sites
|
|
136
|
+
# - 'missplicing': Max splicing delta
|
|
137
|
+
|
|
138
|
+
# Iterate through viable isoforms
|
|
139
|
+
for variant_transcript, isoform_metadata in ss.get_viable_transcripts(metadata=True):
|
|
140
|
+
# variant_transcript is a cloned transcript with:
|
|
141
|
+
# - .donors, .acceptors updated
|
|
142
|
+
# - .mature_mrna generated
|
|
143
|
+
# - .protein generated
|
|
144
|
+
# - .path_weight (probability)
|
|
145
|
+
# - .path_hash (unique identifier)
|
|
146
|
+
|
|
147
|
+
# isoform_metadata is pd.Series with:
|
|
148
|
+
# - 'isoform_prevalence': Path probability
|
|
149
|
+
# - 'isoform_id': Unique hash
|
|
150
|
+
# - Plus comparison metrics to reference
|
|
151
|
+
pass
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
**Output:**
|
|
155
|
+
- **Yields:** `(transcript, metadata)` tuples for each viable isoform
|
|
156
|
+
- Each transcript has `.protein` and `.mature_mrna.seq` attributes
|
|
157
|
+
- Ordered by path probability (highest first)
|
|
158
|
+
|
|
159
|
+
**⚠️ Important:** Requires seqmat Transcript to have:
|
|
160
|
+
- `.clone()` method (deep copy)
|
|
161
|
+
- `.generate_mature_mrna()` method
|
|
162
|
+
- `.generate_protein()` method
|
|
163
|
+
- `.donors`, `.acceptors`, `.rev`, `.transcript_start`, `.transcript_end` attributes
|
|
164
|
+
- `.exons`, `.introns` attributes (optional, for region detection)
|
|
165
|
+
|
|
166
|
+
---
|
|
167
|
+
|
|
168
|
+
### 4. `Oncosplice` (from `Oncosplice.py`)
|
|
169
|
+
|
|
170
|
+
Scores protein-level impact of splicing changes based on conservation.
|
|
171
|
+
|
|
172
|
+
**Input:**
|
|
173
|
+
```python
|
|
174
|
+
from geney import Oncosplice
|
|
175
|
+
|
|
176
|
+
onco = Oncosplice(
|
|
177
|
+
reference_protein="MTEYK...", # Reference protein sequence (str)
|
|
178
|
+
variant_protein="MTEYKV...", # Variant protein sequence (str)
|
|
179
|
+
conservation_vector=np.array([...]) # Conservation scores (numpy array)
|
|
180
|
+
)
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
**Automatic Analysis:**
|
|
184
|
+
- Aligns reference and variant proteins
|
|
185
|
+
- Identifies insertions and deletions
|
|
186
|
+
- Calculates conservation-weighted impact score
|
|
187
|
+
|
|
188
|
+
**Key Methods:**
|
|
189
|
+
```python
|
|
190
|
+
# Get summary as pandas Series
|
|
191
|
+
analysis = onco.get_analysis_series()
|
|
192
|
+
# Returns pd.Series with:
|
|
193
|
+
# - 'reference_protein': Reference sequence
|
|
194
|
+
# - 'variant_protein': Variant sequence
|
|
195
|
+
# - 'reference_length': Length of reference
|
|
196
|
+
# - 'variant_length': Length of variant
|
|
197
|
+
# - 'oncosplice_score': Conservation-weighted impact score
|
|
198
|
+
# - 'oncosplice_percentile': Percentile of score
|
|
199
|
+
# - 'deletion_count': Number of deleted positions
|
|
200
|
+
# - 'insertion_count': Number of inserted positions
|
|
201
|
+
# - 'modified_positions_count': Total modified positions
|
|
202
|
+
|
|
203
|
+
# Visualize conservation and changes
|
|
204
|
+
onco.plot()
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
**Output:**
|
|
208
|
+
- **Score:** Higher scores = more impact on conserved regions
|
|
209
|
+
- **Percentile:** Percentile rank of the score
|
|
210
|
+
- **Series:** Structured analysis results
|
|
211
|
+
|
|
212
|
+
---
|
|
213
|
+
|
|
214
|
+
## Pipeline: `oncosplice_pipeline_single_transcript`
|
|
215
|
+
|
|
216
|
+
Complete workflow from mutation to oncosplice score.
|
|
217
|
+
|
|
218
|
+
### Current Implementation:
|
|
219
|
+
|
|
220
|
+
```python
|
|
221
|
+
from geney.pipelines import oncosplice_pipeline_single_transcript
|
|
222
|
+
|
|
223
|
+
report = oncosplice_pipeline_single_transcript(
|
|
224
|
+
mut_id="KRAS:12:25227343:G:T",
|
|
225
|
+
transcript_id="ENST00000311936",
|
|
226
|
+
splicing_engine='spliceai',
|
|
227
|
+
organism='hg38'
|
|
228
|
+
)
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
### Pipeline Flow:
|
|
232
|
+
|
|
233
|
+
```
|
|
234
|
+
1. MutationalEvent(mut_id)
|
|
235
|
+
↓ (validates and parses mutations)
|
|
236
|
+
|
|
237
|
+
2. Gene.from_file(gene, organism).transcript(id)
|
|
238
|
+
↓ (loads gene annotation)
|
|
239
|
+
|
|
240
|
+
3. TranscriptLibrary(ref_transcript, mutations)
|
|
241
|
+
↓ (applies mutations, predicts splicing)
|
|
242
|
+
|
|
243
|
+
4. SpliceSimulator(splicing_results, mutated_transcript)
|
|
244
|
+
↓ (generates viable isoforms)
|
|
245
|
+
|
|
246
|
+
5. For each isoform:
|
|
247
|
+
Oncosplice(ref_protein, variant_protein, cons_vector)
|
|
248
|
+
↓ (scores conservation impact)
|
|
249
|
+
|
|
250
|
+
6. Returns DataFrame with all isoforms and scores
|
|
251
|
+
```
|
|
252
|
+
|
|
253
|
+
### ✅ **Design Decisions:**
|
|
254
|
+
|
|
255
|
+
1. **Central Position for Analysis:**
|
|
256
|
+
- The pipeline uses `m.central_position` (mean of all mutation positions) as the focal point
|
|
257
|
+
- For single mutations: this equals the mutation position
|
|
258
|
+
- For compound events: this represents the density center
|
|
259
|
+
- Both splicing prediction and metadata reporting use this central position
|
|
260
|
+
- **Rationale:** Provides a single consistent reference point for analysis of mutation clusters
|
|
261
|
+
|
|
262
|
+
2. **Multi-mutation Handling:**
|
|
263
|
+
- All mutations are applied to the transcript via `TranscriptLibrary`
|
|
264
|
+
- Splicing is predicted at the central position to capture regional effects
|
|
265
|
+
- Individual mutation positions are preserved in `.positions` for detailed analysis if needed
|
|
266
|
+
|
|
267
|
+
3. **Dependencies on seqmat:**
|
|
268
|
+
- Requires `reference_transcript.cons_vector` - where does this come from?
|
|
269
|
+
- Requires `transcript.mature_mrna.seq` - ensure seqmat provides this
|
|
270
|
+
- Requires `transcript.protein` - ensure seqmat provides this
|
|
271
|
+
|
|
272
|
+
### Output Schema:
|
|
273
|
+
|
|
274
|
+
Returns `pd.DataFrame` where each row is a viable isoform with:
|
|
275
|
+
|
|
276
|
+
**Base Information:**
|
|
277
|
+
- `mut_id`: Original mutation ID
|
|
278
|
+
- `gene`: Gene name
|
|
279
|
+
- `transcript_id`: Transcript identifier
|
|
280
|
+
- `primary_transcript`: Boolean flag
|
|
281
|
+
- `splicing_engine`: Engine used ('spliceai' or 'pangolin')
|
|
282
|
+
- `central_position`: Central position of mutation event
|
|
283
|
+
- `mutation_count`: Number of mutations in the event
|
|
284
|
+
- `time_of_execution`: Timestamp
|
|
285
|
+
|
|
286
|
+
**Splice Metadata:** (from `ss.report()`)
|
|
287
|
+
- `region`: 'exon', 'intron', or None
|
|
288
|
+
- `index`: Region index
|
|
289
|
+
- `5'_dist`, `3'_dist`: Distances to region boundaries
|
|
290
|
+
- `donor_events`: JSON of altered donors
|
|
291
|
+
- `acceptor_events`: JSON of altered acceptors
|
|
292
|
+
- `missplicing`: Max splicing delta
|
|
293
|
+
|
|
294
|
+
**Isoform Metadata:** (from `ss.get_viable_transcripts()`)
|
|
295
|
+
- `isoform_prevalence`: Probability of this isoform
|
|
296
|
+
- `isoform_id`: Unique hash identifier
|
|
297
|
+
- Plus comparison metrics to reference
|
|
298
|
+
|
|
299
|
+
**Sequence Data:**
|
|
300
|
+
- `reference_mrna`: Reference mRNA sequence
|
|
301
|
+
- `variant_mrna`: Variant mRNA sequence
|
|
302
|
+
|
|
303
|
+
**Oncosplice Analysis:** (from `onco.get_analysis_series()`)
|
|
304
|
+
- `reference_protein`: Reference protein sequence
|
|
305
|
+
- `variant_protein`: Variant protein sequence
|
|
306
|
+
- `reference_length`, `variant_length`: Sequence lengths
|
|
307
|
+
- `oncosplice_score`: Conservation impact score
|
|
308
|
+
- `oncosplice_percentile`: Score percentile
|
|
309
|
+
- `deletion_count`, `insertion_count`: Change counts
|
|
310
|
+
- `modified_positions_count`: Total modifications
|
|
311
|
+
|
|
312
|
+
---
|
|
313
|
+
|
|
314
|
+
## Requirements
|
|
315
|
+
|
|
316
|
+
### Core Dependencies:
|
|
317
|
+
- `numpy` - Numerical operations
|
|
318
|
+
- `pandas` - Data manipulation
|
|
319
|
+
- `biopython` - Sequence alignment
|
|
320
|
+
- `matplotlib`, `seaborn` - Visualization
|
|
321
|
+
- `tensorflow`, `keras` - SpliceAI models
|
|
322
|
+
- `torch` - Pangolin models
|
|
323
|
+
- `joblib` - Model persistence
|
|
324
|
+
- **`seqmat`** - Gene/Transcript handling (external)
|
|
325
|
+
- **`pangolin`** - Splicing prediction (optional)
|
|
326
|
+
|
|
327
|
+
### Conda-only:
|
|
328
|
+
```bash
|
|
329
|
+
conda install -c bioconda spliceai
|
|
330
|
+
```
|
|
331
|
+
|
|
332
|
+
---
|
|
333
|
+
|
|
334
|
+
## Example Usage
|
|
335
|
+
|
|
336
|
+
```python
|
|
337
|
+
from geney.variants import MutationalEvent
|
|
338
|
+
from geney.pipelines import oncosplice_pipeline_single_transcript
|
|
339
|
+
|
|
340
|
+
# Analyze a single mutation
|
|
341
|
+
report_df = oncosplice_pipeline_single_transcript(
|
|
342
|
+
mut_id="KRAS:12:25227343:G:T",
|
|
343
|
+
transcript_id="ENST00000311936",
|
|
344
|
+
splicing_engine='spliceai',
|
|
345
|
+
organism='hg38'
|
|
346
|
+
)
|
|
347
|
+
|
|
348
|
+
# View top isoforms by prevalence
|
|
349
|
+
print(report_df.sort_values('isoform_prevalence', ascending=False).head())
|
|
350
|
+
|
|
351
|
+
# Find isoforms with high oncosplice scores
|
|
352
|
+
high_impact = report_df[report_df['oncosplice_score'] > 0.8]
|
|
353
|
+
```
|
|
354
|
+
|
|
355
|
+
---
|
|
356
|
+
|
|
357
|
+
## Notes & Caveats
|
|
358
|
+
|
|
359
|
+
1. **Multi-mutation events:** The pipeline may not correctly handle compound mutations. Review lines 20 and 30 in `pipelines.py`.
|
|
360
|
+
|
|
361
|
+
2. **seqmat dependency:** This library heavily depends on seqmat's Gene and Transcript classes. Ensure seqmat provides all required methods and attributes.
|
|
362
|
+
|
|
363
|
+
3. **Conservation vectors:** The source of conservation scores (`cons_vector`) must be documented in seqmat.
|
|
364
|
+
|
|
365
|
+
4. **Memory usage:** Generating all viable isoforms can be memory-intensive for genes with many splice sites.
|
|
366
|
+
|
|
367
|
+
5. **Splicing engines:**
|
|
368
|
+
- `'spliceai'` - Requires conda installation
|
|
369
|
+
- `'pangolin'` - Alternative engine
|
|
370
|
+
- `'spliceai-pytorch'` - Deprecated (raises error)
|
|
371
|
+
|
|
372
|
+
---
|
|
373
|
+
|
|
374
|
+
## Contributing
|
|
375
|
+
|
|
376
|
+
When modifying the pipeline:
|
|
377
|
+
1. Ensure compatibility with `MutationalEvent` iteration format
|
|
378
|
+
2. Test with both single and multi-mutation events
|
|
379
|
+
3. Verify seqmat integration points
|
|
380
|
+
4. Update this README with any changes to output schemas
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# oncosplice/__init__.py
|
|
2
|
+
from .variants import Mutation, MutationalEvent, MutationLibrary
|
|
3
|
+
from .engines import (
|
|
4
|
+
sai_predict_probs,
|
|
5
|
+
run_spliceai_seq,
|
|
6
|
+
run_splicing_engine,
|
|
7
|
+
predict_splicing,
|
|
8
|
+
adjoin_splicing_outcomes,
|
|
9
|
+
)
|
|
10
|
+
from .transcripts import TranscriptLibrary
|
|
11
|
+
from .splice_graph import SpliceSimulator
|
|
12
|
+
from .pipelines import (
|
|
13
|
+
oncosplice_pipeline,
|
|
14
|
+
oncosplice_top_isoform,
|
|
15
|
+
max_splicing_delta,
|
|
16
|
+
oncosplice_pipeline_single_transcript, # backwards compat
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
__all__ = [
|
|
20
|
+
"Mutation",
|
|
21
|
+
"MutationalEvent",
|
|
22
|
+
"MutationLibrary",
|
|
23
|
+
"sai_predict_probs",
|
|
24
|
+
"run_spliceai_seq",
|
|
25
|
+
"run_splicing_engine",
|
|
26
|
+
"predict_splicing",
|
|
27
|
+
"adjoin_splicing_outcomes",
|
|
28
|
+
"TranscriptLibrary",
|
|
29
|
+
"SpliceSimulator",
|
|
30
|
+
"oncosplice_pipeline",
|
|
31
|
+
"oncosplice_top_isoform",
|
|
32
|
+
"max_splicing_delta",
|
|
33
|
+
"oncosplice_pipeline_single_transcript",
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
mut_id = 'KRAS:12:25227343:G:T'
|
|
38
|
+
epistasis_id = 'KRAS:12:25227343:G:T|KRAS:12:25227344:A:T'
|