geney 1.3.0__tar.gz → 1.4.45__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. {geney-1.3.0 → geney-1.4.45}/PKG-INFO +19 -7
  2. geney-1.4.45/README.md +380 -0
  3. geney-1.4.45/geney/__init__.py +38 -0
  4. geney-1.4.45/geney/engines.py +354 -0
  5. geney-1.4.45/geney/models/openspliceai-mane/10000nt/model_10000nt_rs10.pt +0 -0
  6. geney-1.4.45/geney/models/openspliceai-mane/10000nt/model_10000nt_rs11.pt +0 -0
  7. geney-1.4.45/geney/models/openspliceai-mane/10000nt/model_10000nt_rs12.pt +0 -0
  8. geney-1.4.45/geney/models/openspliceai-mane/10000nt/model_10000nt_rs13.pt +0 -0
  9. geney-1.4.45/geney/models/openspliceai-mane/10000nt/model_10000nt_rs14.pt +0 -0
  10. geney-1.4.45/geney/oncosplice.py +412 -0
  11. geney-1.4.45/geney/pipelines.py +139 -0
  12. geney-1.4.45/geney/splice_graph.py +637 -0
  13. geney-1.4.45/geney/splice_graph_archive.py +948 -0
  14. geney-1.4.45/geney/transcripts.py +68 -0
  15. geney-1.4.45/geney/variants.py +389 -0
  16. {geney-1.3.0 → geney-1.4.45}/geney.egg-info/PKG-INFO +19 -7
  17. geney-1.4.45/geney.egg-info/SOURCES.txt +22 -0
  18. geney-1.4.45/geney.egg-info/requires.txt +8 -0
  19. {geney-1.3.0 → geney-1.4.45}/setup.py +5 -2
  20. geney-1.3.0/geney/Fasta_segment.py +0 -257
  21. geney-1.3.0/geney/Gene.py +0 -177
  22. geney-1.3.0/geney/SeqMats.py +0 -492
  23. geney-1.3.0/geney/Transcript.py +0 -379
  24. geney-1.3.0/geney/__init__.py +0 -27
  25. geney-1.3.0/geney/_mutation_utils.py +0 -38
  26. geney-1.3.0/geney/config_setup.py +0 -15
  27. geney-1.3.0/geney/data_setup.py +0 -306
  28. geney-1.3.0/geney/graphic_utils.py +0 -269
  29. geney-1.3.0/geney/gtex_utils.py +0 -68
  30. geney-1.3.0/geney/immune_utils.py +0 -125
  31. geney-1.3.0/geney/oncosplice.py +0 -484
  32. geney-1.3.0/geney/pangolin_utils.py +0 -82
  33. geney-1.3.0/geney/spliceai_utils.py +0 -76
  34. geney-1.3.0/geney/splicing_utils.py +0 -466
  35. geney-1.3.0/geney/survival_utils.py +0 -143
  36. geney-1.3.0/geney/tcga_utils.py +0 -406
  37. geney-1.3.0/geney/tis_utils.py +0 -163
  38. geney-1.3.0/geney/translation_initiation/__init__.py +0 -0
  39. geney-1.3.0/geney/translation_initiation/resources/kozak_pssm.json +0 -1
  40. geney-1.3.0/geney/translation_initiation/resources/tis_regressor_model.joblib +0 -0
  41. geney-1.3.0/geney/translation_initiation/tis_utils.py +0 -124
  42. geney-1.3.0/geney/utils.py +0 -80
  43. geney-1.3.0/geney.egg-info/SOURCES.txt +0 -31
  44. geney-1.3.0/geney.egg-info/requires.txt +0 -17
  45. geney-1.3.0/tests/test_oncosplice.py +0 -25
  46. {geney-1.3.0 → geney-1.4.45}/MANIFEST.in +0 -0
  47. {geney-1.3.0 → geney-1.4.45}/geney.egg-info/dependency_links.txt +0 -0
  48. {geney-1.3.0 → geney-1.4.45}/geney.egg-info/top_level.txt +0 -0
  49. {geney-1.3.0 → geney-1.4.45}/setup.cfg +0 -0
@@ -1,19 +1,31 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: geney
3
- Version: 1.3.0
3
+ Version: 1.4.45
4
4
  Summary: A Python package for gene expression modeling.
5
5
  Home-page: https://github.com/nicolaslynn/geney
6
6
  Author: Nicolas Lynn
7
7
  Author-email: nicolasalynn@gmail.com
8
8
  License: Free for non-commercial use
9
- Platform: UNKNOWN
10
9
  Classifier: Development Status :: 1 - Planning
11
10
  Classifier: Intended Audience :: Science/Research
12
11
  Classifier: License :: Free for non-commercial use
13
12
  Classifier: Operating System :: POSIX :: Linux
14
13
  Classifier: Operating System :: MacOS
15
14
  Classifier: Programming Language :: Python :: 3.9
16
- Requires-Python: >3.9
17
-
18
- UNKNOWN
19
-
15
+ Requires-Python: >3.10
16
+ Requires-Dist: numpy<2.0
17
+ Requires-Dist: pandas==2.1.4
18
+ Requires-Dist: biopython>=1.81
19
+ Requires-Dist: matplotlib
20
+ Requires-Dist: seaborn
21
+ Requires-Dist: torch
22
+ Requires-Dist: openspliceai
23
+ Requires-Dist: seqmat
24
+ Dynamic: author
25
+ Dynamic: author-email
26
+ Dynamic: classifier
27
+ Dynamic: home-page
28
+ Dynamic: license
29
+ Dynamic: requires-dist
30
+ Dynamic: requires-python
31
+ Dynamic: summary
geney-1.4.45/README.md ADDED
@@ -0,0 +1,380 @@
1
+ # Geney - Splicing and Oncosplice Analysis Library
2
+
3
+ A Python library for analyzing splicing events and their impact on protein conservation in cancer genomics.
4
+
5
+ ## Overview
6
+
7
+ Geney provides tools for:
8
+ 1. **Variant representation** - Parse and validate genomic mutations
9
+ 2. **Splicing prediction** - Predict splice site changes using SpliceAI or Pangolin
10
+ 3. **Splice simulation** - Generate all viable transcript isoforms from predicted splicing
11
+ 4. **Oncosplice scoring** - Assess impact of splicing changes on conserved protein domains
12
+
13
+ ## Installation
14
+
15
+ ```bash
16
+ # Install dependencies
17
+ pip install -r requirements.txt
18
+
19
+ # Note: spliceai must be installed via conda
20
+ conda install -c bioconda spliceai
21
+
22
+ # Install geney in development mode
23
+ pip install -e .
24
+ ```
25
+
26
+ ## Core Classes
27
+
28
+ ### 1. `MutationalEvent` (from `variants.py`)
29
+
30
+ Represents one or more genomic mutations.
31
+
32
+ **Input:**
33
+ ```python
34
+ from geney.variants import MutationalEvent
35
+
36
+ # Single mutation
37
+ m = MutationalEvent("KRAS:12:25227343:G:T")
38
+
39
+ # Multiple mutations (epistasis)
40
+ m = MutationalEvent("KRAS:12:25227343:G:T|KRAS:12:25227344:A:T")
41
+ ```
42
+
43
+ **Key Properties:**
44
+ - `.gene` - Gene name (str)
45
+ - `.central_position` - Central position (density center) of all mutations (int)
46
+ - `.position` - Alias for `.central_position` (backward compatibility)
47
+ - `.positions` - List of all mutation positions (List[int])
48
+ - `.compatible()` - Returns True if mutations don't overlap (bool)
49
+ - Iterable: yields `(pos, ref, alt)` tuples
50
+
51
+ **Note on Central Position:**
52
+ - For single mutations: equals the mutation position
53
+ - For multiple mutations: equals the mean (centroid) of all positions
54
+ - Used as the analysis point for splicing predictions
55
+
56
+ **Output:** Structured mutation object for downstream analysis
57
+
58
+ ---
59
+
60
+ ### 2. `TranscriptLibrary` (from `TranscriptLibrary.py`)
61
+
62
+ Creates reference and mutated transcript variants, then predicts splicing changes.
63
+
64
+ **Input:**
65
+ ```python
66
+ from geney import TranscriptLibrary
67
+
68
+ tl = TranscriptLibrary(
69
+ reference_transcript, # seqmat Transcript object
70
+ mutations # MutationalEvent object (iterable)
71
+ )
72
+ ```
73
+
74
+ **Key Methods:**
75
+ ```python
76
+ # Predict splicing for all transcripts
77
+ tl.predict_splicing(
78
+ pos=25227343, # Position to analyze
79
+ engine='spliceai', # 'spliceai' or 'pangolin'
80
+ inplace=True # Returns self if True
81
+ )
82
+
83
+ # Get splicing results for specific event
84
+ splicing_df = tl.get_event_columns('event')
85
+ ```
86
+
87
+ **Output:**
88
+ - `splicing_df` is a MultiIndex DataFrame with:
89
+ - **Rows:** Genomic positions
90
+ - **Columns:** MultiIndex with:
91
+ - Level 0: `'donors'` or `'acceptors'`
92
+ - Level 1: `'event_prob'`, `'ref_prob'`, `'annotated'`
93
+ - **Values:** Splice site probabilities (0-1)
94
+
95
+ **Key Attributes:**
96
+ - `.ref` - Reference transcript
97
+ - `.event` - Mutated transcript with all mutations applied
98
+ - `.splicing_results` - Full splicing prediction DataFrame
99
+
100
+ **⚠️ Important:** This class depends on seqmat's Transcript objects having:
101
+ - `.clone()` method
102
+ - `.pre_mrna.apply_mutations((pos, ref, alt))` method
103
+ - `.pre_mrna.predict_splicing(pos, engine, inplace)` method
104
+ - `.pre_mrna.predicted_splicing` attribute
105
+
106
+ ---
107
+
108
+ ### 3. `SpliceSimulator` (from `SpliceSimulator.py`)
109
+
110
+ Generates all viable transcript isoforms based on splice site predictions.
111
+
112
+ **Input:**
113
+ ```python
114
+ from geney import SpliceSimulator
115
+
116
+ ss = SpliceSimulator(
117
+ splicing_df=splicing_results, # From TranscriptLibrary
118
+ transcript=tl.event, # Mutated transcript
119
+ max_distance=100_000_000, # Max intron size
120
+ feature='event' # Column prefix to use
121
+ )
122
+ ```
123
+
124
+ **Key Methods:**
125
+
126
+ ```python
127
+ # Get summary statistics
128
+ metadata = ss.report(position)
129
+ # Returns pd.Series with:
130
+ # - 'region': 'exon', 'intron', or None
131
+ # - 'index': Region index
132
+ # - "5'_dist": Distance to 5' end
133
+ # - "3'_dist": Distance to 3' end
134
+ # - 'donor_events': JSON of altered donor sites
135
+ # - 'acceptor_events': JSON of altered acceptor sites
136
+ # - 'missplicing': Max splicing delta
137
+
138
+ # Iterate through viable isoforms
139
+ for variant_transcript, isoform_metadata in ss.get_viable_transcripts(metadata=True):
140
+ # variant_transcript is a cloned transcript with:
141
+ # - .donors, .acceptors updated
142
+ # - .mature_mrna generated
143
+ # - .protein generated
144
+ # - .path_weight (probability)
145
+ # - .path_hash (unique identifier)
146
+
147
+ # isoform_metadata is pd.Series with:
148
+ # - 'isoform_prevalence': Path probability
149
+ # - 'isoform_id': Unique hash
150
+ # - Plus comparison metrics to reference
151
+ pass
152
+ ```
153
+
154
+ **Output:**
155
+ - **Yields:** `(transcript, metadata)` tuples for each viable isoform
156
+ - Each transcript has `.protein` and `.mature_mrna.seq` attributes
157
+ - Ordered by path probability (highest first)
158
+
159
+ **⚠️ Important:** Requires seqmat Transcript to have:
160
+ - `.clone()` method (deep copy)
161
+ - `.generate_mature_mrna()` method
162
+ - `.generate_protein()` method
163
+ - `.donors`, `.acceptors`, `.rev`, `.transcript_start`, `.transcript_end` attributes
164
+ - `.exons`, `.introns` attributes (optional, for region detection)
165
+
166
+ ---
167
+
168
+ ### 4. `Oncosplice` (from `Oncosplice.py`)
169
+
170
+ Scores protein-level impact of splicing changes based on conservation.
171
+
172
+ **Input:**
173
+ ```python
174
+ from geney import Oncosplice
175
+
176
+ onco = Oncosplice(
177
+ reference_protein="MTEYK...", # Reference protein sequence (str)
178
+ variant_protein="MTEYKV...", # Variant protein sequence (str)
179
+ conservation_vector=np.array([...]) # Conservation scores (numpy array)
180
+ )
181
+ ```
182
+
183
+ **Automatic Analysis:**
184
+ - Aligns reference and variant proteins
185
+ - Identifies insertions and deletions
186
+ - Calculates conservation-weighted impact score
187
+
188
+ **Key Methods:**
189
+ ```python
190
+ # Get summary as pandas Series
191
+ analysis = onco.get_analysis_series()
192
+ # Returns pd.Series with:
193
+ # - 'reference_protein': Reference sequence
194
+ # - 'variant_protein': Variant sequence
195
+ # - 'reference_length': Length of reference
196
+ # - 'variant_length': Length of variant
197
+ # - 'oncosplice_score': Conservation-weighted impact score
198
+ # - 'oncosplice_percentile': Percentile of score
199
+ # - 'deletion_count': Number of deleted positions
200
+ # - 'insertion_count': Number of inserted positions
201
+ # - 'modified_positions_count': Total modified positions
202
+
203
+ # Visualize conservation and changes
204
+ onco.plot()
205
+ ```
206
+
207
+ **Output:**
208
+ - **Score:** Higher scores = more impact on conserved regions
209
+ - **Percentile:** Percentile rank of the score
210
+ - **Series:** Structured analysis results
211
+
212
+ ---
213
+
214
+ ## Pipeline: `oncosplice_pipeline_single_transcript`
215
+
216
+ Complete workflow from mutation to oncosplice score.
217
+
218
+ ### Current Implementation:
219
+
220
+ ```python
221
+ from geney.pipelines import oncosplice_pipeline_single_transcript
222
+
223
+ report = oncosplice_pipeline_single_transcript(
224
+ mut_id="KRAS:12:25227343:G:T",
225
+ transcript_id="ENST00000311936",
226
+ splicing_engine='spliceai',
227
+ organism='hg38'
228
+ )
229
+ ```
230
+
231
+ ### Pipeline Flow:
232
+
233
+ ```
234
+ 1. MutationalEvent(mut_id)
235
+ ↓ (validates and parses mutations)
236
+
237
+ 2. Gene.from_file(gene, organism).transcript(id)
238
+ ↓ (loads gene annotation)
239
+
240
+ 3. TranscriptLibrary(ref_transcript, mutations)
241
+ ↓ (applies mutations, predicts splicing)
242
+
243
+ 4. SpliceSimulator(splicing_results, mutated_transcript)
244
+ ↓ (generates viable isoforms)
245
+
246
+ 5. For each isoform:
247
+ Oncosplice(ref_protein, variant_protein, cons_vector)
248
+ ↓ (scores conservation impact)
249
+
250
+ 6. Returns DataFrame with all isoforms and scores
251
+ ```
252
+
253
+ ### ✅ **Design Decisions:**
254
+
255
+ 1. **Central Position for Analysis:**
256
+ - The pipeline uses `m.central_position` (mean of all mutation positions) as the focal point
257
+ - For single mutations: this equals the mutation position
258
+ - For compound events: this represents the density center
259
+ - Both splicing prediction and metadata reporting use this central position
260
+ - **Rationale:** Provides a single consistent reference point for analysis of mutation clusters
261
+
262
+ 2. **Multi-mutation Handling:**
263
+ - All mutations are applied to the transcript via `TranscriptLibrary`
264
+ - Splicing is predicted at the central position to capture regional effects
265
+ - Individual mutation positions are preserved in `.positions` for detailed analysis if needed
266
+
267
+ 3. **Dependencies on seqmat:**
268
+ - Requires `reference_transcript.cons_vector` - where does this come from?
269
+ - Requires `transcript.mature_mrna.seq` - ensure seqmat provides this
270
+ - Requires `transcript.protein` - ensure seqmat provides this
271
+
272
+ ### Output Schema:
273
+
274
+ Returns `pd.DataFrame` where each row is a viable isoform with:
275
+
276
+ **Base Information:**
277
+ - `mut_id`: Original mutation ID
278
+ - `gene`: Gene name
279
+ - `transcript_id`: Transcript identifier
280
+ - `primary_transcript`: Boolean flag
281
+ - `splicing_engine`: Engine used ('spliceai' or 'pangolin')
282
+ - `central_position`: Central position of mutation event
283
+ - `mutation_count`: Number of mutations in the event
284
+ - `time_of_execution`: Timestamp
285
+
286
+ **Splice Metadata:** (from `ss.report()`)
287
+ - `region`: 'exon', 'intron', or None
288
+ - `index`: Region index
289
+ - `5'_dist`, `3'_dist`: Distances to region boundaries
290
+ - `donor_events`: JSON of altered donors
291
+ - `acceptor_events`: JSON of altered acceptors
292
+ - `missplicing`: Max splicing delta
293
+
294
+ **Isoform Metadata:** (from `ss.get_viable_transcripts()`)
295
+ - `isoform_prevalence`: Probability of this isoform
296
+ - `isoform_id`: Unique hash identifier
297
+ - Plus comparison metrics to reference
298
+
299
+ **Sequence Data:**
300
+ - `reference_mrna`: Reference mRNA sequence
301
+ - `variant_mrna`: Variant mRNA sequence
302
+
303
+ **Oncosplice Analysis:** (from `onco.get_analysis_series()`)
304
+ - `reference_protein`: Reference protein sequence
305
+ - `variant_protein`: Variant protein sequence
306
+ - `reference_length`, `variant_length`: Sequence lengths
307
+ - `oncosplice_score`: Conservation impact score
308
+ - `oncosplice_percentile`: Score percentile
309
+ - `deletion_count`, `insertion_count`: Change counts
310
+ - `modified_positions_count`: Total modifications
311
+
312
+ ---
313
+
314
+ ## Requirements
315
+
316
+ ### Core Dependencies:
317
+ - `numpy` - Numerical operations
318
+ - `pandas` - Data manipulation
319
+ - `biopython` - Sequence alignment
320
+ - `matplotlib`, `seaborn` - Visualization
321
+ - `tensorflow`, `keras` - SpliceAI models
322
+ - `torch` - Pangolin models
323
+ - `joblib` - Model persistence
324
+ - **`seqmat`** - Gene/Transcript handling (external)
325
+ - **`pangolin`** - Splicing prediction (optional)
326
+
327
+ ### Conda-only:
328
+ ```bash
329
+ conda install -c bioconda spliceai
330
+ ```
331
+
332
+ ---
333
+
334
+ ## Example Usage
335
+
336
+ ```python
337
+ from geney.variants import MutationalEvent
338
+ from geney.pipelines import oncosplice_pipeline_single_transcript
339
+
340
+ # Analyze a single mutation
341
+ report_df = oncosplice_pipeline_single_transcript(
342
+ mut_id="KRAS:12:25227343:G:T",
343
+ transcript_id="ENST00000311936",
344
+ splicing_engine='spliceai',
345
+ organism='hg38'
346
+ )
347
+
348
+ # View top isoforms by prevalence
349
+ print(report_df.sort_values('isoform_prevalence', ascending=False).head())
350
+
351
+ # Find isoforms with high oncosplice scores
352
+ high_impact = report_df[report_df['oncosplice_score'] > 0.8]
353
+ ```
354
+
355
+ ---
356
+
357
+ ## Notes & Caveats
358
+
359
+ 1. **Multi-mutation events:** The pipeline may not correctly handle compound mutations. Review lines 20 and 30 in `pipelines.py`.
360
+
361
+ 2. **seqmat dependency:** This library heavily depends on seqmat's Gene and Transcript classes. Ensure seqmat provides all required methods and attributes.
362
+
363
+ 3. **Conservation vectors:** The source of conservation scores (`cons_vector`) must be documented in seqmat.
364
+
365
+ 4. **Memory usage:** Generating all viable isoforms can be memory-intensive for genes with many splice sites.
366
+
367
+ 5. **Splicing engines:**
368
+ - `'spliceai'` - Requires conda installation
369
+ - `'pangolin'` - Alternative engine
370
+ - `'spliceai-pytorch'` - Deprecated (raises error)
371
+
372
+ ---
373
+
374
+ ## Contributing
375
+
376
+ When modifying the pipeline:
377
+ 1. Ensure compatibility with `MutationalEvent` iteration format
378
+ 2. Test with both single and multi-mutation events
379
+ 3. Verify seqmat integration points
380
+ 4. Update this README with any changes to output schemas
@@ -0,0 +1,38 @@
1
+ # oncosplice/__init__.py
2
+ from .variants import Mutation, MutationalEvent, MutationLibrary
3
+ from .engines import (
4
+ sai_predict_probs,
5
+ run_spliceai_seq,
6
+ run_splicing_engine,
7
+ predict_splicing,
8
+ adjoin_splicing_outcomes,
9
+ )
10
+ from .transcripts import TranscriptLibrary
11
+ from .splice_graph import SpliceSimulator
12
+ from .pipelines import (
13
+ oncosplice_pipeline,
14
+ oncosplice_top_isoform,
15
+ max_splicing_delta,
16
+ oncosplice_pipeline_single_transcript, # backwards compat
17
+ )
18
+
19
+ __all__ = [
20
+ "Mutation",
21
+ "MutationalEvent",
22
+ "MutationLibrary",
23
+ "sai_predict_probs",
24
+ "run_spliceai_seq",
25
+ "run_splicing_engine",
26
+ "predict_splicing",
27
+ "adjoin_splicing_outcomes",
28
+ "TranscriptLibrary",
29
+ "SpliceSimulator",
30
+ "oncosplice_pipeline",
31
+ "oncosplice_top_isoform",
32
+ "max_splicing_delta",
33
+ "oncosplice_pipeline_single_transcript",
34
+ ]
35
+
36
+
37
+ mut_id = 'KRAS:12:25227343:G:T'
38
+ epistasis_id = 'KRAS:12:25227343:G:T|KRAS:12:25227344:A:T'