geney 1.3.14__py2.py3-none-any.whl → 1.3.16__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of geney might be problematic. Click here for more details.
geney/splicing_utils.py
CHANGED
|
@@ -4,6 +4,12 @@ from .Gene import Gene
|
|
|
4
4
|
from .SeqMats import MutSeqMat
|
|
5
5
|
from collections import defaultdict
|
|
6
6
|
from . import config
|
|
7
|
+
import sqlite3
|
|
8
|
+
import json
|
|
9
|
+
import os
|
|
10
|
+
from typing import Dict, Any
|
|
11
|
+
|
|
12
|
+
DB_PATH = os.path.join(config['hg38']['splicing_db'], 'mutation_data.db')
|
|
7
13
|
|
|
8
14
|
def generate_adjacency_list(acceptors, donors, transcript_start, transcript_end, max_distance=50, rev=False):
|
|
9
15
|
# Append the transcript end to donors to allow connection to the end point
|
|
@@ -205,9 +211,15 @@ def find_transcript_splicing(transcript, engine: str = 'spliceai') -> Tuple[Dict
|
|
|
205
211
|
return donor_probs, acceptor_probs
|
|
206
212
|
|
|
207
213
|
|
|
208
|
-
def find_transcript_missplicing(mut_id, transcript=None, threshold=0.5, engine='spliceai', organism='hg38'):
|
|
214
|
+
def find_transcript_missplicing(mut_id, transcript=None, threshold=0.5, engine='spliceai', organism='hg38', db=None, force_recompute=False):
|
|
209
215
|
gene = Gene.from_file(mut_id.split(':')[0], organism=organism)
|
|
210
216
|
reference_transcript = gene.transcript(transcript) if transcript is not None else gene.transcript()
|
|
217
|
+
|
|
218
|
+
if db is not None:
|
|
219
|
+
cached_data = db.get_mutation_data(engine, gene.gene_name, mut_id, reference_transcript.transcript_id)
|
|
220
|
+
if cached_data and not force_recompute:
|
|
221
|
+
return cached_data
|
|
222
|
+
|
|
211
223
|
variant_transcript = reference_transcript.clone()
|
|
212
224
|
mutations = [MutSeqMat.from_mutid(m) for m in mut_id.split('|')]
|
|
213
225
|
mutations = [m for m in mutations if m in reference_transcript]
|
|
@@ -218,8 +230,12 @@ def find_transcript_missplicing(mut_id, transcript=None, threshold=0.5, engine='
|
|
|
218
230
|
for mutation in mutations:
|
|
219
231
|
variant_transcript.mutate(mutation, inplace=True)
|
|
220
232
|
|
|
221
|
-
|
|
233
|
+
missplicing = find_transcript_missplicing_seqs(reference_transcript.pre_mrna.get_context(center, 7500, padding='N'), variant_transcript.pre_mrna.get_context(center, 7500, padding='N'), reference_transcript.donors, reference_transcript.acceptors, threshold=threshold, engine=engine)
|
|
222
234
|
|
|
235
|
+
if db is not None:
|
|
236
|
+
db.store_mutation_data(engine, gene.gene_name, mut_id, reference_transcript.transcript_id, missplicing.missplicing)
|
|
237
|
+
|
|
238
|
+
return missplicing
|
|
223
239
|
|
|
224
240
|
# from functools import reduce
|
|
225
241
|
# ref = transcript.pre_mrna
|
|
@@ -352,7 +368,7 @@ def find_transcript_missplicing_seqs(ref_seq, var_seq, donors, acceptors, thresh
|
|
|
352
368
|
return Missplicing(temp, threshold=threshold)
|
|
353
369
|
|
|
354
370
|
|
|
355
|
-
def process_pairwise_epistasis(mids, engine='pangolin', fprint=False):
|
|
371
|
+
def process_pairwise_epistasis(mids, engine='pangolin', fprint=False, db=None):
|
|
356
372
|
results = []
|
|
357
373
|
for mid in mids:
|
|
358
374
|
m1, m2 = mid.split('|')
|
|
@@ -360,9 +376,9 @@ def process_pairwise_epistasis(mids, engine='pangolin', fprint=False):
|
|
|
360
376
|
# missplicing2 = find_transcript_missplicing(m2, threshold=0.25, engine=engine)
|
|
361
377
|
# missplicing_both = find_transcript_missplicing(mid, threshold=0.25, engine=engine)
|
|
362
378
|
|
|
363
|
-
missplicing1 = Missplicing(
|
|
364
|
-
missplicing2 = Missplicing(
|
|
365
|
-
missplicing_both = Missplicing(
|
|
379
|
+
missplicing1 = Missplicing(find_transcript_missplicing(m1, engine=engine, db=db), threshold=0.25)
|
|
380
|
+
missplicing2 = Missplicing(find_transcript_missplicing(m2, engine=engine, db=db), threshold=0.25)
|
|
381
|
+
missplicing_both = Missplicing(find_transcript_missplicing(mid, engine=engine, db=db), threshold=0.25)
|
|
366
382
|
|
|
367
383
|
if fprint:
|
|
368
384
|
print(missplicing1)
|
|
@@ -586,85 +602,162 @@ def benchmark_splicing(gene, organism='hg38', engine='spliceai'):
|
|
|
586
602
|
return len(correct_donor_preds) / num_introns, len(correct_acceptor_preds) / num_introns, len(transcript.introns)
|
|
587
603
|
|
|
588
604
|
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
605
|
+
class SplicingDatabase:
|
|
606
|
+
"""
|
|
607
|
+
A class to handle interactions with the splicing SQLite database.
|
|
608
|
+
Each instance maintains its own connection, suitable for multi-threaded environments.
|
|
609
|
+
"""
|
|
610
|
+
|
|
611
|
+
def __init__(self, db_path: str):
|
|
612
|
+
"""
|
|
613
|
+
Initialize the SplicingDatabase with a connection to the SQLite database.
|
|
614
|
+
|
|
615
|
+
Args:
|
|
616
|
+
db_path (str): Path to the SQLite database file.
|
|
617
|
+
"""
|
|
618
|
+
self.db_path = db_path
|
|
619
|
+
self.conn = sqlite3.connect(self.db_path, isolation_level=None, check_same_thread=False) # Disable thread check
|
|
620
|
+
self.cursor = self.conn.cursor()
|
|
621
|
+
self._initialize_table()
|
|
622
|
+
|
|
623
|
+
def _initialize_table(self):
|
|
624
|
+
"""
|
|
625
|
+
Create the mutations table if it doesn't exist.
|
|
626
|
+
"""
|
|
627
|
+
self.cursor.execute('''
|
|
628
|
+
CREATE TABLE IF NOT EXISTS mutations (
|
|
629
|
+
engine TEXT,
|
|
630
|
+
gene TEXT,
|
|
631
|
+
mut_id TEXT,
|
|
632
|
+
transcript_id TEXT,
|
|
633
|
+
data TEXT,
|
|
634
|
+
PRIMARY KEY (engine, gene, mut_id, transcript_id)
|
|
635
|
+
)
|
|
636
|
+
''')
|
|
637
|
+
|
|
638
|
+
def get_mutation_data(self, engine: str, gene: str, mut_id: str, transcript_id: str) -> Dict[str, Any]:
|
|
639
|
+
"""
|
|
640
|
+
Retrieve mutation data from the database.
|
|
641
|
+
|
|
642
|
+
Args:
|
|
643
|
+
engine (str): Name of the tool used for computation.
|
|
644
|
+
gene (str): Gene name or identifier.
|
|
645
|
+
mut_id (str): A unique identifier for the mutation.
|
|
646
|
+
transcript_id (str): ID for the transcript.
|
|
647
|
+
|
|
648
|
+
Returns:
|
|
649
|
+
Dict[str, Any]: The mutation data if found, else None.
|
|
650
|
+
"""
|
|
651
|
+
self.cursor.execute('''
|
|
652
|
+
SELECT data FROM mutations
|
|
653
|
+
WHERE engine=? AND gene=? AND mut_id=? AND transcript_id=?
|
|
654
|
+
''', (engine, gene, mut_id, transcript_id))
|
|
655
|
+
row = self.cursor.fetchone()
|
|
656
|
+
if row:
|
|
657
|
+
return json.loads(row[0])
|
|
658
|
+
return None
|
|
659
|
+
|
|
660
|
+
def store_mutation_data(self, engine: str, gene: str, mut_id: str, transcript_id: str, data: Dict[str, Any]):
|
|
661
|
+
"""
|
|
662
|
+
Store mutation data in the database.
|
|
663
|
+
|
|
664
|
+
Args:
|
|
665
|
+
tool (str): Name of the tool used for computation.
|
|
666
|
+
gene (str): Gene name or identifier.
|
|
667
|
+
mutation_id (str): A unique identifier for the mutation.
|
|
668
|
+
transcript_id (str): ID for the transcript.
|
|
669
|
+
data (Dict[str, Any]): The mutation data to store.
|
|
670
|
+
"""
|
|
671
|
+
# Convert NumPy types to native Python types
|
|
672
|
+
data_native = convert_numpy_to_native(data)
|
|
673
|
+
data_json = json.dumps(data_native)
|
|
674
|
+
self.cursor.execute('''
|
|
675
|
+
REPLACE INTO mutations (engine, gene, mut_id, transcript_id, data)
|
|
676
|
+
VALUES (?, ?, ?, ?, ?)
|
|
677
|
+
''', (engine, gene, mut_id, transcript_id, data_json))
|
|
678
|
+
|
|
679
|
+
def close(self):
|
|
680
|
+
"""
|
|
681
|
+
Close the database connection.
|
|
682
|
+
"""
|
|
683
|
+
self.conn.close()
|
|
684
|
+
|
|
685
|
+
|
|
592
686
|
|
|
593
687
|
# Global connection and cursor (adjust to your architecture)
|
|
594
688
|
# Ideally, initialize this once in your application startup code.
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
#
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
return computed_data
|
|
689
|
+
# conn = sqlite3.connect(DB_PATH, isolation_level=None) # autocommit mode
|
|
690
|
+
# cursor = conn.cursor()
|
|
691
|
+
#
|
|
692
|
+
# # Create table once at startup, not in the function
|
|
693
|
+
# cursor.execute('''
|
|
694
|
+
# CREATE TABLE IF NOT EXISTS mutations (
|
|
695
|
+
# engine TEXT,
|
|
696
|
+
# gene TEXT,
|
|
697
|
+
# mut_id TEXT,
|
|
698
|
+
# transcript_id TEXT,
|
|
699
|
+
# data TEXT,
|
|
700
|
+
# PRIMARY KEY (engine, gene, mut_id, transcript_id)
|
|
701
|
+
# )''')
|
|
702
|
+
# #
|
|
703
|
+
#
|
|
704
|
+
# def get_splicing(engine, gene, mut_id, transcript_id, force_recompute=False):
|
|
705
|
+
# """
|
|
706
|
+
# Retrieve computed splicing data for a given mutation from a database,
|
|
707
|
+
# Args:
|
|
708
|
+
# engine (str): Name of the tool used for computation.
|
|
709
|
+
# gene (str): Gene name or identifier.
|
|
710
|
+
# mut_id (str): A unique identifier for the mutation.
|
|
711
|
+
# transcript_id (str): ID for the transcript.
|
|
712
|
+
# force_recompute (bool): If True, ignore cached value and recompute.
|
|
713
|
+
# Returns:
|
|
714
|
+
# dict: The splicing data.
|
|
715
|
+
# """
|
|
716
|
+
# # Lookup in the database
|
|
717
|
+
# cursor.execute('SELECT data FROM mutations WHERE engine=? AND gene=? AND mut_id=? AND transcript_id=?',
|
|
718
|
+
# (engine, gene, mut_id, transcript_id))
|
|
719
|
+
# row = cursor.fetchone()
|
|
720
|
+
# # If found and no force recompute, return cached data
|
|
721
|
+
# if row:
|
|
722
|
+
# return json.loads(row[0])
|
|
723
|
+
# return None
|
|
724
|
+
#
|
|
725
|
+
# def save_splicing(engine, gene, mut_id, transcript_id, splicing):
|
|
726
|
+
# data_json = json.dumps(convert_numpy_to_native(splicing))
|
|
727
|
+
# cursor.execute('REPLACE INTO mutations (engine, gene, mut_id, transcript_id, data) VALUES (?, ?, ?, ?, ?)',
|
|
728
|
+
# (engine, gene, mut_id, transcript_id, data_json))
|
|
729
|
+
# return None
|
|
730
|
+
#
|
|
731
|
+
# def get_or_compute_splicing(mut_id, transcript_id=None, engine='spliceai', force_recompute=False):
|
|
732
|
+
# """
|
|
733
|
+
# Retrieve computed splicing data for a given mutation from a database,
|
|
734
|
+
# or compute and store it if not found or if force_recompute is True.
|
|
735
|
+
# Args:
|
|
736
|
+
# engine (str): Name of the tool used for computation.
|
|
737
|
+
# mut_id (str): A unique identifier for the mutation.
|
|
738
|
+
# transcript_id (str): ID for the transcript.
|
|
739
|
+
# force_recompute (bool): If True, ignore cached value and recompute.
|
|
740
|
+
# Returns:
|
|
741
|
+
# dict: The computed splicing data.
|
|
742
|
+
# """
|
|
743
|
+
# gene = mut_id.split(':')[0]
|
|
744
|
+
# if transcript_id is None:
|
|
745
|
+
# transcript_id = Gene.from_file(gene).transcript().transcript_id
|
|
746
|
+
#
|
|
747
|
+
# # Lookup in the database
|
|
748
|
+
# cursor.execute('SELECT data FROM mutations WHERE engine=? AND gene=? AND mut_id=? AND transcript_id=?',
|
|
749
|
+
# (engine, gene, mut_id, transcript_id))
|
|
750
|
+
# row = cursor.fetchone()
|
|
751
|
+
# # If found and no force recompute, return cached data
|
|
752
|
+
# if row and not force_recompute:
|
|
753
|
+
# return json.loads(row[0])
|
|
754
|
+
# # Otherwise, compute the data
|
|
755
|
+
# computed_data = convert_numpy_to_native(find_transcript_missplicing(mut_id, transcript=transcript_id, engine=engine).missplicing) # Replace with your actual function
|
|
756
|
+
# # Store computed data in DB
|
|
757
|
+
# data_json = json.dumps(computed_data)
|
|
758
|
+
# cursor.execute('REPLACE INTO mutations (engine, gene, mut_id, transcript_id, data) VALUES (?, ?, ?, ?, ?)',
|
|
759
|
+
# (engine, gene, mut_id, transcript_id, data_json))
|
|
760
|
+
# return computed_data
|
|
668
761
|
|
|
669
762
|
|
|
670
763
|
def convert_numpy_to_native(obj):
|
|
@@ -16,7 +16,7 @@ geney/pangolin_utils.py,sha256=i5j5vEMCWOTIa1mRP2377BAhlUFZjHBzTQBips4lA_4,2934
|
|
|
16
16
|
geney/power_utils.py,sha256=MehZFUdkJ2EFUot709yPEDxSkXmH5XevMebX2HD768A,7330
|
|
17
17
|
geney/seqmat_utils.py,sha256=wzb3PX5it5bpIFQvcxyzlxfhoJTbHHbsjg0rzh05iVs,19753
|
|
18
18
|
geney/spliceai_utils.py,sha256=PFIhTK8Ihrj-cv5tgRN0UFPYEmC4uxtqXSP9bBLnZRM,3077
|
|
19
|
-
geney/splicing_utils.py,sha256=
|
|
19
|
+
geney/splicing_utils.py,sha256=VLSUJ1SFsnz9-Tt3Ywqp786WeVSJY7-VmwtdTgT2cXk,35828
|
|
20
20
|
geney/survival_utils.py,sha256=KnAzEviMuXh6SnVXId9PgsFLSbgkduTvYoIthxN7FPA,6886
|
|
21
21
|
geney/tcga_utils.py,sha256=D_BNHm-D_K408dlcJm3hzH2c6QNFjQsKvUcOPiQRk7g,17612
|
|
22
22
|
geney/tis_utils.py,sha256=2makfGfVlDFVIbxzXE85AY9jmAjcNmxyIAxjvkRA5LY,7396
|
|
@@ -25,7 +25,7 @@ geney/translation_initiation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM
|
|
|
25
25
|
geney/translation_initiation/tis_utils.py,sha256=AF3siFjuQH-Rs44EV-80zHdbxRMvN4woLFSHroWIETc,4448
|
|
26
26
|
geney/translation_initiation/resources/kozak_pssm.json,sha256=pcd0Olziutq-6H3mFWDCD9cujQ_AlZO-iiOvBl82hqE,1165
|
|
27
27
|
geney/translation_initiation/resources/tis_regressor_model.joblib,sha256=IXb4DUDhJ5rBDKcqMk9zE3ECTZZcdj7Jixz3KpoZ7OA,2592025
|
|
28
|
-
geney-1.3.
|
|
29
|
-
geney-1.3.
|
|
30
|
-
geney-1.3.
|
|
31
|
-
geney-1.3.
|
|
28
|
+
geney-1.3.16.dist-info/METADATA,sha256=UFbVToYRg7aFtUlIwXnsQPqP78-eEpQ5AwGKys4feqQ,971
|
|
29
|
+
geney-1.3.16.dist-info/WHEEL,sha256=AHX6tWk3qWuce7vKLrj7lnulVHEdWoltgauo8bgCXgU,109
|
|
30
|
+
geney-1.3.16.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
|
|
31
|
+
geney-1.3.16.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|