geney 1.3.13__py2.py3-none-any.whl → 1.3.15__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of geney might be problematic. Click here for more details.
geney/splicing_utils.py
CHANGED
|
@@ -4,6 +4,12 @@ from .Gene import Gene
|
|
|
4
4
|
from .SeqMats import MutSeqMat
|
|
5
5
|
from collections import defaultdict
|
|
6
6
|
from . import config
|
|
7
|
+
import sqlite3
|
|
8
|
+
import json
|
|
9
|
+
import os
|
|
10
|
+
from typing import Dict, Any
|
|
11
|
+
|
|
12
|
+
DB_PATH = os.path.join(config['hg38']['splicing_db'], 'mutation_data.db')
|
|
7
13
|
|
|
8
14
|
def generate_adjacency_list(acceptors, donors, transcript_start, transcript_end, max_distance=50, rev=False):
|
|
9
15
|
# Append the transcript end to donors to allow connection to the end point
|
|
@@ -205,9 +211,15 @@ def find_transcript_splicing(transcript, engine: str = 'spliceai') -> Tuple[Dict
|
|
|
205
211
|
return donor_probs, acceptor_probs
|
|
206
212
|
|
|
207
213
|
|
|
208
|
-
def find_transcript_missplicing(mut_id, transcript=None, threshold=0.5, engine='spliceai', organism='hg38'):
|
|
214
|
+
def find_transcript_missplicing(mut_id, transcript=None, threshold=0.5, engine='spliceai', organism='hg38', db=None, force_recompute=False):
|
|
209
215
|
gene = Gene.from_file(mut_id.split(':')[0], organism=organism)
|
|
210
216
|
reference_transcript = gene.transcript(transcript) if transcript is not None else gene.transcript()
|
|
217
|
+
|
|
218
|
+
if db is not None:
|
|
219
|
+
cached_data = db.get_mutation_data(engine, gene.gene_name, mut_id, reference_transcript.transcript_id)
|
|
220
|
+
if cached_data and not force_recompute:
|
|
221
|
+
return cached_data
|
|
222
|
+
|
|
211
223
|
variant_transcript = reference_transcript.clone()
|
|
212
224
|
mutations = [MutSeqMat.from_mutid(m) for m in mut_id.split('|')]
|
|
213
225
|
mutations = [m for m in mutations if m in reference_transcript]
|
|
@@ -218,8 +230,12 @@ def find_transcript_missplicing(mut_id, transcript=None, threshold=0.5, engine='
|
|
|
218
230
|
for mutation in mutations:
|
|
219
231
|
variant_transcript.mutate(mutation, inplace=True)
|
|
220
232
|
|
|
221
|
-
|
|
233
|
+
missplicing = find_transcript_missplicing_seqs(reference_transcript.pre_mrna.get_context(center, 7500), variant_transcript.pre_mrna.get_context(center, 7500), reference_transcript.donors, reference_transcript.acceptors, threshold=threshold, engine=engine)
|
|
222
234
|
|
|
235
|
+
if db is not None:
|
|
236
|
+
db.store_mutation_data(engine, gene.gene_name, mut_id, reference_transcript.transcript_id, missplicing.missplicing)
|
|
237
|
+
|
|
238
|
+
return missplicing
|
|
223
239
|
|
|
224
240
|
# from functools import reduce
|
|
225
241
|
# ref = transcript.pre_mrna
|
|
@@ -388,6 +404,8 @@ def process_pairwise_epistasis(mids, engine='pangolin', fprint=False):
|
|
|
388
404
|
m2.get(k, {}).get('reference') or
|
|
389
405
|
mb.get(k, {}).get('reference')
|
|
390
406
|
)
|
|
407
|
+
if ref_val is None:
|
|
408
|
+
ref_val = 0
|
|
391
409
|
|
|
392
410
|
|
|
393
411
|
# Compute deltas
|
|
@@ -415,12 +433,12 @@ def process_pairwise_epistasis(mids, engine='pangolin', fprint=False):
|
|
|
415
433
|
flag['cummulative'] += 1
|
|
416
434
|
cummulative_deltas.append(max((deltab - delta1, deltab - delta2), key=abs))
|
|
417
435
|
|
|
418
|
-
if (
|
|
436
|
+
if (0.4 <= ref_val <= 0.75) and (
|
|
419
437
|
((delta1 > 0.25 or delta2 > 0.25) and deltab < 0.25) or
|
|
420
438
|
(delta1 < 0.25 and delta2 < 0.25 and deltab > 0.25)
|
|
421
439
|
) and (
|
|
422
|
-
abs(delta1 - deltab) > 0.
|
|
423
|
-
abs(delta2 - deltab) > 0.
|
|
440
|
+
abs(delta1 - deltab) > 0.3 or
|
|
441
|
+
abs(delta2 - deltab) > 0.3
|
|
424
442
|
):
|
|
425
443
|
increased_canonical_splicing = True
|
|
426
444
|
|
|
@@ -584,85 +602,162 @@ def benchmark_splicing(gene, organism='hg38', engine='spliceai'):
|
|
|
584
602
|
return len(correct_donor_preds) / num_introns, len(correct_acceptor_preds) / num_introns, len(transcript.introns)
|
|
585
603
|
|
|
586
604
|
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
605
|
+
class SplicingDatabase:
|
|
606
|
+
"""
|
|
607
|
+
A class to handle interactions with the splicing SQLite database.
|
|
608
|
+
Each instance maintains its own connection, suitable for multi-threaded environments.
|
|
609
|
+
"""
|
|
610
|
+
|
|
611
|
+
def __init__(self, db_path: str):
|
|
612
|
+
"""
|
|
613
|
+
Initialize the SplicingDatabase with a connection to the SQLite database.
|
|
614
|
+
|
|
615
|
+
Args:
|
|
616
|
+
db_path (str): Path to the SQLite database file.
|
|
617
|
+
"""
|
|
618
|
+
self.db_path = db_path
|
|
619
|
+
self.conn = sqlite3.connect(self.db_path, isolation_level=None, check_same_thread=False) # Disable thread check
|
|
620
|
+
self.cursor = self.conn.cursor()
|
|
621
|
+
self._initialize_table()
|
|
622
|
+
|
|
623
|
+
def _initialize_table(self):
|
|
624
|
+
"""
|
|
625
|
+
Create the mutations table if it doesn't exist.
|
|
626
|
+
"""
|
|
627
|
+
self.cursor.execute('''
|
|
628
|
+
CREATE TABLE IF NOT EXISTS mutations (
|
|
629
|
+
engine TEXT,
|
|
630
|
+
gene TEXT,
|
|
631
|
+
mut_id TEXT,
|
|
632
|
+
transcript_id TEXT,
|
|
633
|
+
data TEXT,
|
|
634
|
+
PRIMARY KEY (engine, gene, mut_id, transcript_id)
|
|
635
|
+
)
|
|
636
|
+
''')
|
|
637
|
+
|
|
638
|
+
def get_mutation_data(self, engine: str, gene: str, mut_id: str, transcript_id: str) -> Dict[str, Any]:
|
|
639
|
+
"""
|
|
640
|
+
Retrieve mutation data from the database.
|
|
641
|
+
|
|
642
|
+
Args:
|
|
643
|
+
engine (str): Name of the tool used for computation.
|
|
644
|
+
gene (str): Gene name or identifier.
|
|
645
|
+
mut_id (str): A unique identifier for the mutation.
|
|
646
|
+
transcript_id (str): ID for the transcript.
|
|
647
|
+
|
|
648
|
+
Returns:
|
|
649
|
+
Dict[str, Any]: The mutation data if found, else None.
|
|
650
|
+
"""
|
|
651
|
+
self.cursor.execute('''
|
|
652
|
+
SELECT data FROM mutations
|
|
653
|
+
WHERE engine=? AND gene=? AND mut_id=? AND transcript_id=?
|
|
654
|
+
''', (engine, gene, mut_id, transcript_id))
|
|
655
|
+
row = self.cursor.fetchone()
|
|
656
|
+
if row:
|
|
657
|
+
return json.loads(row[0])
|
|
658
|
+
return None
|
|
659
|
+
|
|
660
|
+
def store_mutation_data(self, engine: str, gene: str, mut_id: str, transcript_id: str, data: Dict[str, Any]):
|
|
661
|
+
"""
|
|
662
|
+
Store mutation data in the database.
|
|
663
|
+
|
|
664
|
+
Args:
|
|
665
|
+
tool (str): Name of the tool used for computation.
|
|
666
|
+
gene (str): Gene name or identifier.
|
|
667
|
+
mutation_id (str): A unique identifier for the mutation.
|
|
668
|
+
transcript_id (str): ID for the transcript.
|
|
669
|
+
data (Dict[str, Any]): The mutation data to store.
|
|
670
|
+
"""
|
|
671
|
+
# Convert NumPy types to native Python types
|
|
672
|
+
data_native = convert_numpy_to_native(data)
|
|
673
|
+
data_json = json.dumps(data_native)
|
|
674
|
+
self.cursor.execute('''
|
|
675
|
+
REPLACE INTO mutations (engine, gene, mut_id, transcript_id, data)
|
|
676
|
+
VALUES (?, ?, ?, ?, ?)
|
|
677
|
+
''', (engine, gene, mut_id, transcript_id, data_json))
|
|
678
|
+
|
|
679
|
+
def close(self):
|
|
680
|
+
"""
|
|
681
|
+
Close the database connection.
|
|
682
|
+
"""
|
|
683
|
+
self.conn.close()
|
|
684
|
+
|
|
685
|
+
|
|
590
686
|
|
|
591
687
|
# Global connection and cursor (adjust to your architecture)
|
|
592
688
|
# Ideally, initialize this once in your application startup code.
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
#
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
return computed_data
|
|
689
|
+
# conn = sqlite3.connect(DB_PATH, isolation_level=None) # autocommit mode
|
|
690
|
+
# cursor = conn.cursor()
|
|
691
|
+
#
|
|
692
|
+
# # Create table once at startup, not in the function
|
|
693
|
+
# cursor.execute('''
|
|
694
|
+
# CREATE TABLE IF NOT EXISTS mutations (
|
|
695
|
+
# engine TEXT,
|
|
696
|
+
# gene TEXT,
|
|
697
|
+
# mut_id TEXT,
|
|
698
|
+
# transcript_id TEXT,
|
|
699
|
+
# data TEXT,
|
|
700
|
+
# PRIMARY KEY (engine, gene, mut_id, transcript_id)
|
|
701
|
+
# )''')
|
|
702
|
+
# #
|
|
703
|
+
#
|
|
704
|
+
# def get_splicing(engine, gene, mut_id, transcript_id, force_recompute=False):
|
|
705
|
+
# """
|
|
706
|
+
# Retrieve computed splicing data for a given mutation from a database,
|
|
707
|
+
# Args:
|
|
708
|
+
# engine (str): Name of the tool used for computation.
|
|
709
|
+
# gene (str): Gene name or identifier.
|
|
710
|
+
# mut_id (str): A unique identifier for the mutation.
|
|
711
|
+
# transcript_id (str): ID for the transcript.
|
|
712
|
+
# force_recompute (bool): If True, ignore cached value and recompute.
|
|
713
|
+
# Returns:
|
|
714
|
+
# dict: The splicing data.
|
|
715
|
+
# """
|
|
716
|
+
# # Lookup in the database
|
|
717
|
+
# cursor.execute('SELECT data FROM mutations WHERE engine=? AND gene=? AND mut_id=? AND transcript_id=?',
|
|
718
|
+
# (engine, gene, mut_id, transcript_id))
|
|
719
|
+
# row = cursor.fetchone()
|
|
720
|
+
# # If found and no force recompute, return cached data
|
|
721
|
+
# if row:
|
|
722
|
+
# return json.loads(row[0])
|
|
723
|
+
# return None
|
|
724
|
+
#
|
|
725
|
+
# def save_splicing(engine, gene, mut_id, transcript_id, splicing):
|
|
726
|
+
# data_json = json.dumps(convert_numpy_to_native(splicing))
|
|
727
|
+
# cursor.execute('REPLACE INTO mutations (engine, gene, mut_id, transcript_id, data) VALUES (?, ?, ?, ?, ?)',
|
|
728
|
+
# (engine, gene, mut_id, transcript_id, data_json))
|
|
729
|
+
# return None
|
|
730
|
+
#
|
|
731
|
+
# def get_or_compute_splicing(mut_id, transcript_id=None, engine='spliceai', force_recompute=False):
|
|
732
|
+
# """
|
|
733
|
+
# Retrieve computed splicing data for a given mutation from a database,
|
|
734
|
+
# or compute and store it if not found or if force_recompute is True.
|
|
735
|
+
# Args:
|
|
736
|
+
# engine (str): Name of the tool used for computation.
|
|
737
|
+
# mut_id (str): A unique identifier for the mutation.
|
|
738
|
+
# transcript_id (str): ID for the transcript.
|
|
739
|
+
# force_recompute (bool): If True, ignore cached value and recompute.
|
|
740
|
+
# Returns:
|
|
741
|
+
# dict: The computed splicing data.
|
|
742
|
+
# """
|
|
743
|
+
# gene = mut_id.split(':')[0]
|
|
744
|
+
# if transcript_id is None:
|
|
745
|
+
# transcript_id = Gene.from_file(gene).transcript().transcript_id
|
|
746
|
+
#
|
|
747
|
+
# # Lookup in the database
|
|
748
|
+
# cursor.execute('SELECT data FROM mutations WHERE engine=? AND gene=? AND mut_id=? AND transcript_id=?',
|
|
749
|
+
# (engine, gene, mut_id, transcript_id))
|
|
750
|
+
# row = cursor.fetchone()
|
|
751
|
+
# # If found and no force recompute, return cached data
|
|
752
|
+
# if row and not force_recompute:
|
|
753
|
+
# return json.loads(row[0])
|
|
754
|
+
# # Otherwise, compute the data
|
|
755
|
+
# computed_data = convert_numpy_to_native(find_transcript_missplicing(mut_id, transcript=transcript_id, engine=engine).missplicing) # Replace with your actual function
|
|
756
|
+
# # Store computed data in DB
|
|
757
|
+
# data_json = json.dumps(computed_data)
|
|
758
|
+
# cursor.execute('REPLACE INTO mutations (engine, gene, mut_id, transcript_id, data) VALUES (?, ?, ?, ?, ?)',
|
|
759
|
+
# (engine, gene, mut_id, transcript_id, data_json))
|
|
760
|
+
# return computed_data
|
|
666
761
|
|
|
667
762
|
|
|
668
763
|
def convert_numpy_to_native(obj):
|
|
@@ -16,7 +16,7 @@ geney/pangolin_utils.py,sha256=i5j5vEMCWOTIa1mRP2377BAhlUFZjHBzTQBips4lA_4,2934
|
|
|
16
16
|
geney/power_utils.py,sha256=MehZFUdkJ2EFUot709yPEDxSkXmH5XevMebX2HD768A,7330
|
|
17
17
|
geney/seqmat_utils.py,sha256=wzb3PX5it5bpIFQvcxyzlxfhoJTbHHbsjg0rzh05iVs,19753
|
|
18
18
|
geney/spliceai_utils.py,sha256=PFIhTK8Ihrj-cv5tgRN0UFPYEmC4uxtqXSP9bBLnZRM,3077
|
|
19
|
-
geney/splicing_utils.py,sha256=
|
|
19
|
+
geney/splicing_utils.py,sha256=shRi6eaIrXg6ZfnTEIVMb7Cl51a1E3swpXH-qz0whKo,35760
|
|
20
20
|
geney/survival_utils.py,sha256=KnAzEviMuXh6SnVXId9PgsFLSbgkduTvYoIthxN7FPA,6886
|
|
21
21
|
geney/tcga_utils.py,sha256=D_BNHm-D_K408dlcJm3hzH2c6QNFjQsKvUcOPiQRk7g,17612
|
|
22
22
|
geney/tis_utils.py,sha256=2makfGfVlDFVIbxzXE85AY9jmAjcNmxyIAxjvkRA5LY,7396
|
|
@@ -25,7 +25,7 @@ geney/translation_initiation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM
|
|
|
25
25
|
geney/translation_initiation/tis_utils.py,sha256=AF3siFjuQH-Rs44EV-80zHdbxRMvN4woLFSHroWIETc,4448
|
|
26
26
|
geney/translation_initiation/resources/kozak_pssm.json,sha256=pcd0Olziutq-6H3mFWDCD9cujQ_AlZO-iiOvBl82hqE,1165
|
|
27
27
|
geney/translation_initiation/resources/tis_regressor_model.joblib,sha256=IXb4DUDhJ5rBDKcqMk9zE3ECTZZcdj7Jixz3KpoZ7OA,2592025
|
|
28
|
-
geney-1.3.
|
|
29
|
-
geney-1.3.
|
|
30
|
-
geney-1.3.
|
|
31
|
-
geney-1.3.
|
|
28
|
+
geney-1.3.15.dist-info/METADATA,sha256=QUp06fi0VTP_o0jI2M9v65lQq8rRzYIN9XRtefazSOA,971
|
|
29
|
+
geney-1.3.15.dist-info/WHEEL,sha256=AHX6tWk3qWuce7vKLrj7lnulVHEdWoltgauo8bgCXgU,109
|
|
30
|
+
geney-1.3.15.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
|
|
31
|
+
geney-1.3.15.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|