geney 1.3.13__py2.py3-none-any.whl → 1.3.15__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of geney might be problematic. Click here for more details.

geney/splicing_utils.py CHANGED
@@ -4,6 +4,12 @@ from .Gene import Gene
4
4
  from .SeqMats import MutSeqMat
5
5
  from collections import defaultdict
6
6
  from . import config
7
+ import sqlite3
8
+ import json
9
+ import os
10
+ from typing import Dict, Any
11
+
12
+ DB_PATH = os.path.join(config['hg38']['splicing_db'], 'mutation_data.db')
7
13
 
8
14
  def generate_adjacency_list(acceptors, donors, transcript_start, transcript_end, max_distance=50, rev=False):
9
15
  # Append the transcript end to donors to allow connection to the end point
@@ -205,9 +211,15 @@ def find_transcript_splicing(transcript, engine: str = 'spliceai') -> Tuple[Dict
205
211
  return donor_probs, acceptor_probs
206
212
 
207
213
 
208
- def find_transcript_missplicing(mut_id, transcript=None, threshold=0.5, engine='spliceai', organism='hg38'):
214
+ def find_transcript_missplicing(mut_id, transcript=None, threshold=0.5, engine='spliceai', organism='hg38', db=None, force_recompute=False):
209
215
  gene = Gene.from_file(mut_id.split(':')[0], organism=organism)
210
216
  reference_transcript = gene.transcript(transcript) if transcript is not None else gene.transcript()
217
+
218
+ if db is not None:
219
+ cached_data = db.get_mutation_data(engine, gene.gene_name, mut_id, reference_transcript.transcript_id)
220
+ if cached_data and not force_recompute:
221
+ return cached_data
222
+
211
223
  variant_transcript = reference_transcript.clone()
212
224
  mutations = [MutSeqMat.from_mutid(m) for m in mut_id.split('|')]
213
225
  mutations = [m for m in mutations if m in reference_transcript]
@@ -218,8 +230,12 @@ def find_transcript_missplicing(mut_id, transcript=None, threshold=0.5, engine='
218
230
  for mutation in mutations:
219
231
  variant_transcript.mutate(mutation, inplace=True)
220
232
 
221
- return find_transcript_missplicing_seqs(reference_transcript.pre_mrna.get_context(center, 7500), variant_transcript.pre_mrna.get_context(center, 7500), reference_transcript.donors, reference_transcript.acceptors, threshold=threshold, engine=engine)
233
+ missplicing = find_transcript_missplicing_seqs(reference_transcript.pre_mrna.get_context(center, 7500), variant_transcript.pre_mrna.get_context(center, 7500), reference_transcript.donors, reference_transcript.acceptors, threshold=threshold, engine=engine)
222
234
 
235
+ if db is not None:
236
+ db.store_mutation_data(engine, gene.gene_name, mut_id, reference_transcript.transcript_id, missplicing.missplicing)
237
+
238
+ return missplicing
223
239
 
224
240
  # from functools import reduce
225
241
  # ref = transcript.pre_mrna
@@ -388,6 +404,8 @@ def process_pairwise_epistasis(mids, engine='pangolin', fprint=False):
388
404
  m2.get(k, {}).get('reference') or
389
405
  mb.get(k, {}).get('reference')
390
406
  )
407
+ if ref_val is None:
408
+ ref_val = 0
391
409
 
392
410
 
393
411
  # Compute deltas
@@ -415,12 +433,12 @@ def process_pairwise_epistasis(mids, engine='pangolin', fprint=False):
415
433
  flag['cummulative'] += 1
416
434
  cummulative_deltas.append(max((deltab - delta1, deltab - delta2), key=abs))
417
435
 
418
- if ((0.4 <= ref_val) <= 0.75) and (
436
+ if (0.4 <= ref_val <= 0.75) and (
419
437
  ((delta1 > 0.25 or delta2 > 0.25) and deltab < 0.25) or
420
438
  (delta1 < 0.25 and delta2 < 0.25 and deltab > 0.25)
421
439
  ) and (
422
- abs(delta1 - deltab) > 0.25 or
423
- abs(delta2 - deltab) > 0.25
440
+ abs(delta1 - deltab) > 0.3 or
441
+ abs(delta2 - deltab) > 0.3
424
442
  ):
425
443
  increased_canonical_splicing = True
426
444
 
@@ -584,85 +602,162 @@ def benchmark_splicing(gene, organism='hg38', engine='spliceai'):
584
602
  return len(correct_donor_preds) / num_introns, len(correct_acceptor_preds) / num_introns, len(transcript.introns)
585
603
 
586
604
 
587
- import sqlite3
588
- import json
589
- import os
605
+ class SplicingDatabase:
606
+ """
607
+ A class to handle interactions with the splicing SQLite database.
608
+ Each instance maintains its own connection, suitable for multi-threaded environments.
609
+ """
610
+
611
+ def __init__(self, db_path: str):
612
+ """
613
+ Initialize the SplicingDatabase with a connection to the SQLite database.
614
+
615
+ Args:
616
+ db_path (str): Path to the SQLite database file.
617
+ """
618
+ self.db_path = db_path
619
+ self.conn = sqlite3.connect(self.db_path, isolation_level=None, check_same_thread=False) # Disable thread check
620
+ self.cursor = self.conn.cursor()
621
+ self._initialize_table()
622
+
623
+ def _initialize_table(self):
624
+ """
625
+ Create the mutations table if it doesn't exist.
626
+ """
627
+ self.cursor.execute('''
628
+ CREATE TABLE IF NOT EXISTS mutations (
629
+ engine TEXT,
630
+ gene TEXT,
631
+ mut_id TEXT,
632
+ transcript_id TEXT,
633
+ data TEXT,
634
+ PRIMARY KEY (engine, gene, mut_id, transcript_id)
635
+ )
636
+ ''')
637
+
638
+ def get_mutation_data(self, engine: str, gene: str, mut_id: str, transcript_id: str) -> Dict[str, Any]:
639
+ """
640
+ Retrieve mutation data from the database.
641
+
642
+ Args:
643
+ engine (str): Name of the tool used for computation.
644
+ gene (str): Gene name or identifier.
645
+ mut_id (str): A unique identifier for the mutation.
646
+ transcript_id (str): ID for the transcript.
647
+
648
+ Returns:
649
+ Dict[str, Any]: The mutation data if found, else None.
650
+ """
651
+ self.cursor.execute('''
652
+ SELECT data FROM mutations
653
+ WHERE engine=? AND gene=? AND mut_id=? AND transcript_id=?
654
+ ''', (engine, gene, mut_id, transcript_id))
655
+ row = self.cursor.fetchone()
656
+ if row:
657
+ return json.loads(row[0])
658
+ return None
659
+
660
+ def store_mutation_data(self, engine: str, gene: str, mut_id: str, transcript_id: str, data: Dict[str, Any]):
661
+ """
662
+ Store mutation data in the database.
663
+
664
+ Args:
665
+ tool (str): Name of the tool used for computation.
666
+ gene (str): Gene name or identifier.
667
+ mutation_id (str): A unique identifier for the mutation.
668
+ transcript_id (str): ID for the transcript.
669
+ data (Dict[str, Any]): The mutation data to store.
670
+ """
671
+ # Convert NumPy types to native Python types
672
+ data_native = convert_numpy_to_native(data)
673
+ data_json = json.dumps(data_native)
674
+ self.cursor.execute('''
675
+ REPLACE INTO mutations (engine, gene, mut_id, transcript_id, data)
676
+ VALUES (?, ?, ?, ?, ?)
677
+ ''', (engine, gene, mut_id, transcript_id, data_json))
678
+
679
+ def close(self):
680
+ """
681
+ Close the database connection.
682
+ """
683
+ self.conn.close()
684
+
685
+
590
686
 
591
687
  # Global connection and cursor (adjust to your architecture)
592
688
  # Ideally, initialize this once in your application startup code.
593
- DB_PATH = os.path.join(config['hg38']['splicing_db'], 'mutation_data.db')
594
- conn = sqlite3.connect(DB_PATH, isolation_level=None) # autocommit mode
595
- cursor = conn.cursor()
596
-
597
- # Create table once at startup, not in the function
598
- cursor.execute('''
599
- CREATE TABLE IF NOT EXISTS mutations (
600
- engine TEXT,
601
- gene TEXT,
602
- mut_id TEXT,
603
- transcript_id TEXT,
604
- data TEXT,
605
- PRIMARY KEY (engine, gene, mut_id, transcript_id)
606
- )''')
607
-
608
-
609
- def get_splicing(engine, gene, mut_id, transcript_id, force_recompute=False):
610
- """
611
- Retrieve computed splicing data for a given mutation from a database,
612
- Args:
613
- engine (str): Name of the tool used for computation.
614
- gene (str): Gene name or identifier.
615
- mut_id (str): A unique identifier for the mutation.
616
- transcript_id (str): ID for the transcript.
617
- force_recompute (bool): If True, ignore cached value and recompute.
618
- Returns:
619
- dict: The splicing data.
620
- """
621
- # Lookup in the database
622
- cursor.execute('SELECT data FROM mutations WHERE engine=? AND gene=? AND mut_id=? AND transcript_id=?',
623
- (engine, gene, mut_id, transcript_id))
624
- row = cursor.fetchone()
625
- # If found and no force recompute, return cached data
626
- if row:
627
- return json.loads(row[0])
628
- return None
629
-
630
- def save_splicing(engine, gene, mut_id, transcript_id, splicing):
631
- data_json = json.dumps(convert_numpy_to_native(splicing))
632
- cursor.execute('REPLACE INTO mutations (engine, gene, mut_id, transcript_id, data) VALUES (?, ?, ?, ?, ?)',
633
- (engine, gene, mut_id, transcript_id, data_json))
634
- return None
635
-
636
- def get_or_compute_splicing(mut_id, transcript_id=None, engine='spliceai', force_recompute=False):
637
- """
638
- Retrieve computed splicing data for a given mutation from a database,
639
- or compute and store it if not found or if force_recompute is True.
640
- Args:
641
- engine (str): Name of the tool used for computation.
642
- mut_id (str): A unique identifier for the mutation.
643
- transcript_id (str): ID for the transcript.
644
- force_recompute (bool): If True, ignore cached value and recompute.
645
- Returns:
646
- dict: The computed splicing data.
647
- """
648
- gene = mut_id.split(':')[0]
649
- if transcript_id is None:
650
- transcript_id = Gene.from_file(gene).transcript().transcript_id
651
-
652
- # Lookup in the database
653
- cursor.execute('SELECT data FROM mutations WHERE engine=? AND gene=? AND mut_id=? AND transcript_id=?',
654
- (engine, gene, mut_id, transcript_id))
655
- row = cursor.fetchone()
656
- # If found and no force recompute, return cached data
657
- if row and not force_recompute:
658
- return json.loads(row[0])
659
- # Otherwise, compute the data
660
- computed_data = convert_numpy_to_native(find_transcript_missplicing(mut_id, transcript=transcript_id, engine=engine).missplicing) # Replace with your actual function
661
- # Store computed data in DB
662
- data_json = json.dumps(computed_data)
663
- cursor.execute('REPLACE INTO mutations (engine, gene, mut_id, transcript_id, data) VALUES (?, ?, ?, ?, ?)',
664
- (engine, gene, mut_id, transcript_id, data_json))
665
- return computed_data
689
+ # conn = sqlite3.connect(DB_PATH, isolation_level=None) # autocommit mode
690
+ # cursor = conn.cursor()
691
+ #
692
+ # # Create table once at startup, not in the function
693
+ # cursor.execute('''
694
+ # CREATE TABLE IF NOT EXISTS mutations (
695
+ # engine TEXT,
696
+ # gene TEXT,
697
+ # mut_id TEXT,
698
+ # transcript_id TEXT,
699
+ # data TEXT,
700
+ # PRIMARY KEY (engine, gene, mut_id, transcript_id)
701
+ # )''')
702
+ # #
703
+ #
704
+ # def get_splicing(engine, gene, mut_id, transcript_id, force_recompute=False):
705
+ # """
706
+ # Retrieve computed splicing data for a given mutation from a database,
707
+ # Args:
708
+ # engine (str): Name of the tool used for computation.
709
+ # gene (str): Gene name or identifier.
710
+ # mut_id (str): A unique identifier for the mutation.
711
+ # transcript_id (str): ID for the transcript.
712
+ # force_recompute (bool): If True, ignore cached value and recompute.
713
+ # Returns:
714
+ # dict: The splicing data.
715
+ # """
716
+ # # Lookup in the database
717
+ # cursor.execute('SELECT data FROM mutations WHERE engine=? AND gene=? AND mut_id=? AND transcript_id=?',
718
+ # (engine, gene, mut_id, transcript_id))
719
+ # row = cursor.fetchone()
720
+ # # If found and no force recompute, return cached data
721
+ # if row:
722
+ # return json.loads(row[0])
723
+ # return None
724
+ #
725
+ # def save_splicing(engine, gene, mut_id, transcript_id, splicing):
726
+ # data_json = json.dumps(convert_numpy_to_native(splicing))
727
+ # cursor.execute('REPLACE INTO mutations (engine, gene, mut_id, transcript_id, data) VALUES (?, ?, ?, ?, ?)',
728
+ # (engine, gene, mut_id, transcript_id, data_json))
729
+ # return None
730
+ #
731
+ # def get_or_compute_splicing(mut_id, transcript_id=None, engine='spliceai', force_recompute=False):
732
+ # """
733
+ # Retrieve computed splicing data for a given mutation from a database,
734
+ # or compute and store it if not found or if force_recompute is True.
735
+ # Args:
736
+ # engine (str): Name of the tool used for computation.
737
+ # mut_id (str): A unique identifier for the mutation.
738
+ # transcript_id (str): ID for the transcript.
739
+ # force_recompute (bool): If True, ignore cached value and recompute.
740
+ # Returns:
741
+ # dict: The computed splicing data.
742
+ # """
743
+ # gene = mut_id.split(':')[0]
744
+ # if transcript_id is None:
745
+ # transcript_id = Gene.from_file(gene).transcript().transcript_id
746
+ #
747
+ # # Lookup in the database
748
+ # cursor.execute('SELECT data FROM mutations WHERE engine=? AND gene=? AND mut_id=? AND transcript_id=?',
749
+ # (engine, gene, mut_id, transcript_id))
750
+ # row = cursor.fetchone()
751
+ # # If found and no force recompute, return cached data
752
+ # if row and not force_recompute:
753
+ # return json.loads(row[0])
754
+ # # Otherwise, compute the data
755
+ # computed_data = convert_numpy_to_native(find_transcript_missplicing(mut_id, transcript=transcript_id, engine=engine).missplicing) # Replace with your actual function
756
+ # # Store computed data in DB
757
+ # data_json = json.dumps(computed_data)
758
+ # cursor.execute('REPLACE INTO mutations (engine, gene, mut_id, transcript_id, data) VALUES (?, ?, ?, ?, ?)',
759
+ # (engine, gene, mut_id, transcript_id, data_json))
760
+ # return computed_data
666
761
 
667
762
 
668
763
  def convert_numpy_to_native(obj):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: geney
3
- Version: 1.3.13
3
+ Version: 1.3.15
4
4
  Summary: A Python package for gene expression modeling.
5
5
  Home-page: https://github.com/nicolaslynn/geney
6
6
  Author: Nicolas Lynn
@@ -16,7 +16,7 @@ geney/pangolin_utils.py,sha256=i5j5vEMCWOTIa1mRP2377BAhlUFZjHBzTQBips4lA_4,2934
16
16
  geney/power_utils.py,sha256=MehZFUdkJ2EFUot709yPEDxSkXmH5XevMebX2HD768A,7330
17
17
  geney/seqmat_utils.py,sha256=wzb3PX5it5bpIFQvcxyzlxfhoJTbHHbsjg0rzh05iVs,19753
18
18
  geney/spliceai_utils.py,sha256=PFIhTK8Ihrj-cv5tgRN0UFPYEmC4uxtqXSP9bBLnZRM,3077
19
- geney/splicing_utils.py,sha256=mBZGTF57BZ3caTnkQ_8zMhv7z0iRPrGKBSmjeDnTiUc,32222
19
+ geney/splicing_utils.py,sha256=shRi6eaIrXg6ZfnTEIVMb7Cl51a1E3swpXH-qz0whKo,35760
20
20
  geney/survival_utils.py,sha256=KnAzEviMuXh6SnVXId9PgsFLSbgkduTvYoIthxN7FPA,6886
21
21
  geney/tcga_utils.py,sha256=D_BNHm-D_K408dlcJm3hzH2c6QNFjQsKvUcOPiQRk7g,17612
22
22
  geney/tis_utils.py,sha256=2makfGfVlDFVIbxzXE85AY9jmAjcNmxyIAxjvkRA5LY,7396
@@ -25,7 +25,7 @@ geney/translation_initiation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM
25
25
  geney/translation_initiation/tis_utils.py,sha256=AF3siFjuQH-Rs44EV-80zHdbxRMvN4woLFSHroWIETc,4448
26
26
  geney/translation_initiation/resources/kozak_pssm.json,sha256=pcd0Olziutq-6H3mFWDCD9cujQ_AlZO-iiOvBl82hqE,1165
27
27
  geney/translation_initiation/resources/tis_regressor_model.joblib,sha256=IXb4DUDhJ5rBDKcqMk9zE3ECTZZcdj7Jixz3KpoZ7OA,2592025
28
- geney-1.3.13.dist-info/METADATA,sha256=yi0VHNzHaCHWqKRpAMHtF4tgWICSO0uYi1TJMDq5BIw,971
29
- geney-1.3.13.dist-info/WHEEL,sha256=AHX6tWk3qWuce7vKLrj7lnulVHEdWoltgauo8bgCXgU,109
30
- geney-1.3.13.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
31
- geney-1.3.13.dist-info/RECORD,,
28
+ geney-1.3.15.dist-info/METADATA,sha256=QUp06fi0VTP_o0jI2M9v65lQq8rRzYIN9XRtefazSOA,971
29
+ geney-1.3.15.dist-info/WHEEL,sha256=AHX6tWk3qWuce7vKLrj7lnulVHEdWoltgauo8bgCXgU,109
30
+ geney-1.3.15.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
31
+ geney-1.3.15.dist-info/RECORD,,
File without changes