PyPI - geney - Versions diffs - 1.3.14__py2.py3-none-any.whl → 1.3.16__py2.py3-none-any.whl - Mend

geney 1.3.14py2.py3-none-any.whl → 1.3.16py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of geney might be problematic. Click here for more details.

Files changed (5) hide show

geney/splicing_utils.py CHANGED Viewed

@@ -4,6 +4,12 @@ from .Gene import Gene
 from .SeqMats import MutSeqMat
 from collections import defaultdict
 from . import config
+import sqlite3
+import json
+import os
+from typing import Dict, Any
+DB_PATH = os.path.join(config['hg38']['splicing_db'], 'mutation_data.db')
 def generate_adjacency_list(acceptors, donors, transcript_start, transcript_end, max_distance=50, rev=False):
     # Append the transcript end to donors to allow connection to the end point
@@ -205,9 +211,15 @@ def find_transcript_splicing(transcript, engine: str = 'spliceai') -> Tuple[Dict
     return donor_probs, acceptor_probs
-def find_transcript_missplicing(mut_id, transcript=None, threshold=0.5, engine='spliceai', organism='hg38'):
+def find_transcript_missplicing(mut_id, transcript=None, threshold=0.5, engine='spliceai', organism='hg38', db=None, force_recompute=False):
     gene = Gene.from_file(mut_id.split(':')[0], organism=organism)
     reference_transcript = gene.transcript(transcript) if transcript is not None else gene.transcript()
+    if db is not None:
+        cached_data = db.get_mutation_data(engine, gene.gene_name, mut_id, reference_transcript.transcript_id)
+        if cached_data and not force_recompute:
+            return cached_data
     variant_transcript = reference_transcript.clone()
     mutations = [MutSeqMat.from_mutid(m) for m in mut_id.split('|')]
     mutations = [m for m in mutations if m in reference_transcript]
@@ -218,8 +230,12 @@ def find_transcript_missplicing(mut_id, transcript=None, threshold=0.5, engine='
     for mutation in mutations:
         variant_transcript.mutate(mutation, inplace=True)
-    return find_transcript_missplicing_seqs(reference_transcript.pre_mrna.get_context(center, 7500), variant_transcript.pre_mrna.get_context(center, 7500), reference_transcript.donors, reference_transcript.acceptors, threshold=threshold, engine=engine)
+    missplicing = find_transcript_missplicing_seqs(reference_transcript.pre_mrna.get_context(center, 7500, padding='N'), variant_transcript.pre_mrna.get_context(center, 7500, padding='N'), reference_transcript.donors, reference_transcript.acceptors, threshold=threshold, engine=engine)
+    if db is not None:
+        db.store_mutation_data(engine, gene.gene_name, mut_id, reference_transcript.transcript_id, missplicing.missplicing)
+    return missplicing
     # from functools import reduce
     # ref = transcript.pre_mrna
@@ -352,7 +368,7 @@ def find_transcript_missplicing_seqs(ref_seq, var_seq, donors, acceptors, thresh
     return Missplicing(temp, threshold=threshold)
-def process_pairwise_epistasis(mids, engine='pangolin', fprint=False):
+def process_pairwise_epistasis(mids, engine='pangolin', fprint=False, db=None):
     results = []
     for mid in mids:
         m1, m2 = mid.split('|')
@@ -360,9 +376,9 @@ def process_pairwise_epistasis(mids, engine='pangolin', fprint=False):
         # missplicing2 = find_transcript_missplicing(m2, threshold=0.25, engine=engine)
         # missplicing_both = find_transcript_missplicing(mid, threshold=0.25, engine=engine)
-        missplicing1 = Missplicing(get_or_compute_splicing(m1, engine=engine), threshold=0.25)
-        missplicing2 = Missplicing(get_or_compute_splicing(m2, engine=engine), threshold=0.25)
-        missplicing_both = Missplicing(get_or_compute_splicing(mid, engine=engine), threshold=0.25)
+        missplicing1 = Missplicing(find_transcript_missplicing(m1, engine=engine, db=db), threshold=0.25)
+        missplicing2 = Missplicing(find_transcript_missplicing(m2, engine=engine, db=db), threshold=0.25)
+        missplicing_both = Missplicing(find_transcript_missplicing(mid, engine=engine, db=db), threshold=0.25)
         if fprint:
             print(missplicing1)
@@ -586,85 +602,162 @@ def benchmark_splicing(gene, organism='hg38', engine='spliceai'):
     return len(correct_donor_preds) / num_introns, len(correct_acceptor_preds) / num_introns, len(transcript.introns)
-import sqlite3
-import json
-import os
+class SplicingDatabase:
+    """
+    A class to handle interactions with the splicing SQLite database.
+    Each instance maintains its own connection, suitable for multi-threaded environments.
+    """
+    def __init__(self, db_path: str):
+        """
+        Initialize the SplicingDatabase with a connection to the SQLite database.
+        Args:
+            db_path (str): Path to the SQLite database file.
+        """
+        self.db_path = db_path
+        self.conn = sqlite3.connect(self.db_path, isolation_level=None, check_same_thread=False)  # Disable thread check
+        self.cursor = self.conn.cursor()
+        self._initialize_table()
+    def _initialize_table(self):
+        """
+        Create the mutations table if it doesn't exist.
+        """
+        self.cursor.execute('''
+            CREATE TABLE IF NOT EXISTS mutations (
+                engine TEXT,
+                gene TEXT,
+                mut_id TEXT,
+                transcript_id TEXT,
+                data TEXT,
+                PRIMARY KEY (engine, gene, mut_id, transcript_id)
+            )
+        ''')
+    def get_mutation_data(self, engine: str, gene: str, mut_id: str, transcript_id: str) -> Dict[str, Any]:
+        """
+        Retrieve mutation data from the database.
+        Args:
+            engine (str): Name of the tool used for computation.
+            gene (str): Gene name or identifier.
+            mut_id (str): A unique identifier for the mutation.
+            transcript_id (str): ID for the transcript.
+        Returns:
+            Dict[str, Any]: The mutation data if found, else None.
+        """
+        self.cursor.execute('''
+            SELECT data FROM mutations
+            WHERE engine=? AND gene=? AND mut_id=? AND transcript_id=?
+        ''', (engine, gene, mut_id, transcript_id))
+        row = self.cursor.fetchone()
+        if row:
+            return json.loads(row[0])
+        return None
+    def store_mutation_data(self, engine: str, gene: str, mut_id: str, transcript_id: str, data: Dict[str, Any]):
+        """
+        Store mutation data in the database.
+        Args:
+            tool (str): Name of the tool used for computation.
+            gene (str): Gene name or identifier.
+            mutation_id (str): A unique identifier for the mutation.
+            transcript_id (str): ID for the transcript.
+            data (Dict[str, Any]): The mutation data to store.
+        """
+        # Convert NumPy types to native Python types
+        data_native = convert_numpy_to_native(data)
+        data_json = json.dumps(data_native)
+        self.cursor.execute('''
+            REPLACE INTO mutations (engine, gene, mut_id, transcript_id, data)
+            VALUES (?, ?, ?, ?, ?)
+        ''', (engine, gene, mut_id, transcript_id, data_json))
+    def close(self):
+        """
+        Close the database connection.
+        """
+        self.conn.close()
 # Global connection and cursor (adjust to your architecture)
 # Ideally, initialize this once in your application startup code.
-DB_PATH = os.path.join(config['hg38']['splicing_db'], 'mutation_data.db')
-conn = sqlite3.connect(DB_PATH, isolation_level=None)  # autocommit mode
-cursor = conn.cursor()
-# Create table once at startup, not in the function
-cursor.execute('''
-CREATE TABLE IF NOT EXISTS mutations (
-    engine TEXT,
-    gene TEXT,
-    mut_id TEXT,
-    transcript_id TEXT,
-    data TEXT,
-    PRIMARY KEY (engine, gene, mut_id, transcript_id)
-)''')
-def get_splicing(engine, gene, mut_id, transcript_id, force_recompute=False):
-    """
-    Retrieve computed splicing data for a given mutation from a database,
-    Args:
-        engine (str): Name of the tool used for computation.
-        gene (str): Gene name or identifier.
-        mut_id (str): A unique identifier for the mutation.
-        transcript_id (str): ID for the transcript.
-        force_recompute (bool): If True, ignore cached value and recompute.
-    Returns:
-        dict: The splicing data.
-    """
-    # Lookup in the database
-    cursor.execute('SELECT data FROM mutations WHERE engine=? AND gene=? AND mut_id=? AND transcript_id=?',
-                   (engine, gene, mut_id, transcript_id))
-    row = cursor.fetchone()
-    # If found and no force recompute, return cached data
-    if row:
-        return json.loads(row[0])
-    return None
-def save_splicing(engine, gene, mut_id, transcript_id, splicing):
-    data_json = json.dumps(convert_numpy_to_native(splicing))
-    cursor.execute('REPLACE INTO mutations (engine, gene, mut_id, transcript_id, data) VALUES (?, ?, ?, ?, ?)',
-                   (engine, gene, mut_id, transcript_id, data_json))
-    return None
-def get_or_compute_splicing(mut_id, transcript_id=None, engine='spliceai', force_recompute=False):
-    """
-    Retrieve computed splicing data for a given mutation from a database,
-    or compute and store it if not found or if force_recompute is True.
-    Args:
-        engine (str): Name of the tool used for computation.
-        mut_id (str): A unique identifier for the mutation.
-        transcript_id (str): ID for the transcript.
-        force_recompute (bool): If True, ignore cached value and recompute.
-    Returns:
-        dict: The computed splicing data.
-    """
-    gene = mut_id.split(':')[0]
-    if transcript_id is None:
-        transcript_id = Gene.from_file(gene).transcript().transcript_id
-    # Lookup in the database
-    cursor.execute('SELECT data FROM mutations WHERE engine=? AND gene=? AND mut_id=? AND transcript_id=?',
-                   (engine, gene, mut_id, transcript_id))
-    row = cursor.fetchone()
-    # If found and no force recompute, return cached data
-    if row and not force_recompute:
-        return json.loads(row[0])
-    # Otherwise, compute the data
-    computed_data = convert_numpy_to_native(find_transcript_missplicing(mut_id, transcript=transcript_id, engine=engine).missplicing) # Replace with your actual function
-    # Store computed data in DB
-    data_json = json.dumps(computed_data)
-    cursor.execute('REPLACE INTO mutations (engine, gene, mut_id, transcript_id, data) VALUES (?, ?, ?, ?, ?)',
-                   (engine, gene, mut_id, transcript_id, data_json))
-    return computed_data
+# conn = sqlite3.connect(DB_PATH, isolation_level=None)  # autocommit mode
+# cursor = conn.cursor()
+#
+# # Create table once at startup, not in the function
+# cursor.execute('''
+# CREATE TABLE IF NOT EXISTS mutations (
+#     engine TEXT,
+#     gene TEXT,
+#     mut_id TEXT,
+#     transcript_id TEXT,
+#     data TEXT,
+#     PRIMARY KEY (engine, gene, mut_id, transcript_id)
+# )''')
+# #
+#
+# def get_splicing(engine, gene, mut_id, transcript_id, force_recompute=False):
+#     """
+#     Retrieve computed splicing data for a given mutation from a database,
+#     Args:
+#         engine (str): Name of the tool used for computation.
+#         gene (str): Gene name or identifier.
+#         mut_id (str): A unique identifier for the mutation.
+#         transcript_id (str): ID for the transcript.
+#         force_recompute (bool): If True, ignore cached value and recompute.
+#     Returns:
+#         dict: The splicing data.
+#     """
+#     # Lookup in the database
+#     cursor.execute('SELECT data FROM mutations WHERE engine=? AND gene=? AND mut_id=? AND transcript_id=?',
+#                    (engine, gene, mut_id, transcript_id))
+#     row = cursor.fetchone()
+#     # If found and no force recompute, return cached data
+#     if row:
+#         return json.loads(row[0])
+#     return None
+#
+# def save_splicing(engine, gene, mut_id, transcript_id, splicing):
+#     data_json = json.dumps(convert_numpy_to_native(splicing))
+#     cursor.execute('REPLACE INTO mutations (engine, gene, mut_id, transcript_id, data) VALUES (?, ?, ?, ?, ?)',
+#                    (engine, gene, mut_id, transcript_id, data_json))
+#     return None
+#
+# def get_or_compute_splicing(mut_id, transcript_id=None, engine='spliceai', force_recompute=False):
+#     """
+#     Retrieve computed splicing data for a given mutation from a database,
+#     or compute and store it if not found or if force_recompute is True.
+#     Args:
+#         engine (str): Name of the tool used for computation.
+#         mut_id (str): A unique identifier for the mutation.
+#         transcript_id (str): ID for the transcript.
+#         force_recompute (bool): If True, ignore cached value and recompute.
+#     Returns:
+#         dict: The computed splicing data.
+#     """
+#     gene = mut_id.split(':')[0]
+#     if transcript_id is None:
+#         transcript_id = Gene.from_file(gene).transcript().transcript_id
+#
+#     # Lookup in the database
+#     cursor.execute('SELECT data FROM mutations WHERE engine=? AND gene=? AND mut_id=? AND transcript_id=?',
+#                    (engine, gene, mut_id, transcript_id))
+#     row = cursor.fetchone()
+#     # If found and no force recompute, return cached data
+#     if row and not force_recompute:
+#         return json.loads(row[0])
+#     # Otherwise, compute the data
+#     computed_data = convert_numpy_to_native(find_transcript_missplicing(mut_id, transcript=transcript_id, engine=engine).missplicing) # Replace with your actual function
+#     # Store computed data in DB
+#     data_json = json.dumps(computed_data)
+#     cursor.execute('REPLACE INTO mutations (engine, gene, mut_id, transcript_id, data) VALUES (?, ?, ?, ?, ?)',
+#                    (engine, gene, mut_id, transcript_id, data_json))
+#     return computed_data
 def convert_numpy_to_native(obj):

{geney-1.3.14.dist-info → geney-1.3.16.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: geney
-Version: 1.3.14
+Version: 1.3.16
 Summary: A Python package for gene expression modeling.
 Home-page: https://github.com/nicolaslynn/geney
 Author: Nicolas Lynn

{geney-1.3.14.dist-info → geney-1.3.16.dist-info}/RECORD RENAMED Viewed

@@ -16,7 +16,7 @@ geney/pangolin_utils.py,sha256=i5j5vEMCWOTIa1mRP2377BAhlUFZjHBzTQBips4lA_4,2934
 geney/power_utils.py,sha256=MehZFUdkJ2EFUot709yPEDxSkXmH5XevMebX2HD768A,7330
 geney/seqmat_utils.py,sha256=wzb3PX5it5bpIFQvcxyzlxfhoJTbHHbsjg0rzh05iVs,19753
 geney/spliceai_utils.py,sha256=PFIhTK8Ihrj-cv5tgRN0UFPYEmC4uxtqXSP9bBLnZRM,3077
-geney/splicing_utils.py,sha256=OLes_xEoD5RZvcxyc8kwlUlWpTbJgl9AgA2PFRyOtdE,32286
+geney/splicing_utils.py,sha256=VLSUJ1SFsnz9-Tt3Ywqp786WeVSJY7-VmwtdTgT2cXk,35828
 geney/survival_utils.py,sha256=KnAzEviMuXh6SnVXId9PgsFLSbgkduTvYoIthxN7FPA,6886
 geney/tcga_utils.py,sha256=D_BNHm-D_K408dlcJm3hzH2c6QNFjQsKvUcOPiQRk7g,17612
 geney/tis_utils.py,sha256=2makfGfVlDFVIbxzXE85AY9jmAjcNmxyIAxjvkRA5LY,7396
@@ -25,7 +25,7 @@ geney/translation_initiation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM
 geney/translation_initiation/tis_utils.py,sha256=AF3siFjuQH-Rs44EV-80zHdbxRMvN4woLFSHroWIETc,4448
 geney/translation_initiation/resources/kozak_pssm.json,sha256=pcd0Olziutq-6H3mFWDCD9cujQ_AlZO-iiOvBl82hqE,1165
 geney/translation_initiation/resources/tis_regressor_model.joblib,sha256=IXb4DUDhJ5rBDKcqMk9zE3ECTZZcdj7Jixz3KpoZ7OA,2592025
-geney-1.3.14.dist-info/METADATA,sha256=fdwqYA-nm1pUEIeZIECpRZ0oEjPyWKIGuNSIuwy3_v4,971
-geney-1.3.14.dist-info/WHEEL,sha256=AHX6tWk3qWuce7vKLrj7lnulVHEdWoltgauo8bgCXgU,109
-geney-1.3.14.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
-geney-1.3.14.dist-info/RECORD,,
+geney-1.3.16.dist-info/METADATA,sha256=UFbVToYRg7aFtUlIwXnsQPqP78-eEpQ5AwGKys4feqQ,971
+geney-1.3.16.dist-info/WHEEL,sha256=AHX6tWk3qWuce7vKLrj7lnulVHEdWoltgauo8bgCXgU,109
+geney-1.3.16.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
+geney-1.3.16.dist-info/RECORD,,

{geney-1.3.14.dist-info → geney-1.3.16.dist-info}/WHEEL RENAMED Viewed

File without changes

{geney-1.3.14.dist-info → geney-1.3.16.dist-info}/top_level.txt RENAMED Viewed

File without changes

geney 1.3.14__py2.py3-none-any.whl → 1.3.16__py2.py3-none-any.whl

Potentially problematic release.

geney 1.3.14py2.py3-none-any.whl → 1.3.16py2.py3-none-any.whl