PyPI - geney - Versions diffs - 1.3.2__py2.py3-none-any.whl → 1.3.4__py2.py3-none-any.whl - Mend

geney 1.3.2py2.py3-none-any.whl → 1.3.4py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

geney/oncosplice.py +109 -105
geney/splicing_utils.py +317 -184
{geney-1.3.2.dist-info → geney-1.3.4.dist-info}/METADATA +1 -1
{geney-1.3.2.dist-info → geney-1.3.4.dist-info}/RECORD +6 -6
{geney-1.3.2.dist-info → geney-1.3.4.dist-info}/WHEEL +0 -0
{geney-1.3.2.dist-info → geney-1.3.4.dist-info}/top_level.txt +0 -0

geney/oncosplice.py CHANGED Viewed

@@ -6,6 +6,10 @@ import pandas as pd
 import numpy as np
 from .SeqMats import SeqMat, MutSeqMat
 from .splicing_utils import find_transcript_missplicing_seqs, develop_aberrant_splicing
+from .Gene import Gene
+import copy
+from . import config
 from .tis_utils import find_tis
 def short_hash_of_list(numbers, length=5):
@@ -301,7 +305,7 @@ def OncospliceAnnotator(reference_transcript, variant_transcript, mut, ref_attri
 def oncosplice(mut_id, splicing_threshold=0.5, protein_coding=True, cons_required=False, primary_transcript=False,
-               window_length=13, organism='hg38', engine='spliceai', domains=None):
+               window_length=13, organism='hg38', engine='spliceai'):
     gene = Gene.from_file(mut_id.split(':')[0], organism=organism)
     reference_gene_proteins = {
         transcript.generate_pre_mrna().generate_mature_mrna().generate_protein().protein: transcript.transcript_id for
@@ -374,110 +378,110 @@ def oncosplice(mut_id, splicing_threshold=0.5, protein_coding=True, cons_require
          'mutation_distance_from_3', 'engine', 'reference_resemblance', 'oncosplice_score', 'percentile',
          'isoform_prevalence', 'reference_protein', 'variant_protein']]
-import asyncio
-async def oncosplice_prototype(mut_id, splicing_threshold=0.5, protein_coding=True, primary_transcript=False,
-                               window_length=13, organism='hg38', engine='spliceai', use_cons=True, require_cons=False):
-    import sys, os
-    needed_file1 = config[organism]['yoram_path'] / 'rest_api_utils.py'
-    needed_file2 = config[organism]['yoram_path'] / 'uniprot_utils.py'
-    if sys.platform == 'linux' and (needed_file1.is_file() and os.access(needed_file1, os.R_OK)) and (
-            needed_file2.is_file() and os.access(needed_file2, os.R_OK)):
-        sys.path.append(str(config[organism]['yoram_path']))
-        import uniprot_utils as uput
-    else:
-        raise SystemError(
-            "Oncosplice Prototype can only be run on Power with access to the /tamir2/yoramzar/Projects/Cancer_mut/Utils folder.")
-    from .tis_utils import find_tis
-    # Define async functions
-    async def background_request(ensb_id, Uniprot_features=["Topological domain", "Transmembrane", "Domain"]):
-        return uput.retrieve_protein_data_features_subset(uput.ensembl_id2uniprot_id(ensb_id), Uniprot_features)
-    def inspect_domain(row, modified_vector, conservation_vector):
-        v1, v2 = modified_vector[row.start:row.end], conservation_vector[row.start:row.end]
-        if sum(v2) == 0:
-            return pd.Series([f'{row.type}|{row.start}|{row.end}|{row.description}', 0],
-                             index=['domain_identifier', 'score'])
-        return pd.Series([f'{row.type}|{row.start}|{row.end}|{row.description}', sum(v1 * v2) / sum(v2)],
-                         index=['domain_identifier', 'score'])
-    gene = Gene(mut_id.split(':')[0], organism=organism)
-    reference_gene_proteins = {tid: transcript.generate_pre_mrna().generate_mature_mrna().generate_protein() for tid, transcript in gene.run_transcripts(protein_coding=True)}
-    mutations = [get_mutation(mut_id, rev=gene.rev) for mut_id in mut_id.split('|')]
-    results = []
-    for tid, transcript in gene.run_transcripts(protein_coding=protein_coding, primary_transcript=primary_transcript):
-        if require_cons and not transcript.cons_available:
-            continue
-        if all(mutation not in transcript for mutation in mutations):
-            # results.append({'transcript_id': transcript.transcript_id})
-            continue
-        task1 = asyncio.create_task(background_request(tid))
-        transcript.generate_pre_mrna()
-        transcript.cons_vector = transform_conservation_vector(transcript.cons_vector, window=window_length)
-        transcript.generate_mature_mrna().generate_protein(inplace=True)
-        ref_protein, cons_vector = transcript.protein, transcript.cons_vector
-        if not use_cons:
-            cons_vector = np.ones(len(ref_protein))
-        if sum(cons_vector) == 0:
-            cons_vector = np.ones(len(ref_protein)) #/len(ref_protein)
-        reference_transcript = copy.deepcopy(transcript)
-        assert len(ref_protein) == len(
-            cons_vector), f"Protein ({len(ref_protein)}) and conservation vector ({len(cons_vector)} must be same length."
-        missplicing = Missplicing(find_transcript_missplicing(transcript, mutations, engine=engine, threshold=splicing_threshold),
-                                  threshold=splicing_threshold)
-        for mutation in mutations:
-            transcript.pre_mrna += mutation
-        domains_df = await task1
-        for i, new_boundaries in enumerate(develop_aberrant_splicing(transcript, missplicing.aberrant_splicing)):
-            transcript.acceptors = new_boundaries['acceptors']
-            transcript.donors = new_boundaries['donors']
-            transcript.generate_mature_mrna()
-            transcript.TIS = find_tis(ref_seq=reference_transcript, mut_seq=transcript)
-            transcript.generate_protein()
-            alignment = get_logical_alignment(reference_transcript.protein, transcript.protein)
-            deleted, inserted = find_indels_with_mismatches_as_deletions(alignment.seqA, alignment.seqB)
-            modified_positions = find_modified_positions(len(ref_protein), deleted, inserted)
-            temp_cons = np.convolve(cons_vector * modified_positions, np.ones(window_length)) / window_length
-            affected_cons_scores = max(temp_cons)
-            percentile = (
-                    sorted(cons_vector).index(next(x for x in sorted(cons_vector) if x >= affected_cons_scores)) / len(
-                cons_vector))
-            out = domains_df.apply(lambda row: inspect_domain(row, modified_positions, cons_vector), axis=1)
-            domains_affected = '+'.join([f'{a}:{round(b, 3)}' for a, b in list(zip(out.domain_identifier, out.score))])
-            report = OncospliceAnnotator(reference_transcript, transcript, mutation)
-            report['mut_id'] = mut_id
-            report['oncosplice_score'] = affected_cons_scores
-            report['cons_available'] = transcript.cons_available
-            report['transcript_id'] = transcript.transcript_id
-            report['percentile'] = percentile
-            report['isoform_id'] = i
-            report['isoform_prevalence'] = new_boundaries['path_weight']
-            report['full_missplicing'] = missplicing.aberrant_splicing
-            report['missplicing'] = max(missplicing)
-            report['domains'] = domains_affected
-            report['max_domain_score'] = out.score.max()
-            report['reference_resemblance'] = reference_gene_proteins.get(transcript.protein, None)
-            results.append(pd.Series(report))
-    report = pd.concat(results, axis=1).T
-    return report
+#
+# import asyncio
+# async def oncosplice_prototype(mut_id, splicing_threshold=0.5, protein_coding=True, primary_transcript=False,
+#                                window_length=13, organism='hg38', engine='spliceai', use_cons=True, require_cons=False):
+#     import sys, os
+#     needed_file1 = config[organism]['yoram_path'] / 'rest_api_utils.py'
+#     needed_file2 = config[organism]['yoram_path'] / 'uniprot_utils.py'
+#
+#     if sys.platform == 'linux' and (needed_file1.is_file() and os.access(needed_file1, os.R_OK)) and (
+#             needed_file2.is_file() and os.access(needed_file2, os.R_OK)):
+#         sys.path.append(str(config[organism]['yoram_path']))
+#         import uniprot_utils as uput
+#
+#     else:
+#         raise SystemError(
+#             "Oncosplice Prototype can only be run on Power with access to the /tamir2/yoramzar/Projects/Cancer_mut/Utils folder.")
+#
+#     from .tis_utils import find_tis
+#
+#     # Define async functions
+#     async def background_request(ensb_id, Uniprot_features=["Topological domain", "Transmembrane", "Domain"]):
+#         return uput.retrieve_protein_data_features_subset(uput.ensembl_id2uniprot_id(ensb_id), Uniprot_features)
+#
+#     def inspect_domain(row, modified_vector, conservation_vector):
+#         v1, v2 = modified_vector[row.start:row.end], conservation_vector[row.start:row.end]
+#         if sum(v2) == 0:
+#             return pd.Series([f'{row.type}|{row.start}|{row.end}|{row.description}', 0],
+#                              index=['domain_identifier', 'score'])
+#
+#         return pd.Series([f'{row.type}|{row.start}|{row.end}|{row.description}', sum(v1 * v2) / sum(v2)],
+#                          index=['domain_identifier', 'score'])
+#
+#     gene = Gene(mut_id.split(':')[0], organism=organism)
+#     reference_gene_proteins = {tid: transcript.generate_pre_mrna().generate_mature_mrna().generate_protein() for tid, transcript in gene.run_transcripts(protein_coding=True)}
+#     mutations = [get_mutation(mut_id, rev=gene.rev) for mut_id in mut_id.split('|')]
+#     results = []
+#     for tid, transcript in gene.run_transcripts(protein_coding=protein_coding, primary_transcript=primary_transcript):
+#         if require_cons and not transcript.cons_available:
+#             continue
+#
+#         if all(mutation not in transcript for mutation in mutations):
+#             # results.append({'transcript_id': transcript.transcript_id})
+#             continue
+#
+#         task1 = asyncio.create_task(background_request(tid))
+#         transcript.generate_pre_mrna()
+#         transcript.cons_vector = transform_conservation_vector(transcript.cons_vector, window=window_length)
+#         transcript.generate_mature_mrna().generate_protein(inplace=True)
+#         ref_protein, cons_vector = transcript.protein, transcript.cons_vector
+#
+#         if not use_cons:
+#             cons_vector = np.ones(len(ref_protein))
+#
+#         if sum(cons_vector) == 0:
+#             cons_vector = np.ones(len(ref_protein)) #/len(ref_protein)
+#
+#         reference_transcript = copy.deepcopy(transcript)
+#
+#         assert len(ref_protein) == len(
+#             cons_vector), f"Protein ({len(ref_protein)}) and conservation vector ({len(cons_vector)} must be same length."
+#
+#         missplicing = Missplicing(find_transcript_missplicing(transcript, mutations, engine=engine, threshold=splicing_threshold),
+#                                   threshold=splicing_threshold)
+#         for mutation in mutations:
+#             transcript.pre_mrna += mutation
+#
+#         domains_df = await task1
+#         for i, new_boundaries in enumerate(develop_aberrant_splicing(transcript, missplicing.aberrant_splicing)):
+#             transcript.acceptors = new_boundaries['acceptors']
+#             transcript.donors = new_boundaries['donors']
+#             transcript.generate_mature_mrna()
+#             transcript.TIS = find_tis(ref_seq=reference_transcript, mut_seq=transcript)
+#             transcript.generate_protein()
+#
+#             alignment = get_logical_alignment(reference_transcript.protein, transcript.protein)
+#             deleted, inserted = find_indels_with_mismatches_as_deletions(alignment.seqA, alignment.seqB)
+#             modified_positions = find_modified_positions(len(ref_protein), deleted, inserted)
+#             temp_cons = np.convolve(cons_vector * modified_positions, np.ones(window_length)) / window_length
+#             affected_cons_scores = max(temp_cons)
+#             percentile = (
+#                     sorted(cons_vector).index(next(x for x in sorted(cons_vector) if x >= affected_cons_scores)) / len(
+#                 cons_vector))
+#
+#             out = domains_df.apply(lambda row: inspect_domain(row, modified_positions, cons_vector), axis=1)
+#             domains_affected = '+'.join([f'{a}:{round(b, 3)}' for a, b in list(zip(out.domain_identifier, out.score))])
+#
+#             report = OncospliceAnnotator(reference_transcript, transcript, mutation)
+#             report['mut_id'] = mut_id
+#             report['oncosplice_score'] = affected_cons_scores
+#             report['cons_available'] = transcript.cons_available
+#             report['transcript_id'] = transcript.transcript_id
+#             report['percentile'] = percentile
+#             report['isoform_id'] = i
+#             report['isoform_prevalence'] = new_boundaries['path_weight']
+#             report['full_missplicing'] = missplicing.aberrant_splicing
+#             report['missplicing'] = max(missplicing)
+#             report['domains'] = domains_affected
+#             report['max_domain_score'] = out.score.max()
+#
+#             report['reference_resemblance'] = reference_gene_proteins.get(transcript.protein, None)
+#             results.append(pd.Series(report))
+#
+#     report = pd.concat(results, axis=1).T
+#     return report
 if __name__ == '__main__':

geney/splicing_utils.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import numpy as np
-from .mutation_utils import get_mutation
-from .seqmat_utils import Gene
+from .Gene import Gene
+from .SeqMats import MutSeqMat
 from collections import defaultdict
@@ -145,75 +144,185 @@ def find_ss_changes(ref_dct, mut_dct, known_splice_sites, threshold=0.5):
     return discovered_pos, deleted_pos
-def find_transcript_missplicing_mutid(mut_id):
-    from geney.Gene import Gene
-    transcript = Gene(mut_id.split(':')[0]).transcript().generate_mature_mrna()
-    out = find_transcript_missplicing(transcript, [get_mutation(mut_id, rev=transcript.rev)], context=5000, window=2500, threshold=0.5, engine='spliceai', just_ss=True)
-    best_delta = 0
-    for k, v in out.items():
-        for k1, v1 in v.items():
-            if abs(v1['delta']) > abs(best_delta):
-                best_delta = v1['delta']
-    return out, best_delta
-def find_transcript_missplicing(transcript, mutations, context=5000, window=2500, threshold=0.5, engine='spliceai', just_ss=False):
-    from functools import reduce
-    ref = transcript.pre_mrna
-    mutations = [mutation for mutation in mutations if mutation.position in ref.indices]
-    if len(mutations) == 0:
-        return {'missed_acceptors': {}, 'missed_donors': {}, 'discovered_acceptors': {}, 'discovered_donors': {}}
+from typing import Tuple, Dict
+def run_splicing_engine(seq, engine='spliceai'):
+    match engine:
+        case 'spliceai':
+            from .spliceai_utils import sai_predict_probs, sai_models
+            donor_probs, acceptor_probs = sai_predict_probs(seq, models=sai_models)
+        case 'pangolin':
+            from .pangolin_utils import pangolin_predict_probs, pang_models
+            donor_probs, acceptor_probs = pangolin_predict_probs(seq, models=pang_models)
+        case _:
+            raise ValueError(f"{engine} not implemented")
+    return donor_probs, acceptor_probs
-    var = reduce(lambda acc, mutation: acc + mutation, mutations, ref)
-    center = int(np.mean([mutation.position for mutation in mutations]) // 1)
-    total_context = context + window
-    length = ref.seqmat.shape[-1]
-    center_index = ref.rel_pos(center)
-    ref_start_pad = max(0, total_context - center_index)
-    ref_end_pad = max(0, total_context - (length - center_index))
+def find_transcript_splicing(transcript, engine: str = 'spliceai') -> Tuple[Dict[int, float], Dict[int, float]]:
+    """
+    Predict splice site probabilities for a given transcript using the specified engine.
+    This function uses a padding of 5000 'N's on each side of the transcript sequence
+    to align with the model's required context length.
+    Args:
+        transcript: An object representing a transcript, expected to have:
+            - an `indices` attribute that returns a sequence of positions.
+            - a `seq` attribute that returns the sequence string.
+        engine (str): The prediction engine to use. Supported: 'spliceai', 'pangolin'.
+    Returns:
+        (donor_probs, acceptor_probs) as two dictionaries keyed by position with probability values.
+    Raises:
+        ValueError: If an unsupported engine is provided.
+        AssertionError: If the length of predicted probabilities does not match the length of indices.
+    """
+    # Prepare reference sequence with padding
+    ref_indices = transcript.indices
+    ref_seq = 'N' * 5000 + transcript.seq + 'N' * 5000
+    ref_seq_acceptor_probs, ref_seq_donor_probs = run_splicing_engine(ref_seq, engine)
+    # Verify lengths
+    assert len(ref_seq_donor_probs) == len(ref_indices), (
+        f"Donor probabilities length ({len(ref_seq_donor_probs)}) does not match "
+        f"indices length ({len(ref_indices)})."
+    )
+    assert len(ref_seq_acceptor_probs) == len(ref_indices), (
+        f"Acceptor probabilities length ({len(ref_seq_acceptor_probs)}) does not match "
+        f"indices length ({len(ref_indices)})."
+    )
+    # Create dictionaries and sort them by probability in descending order
+    donor_probs = dict(sorted((i, p) for i, p in zip(ref_indices, ref_seq_donor_probs)),
+                       key=lambda item: item[1], reverse=True)
+    acceptor_probs = dict(sorted((i, p) for i, p in zip(ref_indices, ref_seq_acceptor_probs)),
+                          key=lambda item: item[1], reverse=True)
+    return donor_probs, acceptor_probs
-    length = var.seqmat.shape[-1]
-    center_index = var.rel_pos(center)
-    if center_index is None:
-        raise IndexError("Center index must not be none... Issue with mutations... They must not be within the transcript.")
-    var_start_pad = max(0, total_context - center_index)
-    var_end_pad = max(0, total_context - (length - center_index))
-    ref = ref.inspect(center, context=total_context)
-    var = var.inspect(center, context=total_context)
+def find_transcript_missplicing(mut_id, transcript='primary', threshold=0.5, engine='spliceai', organism='hg38'):
+    gene = Gene.from_file(mut_id.split(':')[0], organism=organism)
+    reference_transcript = gene.transcript(transcript) if transcript is not None else gene.transcript()
+    variant_transcript = reference_transcript.clone()
+    mutations = [MutSeqMat.from_mutid(m) for m in mut_id.split('|')]
+    mutations = [m for m in mutations if m in reference_transcript]
+    if len(mutations) == 0:
+        return {'missed_acceptors': {}, 'missed_donors': {}, 'discovered_acceptors': {}, 'discovered_donors': {}}
-    ref_indices = np.concatenate([np.zeros(ref_start_pad), ref.indices, np.zeros(ref_end_pad)])
-    mut_indices = np.concatenate([np.zeros(var_start_pad),  var.indices, np.zeros(var_end_pad)])
+    center = np.mean([m.indices[0] for m in mutations]) // 1
+    for mutation in mutations:
+        variant_transcript.mutate(mutation, inplace=True)
-    ref_indices = ref_indices[context:-context]
-    mut_indices = mut_indices[context:-context]
+    return find_transcript_missplicing_seqs(reference_transcript.get_context(center, 7500), variant_transcript.get_context(center, 7500), reference_transcript.donors, reference_transcript.acceptors, threshold=threshold, engine=engine)
-    ref_seq = 'N'*ref_start_pad + ref.seq + 'N'*ref_end_pad
-    var_seq = 'N'*var_start_pad + var.seq + 'N'*var_end_pad
-    if engine == 'spliceai':
-        from .spliceai_utils import sai_predict_probs, sai_models
-        ref_seq_acceptor_probs, ref_seq_donor_probs = sai_predict_probs(ref_seq, models=sai_models)
-        mut_seq_acceptor_probs, mut_seq_donor_probs = sai_predict_probs(var_seq, models=sai_models)
+    # from functools import reduce
+    # ref = transcript.pre_mrna
+    # mutations = [mutation for mutation in mutations if mutation.position in ref.indices]
+    # if len(mutations) == 0:
+    #     return {'missed_acceptors': {}, 'missed_donors': {}, 'discovered_acceptors': {}, 'discovered_donors': {}}
+    #
+    # var = reduce(lambda acc, mutation: acc + mutation, mutations, ref)
+    # center = int(np.mean([mutation.position for mutation in mutations]) // 1)
+    #
+    # total_context = context + window
+    # length = ref.seqmat.shape[-1]
+    # center_index = ref.rel_pos(center)
+    # ref_start_pad = max(0, total_context - center_index)
+    # ref_end_pad = max(0, total_context - (length - center_index))
+    #
+    # length = var.seqmat.shape[-1]
+    # center_index = var.rel_pos(center)
+    # if center_index is None:
+    #     raise IndexError("Center index must not be none... Issue with mutations... They must not be within the transcript.")
+    #
+    # var_start_pad = max(0, total_context - center_index)
+    # var_end_pad = max(0, total_context - (length - center_index))
+    #
+    # ref = ref.inspect(center, context=total_context)
+    # var = var.inspect(center, context=total_context)
+    #
+    # ref_indices = np.concatenate([np.zeros(ref_start_pad), ref.indices, np.zeros(ref_end_pad)])
+    # mut_indices = np.concatenate([np.zeros(var_start_pad),  var.indices, np.zeros(var_end_pad)])
+    #
+    # ref_indices = ref_indices[context:-context]
+    # mut_indices = mut_indices[context:-context]
+    #
+    # ref_seq = 'N'*ref_start_pad + ref.seq + 'N'*ref_end_pad
+    # var_seq = 'N'*var_start_pad + var.seq + 'N'*var_end_pad
+    #
+    # if engine == 'spliceai':
+    #     from .spliceai_utils import sai_predict_probs, sai_models
+    #     ref_seq_acceptor_probs, ref_seq_donor_probs = sai_predict_probs(ref_seq, models=sai_models)
+    #     mut_seq_acceptor_probs, mut_seq_donor_probs = sai_predict_probs(var_seq, models=sai_models)
+    #
+    # elif engine == 'pangolin':
+    #     from .pangolin_utils import pangolin_predict_probs, pang_models
+    #     ref_seq_donor_probs, ref_seq_acceptor_probs = pangolin_predict_probs(ref_seq, models=pang_models, just_ss=just_ss)
+    #     mut_seq_donor_probs, mut_seq_acceptor_probs = pangolin_predict_probs(var_seq, models=pang_models, just_ss=just_ss)
+    #
+    # else:
+    #     raise ValueError(f"{engine} not implemented")
+    #
+    # visible_donors = np.intersect1d(transcript.donors, ref_indices)
+    # visible_acceptors = np.intersect1d(transcript.acceptors, ref_indices)
+    #
+    # assert len(ref_indices) == len(ref_seq_acceptor_probs), f'Reference pos ({len(ref_indices)}) not the same as probs ({len(ref_seq_acceptor_probs)})'
+    # assert len(mut_indices) == len(mut_seq_acceptor_probs), f'Mut pos ({len(mut_indices)}) not the same as probs ({len(mut_seq_acceptor_probs)})'
+    #
+    # iap, dap = find_ss_changes({p: v for p, v in list(zip(ref_indices, ref_seq_acceptor_probs))},
+    #                            {p: v for p, v in list(zip(mut_indices, mut_seq_acceptor_probs))},
+    #                            visible_acceptors,
+    #                            threshold=threshold)
+    #
+    # assert len(ref_indices) == len(ref_seq_donor_probs), 'Reference pos not the same'
+    # assert len(mut_indices) == len(mut_seq_donor_probs), 'Mut pos not the same'
+    #
+    # idp, ddp = find_ss_changes({p: v for p, v in list(zip(ref_indices, ref_seq_donor_probs))},
+    #                            {p: v for p, v in list(zip(mut_indices, mut_seq_donor_probs))},
+    #                            visible_donors,
+    #                            threshold=threshold)
+    #
+    # ref_acceptors = {a: b for a, b in list(zip(ref_indices, ref_seq_acceptor_probs))}
+    # ref_donors = {a: b for a, b in list(zip(ref_indices, ref_seq_donor_probs))}
+    #
+    # lost_acceptors = {int(p): {'absolute': np.float64(0), 'delta': round(float(-ref_acceptors[p]), 3)} for p in
+    #                   visible_acceptors if p not in mut_indices and p not in dap}
+    # lost_donors = {int(p): {'absolute': np.float64(0), 'delta': round(float(-ref_donors[p]), 3)} for p in visible_donors
+    #                if p not in mut_indices and p not in ddp}
+    # dap.update(lost_acceptors)
+    # ddp.update(lost_donors)
+    #
+    # missplicing = {'missed_acceptors': dap, 'missed_donors': ddp, 'discovered_acceptors': iap, 'discovered_donors': idp}
+    # missplicing = {outk: {float(k): v for k, v in outv.items()} for outk, outv in missplicing.items()}
+    # temp =  {outk: {int(k) if k.is_integer() else k: v for k, v in outv.items()} for outk, outv in missplicing.items()}
+    return temp
-    elif engine == 'pangolin':
-        from .pangolin_utils import pangolin_predict_probs, pang_models
-        ref_seq_donor_probs, ref_seq_acceptor_probs = pangolin_predict_probs(ref_seq, models=pang_models, just_ss=just_ss)
-        mut_seq_donor_probs, mut_seq_acceptor_probs = pangolin_predict_probs(var_seq, models=pang_models, just_ss=just_ss)
-    else:
-        raise ValueError(f"{engine} not implemented")
+def find_transcript_missplicing_seqs(ref_seq, var_seq, donors, acceptors, threshold=0.5, engine='spliceai'):
+    if ref_seq.seq == var_seq.seq:
+        return {'missed_acceptors': {}, 'missed_donors': {}, 'discovered_acceptors': {}, 'discovered_donors': {}}
-    visible_donors = np.intersect1d(transcript.donors, ref_indices)
-    visible_acceptors = np.intersect1d(transcript.acceptors, ref_indices)
+    ref_seq_acceptor_probs, ref_seq_donor_probs = run_splicing_engine(ref_seq.seq, engine)
+    mut_seq_acceptor_probs, mut_seq_donor_probs = run_splicing_engine(var_seq.seq, engine)
+    ref_indices = ref_seq.indices[5000:-5000]
+    mut_indices = var_seq.indices[5000:-5000]
+    visible_donors = np.intersect1d(donors, ref_indices)
+    visible_acceptors = np.intersect1d(acceptors, ref_indices)
-    assert len(ref_indices) == len(ref_seq_acceptor_probs), f'Reference pos ({len(ref_indices)}) not the same as probs ({len(ref_seq_acceptor_probs)})'
-    assert len(mut_indices) == len(mut_seq_acceptor_probs), f'Mut pos ({len(mut_indices)}) not the same as probs ({len(mut_seq_acceptor_probs)})'
+    assert len(ref_indices) == len(
+        ref_seq_acceptor_probs), f'Reference pos ({len(ref_indices)}) not the same as probs ({len(ref_seq_acceptor_probs)})'
+    assert len(mut_indices) == len(
+        mut_seq_acceptor_probs), f'Mut pos ({len(mut_indices)}) not the same as probs ({len(mut_seq_acceptor_probs)})'
     iap, dap = find_ss_changes({p: v for p, v in list(zip(ref_indices, ref_seq_acceptor_probs))},
                                {p: v for p, v in list(zip(mut_indices, mut_seq_acceptor_probs))},
                                visible_acceptors,
-                               threshold=threshold)
+                               threshold=0.1)
     assert len(ref_indices) == len(ref_seq_donor_probs), 'Reference pos not the same'
     assert len(mut_indices) == len(mut_seq_donor_probs), 'Mut pos not the same'
@@ -221,113 +330,147 @@ def find_transcript_missplicing(transcript, mutations, context=5000, window=2500
     idp, ddp = find_ss_changes({p: v for p, v in list(zip(ref_indices, ref_seq_donor_probs))},
                                {p: v for p, v in list(zip(mut_indices, mut_seq_donor_probs))},
                                visible_donors,
-                               threshold=threshold)
+                               threshold=0.1)
     ref_acceptors = {a: b for a, b in list(zip(ref_indices, ref_seq_acceptor_probs))}
     ref_donors = {a: b for a, b in list(zip(ref_indices, ref_seq_donor_probs))}
     lost_acceptors = {int(p): {'absolute': np.float64(0), 'delta': round(float(-ref_acceptors[p]), 3)} for p in
                       visible_acceptors if p not in mut_indices and p not in dap}
-    lost_donors = {int(p): {'absolute': np.float64(0), 'delta': round(float(-ref_donors[p]), 3)} for p in visible_donors
+    lost_donors = {int(p): {'absolute': np.float64(0), 'delta': round(float(-ref_donors[p]), 3)} for p in
+                   visible_donors
                    if p not in mut_indices and p not in ddp}
     dap.update(lost_acceptors)
     ddp.update(lost_donors)
-    missplicing = {'missed_acceptors': dap, 'missed_donors': ddp, 'discovered_acceptors': iap, 'discovered_donors': idp}
+    missplicing = {'missed_acceptors': dap, 'missed_donors': ddp, 'discovered_acceptors': iap,
+                   'discovered_donors': idp}
     missplicing = {outk: {float(k): v for k, v in outv.items()} for outk, outv in missplicing.items()}
-    temp =  {outk: {int(k) if k.is_integer() else k: v for k, v in outv.items()} for outk, outv in missplicing.items()}
-    return temp
+    temp = {outk: {int(k) if k.is_integer() else k: v for k, v in outv.items()} for outk, outv in
+            missplicing.items()}
+    return Missplicing(temp, threshold=threshold)
 class Missplicing:
     def __init__(self, splicing_dict, threshold=0.5):
+        """
+        Initialize a Missplicing object.
+        Args:
+            splicing_dict (dict): Dictionary containing splicing events and their details.
+                                  Example:
+                                  {
+                                    "missed_acceptors": {100: {"absolute": 0.0, "delta": -0.3}, ...},
+                                    "missed_donors": { ... },
+                                    "discovered_acceptors": { ... },
+                                    "discovered_donors": { ... }
+                                  }
+            threshold (float): The threshold above which a delta is considered significant.
+        """
         self.missplicing = splicing_dict
         self.threshold = threshold
-    # def __repr__(self):
-    #     return f'Missplicing({self.modification.mut_id}) --> {self.missplicing}'
     def __str__(self):
-        return self.aberrant_splicing
+        """String representation displays the filtered splicing events passing the threshold."""
+        return str(self.significant_events)
     def __bool__(self):
-        if self.apply_sai_threshold_alt() is not None:
-            return True
-        return False
+        """
+        Boolean evaluation: True if any event surpasses the threshold, False otherwise.
+        """
+        return self.first_significant_event() is not None
     def __iter__(self):
-        vals = [0]
-        for event, details in self.missplicing.items():
-            for e, d in details.items():
-                vals.append(d['delta'])
-        return iter(vals)
-    # def __eq__(self, alt_splicing):
-    #     flag, _ = self.check_splicing_difference(self.missplicing, alt_splicing, self.threshold)
-    #     return not flag
+        """
+        Iterate over all delta values from all events. The first yielded value is 0 (for compatibility),
+        followed by all deltas in self.missplicing.
+        """
+        yield 0
+        for details in self.missplicing.values():
+            for d in details.values():
+                yield d['delta']
     @property
-    def aberrant_splicing(self):
-        return self.apply_sai_threshold(self.threshold)
+    def significant_events(self):
+        """
+        Returns a filtered version of missplicing events that meet or exceed the current threshold.
+        """
+        return self.filter_by_threshold(self.threshold)
+    def filter_by_threshold(self, threshold=None):
+        """
+        Filter self.missplicing to only include events where abs(delta) >= threshold.
+        Args:
+            threshold (float, optional): The threshold to apply. Defaults to self.threshold.
+        Returns:
+            dict: A new dictionary with filtered events.
+        """
+        if threshold is None:
+            threshold = self.threshold
-    def apply_sai_threshold(self, threshold=None):
-        splicing_dict = self.missplicing
-        if not threshold:
+        return {
+            event: {
+                pos: detail for pos, detail in details.items()
+                if abs(detail['delta']) >= threshold
+            }
+            for event, details in self.missplicing.items()
+        }
+    def first_significant_event(self, splicing_dict=None, threshold=None):
+        """
+        Check if there is any event surpassing a given threshold and return the dictionary if found.
+        Args:
+            splicing_dict (dict, optional): Dictionary to check. Defaults to self.missplicing.
+            threshold (float, optional): Threshold to apply. Defaults to self.threshold.
+        Returns:
+            dict or None: Returns the dictionary if a delta surpasses the threshold, otherwise None.
+        """
+        if splicing_dict is None:
+            splicing_dict = self.missplicing
+        if threshold is None:
             threshold = self.threshold
-        new_dict = {}
-        for event, details in self.missplicing.items():
-            in_dict = {}
-            for e, d in details.items():
-                if abs(d['delta']) >= threshold:
-                    in_dict[e] = d
-                    # return splicing_dict
-            new_dict[event] = in_dict
-        return new_dict
-    def apply_sai_threshold_alt(self, splicing_dict=None, threshold=None):
-        splicing_dict = self.missplicing if not splicing_dict else splicing_dict
-        threshold = self.threshold if not threshold else threshold
-        for event, details in splicing_dict.items():
-            for e, d in details.items():
-                if abs(d['delta']) >= threshold:
-                    return splicing_dict
+        # Check if any event meets the threshold
+        if any(abs(detail['delta']) >= threshold for details in splicing_dict.values() for detail in details.values()):
+            return splicing_dict
         return None
-    def get_max_missplicing_delta(self):
-        max_delta = 0
-        for event, details in self.missplicing.items():
-            for e, d in details.items():
-                if abs(d['delta']) > max_delta:
-                    max_delta = abs(d['delta'])
-        return max_delta
-def find_transcript_splicing(transcript, engine='spliceai'):
-    ref = transcript.pre_mrna
-    ref_start_pad = 5000
-    ref_end_pad = 5000
-    ref_indices = ref.indices
-    ref_seq = 'N' * ref_start_pad + ref.seq + 'N' * ref_end_pad
-    if engine == 'spliceai':
-        from .spliceai_utils import sai_predict_probs, sai_models
-        ref_seq_acceptor_probs, ref_seq_donor_probs = sai_predict_probs(ref_seq, sai_models)
-    elif engine == 'pangolin':
-        from .pangolin_utils import pangolin_predict_probs, pang_models
-        ref_seq_donor_probs, ref_seq_acceptor_probs = pangolin_predict_probs(ref_seq, models=pang_models)
-    else:
-        raise ValueError(f"{engine} not implemented")
-    assert len(ref_seq_donor_probs) == len(ref_indices), f'{len(ref_seq_donor_probs)}  vs. {len(ref_indices)}'
-    donor_probs = {i: p for i, p in list(zip(ref_indices, ref_seq_donor_probs))}
-    donor_probs = dict(sorted(donor_probs.items(), key=lambda item: item[1], reverse=True))
-    acceptor_probs = {i: p for i, p in list(zip(ref_indices, ref_seq_acceptor_probs))}
-    acceptor_probs = dict(sorted(acceptor_probs.items(), key=lambda item: item[1], reverse=True))
-    return donor_probs, acceptor_probs
+    def max_delta(self):
+        """
+        Returns the maximum absolute delta found in all events.
+        Returns:
+            float: The maximum absolute delta, or 0 if no events.
+        """
+        deltas = [detail['delta'] for details in self.missplicing.values() for detail in details.values()]
+        return max(deltas, key=abs, default=0.0)
+# def find_transcript_splicing(transcript, engine='spliceai'):
+#     ref_indices = transcript.indices
+#     ref_seq = 'N' * 5000 + transcript.seq + 'N' * 5000
+#     if engine == 'spliceai':
+#         from .spliceai_utils import sai_predict_probs, sai_models
+#         ref_seq_acceptor_probs, ref_seq_donor_probs = sai_predict_probs(ref_seq, sai_models)
+#
+#     elif engine == 'pangolin':
+#         from .pangolin_utils import pangolin_predict_probs, pang_models
+#         ref_seq_donor_probs, ref_seq_acceptor_probs = pangolin_predict_probs(ref_seq, models=pang_models)
+#
+#     else:
+#         raise ValueError(f"{engine} not implemented")
+#
+#     assert len(ref_seq_donor_probs) == len(ref_indices), f'{len(ref_seq_donor_probs)}  vs. {len(ref_indices)}'
+#     donor_probs = {i: p for i, p in list(zip(ref_indices, ref_seq_donor_probs))}
+#     donor_probs = dict(sorted(donor_probs.items(), key=lambda item: item[1], reverse=True))
+#
+#     acceptor_probs = {i: p for i, p in list(zip(ref_indices, ref_seq_acceptor_probs))}
+#     acceptor_probs = dict(sorted(acceptor_probs.items(), key=lambda item: item[1], reverse=True))
+#     return donor_probs, acceptor_probs
 def benchmark_splicing(gene, organism='hg38', engine='spliceai'):
@@ -337,7 +480,7 @@ def benchmark_splicing(gene, organism='hg38', engine='spliceai'):
         return None, None
     transcript.generate_pre_mrna()
-    predicted_donor_sites, predicted_acceptor_sites = find_transcript_splicing(transcript, engine=engine)
+    predicted_donor_sites, predicted_acceptor_sites = find_transcript_splicing(transcript.pre_mrna, engine=engine)
     num_introns = len(transcript.introns)
     predicted_donors = list(predicted_donor_sites.keys())[:num_introns]
     predicted_acceptors = list(predicted_acceptor_sites.keys())[:num_introns]
@@ -346,68 +489,58 @@ def benchmark_splicing(gene, organism='hg38', engine='spliceai'):
     return len(correct_donor_preds) / num_introns, len(correct_acceptor_preds) / num_introns, len(transcript.introns)
-def missplicing(mut_id, splicing_threshold=0.5, primary_transcript=True, organism='hg38', engine='spliceai'):
-    gene = Gene(mut_id.split(':')[0], organism=organism)
-    mutation = get_mutation(mut_id, rev=gene.rev)
-    results = {}
-    for tid, transcript in gene.run_transcripts():
-        # if not transcript.primary_transcript and primary_transcript:
-        #     continue
-        #
-        if mutation not in transcript:
-            continue
-        good_tid = tid
-        transcript.generate_pre_mrna()
-        results[tid] = Missplicing(find_transcript_missplicing(transcript, mutation, engine=engine),
-                                   threshold=splicing_threshold)
-    # if len(results) == 0:
-    #     return None
-    #
-    # if primary_transcript and good_tid in results:
-    #     return results[good_tid]
-    # else:
-    #     return None
-    return results
 import sqlite3
 import json
+import os
+# Global connection and cursor (adjust to your architecture)
+# Ideally, initialize this once in your application startup code.
+DB_PATH = os.path.join(config['splicing_db'], 'mutation_data.db')
+conn = sqlite3.connect(DB_PATH, isolation_level=None)  # autocommit mode
+cursor = conn.cursor()
+# Create table once at startup, not in the function
+cursor.execute('''
+CREATE TABLE IF NOT EXISTS mutations (
+    tool TEXT,
+    gene TEXT,
+    mutation_id TEXT,
+    transcript_id TEXT,
+    data TEXT,
+    PRIMARY KEY (tool, gene, mutation_id, transcript_id)
+)''')
 def get_or_compute_splicing(tool, gene, mutation_id, transcript_id, force_recompute=False):
-    conn = sqlite3.connect(os.path.join(tool_parameters['splicing_db'], 'mutation_data.db'))
-    cursor = conn.cursor()
-    # Create table if it doesn't exist
-    cursor.execute('''CREATE TABLE IF NOT EXISTS mutations (
-                         tool TEXT,
-                         gene TEXT,
-                         mutation_id TEXT,
-                         transcript_id TEXT,
-                         data TEXT,
-                         PRIMARY KEY (tool, gene, mutation_id, transcript_id)
-                      )''')
-    # Look up entry
+    """
+    Retrieve computed splicing data for a given mutation from a database,
+    or compute and store it if not found or if force_recompute is True.
+    Args:
+        tool (str): Name of the tool used for computation.
+        gene (str): Gene name or identifier.
+        mutation_id (str): A unique identifier for the mutation.
+        transcript_id (str): ID for the transcript.
+        force_recompute (bool): If True, ignore cached value and recompute.
+    Returns:
+        dict: The computed splicing data.
+    """
+    # Lookup in the database
     cursor.execute('SELECT data FROM mutations WHERE tool=? AND gene=? AND mutation_id=? AND transcript_id=?',
                    (tool, gene, mutation_id, transcript_id))
     row = cursor.fetchone()
-    # If entry is found and force_recompute is False, return data
+    # If found and no force recompute, return cached data
     if row and not force_recompute:
         return json.loads(row[0])
-    # Otherwise, compute the dictionary
-    computed_data = find_transcript_missplicing(tool, mutation_id, transcript_id)  # Replace with actual function
+    # Otherwise, compute the data
+    computed_data = find_transcript_missplicing(mutation_id, transcript_id=transcript_id, engine=tool)  # Replace with your actual function
-    # Store in the database
+    # Store computed data in DB
     data_json = json.dumps(computed_data)
-    cursor.execute('REPLACE INTO mutations (tool, mutation_id, transcript_id, data) VALUES (?, ?, ?, ?)',
-                   (tool, mutation_id, transcript_id, data_json))
-    conn.commit()
-    conn.close()
+    cursor.execute('REPLACE INTO mutations (tool, gene, mutation_id, transcript_id, data) VALUES (?, ?, ?, ?, ?)',
+                   (tool, gene, mutation_id, transcript_id, data_json))
     return computed_data

{geney-1.3.2.dist-info → geney-1.3.4.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: geney
-Version: 1.3.2
+Version: 1.3.4
 Summary: A Python package for gene expression modeling.
 Home-page: https://github.com/nicolaslynn/geney
 Author: Nicolas Lynn

{geney-1.3.2.dist-info → geney-1.3.4.dist-info}/RECORD RENAMED Viewed

@@ -10,12 +10,12 @@ geney/graphic_utils.py,sha256=oMsBpB9YeEn96gGpKh4MmtagJffWZbk-xPrIwHvkFhA,11016
 geney/gtex_utils.py,sha256=asL2lHyU5KsbWpV096vkf1Ka7hSo_RRfZqw7p5nERmE,1919
 geney/immune_utils.py,sha256=ZRni5ttrhpYBnmNr0d0ZatIbNPYs4nmQuoUO00SpsS4,5271
 geney/mutation_utils.py,sha256=C_kv2MB_L8LlhX3W2ooXjJ3uDoJ8zX1WeDtZKoBZJkI,1547
-geney/oncosplice.py,sha256=FdvuROk2G7wwLoB5lLzYia8Smw9hHZeVs-J2MUoAwlU,22106
+geney/oncosplice.py,sha256=1xphL2LeAObwUKBXgcyyKbNO9bAryKDZesK7OpUpFfA,22336
 geney/pangolin_utils.py,sha256=i5j5vEMCWOTIa1mRP2377BAhlUFZjHBzTQBips4lA_4,2934
 geney/power_utils.py,sha256=MehZFUdkJ2EFUot709yPEDxSkXmH5XevMebX2HD768A,7330
 geney/seqmat_utils.py,sha256=wzb3PX5it5bpIFQvcxyzlxfhoJTbHHbsjg0rzh05iVs,19753
 geney/spliceai_utils.py,sha256=PFIhTK8Ihrj-cv5tgRN0UFPYEmC4uxtqXSP9bBLnZRM,3077
-geney/splicing_utils.py,sha256=4xYXy_dIbqdbVfxsEj_OCuM-MsQ24gi4fIv0vQjAYcQ,19215
+geney/splicing_utils.py,sha256=6TlSeNK8BWGnm5AqYKa_qObqHQVcgYtRY08JmfK9yZ8,26338
 geney/survival_utils.py,sha256=KnAzEviMuXh6SnVXId9PgsFLSbgkduTvYoIthxN7FPA,6886
 geney/tcga_utils.py,sha256=D_BNHm-D_K408dlcJm3hzH2c6QNFjQsKvUcOPiQRk7g,17612
 geney/tis_utils.py,sha256=2makfGfVlDFVIbxzXE85AY9jmAjcNmxyIAxjvkRA5LY,7396
@@ -24,7 +24,7 @@ geney/translation_initiation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM
 geney/translation_initiation/tis_utils.py,sha256=AF3siFjuQH-Rs44EV-80zHdbxRMvN4woLFSHroWIETc,4448
 geney/translation_initiation/resources/kozak_pssm.json,sha256=pcd0Olziutq-6H3mFWDCD9cujQ_AlZO-iiOvBl82hqE,1165
 geney/translation_initiation/resources/tis_regressor_model.joblib,sha256=IXb4DUDhJ5rBDKcqMk9zE3ECTZZcdj7Jixz3KpoZ7OA,2592025
-geney-1.3.2.dist-info/METADATA,sha256=aGPdV-x5PcONjV5ylUg8rYhW0eo4Fm2HDOE8dzldpcg,994
-geney-1.3.2.dist-info/WHEEL,sha256=fS9sRbCBHs7VFcwJLnLXN1MZRR0_TVTxvXKzOnaSFs8,110
-geney-1.3.2.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
-geney-1.3.2.dist-info/RECORD,,
+geney-1.3.4.dist-info/METADATA,sha256=ONsBA4xTOrs0KaNJR9pBrwlHE06WC8YUuCFfH5vV2ag,994
+geney-1.3.4.dist-info/WHEEL,sha256=fS9sRbCBHs7VFcwJLnLXN1MZRR0_TVTxvXKzOnaSFs8,110
+geney-1.3.4.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
+geney-1.3.4.dist-info/RECORD,,

{geney-1.3.2.dist-info → geney-1.3.4.dist-info}/WHEEL RENAMED Viewed

File without changes

{geney-1.3.2.dist-info → geney-1.3.4.dist-info}/top_level.txt RENAMED Viewed

File without changes

geney 1.3.2__py2.py3-none-any.whl → 1.3.4__py2.py3-none-any.whl

geney 1.3.2py2.py3-none-any.whl → 1.3.4py2.py3-none-any.whl