PyPI - geney - Versions diffs - 1.2.55__py2.py3-none-any.whl → 1.2.57__py2.py3-none-any.whl - Mend

geney 1.2.55py2.py3-none-any.whl → 1.2.57py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

geney/oncosplice.py +55 -97
geney/pangolin_utils.py +5 -3
geney/spliceai_utils.py +2 -2
geney/splicing_utils.py +3 -3
geney/tis_utils.py +5 -17
{geney-1.2.55.dist-info → geney-1.2.57.dist-info}/METADATA +1 -1
{geney-1.2.55.dist-info → geney-1.2.57.dist-info}/RECORD +9 -9
{geney-1.2.55.dist-info → geney-1.2.57.dist-info}/WHEEL +0 -0
{geney-1.2.55.dist-info → geney-1.2.57.dist-info}/top_level.txt +0 -0

geney/oncosplice.py CHANGED Viewed

@@ -9,7 +9,6 @@ from .seqmat_utils import *
 from .mutation_utils import *
 from .tis_utils import find_tis
-### Scoring
 def find_continuous_gaps(sequence):
     """Find continuous gap sequences in an alignment."""
     return [(m.start(), m.end()) for m in re.finditer(r'-+', sequence)]
@@ -121,43 +120,6 @@ def transform_conservation_vector(conservation_vector, window=13, factor=4):
     return exp_factors
-# def find_modified_positions(sequence_length, deletions, insertions, reach_limit=16):
-#     """
-#     Identify unmodified positions in a sequence given deletions and insertions.
-#
-#     :param sequence_length: Length of the sequence.
-#     :param deletions: Dictionary of deletions.
-#     :param insertions: Dictionary of insertions.
-#     :param reach_limit: Limit for considering the effect of insertions/deletions.
-#     :return: Array indicating unmodified positions.
-#     """
-#     unmodified_positions = np.zeros(sequence_length, dtype=float)
-#
-#     for pos, insertion in insertions.items():
-#         # if pos >= sequence_length:
-#         #     pos = sequence_length - 1
-#         #     add_factor = 1
-#
-#         reach = min(len(insertion) // 2, reach_limit)
-#         front_end, back_end = max(0, pos - reach), min(sequence_length - 1, pos + reach)
-#         len_start, len_end = pos - front_end, back_end - pos
-#         try:
-#             gradient_front = np.linspace(0, 1, len_start, endpoint=False)
-#             gradient_back = np.linspace(0, 1, len_end, endpoint=True)[::-1]
-#             combined_gradient = np.concatenate([gradient_front, np.array([1]), gradient_back])
-#             unmodified_positions[front_end:back_end + 1] = combined_gradient
-#
-#         except ValueError as e:
-#             print(
-#                 f"Error: {e} | Lengths: unmodified_positions_slice={back_end - front_end}.")
-#             unmodified_positions[front_end:back_end] = np.zeros(back_end - front_end)
-#
-#     for pos, deletion in deletions.items():
-#         deletion_length = len(deletion)
-#         unmodified_positions[pos:pos + deletion_length] = 1
-#
-#     return unmodified_positions
 def find_modified_positions(sequence_length, deletions, insertions, reach_limit=16):
     """
     Identify unmodified positions in a sequence given deletions and insertions.
@@ -251,12 +213,7 @@ def moving_average_conv(vector, window_size, factor=1):
     return np.convolve(vector, np.ones(window_size), mode='same') / window_size
 def find_splice_site_proximity(pos, transcript):
     for i, (ex_start, ex_end) in enumerate(transcript.exons):
         if min(ex_start, ex_end) <= pos <= max(ex_start, ex_end):
             return i + 1, None, abs(pos - ex_start), abs(pos - ex_end)
@@ -323,7 +280,7 @@ def summarize_missplicing_event(pes, pir, es, ne, ir):
 # Annotating
 def OncospliceAnnotator(reference_transcript, variant_transcript, mut):
-    affected_exon, affected_intron, distance_from_5, distance_from_3 = find_splice_site_proximity(mut.indices[0],
+    affected_exon, affected_intron, distance_from_5, distance_from_3 = find_splice_site_proximity(np.floor(mut.indices[0]),
                                                                                                   reference_transcript)
     report = {}
@@ -361,59 +318,60 @@ def OncospliceAnnotator(reference_transcript, variant_transcript, mut):
     return report
-# def oncosplice(mut_id, splicing_threshold=0.5, protein_coding=True, cons_required=False, primary_transcript=False, window_length=13, organism='hg38', engine='spliceai', domains=None):
-#     gene = Gene(mut_id.split(':')[0], organism=organism)
-#     reference_gene_proteins = {tid: transcript.generate_pre_mrna().generate_mature_mrna().generate_protein() for tid, transcript in gene.run_transcripts(protein_coding=True)}
-#     mutations = [get_mutation(m, rev=gene.rev) for m in mut_id.split('|')]
-#
-#     results = []
-#     for tid, transcript in gene.run_transcripts(protein_coding=protein_coding, primary_transcript=primary_transcript):
-#         if cons_required and not transcript.cons_available:
-#             continue
-#
-#         if all(mutation not in transcript for mutation in mutations):
-#             # results.append({'transcript_id': transcript.transcript_id})
-#             continue
-#
-#         transcript.generate_pre_mrna()
-#         transcript.cons_vector = transform_conservation_vector(transcript.cons_vector, window=window_length)
-#         transcript.generate_mature_mrna().generate_protein(inplace=True, domains=domains)
-#         ref_protein, cons_vector = transcript.protein, transcript.cons_vector
-#         reference_transcript = copy.deepcopy(transcript)
-#
-#         assert len(ref_protein) == len(cons_vector), f"Protein ({len(ref_protein)}) and conservation vector ({len(cons_vector)}) must be same length. {ref_protein}, \n>{cons_vector}\n>{transcript.cons_seq}"
-#
-#         missplicing = Missplicing(find_transcript_missplicing(transcript, mutations, engine=engine, threshold=splicing_threshold), threshold=splicing_threshold)
-#         for mutation in mutations:
-#             transcript.pre_mrna += mutation
-#
-#         for i, new_boundaries in enumerate(develop_aberrant_splicing(transcript, missplicing.aberrant_splicing)):
-#             transcript.acceptors = new_boundaries['acceptors']
-#             transcript.donors = new_boundaries['donors']
-#             transcript.generate_mature_mrna().generate_protein()
-#
-#             alignment = get_logical_alignment(reference_transcript.protein, transcript.protein)
-#             deleted, inserted = find_indels_with_mismatches_as_deletions(alignment.seqA, alignment.seqB)
-#             modified_positions = find_modified_positions(len(ref_protein), deleted, inserted)
-#             temp_cons = np.convolve(cons_vector * modified_positions, np.ones(window_length)) / window_length
-#             affected_cons_scores = max(temp_cons)
-#             percentile = (
-#                         sorted(cons_vector).index(next(x for x in sorted(cons_vector) if x >= affected_cons_scores)) / len(
-#                     cons_vector))
-#
-#             report = OncospliceAnnotator(reference_transcript, transcript, mutation)
-#             report['mut_id'] = mut_id
-#             report['oncosplice_score'] = affected_cons_scores
-#             report['percentile'] = percentile
-#             report['isoform_id'] = i
-#             report['isoform_prevalence'] = new_boundaries['path_weight']
-#             report['full_missplicing'] = missplicing.aberrant_splicing
-#             report['missplicing'] = max(missplicing)
-#             report['reference_resemblance'] = reference_gene_proteins.get(transcript.protein, None)
-#             results.append(report)
-#
-#     report = pd.DataFrame(results)
-#     return report
+def oncosplice(mut_id, splicing_threshold=0.5, protein_coding=True, cons_required=False, primary_transcript=False, window_length=13, organism='hg38', engine='spliceai', domains=None):
+    gene = Gene(mut_id.split(':')[0], organism=organism)
+    reference_gene_proteins = {tid: transcript.generate_pre_mrna().generate_mature_mrna().generate_protein() for tid, transcript in gene.run_transcripts(protein_coding=True)}
+    mutations = [get_mutation(m, rev=gene.rev) for m in mut_id.split('|')]
+    results = []
+    for tid, transcript in gene.run_transcripts(protein_coding=protein_coding, primary_transcript=primary_transcript):
+        if cons_required and not transcript.cons_available:
+            continue
+        if all(mutation not in transcript for mutation in mutations):
+            continue
+        transcript.generate_pre_mrna()
+        transcript.cons_vector = transform_conservation_vector(transcript.cons_vector, window=window_length)
+        transcript.generate_mature_mrna().generate_protein(inplace=True, domains=domains)
+        ref_protein, cons_vector = transcript.protein, transcript.cons_vector
+        reference_transcript = copy.deepcopy(transcript)
+        assert len(ref_protein) == len(cons_vector), f"Protein ({len(ref_protein)}) and conservation vector ({len(cons_vector)}) must be same length. {ref_protein}, \n>{cons_vector}\n>{transcript.cons_seq}"
+        missplicing = Missplicing(find_transcript_missplicing(transcript, mutations, engine=engine, threshold=splicing_threshold), threshold=splicing_threshold)
+        for mutation in mutations:
+            transcript.pre_mrna += mutation
+        for i, new_boundaries in enumerate(develop_aberrant_splicing(transcript, missplicing.aberrant_splicing)):
+            transcript.acceptors = new_boundaries['acceptors']
+            transcript.donors = new_boundaries['donors']
+            transcript.generate_mature_mrna().generate_protein()
+            alignment = get_logical_alignment(reference_transcript.protein, transcript.protein)
+            deleted, inserted = find_indels_with_mismatches_as_deletions(alignment.seqA, alignment.seqB)
+            modified_positions = find_modified_positions(len(ref_protein), deleted, inserted)
+            temp_cons = np.convolve(cons_vector * modified_positions, np.ones(window_length)) / window_length
+            affected_cons_scores = max(temp_cons)
+            percentile = (
+                        sorted(cons_vector).index(next(x for x in sorted(cons_vector) if x >= affected_cons_scores)) / len(
+                    cons_vector))
+            report = OncospliceAnnotator(reference_transcript, transcript, mutation)
+            report['mut_id'] = mut_id
+            report['oncosplice_score'] = affected_cons_scores
+            report['percentile'] = percentile
+            report['isoform_id'] = i
+            report['isoform_prevalence'] = new_boundaries['path_weight']
+            report['full_missplicing'] = missplicing.aberrant_splicing
+            report['missplicing'] = max(missplicing)
+            report['reference_resemblance'] = reference_gene_proteins.get(transcript.protein, None)
+            results.append(report)
+    if len(results) == 0:
+        return None
+    return pd.DataFrame(results)
 import asyncio

geney/pangolin_utils.py CHANGED Viewed

@@ -46,10 +46,12 @@ def pang_one_hot_encode(seq):
-def pangolin_predict_probs(true_seq, models):
+def pangolin_predict_probs(true_seq, models, just_ss=False):
     # print(f"Running pangolin on: {true_seq}")
-    model_nums = [0, 2, 4, 6]
-    model_nums = [0, 1, 2, 3, 4, 5, 6]
+    if just_ss:
+        model_nums = [0, 2, 4, 6]
+    else:
+        model_nums = [0, 1, 2, 3, 4, 5, 6, 7]
     INDEX_MAP = {0: 1, 1: 2, 2: 4, 3: 5, 4: 7, 5: 8, 6: 10, 7: 11}
     seq = true_seq

geney/spliceai_utils.py CHANGED Viewed

@@ -12,8 +12,8 @@ if tf.config.list_physical_devices('GPU'):
 else:
     print("Running on CPU.")
-# tf.config.threading.set_intra_op_parallelism_threads(1)
-# tf.config.threading.set_inter_op_parallelism_threads(1)
+tf.config.threading.set_intra_op_parallelism_threads(1)
+tf.config.threading.set_inter_op_parallelism_threads(1)
 sai_paths = ('models/spliceai{}.h5'.format(x) for x in range(1, 6))
 sai_models = [load_model(resource_filename('spliceai', x)) for x in sai_paths]

geney/splicing_utils.py CHANGED Viewed

@@ -145,7 +145,7 @@ def find_ss_changes(ref_dct, mut_dct, known_splice_sites, threshold=0.5):
     return discovered_pos, deleted_pos
-def find_transcript_missplicing(transcript, mutations, context=5000, window=2500, threshold=0.5, engine='spliceai'):
+def find_transcript_missplicing(transcript, mutations, context=5000, window=2500, threshold=0.5, engine='spliceai', just_ss=False):
     from functools import reduce
     ref = transcript.pre_mrna
     var = reduce(lambda acc, mutation: acc + mutation, mutations, ref)
@@ -182,8 +182,8 @@ def find_transcript_missplicing(transcript, mutations, context=5000, window=2500
     elif engine == 'pangolin':
         from .pangolin_utils import pangolin_predict_probs, pang_models
-        ref_seq_donor_probs, ref_seq_acceptor_probs = pangolin_predict_probs(ref_seq, models=pang_models)
-        mut_seq_donor_probs, mut_seq_acceptor_probs = pangolin_predict_probs(var_seq, models=pang_models)
+        ref_seq_donor_probs, ref_seq_acceptor_probs = pangolin_predict_probs(ref_seq, models=pang_models, just_ss=just_ss)
+        mut_seq_donor_probs, mut_seq_acceptor_probs = pangolin_predict_probs(var_seq, models=pang_models, just_ss=just_ss)
     else:
         raise ValueError(f"{engine} not implemented")

geney/tis_utils.py CHANGED Viewed

@@ -28,26 +28,13 @@ def find_tis(ref_seq, mut_seq, left_context=100, right_context=102):
                                                                                                 right_context=right_context,
                                                                                                 padding='$')
-        # 3. If condition 2 is not met, we perform a TIS reaquisition. If condition 2 is met, then we return the reference TIS to be used in the mutated sequence
     if context_conserved:
-        return tis_coords[0]
-    # 4. Reaquisition of TIS follows:
-    #### The logic:
-    #  a. We need to find all possible start codon candidates as relative indices
-    #  b. We need to find what proteins each alternative start codon would create
-    #  c. We need to make sure we are only looking at a region around a mutation
-    #  d. We need the titer score rank relative to all titer score reference ranks and relative to the reference score
+        return [(tis_coords[0], 1, 'canonical')]
     sc_table = pd.read_pickle(config['titer_path'] / 'titer_tis_scores.pickle')
-    # target_transcript = sc_table[sc_table.transcript_id == ref_id]
-    # if len(target_transcript) == 0:
-    ### reaquire TIS score for ref
-    # pass
     ref_seq_tis_context = ref_seq.asymmetric_subseq(tis_coords[0], left_context=left_context,
                                                     right_context=right_context, padding='$')
-    # target_ref_titer_score = target_transcript.tis_score
     ref_titer_score = retrieve_titer_score(ref_seq_tis_context)
     ref_titer_rank = percentileofscore(sc_table['tis_score'], ref_titer_score)
     ref_protein = ref_seq.translate(tis_coords[0])
@@ -56,7 +43,8 @@ def find_tis(ref_seq, mut_seq, left_context=100, right_context=102):
     candidate_positions = np.array(
         [p.align(ref_protein, mut_seq.translate(mut_seq.seqmat[1, i])).score if candidate_positions[i] == True else 0
          for i in range(len(ref_seq.seq))])
-    candidate_positions = candidate_positions > sorted(candidate_positions)[-5]
+    candidate_positions = candidate_positions > sorted(candidate_positions)[-5] # implement correct logic
     candidate_positions = np.array([retrieve_titer_score(
         mut_seq.asymmetric_subseq(tis_coords[0], left_context=left_context, right_context=right_context,
                                   padding='$')) if candidate_positions[i] > 0 else False for i in
@@ -66,7 +54,7 @@ def find_tis(ref_seq, mut_seq, left_context=100, right_context=102):
          in range(len(ref_seq.seq))])
     best_position = np.where(candidate_positions == min(candidate_positions))[0][0]
     out = mut_seq.seqmat[1, best_position]
-    return out
+    return out  #output: [(genomic_coord1, probability, filter_tag), (genomic_coord2, probability, filter_tag)]
 def seq_matrix(seq_list):

{geney-1.2.55.dist-info → geney-1.2.57.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: geney
-Version: 1.2.55
+Version: 1.2.57
 Summary: A Python package for gene expression modeling.
 Home-page: https://github.com/nicolaslynn/geney
 Author: Nicolas Lynn

{geney-1.2.55.dist-info → geney-1.2.57.dist-info}/RECORD RENAMED Viewed

@@ -6,21 +6,21 @@ geney/graphic_utils.py,sha256=oMsBpB9YeEn96gGpKh4MmtagJffWZbk-xPrIwHvkFhA,11016
 geney/gtex_utils.py,sha256=asL2lHyU5KsbWpV096vkf1Ka7hSo_RRfZqw7p5nERmE,1919
 geney/immune_utils.py,sha256=ZRni5ttrhpYBnmNr0d0ZatIbNPYs4nmQuoUO00SpsS4,5271
 geney/mutation_utils.py,sha256=C_kv2MB_L8LlhX3W2ooXjJ3uDoJ8zX1WeDtZKoBZJkI,1547
-geney/oncosplice.py,sha256=-_b0ZSxWa-bSYDoVMt605lJlx8-rXf0WsKsFrMoF6Vg,23707
-geney/pangolin_utils.py,sha256=NJEdY43L_2lielY1hZOjlak0baHqXTa1ITrvx8Tkg5o,2878
+geney/oncosplice.py,sha256=eWgY2Lcj894UBFnIVhbxiVz5oqASHg-Ot1wFbjlJbI8,21857
+geney/pangolin_utils.py,sha256=HvXfdLhHWTDXNmYtc8K3p64iTvDtsBq6-Jml5tpg7JI,2930
 geney/power_utils.py,sha256=MehZFUdkJ2EFUot709yPEDxSkXmH5XevMebX2HD768A,7330
 geney/seqmat_utils.py,sha256=2cRXT_Ox4IdzCM8x3H2HexxFZzjo5WHs0HZiUQv8fBM,18347
-geney/spliceai_utils.py,sha256=gIGPC8u3J15A7EQrk2Elho5PbF9MmUUNopGGH-eEV8s,1873
-geney/splicing_utils.py,sha256=t0vE5KTAdYOYJLa9wjaSJ1jqiHhsDxZs64OxrgR-Sqc,16811
+geney/spliceai_utils.py,sha256=21_TaiLW3faRuPegMgsVvIf1G1a03penZSiydQ-hOTA,1869
+geney/splicing_utils.py,sha256=34xdarFpTHsHZkhi7VrHby9DaIBZ2xCLqPMrTmasEgE,16860
 geney/survival_utils.py,sha256=KnAzEviMuXh6SnVXId9PgsFLSbgkduTvYoIthxN7FPA,6886
 geney/tcga_utils.py,sha256=D_BNHm-D_K408dlcJm3hzH2c6QNFjQsKvUcOPiQRk7g,17612
-geney/tis_utils.py,sha256=vA2ci4gNfwwQZlCjPpO5ehvL2NRVeM7lHI_VyfT-_10,8049
+geney/tis_utils.py,sha256=2makfGfVlDFVIbxzXE85AY9jmAjcNmxyIAxjvkRA5LY,7396
 geney/utils.py,sha256=EsKvBM-Nz2a3_4ZAhF4Dxd4PwT7_6YYKpxEN4LLgg10,2174
 geney/translation_initiation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 geney/translation_initiation/tis_utils.py,sha256=AF3siFjuQH-Rs44EV-80zHdbxRMvN4woLFSHroWIETc,4448
 geney/translation_initiation/resources/kozak_pssm.json,sha256=pcd0Olziutq-6H3mFWDCD9cujQ_AlZO-iiOvBl82hqE,1165
 geney/translation_initiation/resources/tis_regressor_model.joblib,sha256=IXb4DUDhJ5rBDKcqMk9zE3ECTZZcdj7Jixz3KpoZ7OA,2592025
-geney-1.2.55.dist-info/METADATA,sha256=bMKlTktE8jhYNpbxWMnp6Z168gk4NafThjukv45vYI4,948
-geney-1.2.55.dist-info/WHEEL,sha256=fS9sRbCBHs7VFcwJLnLXN1MZRR0_TVTxvXKzOnaSFs8,110
-geney-1.2.55.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
-geney-1.2.55.dist-info/RECORD,,
+geney-1.2.57.dist-info/METADATA,sha256=UFirGNGhFN_aJnqSO8WHagJCmEfKoHdfRZojLfKymsE,948
+geney-1.2.57.dist-info/WHEEL,sha256=fS9sRbCBHs7VFcwJLnLXN1MZRR0_TVTxvXKzOnaSFs8,110
+geney-1.2.57.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
+geney-1.2.57.dist-info/RECORD,,

{geney-1.2.55.dist-info → geney-1.2.57.dist-info}/WHEEL RENAMED Viewed

File without changes

{geney-1.2.55.dist-info → geney-1.2.57.dist-info}/top_level.txt RENAMED Viewed

File without changes

geney 1.2.55__py2.py3-none-any.whl → 1.2.57__py2.py3-none-any.whl

geney 1.2.55py2.py3-none-any.whl → 1.2.57py2.py3-none-any.whl