PyPI - geney - Versions diffs - 1.2.32__py2.py3-none-any.whl → 1.2.34__py2.py3-none-any.whl - Mend

geney 1.2.32py2.py3-none-any.whl → 1.2.34py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of geney might be problematic. Click here for more details.

Files changed (9) hide show

geney/config_setup.py +2 -1
geney/oncosplice.py +41 -27
geney/seqmat_utils.py +1 -1
geney/tis_utils.py +175 -0
geney/translation_initiation/tis_utils.py +2 -0
{geney-1.2.32.dist-info → geney-1.2.34.dist-info}/METADATA +2 -2
{geney-1.2.32.dist-info → geney-1.2.34.dist-info}/RECORD +9 -8
{geney-1.2.32.dist-info → geney-1.2.34.dist-info}/WHEEL +0 -0
{geney-1.2.32.dist-info → geney-1.2.34.dist-info}/top_level.txt +0 -0

geney/config_setup.py CHANGED Viewed

@@ -6,7 +6,8 @@ def get_config():
     config_file = os.path.join(os.path.expanduser('~'), '.oncosplice_setup_1_2', 'config.json')
     if Path(config_file).exists():
         config_setup = {k: {k_in: Path(p_in) for k_in, p_in in p.items()} for k, p in json.loads(open(config_file).read()).items()}
+        config_setup['hg38']['titer_path'] = Path('/tamir2/nicolaslynn/tools/titer')
+        config_setup['hg38']['yoram_path'] = Path('/tamir2/yoramzar/Projects/Cancer_mut/Utils')
     else:
         print("Database not set up.")
         config_setup = {}

geney/oncosplice.py CHANGED Viewed

@@ -7,7 +7,7 @@ import pandas as pd
 from .splicing_utils import find_transcript_missplicing, develop_aberrant_splicing, Missplicing
 from .seqmat_utils import *
 from .mutation_utils import *
+from .tis_utils import find_tis
 ### Scoring
 def find_continuous_gaps(sequence):
@@ -416,58 +416,69 @@ def oncosplice(mut_id, splicing_threshold=0.5, protein_coding=True, primary_tran
 import asyncio
-async def oncosplice_prototype(mut_id, splicing_threshold=0.5, protein_coding=True, primary_transcript=False, window_length=13, organism='hg38', engine='spliceai'):
+async def oncosplice_prototype(mut_id, splicing_threshold=0.5, protein_coding=True, primary_transcript=False,
+                               window_length=13, organism='hg38', engine='spliceai'):
     import sys, os
-    from pathlib import Path
-    needed_path = Path('/tamir2/yoramzar/Projects/Cancer_mut/Utils')
-    needed_file1 = needed_path / 'rest_api_utils.py'
-    needed_file2 = needed_path / 'uniprot_utils.py'
+    needed_file1 = config[organism]['yoram_path'] / 'rest_api_utils.py'
+    needed_file2 = config[organism]['yoram_path'] / 'uniprot_utils.py'
-    if sys.platform == 'linux' and (needed_file1.is_file() and os.access(needed_file1, os.X_OK)) and (needed_file2.is_file() and os.access(needed_file2, os.X_OK)):
-        sys.path.append(str(needed_path))
+    if sys.platform == 'linux' and (needed_file1.is_file() and os.access(needed_file1, os.R_OK)) and (
+            needed_file2.is_file() and os.access(needed_file2, os.R_OK)):
+        sys.path.append(str(config[organism]['yoram_path']))
         import uniprot_utils as uput
     else:
-        raise SystemError("Oncosplice Prototype can only be run on Power with access to the /tamir2/yoramzar/Projects/Cancer_mut/Utils folder.")
+        raise SystemError(
+            "Oncosplice Prototype can only be run on Power with access to the /tamir2/yoramzar/Projects/Cancer_mut/Utils folder.")
+    from .tis_utils import find_tis
     # Define async functions
     async def background_request(ensb_id, Uniprot_features=["Topological domain", "Transmembrane", "Domain"]):
         return uput.retrieve_protein_data_features_subset(uput.ensembl_id2uniprot_id(ensb_id), Uniprot_features)
+    def inspect_domain(row, modified_vector, conservation_vector):
+        v1, v2 = modified_vector[row.start:row.end], conservation_vector[row.start:row.end]
+        return pd.Series([f'{row.type}|{row.start}|{row.end}|{row.description}', sum(v1 * v2) / sum(v2)],
+                         index=['domain_identifier', 'score'])
     gene = Gene(mut_id.split(':')[0], organism=organism)
-    # request_thread = threading.Thread(target=background_request, args=(gene.transcript_ids, domains))
-    # request_thread.start()
-    mutation = get_mutation(mut_id, rev=gene.rev)
+    mutations = [get_mutation(mut_id, rev=gene.rev) for mut_id in mut_id.split('|')]
     results = []
     for tid, transcript in gene.run_transcripts(protein_coding=protein_coding, primary_transcript=primary_transcript):
         if not transcript.cons_available:
             continue
-        if mutation not in transcript:
+        if all(mutation not in transcript for mutation in mutations):
             results.append({'transcript_id': transcript.transcript_id})
             continue
-        task1 = asyncio.create_task(background_request(transcript.transcript_id))
+        task1 = asyncio.create_task(background_request(tid))
         transcript.generate_pre_mrna()
         transcript.cons_vector = transform_conservation_vector(transcript.cons_vector, window=window_length)
-        transcript.generate_mature_mrna().generate_protein(inplace=True, domains=domains)
+        transcript.generate_mature_mrna().generate_protein(inplace=True)
         ref_protein, cons_vector = transcript.protein, transcript.cons_vector
         reference_transcript = copy.deepcopy(transcript)
-        assert len(ref_protein) == len(cons_vector), f"Protein ({len(ref_protein)}) and conservation vector ({len(cons_vector)} must be same length."
+        assert len(ref_protein) == len(
+            cons_vector), f"Protein ({len(ref_protein)}) and conservation vector ({len(cons_vector)} must be same length."
-        missplicing = Missplicing(find_transcript_missplicing(transcript, mutation, engine=engine), threshold=splicing_threshold)
-        transcript.pre_mrna += mutation
-        result1 = await task1
-        print(result1)
+        missplicing = Missplicing(find_transcript_missplicing(transcript, mutations, engine=engine),
+                                  threshold=splicing_threshold)
+        for mutation in mutations:
+            transcript.pre_mrna += mutation
+        domains_df = await task1
         for i, new_boundaries in enumerate(develop_aberrant_splicing(transcript, missplicing.aberrant_splicing)):
             transcript.acceptors = new_boundaries['acceptors']
             transcript.donors = new_boundaries['donors']
-            transcript.generate_mature_mrna().generate_protein()
+            transcript.generate_mature_mrna()
+            transcript.TIS = find_tis(ref_seq=reference_transcript, mut_seq=transcript)
+            transcript.generate_protein()
             alignment = get_logical_alignment(reference_transcript.protein, transcript.protein)
             deleted, inserted = find_indels_with_mismatches_as_deletions(alignment.seqA, alignment.seqB)
@@ -475,8 +486,11 @@ async def oncosplice_prototype(mut_id, splicing_threshold=0.5, protein_coding=Tr
             temp_cons = np.convolve(cons_vector * modified_positions, np.ones(window_length)) / window_length
             affected_cons_scores = max(temp_cons)
             percentile = (
-                        sorted(cons_vector).index(next(x for x in sorted(cons_vector) if x >= affected_cons_scores)) / len(
-                    cons_vector))
+                    sorted(cons_vector).index(next(x for x in sorted(cons_vector) if x >= affected_cons_scores)) / len(
+                cons_vector))
+            out = domains_df.apply(lambda row: inspect_domain(row, va, vb), axis=1)
+            domains_affected = '+'.join([f'{a}:{b}' for a, b in list(zip(out.domain_identifier, out.score))])
             report = OncospliceAnnotator(reference_transcript, transcript, mutation)
             report['mut_id'] = mut_id
@@ -486,13 +500,13 @@ async def oncosplice_prototype(mut_id, splicing_threshold=0.5, protein_coding=Tr
             report['isoform_prevalence'] = new_boundaries['path_weight']
             report['full_missplicing'] = missplicing.aberrant_splicing
             report['missplicing'] = max(missplicing)
+            report['domains_affected'] = domains_affected
             # report['reference_resemblance'] = reference_gene_proteins.get(variant_isoform.protein, None)
-            results.append(report)
+            results.append(pd.Series(report))
-    report = pd.DataFrame(results)
+    report = pd.concat(results, axis=1).T
     return report
 if __name__ == '__main__':
     pass

geney/seqmat_utils.py CHANGED Viewed

@@ -203,7 +203,7 @@ class SeqMat:
             return SeqMat('ATG')
     def translate(self, tis_index):
-        from Bio import Seq
+        from Bio.Seq import Seq
         return Seq(self.orf_seqmat(tis_index).seq).translate()

geney/tis_utils.py ADDED Viewed

@@ -0,0 +1,175 @@
+import numpy as np
+import pandas as pd
+import os
+from scipy.stats import percentileofscore
+import shelve
+from Bio.Align import PairwiseAligner
+from geney import config
+p = PairwiseAligner()
+def find_tis(ref_seq, mut_seq, left_context=100, right_context=102):
+    tis_coords = ref_seq.mature_mrna.asymmetric_indices(ref_seq.TIS, left_context=0, right_context=3)
+    ref_seq, mut_seq = ref_seq.mature_mrna, mut_seq.mature_mrna
+    # 1. Is the start codon (the indices) conserved in the mut sequence?
+    assert all(a in ref_seq.seqmat[1, :] for a in
+               tis_coords), f"Start codon indices specified not found in the reference sequence."
+    tis_conserved = all(a in mut_seq.seqmat[1, :] for a in tis_coords)
+    # 2. If condition 1 is passed, is the context around that start codon the same in both the reference and the mutated?
+    context_conserved = False
+    if tis_conserved:
+        context_conserved = ref_seq.asymmetric_subseq(tis_coords[0], left_context=left_context,
+                                                      right_context=right_context,
+                                                      padding='$') == mut_seq.asymmetric_subseq(tis_coords[0],
+                                                                                                left_context=left_context,
+                                                                                                right_context=right_context,
+                                                                                                padding='$')
+        # 3. If condition 2 is not met, we perform a TIS reaquisition. If condition 2 is met, then we return the reference TIS to be used in the mutated sequence
+    if context_conserved:
+        return tis_coords[0]
+    # 4. Reaquisition of TIS follows:
+    #### The logic:
+    #  a. We need to find all possible start codon candidates as relative indices
+    #  b. We need to find what proteins each alternative start codon would create
+    #  c. We need to make sure we are only looking at a region around a mutation
+    #  d. We need the titer score rank relative to all titer score reference ranks and relative to the reference score
+    sc_table = pd.read_pickle(config['titer_path'] / 'titer_tis_scores.pickle')
+    # target_transcript = sc_table[sc_table.transcript_id == ref_id]
+    # if len(target_transcript) == 0:
+    ### reaquire TIS score for ref
+    # pass
+    ref_seq_tis_context = ref_seq.asymmetric_subseq(tis_coords[0], left_context=left_context,
+                                                    right_context=right_context, padding='$')
+    # target_ref_titer_score = target_transcript.tis_score
+    ref_titer_score = retrieve_titer_score(ref_seq_tis_context)
+    ref_titer_rank = percentileofscore(sc_table['tis_score'], ref_titer_score)
+    ref_protein = ref_seq.translate(tis_coords[0])
+    candidate_positions = np.array([mut_seq.seq[i:i + 3] in TITER_acceptable_TISs for i in range(len(mut_seq.seq))])
+    candidate_positions = np.array(
+        [p.align(ref_protein, mut_seq.translate(mut_seq.seqmat[1, i])).score if candidate_positions[i] == True else 0
+         for i in range(len(ref_seq.seq))])
+    candidate_positions = candidate_positions > sorted(candidate_positions)[-5]
+    candidate_positions = np.array([retrieve_titer_score(
+        mut_seq.asymmetric_subseq(tis_coords[0], left_context=left_context, right_context=right_context,
+                                  padding='$')) if candidate_positions[i] > 0 else False for i in
+                                    range(len(ref_seq.seq))])
+    candidate_positions = np.array(
+        [percentileofscore(sc_table.tis_score, candidate_positions[i]) if candidate_positions[i] != False else 100 for i
+         in range(len(ref_seq.seq))])
+    best_position = np.where(candidate_positions == min(candidate_positions))[0][0]
+    out = mut_seq.seqmat[1, best_position]
+    return out
+def seq_matrix(seq_list):
+    tensor = np.zeros((len(seq_list), 203, 8))
+    for i in range(len(seq_list)):
+        seq = seq_list[i]
+        j = 0
+        for s in seq:
+            if s == 'A' and (j < 100 or j > 102):
+                tensor[i][j] = [1, 0, 0, 0, 0, 0, 0, 0]
+            if s == 'T' and (j < 100 or j > 102):
+                tensor[i][j] = [0, 1, 0, 0, 0, 0, 0, 0]
+            if s == 'C' and (j < 100 or j > 102):
+                tensor[i][j] = [0, 0, 1, 0, 0, 0, 0, 0]
+            if s == 'G' and (j < 100 or j > 102):
+                tensor[i][j] = [0, 0, 0, 1, 0, 0, 0, 0]
+            if s == '$':
+                tensor[i][j] = [0, 0, 0, 0, 0, 0, 0, 0]
+            if s == 'A' and (j >= 100 and j <= 102):
+                tensor[i][j] = [0, 0, 0, 0, 1, 0, 0, 0]
+            if s == 'T' and (j >= 100 and j <= 102):
+                tensor[i][j] = [0, 0, 0, 0, 0, 1, 0, 0]
+            if s == 'C' and (j >= 100 and j <= 102):
+                tensor[i][j] = [0, 0, 0, 0, 0, 0, 1, 0]
+            if s == 'G' and (j >= 100 and j <= 102):
+                tensor[i][j] = [0, 0, 0, 0, 0, 0, 0, 1]
+            j += 1
+    return tensor
+def build_titer_model(TITER_path=config['titer_setup']):
+    print('Building TITER model...')
+    from tensorflow.keras.constraints import MaxNorm
+    from tensorflow.keras.layers import Conv1D, MaxPool1D, LSTM, Dropout, Flatten, Dense, Activation
+    from tensorflow.keras import Sequential, Input
+    model = Sequential()
+    model.add(Input(shape=(203, 8)))
+    model.add(Conv1D(filters=128,
+                     kernel_size=3,
+                     padding='valid',
+                     kernel_constraint=MaxNorm(3),
+                     activation='relu'))
+    model.add(MaxPool1D(3))
+    model.add(Dropout(rate=0.21370950078747658))
+    model.add(LSTM(units=256,
+                   return_sequences=True))
+    model.add(Dropout(rate=0.7238091317104384))
+    model.add(Flatten())
+    model.add(Dense(1))
+    model.add(Activation('sigmoid'))
+    model.compile(loss='binary_crossentropy',
+                  optimizer='nadam',
+                  metrics=['accuracy'])
+    models = []
+    # Load weights into multiple instances of the model
+    for i in range(32):
+        model_copy = Sequential(model.layers)  # Create a new model instance with the same architecture
+        weights_path = os.path.join(TITER_path, f"bestmodel_{i}.hdf5")
+        if os.path.exists(weights_path):
+            model_copy.load_weights(weights_path)  # Load weights into the new model instance
+            models.append(model_copy)
+            print(f"Loaded model {i} with weights from {weights_path}")
+        else:
+            print(f"Warning: Weights file {weights_path} not found")
+    return models
+def calculate_titer_score(candidate_seq, titer_model=None):  # , prior):
+    if titer_model is None:
+        titer_model = TITER_MODEL
+    processed_seq = seq_matrix([candidate_seq])  # Wrap in list to keep dimensions consistent
+    # prior = np.array([prior]).reshape(1, 1)
+    analyzed_score = np.zeros((1, 1))
+    # Iterate through the models (assuming 32 models) and calculate the score
+    for i in range(32):
+        y_pred = titer_model[i].predict(processed_seq, verbose=0)
+        analyzed_score += y_pred  # * prior
+    print(analyzed_score)
+    return analyzed_score[0][0]
+def retrieve_titer_score(sequence, filename='sequences_shelve.db'):
+    # Open the shelf (acts like a dictionary, stored in a file)
+    with shelve.open(filename) as db:
+        # Check if sequence is already in the shelf
+        if sequence in db:
+            return db[sequence]
+        else:
+            # If not, run the function, store the result, and return it
+            value = calculate_titer_score(sequence, TITER_MODEL)
+            db[sequence] = value
+            return value
+TITER_acceptable_TISs = ['ATG', 'CTG', 'ACG', 'TTG', 'GTG']
+codon_tis_prior = {'ATG': 3.5287101354987644, 'CTG': 1.746859242328512, 'ACG': 1.3535552403706805,
+                   'TTG': 1.1364995562364615, 'GTG': 1.218573747658257}
+stop_codons = ['TAA', 'TAG', 'TGA']
+TITER_MODEL = build_titer_model()

geney/translation_initiation/tis_utils.py CHANGED Viewed

@@ -120,3 +120,5 @@ def get_end_codon(seq, start_position):
 def calculate_titer_score(seq, pos):
     return 0

{geney-1.2.32.dist-info → geney-1.2.34.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: geney
-Version: 1.2.32
+Version: 1.2.34
 Summary: A Python package for gene expression modeling.
 Home-page: https://github.com/nicolaslynn/geney
 Author: Nicolas Lynn
@@ -13,7 +13,7 @@ Classifier: License :: Free for non-commercial use
 Classifier: Operating System :: POSIX :: Linux
 Classifier: Operating System :: MacOS
 Classifier: Programming Language :: Python :: 3.9
-Requires-Python: >3.9
+Requires-Python: ==3.10
 Requires-Dist: numpy
 Requires-Dist: pandas
 Requires-Dist: networkx

{geney-1.2.32.dist-info → geney-1.2.34.dist-info}/RECORD RENAMED Viewed

@@ -1,25 +1,26 @@
 geney/Fasta_segment.py,sha256=0zCdzPUbDeM9Rz642woH5Q94pwI46O0fE3H8w0XWebc,11255
 geney/__init__.py,sha256=eBdDl42N6UhcYeZDjOnv199Z88fI5_8Y6xW8447OKXM,755
-geney/config_setup.py,sha256=VA6mhVGMRadwlpEx4m1wrssmDM8qpfKT21MAijIwjyQ,428
+geney/config_setup.py,sha256=klm_k7Ca_703DpeGBcGoDqz1XwHQhNXENPKjj_xfSQw,608
 geney/data_setup.py,sha256=2RHmuvcGUQbEglXQEZr0C2QPDTQYRZOEm0EcmyfQJgU,12229
 geney/graphic_utils.py,sha256=tjm6IDQ1BdfSeuPYzjlqAUHFQoDYH9jXTzJjKFS4Hh4,11078
 geney/gtex_utils.py,sha256=asL2lHyU5KsbWpV096vkf1Ka7hSo_RRfZqw7p5nERmE,1919
 geney/immune_utils.py,sha256=ZRni5ttrhpYBnmNr0d0ZatIbNPYs4nmQuoUO00SpsS4,5271
 geney/mutation_utils.py,sha256=C_kv2MB_L8LlhX3W2ooXjJ3uDoJ8zX1WeDtZKoBZJkI,1547
-geney/oncosplice.py,sha256=7wf0_-Gkc_G9HhUXjORHk3buZ66JzVzSFVQ4EZOtUAE,21787
+geney/oncosplice.py,sha256=QETLNIzc3T1CYausLD3W_jCSJveDkg2F6WnIMagVLT0,22536
 geney/pangolin_utils.py,sha256=ETTGpuaQgdZ1v8H0NP8sbTEfGWu0VXUFUS7wsURsTc4,2991
 geney/power_utils.py,sha256=MehZFUdkJ2EFUot709yPEDxSkXmH5XevMebX2HD768A,7330
-geney/seqmat_utils.py,sha256=TDWhE5oVTGJceaO6YmE7I_BEWRxWLT74_3rkmY1M0Fs,18368
+geney/seqmat_utils.py,sha256=YV5DFLbfjXLIswPGvqK1-eEfwn9TUby0b2kewdGAKws,18372
 geney/spliceai_utils.py,sha256=gIGPC8u3J15A7EQrk2Elho5PbF9MmUUNopGGH-eEV8s,1873
 geney/splicing_utils.py,sha256=q47EdcsHrp4aLIPVWvkGBJSzS3l3DKiD9DNDsPpZdHk,16075
 geney/survival_utils.py,sha256=2CAkC2LsspicHIdrqsiPnjgvpr5KHDUfLFFqnRbPJqs,5762
 geney/tcga_utils.py,sha256=vXSMf1OxoF_AdE_rMguy_BoYaart_E1t4FFMx2DS1Ak,15585
+geney/tis_utils.py,sha256=GlzyO_QvMFt5tM4kewQ1L2l1KAYrCixgw8ny_WsGsYQ,8040
 geney/utils.py,sha256=EsKvBM-Nz2a3_4ZAhF4Dxd4PwT7_6YYKpxEN4LLgg10,2174
 geney/translation_initiation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-geney/translation_initiation/tis_utils.py,sha256=iXrWVijyPe-f8I9rEVGdxNnXBrOGPoKFjmvaOEnQYNE,4446
+geney/translation_initiation/tis_utils.py,sha256=AF3siFjuQH-Rs44EV-80zHdbxRMvN4woLFSHroWIETc,4448
 geney/translation_initiation/resources/kozak_pssm.json,sha256=pcd0Olziutq-6H3mFWDCD9cujQ_AlZO-iiOvBl82hqE,1165
 geney/translation_initiation/resources/tis_regressor_model.joblib,sha256=IXb4DUDhJ5rBDKcqMk9zE3ECTZZcdj7Jixz3KpoZ7OA,2592025
-geney-1.2.32.dist-info/METADATA,sha256=aHeSBHWq3b1li4G_CI2ClUEHJc5SfWHowqKrkZbQPGk,948
-geney-1.2.32.dist-info/WHEEL,sha256=fS9sRbCBHs7VFcwJLnLXN1MZRR0_TVTxvXKzOnaSFs8,110
-geney-1.2.32.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
-geney-1.2.32.dist-info/RECORD,,
+geney-1.2.34.dist-info/METADATA,sha256=LfYqiCiEw25eyzdGGYy2OrJ7rGC05l1lnaF8eupWrTE,950
+geney-1.2.34.dist-info/WHEEL,sha256=fS9sRbCBHs7VFcwJLnLXN1MZRR0_TVTxvXKzOnaSFs8,110
+geney-1.2.34.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
+geney-1.2.34.dist-info/RECORD,,

{geney-1.2.32.dist-info → geney-1.2.34.dist-info}/WHEEL RENAMED Viewed

File without changes

{geney-1.2.32.dist-info → geney-1.2.34.dist-info}/top_level.txt RENAMED Viewed

File without changes

geney 1.2.32__py2.py3-none-any.whl → 1.2.34__py2.py3-none-any.whl

Potentially problematic release.

geney 1.2.32py2.py3-none-any.whl → 1.2.34py2.py3-none-any.whl