PyPI - geney - Versions diffs - 1.1.14__py2.py3-none-any.whl → 1.1.15__py2.py3-none-any.whl - Mend

geney 1.1.14py2.py3-none-any.whl → 1.1.15py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of geney might be problematic. Click here for more details.

Files changed (7) hide show

geney/immune_utils.py +12 -14
geney/oncosplice.py +2 -2
geney/oncosplice_mouse.py +277 -0
{geney-1.1.14.dist-info → geney-1.1.15.dist-info}/METADATA +4 -1
{geney-1.1.14.dist-info → geney-1.1.15.dist-info}/RECORD +7 -6
{geney-1.1.14.dist-info → geney-1.1.15.dist-info}/WHEEL +0 -0
{geney-1.1.14.dist-info → geney-1.1.15.dist-info}/top_level.txt +0 -0

geney/immune_utils.py CHANGED Viewed

@@ -2,6 +2,8 @@ import subprocess
 import logging
 import tempfile
 from geney import config_setup
+import re
+from io import StringIO
 import pandas as pd
@@ -37,17 +39,17 @@ class NetChop(object):
                 logging.error("Error calling netChop: %s:\n%s" % (e, e.output))
                 raise
         parsed = self.parse_netchop(output)
-        return parsed
+        # return parsed
         #
-        # assert len(parsed) == len(sequences), \
-        #     "Expected %d results but got %d" % (
-        #         len(sequences), len(parsed))
-        # assert [len(x) for x in parsed] == [len(x) for x in sequences]
-        # filtered_proteosomes = []
-        # for scores, seq in list(zip(parsed, sequences)):
-        #     proteosome = self.chop_protein(seq, [s > threshold for s in scores])
-        #     filtered_proteosomes.append([e for e in proteosome if len(e) > min_len])
-        # return filtered_proteosomes
+        assert len(parsed) == len(sequences), \
+            "Expected %d results but got %d" % (
+                len(sequences), len(parsed))
+        assert [len(x) for x in parsed] == [len(x) for x in sequences]
+        filtered_proteosomes = []
+        for scores, seq in list(zip(parsed, sequences)):
+            proteosome = self.chop_protein(seq, [s > threshold for s in scores])
+            filtered_proteosomes.append([e for e in proteosome if len(e) > min_len])
+        return filtered_proteosomes
     @staticmethod
     def parse_netchop(netchop_output):
         """
@@ -99,10 +101,6 @@ class NetChop(object):
         return pd.DataFrame(cut_sequences)
-import re
-import StringIO
-import pandas as pd
 def run_mhc(sequences):
     with tempfile.NamedTemporaryFile(dir='/tamir2/nicolaslynn/temp', suffix=".pep", mode="w") as input_fd:
         for (i, sequence) in enumerate(sequences):

geney/oncosplice.py CHANGED Viewed

@@ -530,8 +530,8 @@ class Transcript:
         for i, j in self.exons_pos:
             rel_start, rel_end = pre_indices_pos.index(i), pre_indices_pos.index(j)
             mature_mrna_pos += pre_seq_pos[rel_start:rel_end + 1]
-            pre_indices_pos.extend(pre_indices_pos[rel_start:rel_end + 1])
-        return mature_mrna_pos, pre_indices_pos
+            mature_indices_pos.extend(pre_indices_pos[rel_start:rel_end + 1])
+        return mature_mrna_pos, mature_indices_pos
     def generate_mature_mrna(self, inplace=True):
         if inplace:

geney/oncosplice_mouse.py ADDED Viewed

@@ -0,0 +1,277 @@
+from geney.oncosplice import *
+from copy import deepcopy
+import pandas as pd
+import numpy as np
+from geney.Fasta_segment import Fasta_segment
+import torch
+config_setup = { "BASE": "/tamir2/nicolaslynntest/experimental/oncosplice_mouse",
+                 "ONCOSPLICE": "/tamir2/nicolaslynntest/experimental/oncosplice_mouse/oncosplice",
+                 "CHROM_SOURCE": "/tamir2/nicolaslynntest/experimental/oncosplice_mouse/chromosomes",
+                 "MRNA_PATH": "/tamir2/nicolaslynntest/experimental/oncosplice_mouse/annotations",
+                 "MISSPLICING_PATH": "/tamir2/nicolaslynntest/experimental/oncosplice_mouse/missplicing"}
+from pkg_resources import resource_filename
+from pangolin.model import *
+IN_MAP = np.asarray([[0, 0, 0, 0],
+                     [1, 0, 0, 0],
+                     [0, 1, 0, 0],
+                     [0, 0, 1, 0],
+                     [0, 0, 0, 1]])
+INDEX_MAP = {0:1, 1:2, 2:4, 3:5, 4:7, 5:8, 6:10, 7:11}
+model_nums = [1, 3, 5, 7]
+models = []
+for i in model_nums:
+    for j in range(1, 6):
+        model = Pangolin(L, W, AR)
+        if torch.cuda.is_available():
+            model.cuda()
+            weights = torch.load(resource_filename("pangolin","models/final.%s.%s.3" % (j, i)))
+        else:
+            weights = torch.load(resource_filename("pangolin","models/final.%s.%s.3" % (j, i)),
+                                 map_location=torch.device('cpu'))
+        model.load_state_dict(weights)
+        model.eval()
+        models.append(model)
+def one_hot_encode(seq, strand='+'):
+    seq = seq.upper().replace('A', '1').replace('C', '2')
+    seq = seq.replace('G', '3').replace('T', '4').replace('N', '0')
+    if strand == '+':
+        seq = np.asarray(list(map(int, list(seq))))
+    elif strand == '-':
+        seq = np.asarray(list(map(int, list(seq[::-1]))))
+        seq = (5 - seq) % 5  # Reverse complement
+    return IN_MAP[seq.astype('int8')]
+def run_pangolin_seq(seq):
+    seq = one_hot_encode(seq, '+').T
+    seq = torch.from_numpy(np.expand_dims(seq, axis=0)).float()
+    if torch.cuda.is_available():
+        seq = seq.to(torch.device("cuda"))
+    score = []
+    for j, model_num in enumerate(model_nums):
+        # score = []
+        # Average across 5 models
+        for model in models[5*j:5*j+5]:
+            with torch.no_grad():
+                score.append(model(seq)[0][INDEX_MAP[model_num],:].cpu().numpy())
+    return np.mean(score, axis=0)
+# Missplicing Detection
+def find_ss_changes(ref_dct, mut_dct, known_splice_sites, threshold=0.5):
+    '''
+    :param ref_dct:  the spliceai probabilities for each nucleotide (by genomic position) as a dictionary for the reference sequence
+    :param mut_dct:  the spliceai probabilities for each nucleotide (by genomic position) as a dictionary for the mutated sequence
+    :param known_splice_sites: the indices (by genomic position) that serve as known splice sites
+    :param threshold: the threshold for detection (difference between reference and mutated probabilities)
+    :return: two dictionaries; discovered_pos is a dictionary containing all the positions that meat the threshold for discovery
+            and deleted_pos containing all the positions that meet the threshold for missing and the condition for missing
+    '''
+    new_dict = {v: mut_dct.get(v, 0) - ref_dct.get(v, 0) for v in
+                list(set(list(ref_dct.keys()) + list(mut_dct.keys())))}
+    discovered_pos = {k: {'delta': round(float(v), 3), 'absolute': round(float(mut_dct[k]), 3)} for k, v in
+                      new_dict.items() if v >= threshold and k not in known_splice_sites}   # if (k not in known_splice_sites and v >= threshold) or (v > 0.45)}
+    deleted_pos = {k: {'delta': round(float(v), 3), 'absolute': round(float(mut_dct.get(k, 0)), 3)} for k, v in
+                   new_dict.items() if -v >= threshold and k in known_splice_sites}      #if k in known_splice_sites and v <= -threshold}
+    return discovered_pos, deleted_pos
+def run_pangolin_comparison(mutations, transcript_data, sai_mrg_context=5000, min_coverage=2500, sai_threshold=0.5):
+    positions = mutations.positions
+    end_positions = [m.start + len(m.ref) for m in mutations.variants]
+    positions.extend(end_positions)
+    seq_start_pos = min(positions) - sai_mrg_context - min_coverage
+    seq_end_pos = max(positions) + sai_mrg_context + min_coverage
+    fasta_obj = Fasta_segment()
+    ref_seq, ref_indices = fasta_obj.read_segment_endpoints(
+        config_setup['CHROM_SOURCE'] / f'chr{mutations.chrom}.fasta',
+        seq_start_pos,
+        seq_end_pos)
+    transcript_start, transcript_end, rev = transcript_data.transcript_lower, transcript_data.transcript_upper, transcript_data.rev
+    start_pad = ref_indices.index(transcript_start) if transcript_start in ref_indices else 0
+    end_cutoff = ref_indices.index(transcript_end) if transcript_end in ref_indices else len(ref_indices)
+    end_pad = len(ref_indices) - end_cutoff
+    ref_seq = 'N' * start_pad + ref_seq[start_pad:end_cutoff] + 'N' * end_pad
+    ref_indices = [-1] * start_pad + ref_indices[start_pad:end_cutoff] + [-1] * end_pad
+    mut_seq, mut_indices = ref_seq, ref_indices
+    for mut in mutations:
+        mut_seq, mut_indices = generate_mut_variant(seq=mut_seq, indices=mut_indices, mut=mut)
+    ref_indices = ref_indices[sai_mrg_context:-sai_mrg_context]
+    mut_indices = mut_indices[sai_mrg_context:-sai_mrg_context]
+    visible_donors = np.intersect1d(transcript_data.donors, ref_indices)
+    visible_acceptors = np.intersect1d(transcript_data.acceptors, ref_indices)
+    if rev:
+        ref_seq = reverse_complement(ref_seq)
+        mut_seq = reverse_complement(mut_seq)
+        ref_indices = ref_indices[::-1]
+        mut_indices = mut_indices[::-1]
+    ref_seq_probs = run_pangolin_seq(ref_seq)
+    mut_seq_probs = run_pangolin_seq(mut_seq)
+    assert len(ref_indices) == len(ref_seq_probs), 'Reference pos not the same'
+    assert len(mut_indices) == len(mut_seq_probs), 'Mut pos not the same'
+    iap, dap = find_ss_changes({p: v for p, v in list(zip(ref_indices, ref_seq_probs))},
+                               {p: v for p, v in list(zip(mut_indices, mut_seq_probs))},
+                               visible_acceptors,
+                               threshold=sai_threshold)
+    idp, ddp = find_ss_changes({p: v for p, v in list(zip(ref_indices, ref_seq_probs))},
+                               {p: v for p, v in list(zip(mut_indices, mut_seq_probs))},
+                               visible_donors,
+                               threshold=sai_threshold)
+    ref_acceptors = {a: b for a, b in list(zip(ref_indices, ref_seq_probs))}
+    ref_donors = {a: b for a, b in list(zip(ref_indices, ref_seq_probs))}
+    lost_acceptors = {int(p): {'absolute': np.float64(0), 'delta': round(float(-ref_acceptors[p]), 3)} for p in visible_acceptors if p not in mut_indices and p not in dap}
+    lost_donors = {int(p): {'absolute': np.float64(0), 'delta': round(float(-ref_donors[p]), 3)} for p in visible_donors if p not in mut_indices and p not in ddp}
+    dap.update(lost_acceptors)
+    ddp.update(lost_donors)
+    missplicing = {'missed_acceptors': dap, 'missed_donors': ddp, 'discovered_acceptors': iap, 'discovered_donors': idp}
+    missplicing = {outk: {float(k): v for k, v in outv.items()} for outk, outv in missplicing.items()}
+    return {outk: {int(k) if k.is_integer() else k: v for k, v in outv.items()} for outk, outv in missplicing.items()}
+class PredictPangolin:
+    def __init__(self, mutation, gene_data,
+                threshold=0.5, context=5000, coverage=2500):
+        self.modification = mutation
+        self.threshold = threshold
+        self.transcript_id = gene_data.transcript_id
+        self.spliceai_db = config_setup['MISSPLICING_PATH'] / f'spliceai_epistatic'
+        self.missplicing = {}
+        self.missplicing = run_pangolin_comparison(self.modification, transcript_data=gene_data, sai_mrg_context=context, min_coverage=coverage, sai_threshold=0.1)
+    def __repr__(self):
+        return f'Missplicing({self.modification.mut_id}) --> {self.missplicing}'
+    def __str__(self):
+        return self.aberrant_splicing
+    def __bool__(self):
+        for event, details in self.aberrant_splicing.items():
+            if details:
+                return True
+        return False
+    def __eq__(self, alt_splicing):
+        flag, _ = check_splicing_difference(self.missplicing, alt_splicing, self.threshold)
+        return not flag
+    def __iter__(self):
+        penetrances = [abs(d_in['delta']) for d in self.missplicing.values() for d_in in d.values()] + [0]
+        return iter(penetrances)
+    @property
+    def aberrant_splicing(self):
+        return self.apply_sai_threshold(self.missplicing, self.threshold)
+    def apply_sai_threshold(self, splicing_dict=None, threshold=None):
+        splicing_dict = self.missplicing if not splicing_dict else splicing_dict
+        threshold = self.threshold if not threshold else threshold
+        new_dict = {}
+        for event, details in splicing_dict.items():
+            for e, d in details.items():
+                if abs(d['delta']) >= threshold:
+                    return splicing_dict
+        return new_dict
+    def apply_sai_threshold_primary(self, splicing_dict=None, threshold=None):
+        splicing_dict = self.missplicing if not splicing_dict else splicing_dict
+        threshold = self.threshold if not threshold else threshold
+        new_dict = {}
+        for event, details in splicing_dict.items():
+            new_dict_in = {}
+            for e, d in details.items():
+                if abs(d['delta']) >= threshold:
+                    new_dict_in[e] = d
+            new_dict[event] = new_dict_in
+        return new_dict
+    def get_max_missplicing_delta(self):
+        max_delta = 0
+        for event, details in self.missplicing.items():
+            for e, d in details.items():
+                if abs(d['delta']) > max_delta:
+                    max_delta = abs(d['delta'])
+        return max_delta
+def oncosplice(mut_id, sai_threshold=0.5, protein_coding=True, primary_transcript=False, per_transcript_missplicing=False, window_length=13, save_spliceai_results=False, force_spliceai=False):
+    mutation = Variations(mut_id)
+    try:
+        reference_gene = Gene(mutation.gene)
+    except FileNotFoundError:
+        return pd.DataFrame()
+    reference_gene_proteines = {g.protein: g.transcript_id for g in reference_gene.run_transcripts()}
+    mutated_gene = Gene(mutation.gene, mut_id)
+    results = []
+    for variant in mutated_gene.run_transcripts(protein_coding=protein_coding, primary_transcript=primary_transcript):
+        reference = reference_gene.transcript(variant.transcript_id)
+        if mutation not in reference or reference.protein == '' or len(reference.protein) < window_length:
+            continue
+        cons_vector = transform_conservation_vector(reference.cons_vector, window=window_length)
+        # if per_transcript_missplicing:
+        missplicing_obj = PredictSpliceAI(mutation, reference, threshold=sai_threshold, force=force_spliceai, save_results=save_spliceai_results)
+        missplicing = missplicing_obj.apply_sai_threshold_primary(threshold=sai_threshold)
+        # print(missplicing)
+        for i, new_boundaries in enumerate(develop_aberrant_splicing(variant, missplicing)):
+            variant_isoform = deepcopy(variant)
+            variant_isoform.reset_acceptors(acceptors=new_boundaries['acceptors']).reset_donors(donors=new_boundaries['donors']).organize().generate_protein()
+            alignment = get_logical_alignment(reference.protein, variant_isoform.protein)
+            deleted, inserted = find_indels_with_mismatches_as_deletions(alignment.seqA, alignment.seqB)
+            modified_positions = find_modified_positions(len(reference.protein), deleted, inserted)
+            temp_cons = np.convolve(cons_vector * modified_positions, np.ones(window_length)) / window_length
+            affected_cons_scores = max(temp_cons)
+            percentile = (
+                        sorted(cons_vector).index(next(x for x in sorted(cons_vector) if x >= affected_cons_scores)) / len(
+                    cons_vector))
+            report = OncospliceAnnotator(reference, variant_isoform, mutation)
+            report['original_cons'] = reference.cons_vector
+            report['oncosplice_score'] = affected_cons_scores
+            report['percentile'] = percentile
+            report['modified_positions'] = modified_positions
+            report['cons_vector'] = cons_vector
+            report['isoform_id'] = i
+            report['isoform_prevalence'] = new_boundaries['path_weight']
+            report['full_missplicing'] = missplicing
+            report['missplicing'] = max(missplicing_obj)
+            report['reference_resemblance'] = reference_gene_proteines.get(variant_isoform.protein, None)
+            results.append(report)
+    report = pd.DataFrame(results)
+    return report

{geney-1.1.14.dist-info → geney-1.1.15.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: geney
-Version: 1.1.14
+Version: 1.1.15
 Summary: A Python package for gene expression modeling.
 Home-page: https://github.com/nicolaslynn/geney
 Author: Nicolas Lynn
@@ -27,9 +27,12 @@ Requires-Dist: joblib ==1.3.2
 Requires-Dist: gtfparse ==1.3.0
 Requires-Dist: sh ==2.0.6
 Requires-Dist: termplotlib ==0.3.9
+Requires-Dist: torch
 Requires-Dist: lifelines
 Requires-Dist: notebook
 Requires-Dist: matplotlib
 Requires-Dist: dask[complete]
 Requires-Dist: dask-jobqueue
+Requires-Dist: gffutils
+Requires-Dist: pyfastx

{geney-1.1.14.dist-info → geney-1.1.15.dist-info}/RECORD RENAMED Viewed

@@ -7,9 +7,10 @@ geney/config_setup.py,sha256=SePeooA4RWAtR_KAT1-W1hkD3MT5tH6YMyp80t_RNPQ,385
 geney/data_setup.py,sha256=DZeksRPr2ZT7bszMo33W0r3OwmqHokVXtZ4gx5Lu_Mo,10725
 geney/gtex.py,sha256=asL2lHyU5KsbWpV096vkf1Ka7hSo_RRfZqw7p5nERmE,1919
 geney/gtex_utils.py,sha256=asL2lHyU5KsbWpV096vkf1Ka7hSo_RRfZqw7p5nERmE,1919
-geney/immune_utils.py,sha256=elxjQyB52lYXrrt3sX6vtYlr_pTFEeCFzmEMP2qlPwA,5300
+geney/immune_utils.py,sha256=ZRni5ttrhpYBnmNr0d0ZatIbNPYs4nmQuoUO00SpsS4,5271
 geney/netchop.py,sha256=AMiy9YsdTmX4B3k3Y5Yh7EmoGAojM1O3AzhPKOiB--g,3050
-geney/oncosplice.py,sha256=hVSsQulgER5NZtmQB59LTLl5tOWPeWcpOpHquW_Z-DM,68965
+geney/oncosplice.py,sha256=vHKRq5Zkc0qhsMAe8sZKbGjjK6-Wgk_Si0EDHUU_BOY,68971
+geney/oncosplice_mouse.py,sha256=LYLOukI9qI1IBkyl1qVRFR5d1NAw7Orlj8Zth-4xCW8,12962
 geney/oncosplice_pipeline.py,sha256=hpGqFHOdn8i8tvvs1-t3-G9Ko18zInwoDXBJbbrfbC4,68036
 geney/performance_utils.py,sha256=FQt7rA4r-Wuq3kceCxsSuMfj3wU1tMG8QnbL59aBohs,4700
 geney/power_utils.py,sha256=6InuDm1jSrsgR-F_LmdMTbuQwty2OdYjwfGGaAPhaRI,7268
@@ -44,7 +45,7 @@ geney/translation_initiation/resources/kozak_pssm.json,sha256=pcd0Olziutq-6H3mFW
 geney/translation_initiation/resources/tis_regressor_model.joblib,sha256=IXb4DUDhJ5rBDKcqMk9zE3ECTZZcdj7Jixz3KpoZ7OA,2592025
 geney/translation_termination/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 geney/translation_termination/tts_utils.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-geney-1.1.14.dist-info/METADATA,sha256=zIhA9HkRpvesCUHmRo9Aml2qSmXBEYa6XBsISeWTtt0,1131
-geney-1.1.14.dist-info/WHEEL,sha256=iYlv5fX357PQyRT2o6tw1bN-YcKFFHKqB_LwHO5wP-g,110
-geney-1.1.14.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
-geney-1.1.14.dist-info/RECORD,,
+geney-1.1.15.dist-info/METADATA,sha256=DMZ8ovJT_dpSe2rmM_m2LAc9nIZsOf4VUlLE__kscfY,1199
+geney-1.1.15.dist-info/WHEEL,sha256=iYlv5fX357PQyRT2o6tw1bN-YcKFFHKqB_LwHO5wP-g,110
+geney-1.1.15.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
+geney-1.1.15.dist-info/RECORD,,

{geney-1.1.14.dist-info → geney-1.1.15.dist-info}/WHEEL RENAMED Viewed

File without changes

{geney-1.1.14.dist-info → geney-1.1.15.dist-info}/top_level.txt RENAMED Viewed

File without changes

geney 1.1.14__py2.py3-none-any.whl → 1.1.15__py2.py3-none-any.whl

Potentially problematic release.

geney 1.1.14py2.py3-none-any.whl → 1.1.15py2.py3-none-any.whl