PyPI - geney - Versions diffs - 1.2.40__py2.py3-none-any.whl → 1.2.41__py2.py3-none-any.whl - Mend

geney 1.2.40py2.py3-none-any.whl → 1.2.41py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

geney/graphic_utils.py +1 -2
geney/oncosplice.py +10 -6
geney/tcga_utils.py +66 -33
{geney-1.2.40.dist-info → geney-1.2.41.dist-info}/METADATA +1 -1
{geney-1.2.40.dist-info → geney-1.2.41.dist-info}/RECORD +7 -7
{geney-1.2.40.dist-info → geney-1.2.41.dist-info}/WHEEL +0 -0
{geney-1.2.40.dist-info → geney-1.2.41.dist-info}/top_level.txt +0 -0

geney/graphic_utils.py CHANGED Viewed

@@ -1,9 +1,8 @@
 import matplotlib.pyplot as plt
 from matplotlib.patches import Rectangle
 import seaborn as sns
 from collections import namedtuple
-from geney.utils import find_files_by_gene_name, reverse_complement, unload_pickle, contains, unload_json, dump_json #, is_monotonic
+from geney.utils import unload_pickle, contains, unload_json, dump_json
 ### Graphical Stuff

geney/oncosplice.py CHANGED Viewed

@@ -331,19 +331,19 @@ def OncospliceAnnotator(reference_transcript, variant_transcript, mut):
     report['primary_transcript'] = reference_transcript.primary_transcript
     report['transcript_id'] = reference_transcript.transcript_id
     # report['mut_id'] = mut.mut_id
-    report['cons_available'] = int(reference_transcript.cons_available)
+    # report['cons_available'] = int(reference_transcript.cons_available)
     # report['protein_coding'] = reference_transcript.transcript_biotype
     # report['reference_mrna'] = reference_transcript.transcript_seq
-    report['reference_cds_start'] = reference_transcript.TIS
+    # report['reference_cds_start'] = reference_transcript.TIS
     # report['reference_pre_mrna'] = reference_transcript.pre_mrna
     # report[
     #     'reference_orf'] = reference_transcript.orf  # pre_mrna[reference_transcript.transcript_indices.index(reference_transcript.TIS):reference_transcript.transcript_indices.index(reference_transcript.TTS)]
     report['reference_protein'] = reference_transcript.protein
-    report['reference_protein_length'] = len(reference_transcript.protein)
+    # report['reference_protein_length'] = len(reference_transcript.protein)
     # report['variant_mrna'] = variant_transcript.transcript_seq
-    report['variant_cds_start'] = variant_transcript.TIS
+    # report['variant_cds_start'] = variant_transcript.TIS
     # report[
     #     'variant_pre_mrna'] = variant_transcript.pre_mrna  # pre_mrna[variant_transcript.transcript_indices.index(variant_transcript.TIS):variant_transcript.transcript_indices.index(variant_transcript.TTS)]
     # report['variant_orf'] = variant_transcript.orf
@@ -363,6 +363,8 @@ def OncospliceAnnotator(reference_transcript, variant_transcript, mut):
 def oncosplice(mut_id, splicing_threshold=0.5, protein_coding=True, primary_transcript=False, window_length=13, organism='hg38', engine='spliceai', domains=None):
     gene = Gene(mut_id.split(':')[0], organism=organism)
+    reference_gene_proteins = {tid: transcript.generate_pre_mrna().generate_mature_mrna().generate_protein() for tid, transcript in gene.run_transcripts(protein_coding=True)}
     mutations = [get_mutation(m, rev=gene.rev) for m in mut_id.split('|')]
     results = []
@@ -408,7 +410,7 @@ def oncosplice(mut_id, splicing_threshold=0.5, protein_coding=True, primary_tran
             report['isoform_prevalence'] = new_boundaries['path_weight']
             report['full_missplicing'] = missplicing.aberrant_splicing
             report['missplicing'] = max(missplicing)
-            # report['reference_resemblance'] = reference_gene_proteins.get(variant_isoform.protein, None)
+            report['reference_resemblance'] = reference_gene_proteins.get(transcript.protein, None)
             results.append(report)
     report = pd.DataFrame(results)
@@ -445,6 +447,8 @@ async def oncosplice_prototype(mut_id, splicing_threshold=0.5, protein_coding=Tr
                          index=['domain_identifier', 'score'])
     gene = Gene(mut_id.split(':')[0], organism=organism)
+    reference_gene_proteins = {tid: transcript.generate_pre_mrna().generate_mature_mrna().generate_protein() for tid, transcript in gene.run_transcripts(protein_coding=True)}
     mutations = [get_mutation(mut_id, rev=gene.rev) for mut_id in mut_id.split('|')]
     results = []
@@ -501,7 +505,7 @@ async def oncosplice_prototype(mut_id, splicing_threshold=0.5, protein_coding=Tr
             report['full_missplicing'] = missplicing.aberrant_splicing
             report['missplicing'] = max(missplicing)
             report['domains_affected'] = domains_affected
-            # report['reference_resemblance'] = reference_gene_proteins.get(variant_isoform.protein, None)
+            report['reference_resemblance'] = reference_gene_proteins.get(transcript.protein, None)
             results.append(pd.Series(report))
     report = pd.concat(results, axis=1).T

geney/tcga_utils.py CHANGED Viewed

@@ -2,6 +2,7 @@
 import pandas as pd
 import random
 from pathlib import Path
+from tqdm import tqdm
 class TCGACase:
     def __init__(self, df):
@@ -98,38 +99,61 @@ class TCGACase:
 class TCGAGene:
     def __init__(self, gene, cancer_path=Path('/tamir2/cancer_proj/gdc_db/data/filtered_feb_2021/AllGenes/'),
                  valid_cases=None, extra_cols=[], exclude_filters=None, include_filter=None):
-        df = pd.read_csv(cancer_path / gene / 'GeneMutTble.txt',
-                         usecols=['Variant_Type', 'FILTER', 'vcf_tumor_gt', 'vcf_normal_gt',
-                                  'COSMIC', 't_depth', 't_ref_count', 't_alt_count', 'Proj_name',
-                                  'HGVSc', 'Chromosome', 'Start_Position', 'Reference_Allele',
-                                  'Tumor_Seq_Allele2', 'case_id', 'Gene_name', 'Variant_Type'] + extra_cols,
-                         low_memory=False).sort_values('Start_Position', ascending=True)
-        if df.empty:
-            self.df = df
+        file_path = cancer_path / gene / 'GeneMutTble.txt'
+        if not file_path.exists():
+            self.df = pd.DataFrame()
         else:
-            df = df[df.Variant_Type.isin(['SNP', 'INS', 'DEL'])]
-            df = df.astype({'Start_Position': int})
-            if include_filter is not None:
-                df = df[df.FILTER == include_filter]
+            df = pd.read_csv(file_path,
+                             usecols=['Variant_Type', 'FILTER', 'vcf_tumor_gt', 'vcf_normal_gt',
+                                      'COSMIC', 't_depth', 't_ref_count', 't_alt_count', 'Proj_name',
+                                      'HGVSc', 'Chromosome', 'Start_Position', 'Reference_Allele',
+                                      'Tumor_Seq_Allele2', 'case_id', 'Gene_name', 'Variant_Type',
+                                      'Variant_Classification'] + extra_cols,
+                             low_memory=False).sort_values('Start_Position', ascending=True)
-            elif exclude_filters is not None:
-                for exclude_filter in exclude_filters:
-                    df = df[~df.FILTER.str.contains(exclude_filter)]
+            df['attention'] = True
-            if valid_cases is not None:
-                df = df[df.case_id.isin(valid_cases)]
+            if df.empty:
+                self.df = df
-            df['mut_id'] = df.apply(lambda
-                                        row: f"{row.Gene_name}:{row.Chromosome.replace('chr', '')}:{row.Start_Position}:{row.Reference_Allele}:{row.Tumor_Seq_Allele2}",
-                                    axis=1)
-            df['ratio'] = df.t_alt_count + df.t_ref_count
-            df = df[df.ratio > 0]
-            df['ratio'] = df.t_alt_count / df.ratio
-            self.df = df
+            else:
+                df = df[df.Variant_Type.isin(['SNP', 'INS', 'DEL'])]
+                df = df.astype({'Start_Position': int})
+                if include_filter is not None:
+                    # df = df[df.FILTER == include_filter]
+                    df.loc[~df['FILTER'].str.contains(include_filter), 'attention'] = False
+                elif exclude_filters is not None:
+                    for exclude_filter in exclude_filters:
+                        # df = df[~df.FILTER.str.contains(exclude_filter)]
+                        df.loc[df['FILTER'].str.contains(exclude_filter), 'attention'] = False
+                if valid_cases is not None:
+                    # df = df[df.case_id.isin(valid_cases)]
+                    df.loc[~df.case_id.isin(valid_cases), 'attention'] = False
+                df['mut_id'] = df.apply(lambda
+                                            row: f"{row.Gene_name}:{row.Chromosome.replace('chr', '')}:{row.Start_Position}:{row.Reference_Allele}:{row.Tumor_Seq_Allele2}",
+                                        axis=1)
+                df['mut_id_yoram'] = df.apply(lambda
+                                                  row: f"{row.Gene_name}:{row.Chromosome}:{row.Variant_Classification}:{row.Start_Position}:{row.Reference_Allele}:{row.Tumor_Seq_Allele2}",
+                                              axis=1)
+                silent_mut_classes = ["3'Flank", "3'UTR", "Silent", "Splice_Site", "Splice_Region", "Intron", "5'Flank",
+                                      "3'Flank"]
+                df['silent'] = df.apply(lambda row: row.Variant_Classification in silent_mut_classes, axis=1)
+                df['ratio'] = df.t_alt_count + df.t_ref_count
+                df = df[df.ratio > 0]
+                df['ratio'] = df.t_alt_count / df.ratio
+                self.df = df
+    def __repr__(self):
+        return repr(self.df[self.df.attention])
+    @property
+    def data(self):
+        return self.df[self.df.attention]
     def affected_cases(self, mut_id=None, read_ratio=0, filters=[]):
         if mut_id is None:
@@ -164,18 +188,27 @@ class TCGAGene:
     def total_prevalence(self, mut_id):
         pass
-    def project_prevalence(self, mut_id):
-        pass
+    def project_prevalence(self, mut_id, df_p_proc):
+        mut_prevalence = {}
+        for i, g in tqdm(self.data.groupby(['mut_id', 'Transcript_ID'])):
+            mut_prevalence[i] = series_to_pretty_string((df_p_proc[g.case_id].value_counts() / project_counts).dropna())
+        return pd.Series(mut_prevalence)
     def project_counts(self, mut_id):
         pass
+    def filter_silent_muts(self):
+        self.df.loc[self.df.silent, 'attention'] = False
+        return self
+def series_to_pretty_string(series):
+    # Format each index-value pair, applying scientific notation to floats with 3 significant figures
+    pretty_str = "\n".join([
+        f"{index}: {value:.3e}" if isinstance(value, float) else f"{index}: {value}"
+        for index, value in series.items()
+    ])
+    return pretty_str
 # CLINICAL_DATA_FILE = Path('/tamir2/nicolaslynn/data/TCGA/cancer_reports/new_df_p_proc.pkl')

{geney-1.2.40.dist-info → geney-1.2.41.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: geney
-Version: 1.2.40
+Version: 1.2.41
 Summary: A Python package for gene expression modeling.
 Home-page: https://github.com/nicolaslynn/geney
 Author: Nicolas Lynn

{geney-1.2.40.dist-info → geney-1.2.41.dist-info}/RECORD RENAMED Viewed

@@ -2,25 +2,25 @@ geney/Fasta_segment.py,sha256=0zCdzPUbDeM9Rz642woH5Q94pwI46O0fE3H8w0XWebc,11255
 geney/__init__.py,sha256=eBdDl42N6UhcYeZDjOnv199Z88fI5_8Y6xW8447OKXM,755
 geney/config_setup.py,sha256=klm_k7Ca_703DpeGBcGoDqz1XwHQhNXENPKjj_xfSQw,608
 geney/data_setup.py,sha256=2RHmuvcGUQbEglXQEZr0C2QPDTQYRZOEm0EcmyfQJgU,12229
-geney/graphic_utils.py,sha256=tjm6IDQ1BdfSeuPYzjlqAUHFQoDYH9jXTzJjKFS4Hh4,11078
+geney/graphic_utils.py,sha256=oMsBpB9YeEn96gGpKh4MmtagJffWZbk-xPrIwHvkFhA,11016
 geney/gtex_utils.py,sha256=asL2lHyU5KsbWpV096vkf1Ka7hSo_RRfZqw7p5nERmE,1919
 geney/immune_utils.py,sha256=ZRni5ttrhpYBnmNr0d0ZatIbNPYs4nmQuoUO00SpsS4,5271
 geney/mutation_utils.py,sha256=C_kv2MB_L8LlhX3W2ooXjJ3uDoJ8zX1WeDtZKoBZJkI,1547
-geney/oncosplice.py,sha256=J_nFs_xBSJtgMqeHv628QodRL0B2d-Zi1Ke7Pk7S4R4,22595
+geney/oncosplice.py,sha256=1K8p-sytnMUKTYwO_z_YJLelLosKj8TZpM0i5lHcMFI,22941
 geney/pangolin_utils.py,sha256=lLmnjJdJjqwWS85-1jlPLIjD2z14sWjzU87hS-8xxpQ,2873
 geney/power_utils.py,sha256=MehZFUdkJ2EFUot709yPEDxSkXmH5XevMebX2HD768A,7330
 geney/seqmat_utils.py,sha256=YV5DFLbfjXLIswPGvqK1-eEfwn9TUby0b2kewdGAKws,18372
 geney/spliceai_utils.py,sha256=gIGPC8u3J15A7EQrk2Elho5PbF9MmUUNopGGH-eEV8s,1873
 geney/splicing_utils.py,sha256=lGBNknnAdKhcJ3MqPQ5c9oz_NKcL2lcFAr78StjKa6o,16151
 geney/survival_utils.py,sha256=2CAkC2LsspicHIdrqsiPnjgvpr5KHDUfLFFqnRbPJqs,5762
-geney/tcga_utils.py,sha256=vXSMf1OxoF_AdE_rMguy_BoYaart_E1t4FFMx2DS1Ak,15585
+geney/tcga_utils.py,sha256=wM52QZ1M_54CrXZ_uj05R14ycZh23gTZUI8b0ZMtPd0,17615
 geney/tis_utils.py,sha256=vA2ci4gNfwwQZlCjPpO5ehvL2NRVeM7lHI_VyfT-_10,8049
 geney/utils.py,sha256=EsKvBM-Nz2a3_4ZAhF4Dxd4PwT7_6YYKpxEN4LLgg10,2174
 geney/translation_initiation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 geney/translation_initiation/tis_utils.py,sha256=AF3siFjuQH-Rs44EV-80zHdbxRMvN4woLFSHroWIETc,4448
 geney/translation_initiation/resources/kozak_pssm.json,sha256=pcd0Olziutq-6H3mFWDCD9cujQ_AlZO-iiOvBl82hqE,1165
 geney/translation_initiation/resources/tis_regressor_model.joblib,sha256=IXb4DUDhJ5rBDKcqMk9zE3ECTZZcdj7Jixz3KpoZ7OA,2592025
-geney-1.2.40.dist-info/METADATA,sha256=ja7ULYnyNPbYYj-wloXQzHDH86TL2mg4LfgEmZaMcbE,948
-geney-1.2.40.dist-info/WHEEL,sha256=fS9sRbCBHs7VFcwJLnLXN1MZRR0_TVTxvXKzOnaSFs8,110
-geney-1.2.40.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
-geney-1.2.40.dist-info/RECORD,,
+geney-1.2.41.dist-info/METADATA,sha256=e7eHu8HlNdNuNXLWxK17ok3lAetzKTJ7ie-8MRct1T8,948
+geney-1.2.41.dist-info/WHEEL,sha256=fS9sRbCBHs7VFcwJLnLXN1MZRR0_TVTxvXKzOnaSFs8,110
+geney-1.2.41.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
+geney-1.2.41.dist-info/RECORD,,

{geney-1.2.40.dist-info → geney-1.2.41.dist-info}/WHEEL RENAMED Viewed

File without changes

{geney-1.2.40.dist-info → geney-1.2.41.dist-info}/top_level.txt RENAMED Viewed

File without changes

geney 1.2.40__py2.py3-none-any.whl → 1.2.41__py2.py3-none-any.whl

geney 1.2.40py2.py3-none-any.whl → 1.2.41py2.py3-none-any.whl