PyPI - geney - Versions diffs - 1.2.3__py2.py3-none-any.whl → 1.2.5__py2.py3-none-any.whl - Mend

geney 1.2.3py2.py3-none-any.whl → 1.2.5py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of geney might be problematic. Click here for more details.

Files changed (8) hide show

geney/data_setup.py +8 -2
geney/oncosplice.py +15 -20
geney/power_utils.py +3 -3
geney/tcga_utils.py +2 -2
{geney-1.2.3.dist-info → geney-1.2.5.dist-info}/METADATA +2 -2
{geney-1.2.3.dist-info → geney-1.2.5.dist-info}/RECORD +8 -8
{geney-1.2.3.dist-info → geney-1.2.5.dist-info}/WHEEL +0 -0
{geney-1.2.3.dist-info → geney-1.2.5.dist-info}/top_level.txt +0 -0

geney/data_setup.py CHANGED Viewed

@@ -7,7 +7,7 @@ from tqdm import tqdm
 import requests
 import argparse
 from sh import gunzip
+import shutil
 def download(external_url, local_path):
     print(f"Grabbing {external_url}")
@@ -94,6 +94,7 @@ def process_transcript(transcript_df, rev, chrm, cons_data):
         cds_start, cds_end = cds_start[0], cds_end[0]
         data.update({'TIS': cds_start, 'TTS': cds_end, 'protein_id': transcript.protein_id})
+    print(f"{transcript.transcript_id} in cons_data: {transcript.transcript_id in cons_data}")
     if transcript.transcript_id in cons_data:
         data.update({'cons_available': True, 'cons_vector': cons_data[transcript.transcript_id]['scores'], 'cons_seq': cons_data[transcript.transcript_id]['seq']})
@@ -104,6 +105,8 @@ def process_transcript(transcript_df, rev, chrm, cons_data):
 def retrieve_and_parse_ensembl_annotations(local_path, annotations_file, cons_data, gtex_file='', valid_biotypes=('protein_coding')):
+    print(cons_data.keys())
     if gtex_file:
         gtex_df = pd.read_csv(gtex_file, delimiter='\t', header=2)
         gtex_df.Name = gtex_df.apply(lambda row: row.Name.split('.')[0], axis=1)
@@ -225,21 +228,24 @@ def main():
         'TEMP': os.path.join(args.basepath, args.organism, 'temp')
     }
+    base_path = Path(args.basepath) / args.organism
     if config_file.exists():
         config_data = unload_json(config_file)
         overwrite = 'y'
         if args.organism in config_data:
             overwrite = input("Organism {args.organism} already configured... Overwrite? (y/n)")
         if overwrite == 'y':
             config_data[args.organism] = config_paths
             dump_json(config_file, config_data)
+            shutil.rmtree(base_path)
         else:
             raise SystemExit("Exiting configuration.")
     else:
         config_data = {args.organism: config_paths}
         dump_json(config_file, config_data)
-    base_path = Path(args.basepath) / args.organism
     if base_path.exists() and len(os.listdir(base_path)) > 0:
         raise FileExistsError(f"Directory {base_path} not empty.")

geney/oncosplice.py CHANGED Viewed

@@ -35,7 +35,7 @@ import torch
 from pkg_resources import resource_filename
 from pangolin.model import *
-pang_model_nums = [0, 2, 4, 6]
+pang_model_nums = [0, 1, 2, 3, 4, 5, 6]
 pang_models = []
 for i in pang_model_nums:
     for j in range(1, 6):
@@ -752,7 +752,7 @@ def pangolin_predict_probs(true_seq, models):
     splicing_pred = np.array(scores).max(axis=0)
     donor_probs = [splicing_pred[i] * donor_dinucleotide[i] for i in range(len(true_seq))]
     acceptor_probs = [splicing_pred[i] * acceptor_dinucleotide[i] for i in range(len(true_seq))]
-    return donor_probs, acceptor_probs
+    return donor_probs[5000:-5000], acceptor_probs[5000:-5000]
 def run_spliceai_transcript(mutations, transcript_data, sai_mrg_context=5000, min_coverage=2500, sai_threshold=0.5, engine='spliceai'):
     positions = mutations.positions
@@ -803,9 +803,11 @@ def run_spliceai_transcript(mutations, transcript_data, sai_mrg_context=5000, mi
         mut_seq_acceptor_probs, mut_seq_donor_probs = mut_seq_probs_temp[0, :], mut_seq_probs_temp[1, :]
     elif engine == 'pangolin':
-        ref_seq_donor_probs, ref_seq_acceptor_probs = pangolin_predict_probs(ref_seq, pangolin_models)
-        mut_seq_donor_probs, mut_seq_acceptor_probs = pangolin_predict_probs(mut_seq, pangolin_models)
+        ref_seq_donor_probs, ref_seq_acceptor_probs = pangolin_predict_probs(ref_seq, pangolin_models=pang_models)
+        mut_seq_donor_probs, mut_seq_acceptor_probs = pangolin_predict_probs(mut_seq, pangolin_models=pang_models)
+    else:
+        raise ValueError(f"{engine} not implemented")
     assert len(ref_indices) == len(ref_seq_acceptor_probs), 'Reference pos not the same'
     assert len(mut_indices) == len(mut_seq_acceptor_probs), 'Mut pos not the same'
@@ -1400,18 +1402,15 @@ def moving_average_conv(vector, window_size, factor=1):
     return np.convolve(vector, np.ones(window_size), mode='same') / window_size
-def oncosplice(mut_id, sai_threshold=0.5, protein_coding=True, primary_transcript=False, per_transcript_missplicing=False, window_length=13, save_spliceai_results=False, force_spliceai=False):
+def oncosplice(mut_id, sai_threshold=0.5, protein_coding=True, primary_transcript=False, window_length=13, save_spliceai_results=False, force_spliceai=False, organism='hg38'):
     mutation = Variations(mut_id)
-    try:
-        reference_gene = Gene(mutation.gene)
-    except FileNotFoundError:
-        return pd.DataFrame()
+    # try:
+    reference_gene = Gene(mutation.gene, organism=organism)
+    # except FileNotFoundError:
+    #     return pd.DataFrame()
-    reference_gene_proteines = {g.protein: g.transcript_id for g in reference_gene.run_transcripts()}
-    mutated_gene = Gene(mutation.gene, mut_id)
-    # if not per_transcript_missplicing:
-    #     missplicing_obj = PredictSpliceAI(mutation, reference_gene, threshold=sai_threshold, force=True, save_results=False)
-    #     missplicing = missplicing_obj.missplicing
+    reference_gene_proteins = {g.protein: g.transcript_id for g in reference_gene.run_transcripts()}
+    mutated_gene = Gene(mutation.gene, mut_id, organism=organism)
     results = []
     for variant in mutated_gene.run_transcripts(protein_coding=protein_coding, primary_transcript=primary_transcript):
@@ -1420,10 +1419,9 @@ def oncosplice(mut_id, sai_threshold=0.5, protein_coding=True, primary_transcrip
             continue
         cons_vector = transform_conservation_vector(reference.cons_vector, window=window_length)
-        # if per_transcript_missplicing:
         missplicing_obj = PredictSpliceAI(mutation, reference, threshold=sai_threshold, force=force_spliceai, save_results=save_spliceai_results)
         missplicing = missplicing_obj.apply_sai_threshold_primary(threshold=sai_threshold)
-        # print(missplicing)
         for i, new_boundaries in enumerate(develop_aberrant_splicing(variant, missplicing)):
             variant_isoform = deepcopy(variant)
             variant_isoform.reset_acceptors(acceptors=new_boundaries['acceptors']).reset_donors(donors=new_boundaries['donors']).organize().generate_protein()
@@ -1432,9 +1430,6 @@ def oncosplice(mut_id, sai_threshold=0.5, protein_coding=True, primary_transcrip
             modified_positions = find_modified_positions(len(reference.protein), deleted, inserted)
             temp_cons = np.convolve(cons_vector * modified_positions, np.ones(window_length)) / window_length
             affected_cons_scores = max(temp_cons)
-            # temp_cons = np.convolve(cons_vector, np.ones(window_length))
-            # print(temp_cons)
-            # print(cons_vector)
             percentile = (
                         sorted(cons_vector).index(next(x for x in sorted(cons_vector) if x >= affected_cons_scores)) / len(
                     cons_vector))
@@ -1449,7 +1444,7 @@ def oncosplice(mut_id, sai_threshold=0.5, protein_coding=True, primary_transcrip
             report['isoform_prevalence'] = new_boundaries['path_weight']
             report['full_missplicing'] = missplicing
             report['missplicing'] = max(missplicing_obj)
-            report['reference_resemblance'] = reference_gene_proteines.get(variant_isoform.protein, None)
+            report['reference_resemblance'] = reference_gene_proteins.get(variant_isoform.protein, None)
             results.append(report)
     report = pd.DataFrame(results)

geney/power_utils.py CHANGED Viewed

@@ -38,7 +38,7 @@ def write_executors(folder_path, script='geney.power_utils', input_file='/tamir2
 def launch_dask_cluster(memory_size="3GB", num_workers=10, queue="tamirQ",
                         walltime="24:00:00", dashboard_address=":23154",
-                        log_directory="dask-logs", slurm=False):
+                        log_directory="dask-logs", slurm=False, organism='hg38'):
     """
     Launch a Dask cluster using PBS.
@@ -63,7 +63,7 @@ def launch_dask_cluster(memory_size="3GB", num_workers=10, queue="tamirQ",
                 walltime='7200',
                 scheduler_options={"dashboard_address": dashboard_address},
                 log_directory=log_directory,
-                job_script_prologue=[f"cd {config_setup['BASE']}"]
+                job_script_prologue=[f"cd {config_setup[organism]['BASE']}"]
             )
         else:
@@ -75,7 +75,7 @@ def launch_dask_cluster(memory_size="3GB", num_workers=10, queue="tamirQ",
                 walltime=walltime,
                 scheduler_options={"dashboard_address": dashboard_address},
                 log_directory=log_directory,
-                job_script_prologue=[f"cd {config_setup['BASE']}"]
+                job_script_prologue=[f"cd {config_setup[organism]['BASE']}"]
             )
         dask_cluster.scale(num_workers)

geney/tcga_utils.py CHANGED Viewed

@@ -363,8 +363,8 @@ class TCGAGene:
 #     return cases
 #
 #
-# def create_mut_id(row):
-#     return f"{row.Gene_name}:{row['Chromosome']}:{row['Start_Position']}:{row['Reference_Allele']}:{row['Tumor_Seq_Allele2']}"
+def create_mut_id(row):
+    return f"{row.Gene_name}:{row['Chromosome']}:{row['Start_Position']}:{row['Reference_Allele']}:{row['Tumor_Seq_Allele2']}"
 #
 #
 # def is_in_exon(mut_id, tid):

{geney-1.2.3.dist-info → geney-1.2.5.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: geney
-Version: 1.2.3
+Version: 1.2.5
 Summary: A Python package for gene expression modeling.
 Home-page: https://github.com/nicolaslynn/geney
 Author: Nicolas Lynn
@@ -17,7 +17,7 @@ Requires-Dist: numpy ==1.26.4
 Requires-Dist: pandas ==2.2.1
 Requires-Dist: networkx ==3.2.1
 Requires-Dist: viennarna ==2.6.4
-Requires-Dist: tqdm ==4.66.1
+Requires-Dist: tqdm >=4.66.1
 Requires-Dist: spliceai ==1.3.1
 Requires-Dist: scikit-learn ==1.0.2
 Requires-Dist: biopython ==1.81

{geney-1.2.3.dist-info → geney-1.2.5.dist-info}/RECORD RENAMED Viewed

@@ -4,20 +4,20 @@ geney/__init__.py,sha256=r-Yvpo_Tc236DcsqsFyexT21iVoYCVl9zoJj5pFuWEE,407
 geney/benchmark_clinvar.py,sha256=LLl77e95Qbg9Kd-m2yL8ilmzubSz9SKogeARwssT4Ks,5532
 geney/compare_sets.py,sha256=TcgL57V7BUPxBoW9lv3xr8qK2Acmykn85Ev3avicQr8,2977
 geney/config_setup.py,sha256=VA6mhVGMRadwlpEx4m1wrssmDM8qpfKT21MAijIwjyQ,428
-geney/data_setup.py,sha256=mV_sSCMT8C41q_PD_G34MIBvsBKA0Czrpw17-DcYmT4,12052
+geney/data_setup.py,sha256=LTiJMYPgv9KnIgUNw-D57Fu4nxL4OojXMpmdhE8QSYU,12228
 geney/gtex.py,sha256=asL2lHyU5KsbWpV096vkf1Ka7hSo_RRfZqw7p5nERmE,1919
 geney/gtex_utils.py,sha256=asL2lHyU5KsbWpV096vkf1Ka7hSo_RRfZqw7p5nERmE,1919
 geney/immune_utils.py,sha256=ZRni5ttrhpYBnmNr0d0ZatIbNPYs4nmQuoUO00SpsS4,5271
 geney/netchop.py,sha256=AMiy9YsdTmX4B3k3Y5Yh7EmoGAojM1O3AzhPKOiB--g,3050
-geney/oncosplice.py,sha256=xDzCLivFyurx-qlQo9cyrV-9KJ9VykYAb8lY9DDWl7Q,71810
+geney/oncosplice.py,sha256=9oZs9W_bI6O5h3284WvatkerhSCaxMZWfs1xVc1lJO0,71524
 geney/oncosplice_mouse.py,sha256=LYLOukI9qI1IBkyl1qVRFR5d1NAw7Orlj8Zth-4xCW8,12962
 geney/oncosplice_pipeline.py,sha256=hpGqFHOdn8i8tvvs1-t3-G9Ko18zInwoDXBJbbrfbC4,68036
 geney/performance_utils.py,sha256=FQt7rA4r-Wuq3kceCxsSuMfj3wU1tMG8QnbL59aBohs,4700
-geney/power_utils.py,sha256=GtEvKAbz34S-ILQST6tabt3g0M4L8_aa50HIAQZ7byM,7266
+geney/power_utils.py,sha256=nppfT1-bOC1dnvfRs55LipjoWDlRrOqWiuCMH0v1auU,7303
 geney/survival.py,sha256=gNKZGcwxDZ00ixVBHf3ZdjbY_AHQOCU9kKpBC_dokbM,5572
 geney/survival_utils.py,sha256=2CAkC2LsspicHIdrqsiPnjgvpr5KHDUfLFFqnRbPJqs,5762
 geney/tcga_annotations.py,sha256=DjRl6Pk5VAOL1yhbt8SXD6FZhYbcYNu3FtXYMeveGB0,15016
-geney/tcga_utils.py,sha256=uAjejr7F-XqcXS5uANGlsHLOlzMmGo4CTbWhMO0E318,15589
+geney/tcga_utils.py,sha256=vXSMf1OxoF_AdE_rMguy_BoYaart_E1t4FFMx2DS1Ak,15585
 geney/utils.py,sha256=xJi7fk3g7DkR2rKOb8WePLQNM1ib83rcHecwRdwd5lA,2036
 geney/analyzers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 geney/analyzers/benchmark_clinvar.py,sha256=ZAxvZ-Ue5T6au5mGbk8clfvbAYl13NIY7U92KzL0lXI,5531
@@ -45,7 +45,7 @@ geney/translation_initiation/resources/kozak_pssm.json,sha256=pcd0Olziutq-6H3mFW
 geney/translation_initiation/resources/tis_regressor_model.joblib,sha256=IXb4DUDhJ5rBDKcqMk9zE3ECTZZcdj7Jixz3KpoZ7OA,2592025
 geney/translation_termination/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 geney/translation_termination/tts_utils.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-geney-1.2.3.dist-info/METADATA,sha256=K6ufQEQw0PTbczOxkWFW26U4URgwy92Q9Aqp8BhsKIA,1198
-geney-1.2.3.dist-info/WHEEL,sha256=iYlv5fX357PQyRT2o6tw1bN-YcKFFHKqB_LwHO5wP-g,110
-geney-1.2.3.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
-geney-1.2.3.dist-info/RECORD,,
+geney-1.2.5.dist-info/METADATA,sha256=9UptuZVJWZvVN6Y9KgPUxrC4gnijFVW4CtkkESxrY9E,1198
+geney-1.2.5.dist-info/WHEEL,sha256=iYlv5fX357PQyRT2o6tw1bN-YcKFFHKqB_LwHO5wP-g,110
+geney-1.2.5.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
+geney-1.2.5.dist-info/RECORD,,

{geney-1.2.3.dist-info → geney-1.2.5.dist-info}/WHEEL RENAMED Viewed

File without changes

{geney-1.2.3.dist-info → geney-1.2.5.dist-info}/top_level.txt RENAMED Viewed

File without changes

geney 1.2.3__py2.py3-none-any.whl → 1.2.5__py2.py3-none-any.whl

Potentially problematic release.

geney 1.2.3py2.py3-none-any.whl → 1.2.5py2.py3-none-any.whl