PyPI - geney - Versions diffs - 1.1.17__py2.py3-none-any.whl → 1.1.19__py2.py3-none-any.whl - Mend

geney 1.1.17py2.py3-none-any.whl → 1.1.19py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of geney might be problematic. Click here for more details.

Files changed (8) hide show

geney/data_setup.py +66 -36
geney/oncosplice.py +73 -5
geney/power_utils.py +1 -1
geney/utils.py +2 -2
{geney-1.1.17.dist-info → geney-1.1.19.dist-info}/METADATA +1 -1
{geney-1.1.17.dist-info → geney-1.1.19.dist-info}/RECORD +8 -8
{geney-1.1.17.dist-info → geney-1.1.19.dist-info}/WHEEL +0 -0
{geney-1.1.17.dist-info → geney-1.1.19.dist-info}/top_level.txt +0 -0

geney/data_setup.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from pathlib import Path
 import os
 from gtfparse import read_gtf
-from geney.utils import dump_json, dump_pickle, unload_pickle
+from geney.utils import dump_json, dump_pickle, unload_pickle, unload_json
 import pandas as pd
 from tqdm import tqdm
 import requests
@@ -103,11 +103,13 @@ def process_transcript(transcript_df, rev, chrm, cons_data):
     return data
-def retrieve_and_parse_ensembl_annotations(local_path, annotations_file, gtex_file, cons_data, valid_biotypes=('protein_coding')):
-    gtex_df = pd.read_csv(gtex_file, delimiter='\t', header=2)
-    gtex_df.Name = gtex_df.apply(lambda row: row.Name.split('.')[0], axis=1)
-    gtex_df = gtex_df.set_index('Name').drop(columns=['Description'])
+def retrieve_and_parse_ensembl_annotations(local_path, annotations_file, cons_data, gtex_file='', valid_biotypes=('protein_coding')):
+    if gtex_file:
+        gtex_df = pd.read_csv(gtex_file, delimiter='\t', header=2)
+        gtex_df.Name = gtex_df.apply(lambda row: row.Name.split('.')[0], axis=1)
+        gtex_df = gtex_df.set_index('Name').drop(columns=['Description'])
+    else:
+        gtex_df = pd.DataFrame()
     annotations = read_gtf(annotations_file)
     temp = annotations[(annotations.gene_biotype == 'protein_coding') & (annotations.transcript_biotype == 'protein_coding')]
@@ -200,59 +202,87 @@ def write_sequence(output_directory, header, sequence):
 def main():
-    config_dir = Path(os.path.join(os.path.expanduser('~'), '.oncosplice_setup'))
-    if config_dir.exists():
-        for file in config_dir.glob('*'):
-            file.unlink()
-        config_dir.rmdir()
-    config_dir.mkdir()
+    config_dir = Path(os.path.join(os.path.expanduser('~'), '.oncosplice_setup_new'))
+    # if config_dir.exists():
+    #     for file in config_dir.glob('*'):
+    #         file.unlink()
+    #     config_dir.rmdir()
+    if not config_dir.exists():
+        config_dir.mkdir()
     parser = argparse.ArgumentParser(description="Geney database location")
     parser.add_argument("-b", "--basepath", help="The location of the data we are mounting.", required=True)
-    # parser.add_argument("-s", "--splicepath", help="The location of the data we are mounting.", required=False, default=None)
+    parser.add_argument("-o", "--organism", help="Which organism we are setting up for (mm39 or hg38).", required=False, default='hg38')
     args = parser.parse_args()
     config_file = config_dir / 'config.json'
     config_paths = {
-        'CHROM_SOURCE': os.path.join(args.basepath, 'chromosomes'),
-        'MRNA_PATH': os.path.join(args.basepath, 'annotations'),
-        'MISSPLICING_PATH': os.path.join(args.basepath, 'missplicing'),
-        'ONCOSPLICE_PATH': os.path.join(args.basepath, 'oncosplice'),
-        'BASE': args.basepath,
-        'NETCHOP': os.path.join(args.basepath, 'netchop')
+        'CHROM_SOURCE': os.path.join(args.basepath, args.organism, 'chromosomes'),
+        'MRNA_PATH': os.path.join(args.basepath, args.organism, 'annotations'),
+        'MISSPLICING_PATH': os.path.join(args.basepath, args.organism, 'missplicing'),
+        'ONCOSPLICE_PATH': os.path.join(args.basepath, args.organism, 'oncosplice'),
+        'BASE': os.path.join(args.basepath, args.organism),
+        'TEMP': os.path.join(args.basepath, args.organism, 'temp')
     }
-    dump_json(config_file, config_paths)
-    base_path = Path(args.basepath)
+    if config_file.exists():
+        config_data = unload_json(config_file)
+        overwrite = 'y'
+        if args.organism in config_data:
+            overwrite = input("Organism {args.organism} already configured... Overwrite? (y/n)")
+        if overwrite == 'y':
+            config_data[args.organism] = config_paths
+            dump_json(config_file, config_data)
+        else:
+            raise SystemExit("Exiting configuration.")
+    else:
+        config_data = {args.organism: config_paths}
+        dump_json(config_file, config_data)
+    base_path = Path(args.basepath) / args.organism
     if base_path.exists() and len(os.listdir(base_path)) > 0:
         raise FileExistsError(f"Directory {base_path} not empty.")
     elif not base_path.exists():
         print(f"Initializing data folder at {base_path}.")
-        base_path.mkdir()
+        base_path.mkdir(parents=True)
-    cons_url = 'https://genome-data-public-access.s3.eu-north-1.amazonaws.com/conservation.pkl'
-    cons_file = download(cons_url, base_path)
-    gtex_url = 'https://storage.googleapis.com/adult-gtex/bulk-gex/v8/rna-seq/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz'
-    gtex_file = download_and_ungzip(gtex_url, base_path)
+    if args.organism == 'hg38':
+        file_maps = {
+            'cons_url': 'https://genome-data-public-access.s3.eu-north-1.amazonaws.com/conservation.pkl',
+            'expression_url': 'https://storage.googleapis.com/adult-gtex/bulk-gex/v8/rna-seq/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz',
+            'fasta_url': 'https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/latest/hg38.fa.gz',
+            'ensembl_url': 'https://ftp.ensembl.org/pub/release-111/gtf/homo_sapiens/Homo_sapiens.GRCh38.111.gtf.gz'
+        }
+    elif args.organism == 'mm39':
+        file_maps = {
+            'cons_url':  'https://genome-data-public-access.s3.eu-north-1.amazonaws.com/mm39_conservation.pkl',
+            'expression_url': '',
+            'fasta_url': 'https://hgdownload.soe.ucsc.edu/goldenPath/mm39/bigZips/mm39.fa.gz',
+            'ensembl_url': 'https://ftp.ensembl.org/pub/release-112/gtf/mus_musculus/Mus_musculus.GRCm39.112.gtf.gz'
+        }
-    fasta_url = 'https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/latest/hg38.fa.gz'
-    fasta_file = download_and_ungzip(fasta_url, base_path)
+    else:
+        raise NotImplemented(f"Organism {args.organism} not supported.")
+    cons_file = download(file_maps['cons_url'], base_path)
+    if file_maps['expression_url']:
+        gtex_file = download_and_ungzip(file_maps['expression_url'], base_path)
+    else:
+        gtex_file = None
+    fasta_file = download_and_ungzip(file_maps['fasta_url'], base_path)
     fasta_build_path = base_path / f'chromosomes'
     fasta_build_path.mkdir()
     split_fasta(fasta_file, fasta_build_path)
-    clinvar_url = 'https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz'
-    clinvar_file = download_and_ungzip(clinvar_url, base_path)
-    # clinvar_build_path = base_path / f'accessory_data'
-    # clinvar_build_path.mkdir()
-    ensembl_url = 'https://ftp.ensembl.org/pub/release-111/gtf/homo_sapiens/Homo_sapiens.GRCh38.111.gtf.gz'
-    ensembl_file = download_and_ungzip(ensembl_url, base_path)
+    ensembl_file = download_and_ungzip(file_maps['ensembl_url'], base_path)
     ensembl_annotation_path = base_path / f'annotations'
     ensembl_annotation_path.mkdir()
-    retrieve_and_parse_ensembl_annotations(ensembl_annotation_path, ensembl_file, gtex_file, unload_pickle(cons_file))
+    retrieve_and_parse_ensembl_annotations(ensembl_annotation_path, ensembl_file, unload_pickle(cons_file), gtex_file=gtex_file)
     splicing_path = Path(config_paths['MISSPLICING_PATH'])
     if not splicing_path.exists():

geney/oncosplice.py CHANGED Viewed

@@ -29,6 +29,27 @@ tf.config.threading.set_inter_op_parallelism_threads(1)
 sai_paths = ('models/spliceai{}.h5'.format(x) for x in range(1, 6))
 sai_models = [load_model(resource_filename('spliceai', x)) for x in sai_paths]
+# Load models
+import torch
+from pkg_resources import resource_filename
+from pangolin.model import *
+pang_model_nums = [0, 2, 4, 6]
+pang_models = []
+for i in pang_model_nums:
+    for j in range(1, 6):
+        model = Pangolin(L, W, AR)
+        if torch.cuda.is_available():
+            model.cuda()
+            weights = torch.load(resource_filename("pangolin","models/final.%s.%s.3" % (j, i)))
+        else:
+            weights = torch.load(resource_filename("pangolin","models/final.%s.%s.3" % (j, i)),
+                                 map_location=torch.device('cpu'))
+        model.load_state_dict(weights)
+        model.eval()
+        pang_models.append(model)
 def is_monotonic(A):
     x, y = [], []
     x.extend(A)
@@ -691,7 +712,47 @@ def run_spliceai_seq(seq, indices, threshold=0):
     return acceptor_indices, donor_indices
-def run_spliceai_transcript(mutations, transcript_data, sai_mrg_context=5000, min_coverage=2500, sai_threshold=0.5):
+def pang_one_hot_encode(seq):
+    IN_MAP = np.asarray([[0, 0, 0, 0],
+                         [1, 0, 0, 0],
+                         [0, 1, 0, 0],
+                         [0, 0, 1, 0],
+                         [0, 0, 0, 1]])
+    seq = seq.upper().replace('A', '1').replace('C', '2')
+    seq = seq.replace('G', '3').replace('T', '4').replace('N', '0')
+    seq = np.asarray(list(map(int, list(seq))))
+    return IN_MAP[seq.astype('int8')]
+def pangolin_predict_probs(true_seq, models):
+    model_nums = [0, 2, 4, 6]
+    INDEX_MAP = {0: 1, 1: 2, 2: 4, 3: 5, 4: 7, 5: 8, 6: 10, 7: 11}
+    seq = 'N'*5000 + true_seq + 'N'*5000
+    acceptor_dinucleotide = np.array([true_seq[i - 2:i] == 'AG' for i in range(len(true_seq))])
+    donor_dinucleotide = np.array([true_seq[i + 1:i + 3] == 'GT' for i in range(len(true_seq))])
+    seq = pang_one_hot_encode(seq).T
+    seq = torch.from_numpy(np.expand_dims(seq, axis=0)).float()
+    if torch.cuda.is_available():
+        seq = seq.to(torch.device("cuda"))
+    scores = []
+    for j, model_num in enumerate(model_nums):
+        score = []
+        # Average across 5 models
+        for model in models[5 * j:5 * j + 5]:
+            with torch.no_grad():
+                score.append(model(seq)[0][INDEX_MAP[model_num], :].cpu().numpy())
+        scores.append(np.mean(score, axis=0))
+    splicing_pred = np.array(scores).max(axis=0)
+    donor_probs = [splicing_pred[i] * donor_dinucleotide[i] for i in range(len(true_seq))]
+    acceptor_probs = [splicing_pred[i] * acceptor_dinucleotide[i] for i in range(len(true_seq))]
+    return donor_probs, acceptor_probs
+def run_spliceai_transcript(mutations, transcript_data, sai_mrg_context=5000, min_coverage=2500, sai_threshold=0.5, engine='spliceai'):
     positions = mutations.positions
     end_positions = [m.start + len(m.ref) for m in mutations.variants]
     positions.extend(end_positions)
@@ -733,11 +794,16 @@ def run_spliceai_transcript(mutations, transcript_data, sai_mrg_context=5000, mi
         ref_indices = ref_indices[::-1]
         mut_indices = mut_indices[::-1]
-    ref_seq_probs_temp = sai_predict_probs(ref_seq, sai_models)
-    mut_seq_probs_temp = sai_predict_probs(mut_seq, sai_models)
+    if engine == 'spliceai':
+        ref_seq_probs_temp = sai_predict_probs(ref_seq, sai_models)
+        mut_seq_probs_temp = sai_predict_probs(mut_seq, sai_models)
+        ref_seq_acceptor_probs, ref_seq_donor_probs = ref_seq_probs_temp[0, :], ref_seq_probs_temp[1, :]
+        mut_seq_acceptor_probs, mut_seq_donor_probs = mut_seq_probs_temp[0, :], mut_seq_probs_temp[1, :]
+    elif engine == 'pangolin':
+        ref_seq_donor_probs, ref_seq_acceptor_probs = pangolin_predict_probs(ref_seq, pangolin_models)
+        mut_seq_donor_probs, mut_seq_acceptor_probs = pangolin_predict_probs(mut_seq, pangolin_models)
-    ref_seq_acceptor_probs, ref_seq_donor_probs = ref_seq_probs_temp[0, :], ref_seq_probs_temp[1, :]
-    mut_seq_acceptor_probs, mut_seq_donor_probs = mut_seq_probs_temp[0, :], mut_seq_probs_temp[1, :]
     assert len(ref_indices) == len(ref_seq_acceptor_probs), 'Reference pos not the same'
     assert len(mut_indices) == len(mut_seq_acceptor_probs), 'Mut pos not the same'
@@ -1025,6 +1091,7 @@ def OncospliceAnnotator(reference_transcript, variant_transcript, mut):
 def find_splice_site_proximity(mut, transcript):
     for i, (ex_start, ex_end) in enumerate(transcript.exons):
         if min(ex_start, ex_end) <= mut.start <= max(ex_start, ex_end):
             return i + 1, None, abs(mut.start - ex_start), abs(mut.start - ex_end)
@@ -1033,6 +1100,7 @@ def find_splice_site_proximity(mut, transcript):
         if min(in_start, in_end) <= mut.start <= max(in_start, in_end):
             return None, i + 1, abs(mut.start - in_end), abs(mut.start - in_start)
+    return None, None, np.inf, np.inf
 def define_missplicing_events(ref, var):
     ref_introns, ref_exons = ref.introns, ref.exons

geney/power_utils.py CHANGED Viewed

@@ -60,7 +60,7 @@ def launch_dask_cluster(memory_size="3GB", num_workers=10, queue="tamirQ",
                 memory=memory_size,
                 processes=1,
                 queue=queue,
-                walltime=walltime,
+                walltime='7200',
                 scheduler_options={"dashboard_address": dashboard_address},
                 log_directory=log_directory,
                 job_script_prologue=[f"cd {config_setup['BASE']}"]

geney/utils.py CHANGED Viewed

@@ -52,9 +52,9 @@ def dump_pickle(file_path, payload):
     return None
-def find_files_by_gene_name(gene_name):
+def find_files_by_gene_name(gene_name, organism='hg38'):
     from geney import config_setup
-    mrna_path = config_setup['MRNA_PATH'] / 'protein_coding'
+    mrna_path = config_setup['MRNA_PATH'] / organism / 'protein_coding'
     matching_files = [f for f in mrna_path.glob(f'*_{gene_name}.pkl')]
     if len(matching_files) > 1:
         print(f"Multiple files available ({[f.name for f in matching_files]}).")

{geney-1.1.17.dist-info → geney-1.1.19.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: geney
-Version: 1.1.17
+Version: 1.1.19
 Summary: A Python package for gene expression modeling.
 Home-page: https://github.com/nicolaslynn/geney
 Author: Nicolas Lynn

{geney-1.1.17.dist-info → geney-1.1.19.dist-info}/RECORD RENAMED Viewed

@@ -4,21 +4,21 @@ geney/__init__.py,sha256=r-Yvpo_Tc236DcsqsFyexT21iVoYCVl9zoJj5pFuWEE,407
 geney/benchmark_clinvar.py,sha256=LLl77e95Qbg9Kd-m2yL8ilmzubSz9SKogeARwssT4Ks,5532
 geney/compare_sets.py,sha256=TcgL57V7BUPxBoW9lv3xr8qK2Acmykn85Ev3avicQr8,2977
 geney/config_setup.py,sha256=SePeooA4RWAtR_KAT1-W1hkD3MT5tH6YMyp80t_RNPQ,385
-geney/data_setup.py,sha256=DZeksRPr2ZT7bszMo33W0r3OwmqHokVXtZ4gx5Lu_Mo,10725
+geney/data_setup.py,sha256=AdeagKvwNEwkkjXReUZ5etgBwm0x3vCqmdsfs09QeDU,12022
 geney/gtex.py,sha256=asL2lHyU5KsbWpV096vkf1Ka7hSo_RRfZqw7p5nERmE,1919
 geney/gtex_utils.py,sha256=asL2lHyU5KsbWpV096vkf1Ka7hSo_RRfZqw7p5nERmE,1919
 geney/immune_utils.py,sha256=ZRni5ttrhpYBnmNr0d0ZatIbNPYs4nmQuoUO00SpsS4,5271
 geney/netchop.py,sha256=AMiy9YsdTmX4B3k3Y5Yh7EmoGAojM1O3AzhPKOiB--g,3050
-geney/oncosplice.py,sha256=hqDTeuYPAh3vCrWutH3187UP9ShWqtxAolW7cvGvf3I,68971
+geney/oncosplice.py,sha256=PzeQFy8k2xCSIl07kY19rGZ6U5ljyrJ0REC_Qgf-IN0,71582
 geney/oncosplice_mouse.py,sha256=LYLOukI9qI1IBkyl1qVRFR5d1NAw7Orlj8Zth-4xCW8,12962
 geney/oncosplice_pipeline.py,sha256=hpGqFHOdn8i8tvvs1-t3-G9Ko18zInwoDXBJbbrfbC4,68036
 geney/performance_utils.py,sha256=FQt7rA4r-Wuq3kceCxsSuMfj3wU1tMG8QnbL59aBohs,4700
-geney/power_utils.py,sha256=6InuDm1jSrsgR-F_LmdMTbuQwty2OdYjwfGGaAPhaRI,7268
+geney/power_utils.py,sha256=GtEvKAbz34S-ILQST6tabt3g0M4L8_aa50HIAQZ7byM,7266
 geney/survival.py,sha256=gNKZGcwxDZ00ixVBHf3ZdjbY_AHQOCU9kKpBC_dokbM,5572
 geney/survival_utils.py,sha256=2CAkC2LsspicHIdrqsiPnjgvpr5KHDUfLFFqnRbPJqs,5762
 geney/tcga_annotations.py,sha256=DjRl6Pk5VAOL1yhbt8SXD6FZhYbcYNu3FtXYMeveGB0,15016
 geney/tcga_utils.py,sha256=uAjejr7F-XqcXS5uANGlsHLOlzMmGo4CTbWhMO0E318,15589
-geney/utils.py,sha256=YOe22gA0Oew9_QEym7ivM9sb7t3wNeHTeiSDBmvOPso,1984
+geney/utils.py,sha256=CgQQ8sy5g7g75cy-NEgYprink8a6pUreBgs-BhpyJt8,2012
 geney/analyzers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 geney/analyzers/benchmark_clinvar.py,sha256=ZAxvZ-Ue5T6au5mGbk8clfvbAYl13NIY7U92KzL0lXI,5531
 geney/analyzers/characterize_epistasis.py,sha256=MvcYQMRwZ-qqlX9mn41vmr0Uxb5dIrrcaE3oiZMTYm8,648
@@ -45,7 +45,7 @@ geney/translation_initiation/resources/kozak_pssm.json,sha256=pcd0Olziutq-6H3mFW
 geney/translation_initiation/resources/tis_regressor_model.joblib,sha256=IXb4DUDhJ5rBDKcqMk9zE3ECTZZcdj7Jixz3KpoZ7OA,2592025
 geney/translation_termination/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 geney/translation_termination/tts_utils.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-geney-1.1.17.dist-info/METADATA,sha256=eql82__spjwiC-hrepMkawya4c_A4ouueRzcfCr1kfo,1199
-geney-1.1.17.dist-info/WHEEL,sha256=iYlv5fX357PQyRT2o6tw1bN-YcKFFHKqB_LwHO5wP-g,110
-geney-1.1.17.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
-geney-1.1.17.dist-info/RECORD,,
+geney-1.1.19.dist-info/METADATA,sha256=exY4KdtXuhuA8Bol9FN1bkprgX-EiWubjd0TPUkL7U4,1199
+geney-1.1.19.dist-info/WHEEL,sha256=iYlv5fX357PQyRT2o6tw1bN-YcKFFHKqB_LwHO5wP-g,110
+geney-1.1.19.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
+geney-1.1.19.dist-info/RECORD,,

{geney-1.1.17.dist-info → geney-1.1.19.dist-info}/WHEEL RENAMED Viewed

File without changes

{geney-1.1.17.dist-info → geney-1.1.19.dist-info}/top_level.txt RENAMED Viewed

File without changes

geney 1.1.17__py2.py3-none-any.whl → 1.1.19__py2.py3-none-any.whl

Potentially problematic release.

geney 1.1.17py2.py3-none-any.whl → 1.1.19py2.py3-none-any.whl