geney 1.1.17__py2.py3-none-any.whl → 1.1.19__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of geney might be problematic. Click here for more details.

geney/data_setup.py CHANGED
@@ -1,7 +1,7 @@
1
1
  from pathlib import Path
2
2
  import os
3
3
  from gtfparse import read_gtf
4
- from geney.utils import dump_json, dump_pickle, unload_pickle
4
+ from geney.utils import dump_json, dump_pickle, unload_pickle, unload_json
5
5
  import pandas as pd
6
6
  from tqdm import tqdm
7
7
  import requests
@@ -103,11 +103,13 @@ def process_transcript(transcript_df, rev, chrm, cons_data):
103
103
  return data
104
104
 
105
105
 
106
- def retrieve_and_parse_ensembl_annotations(local_path, annotations_file, gtex_file, cons_data, valid_biotypes=('protein_coding')):
107
-
108
- gtex_df = pd.read_csv(gtex_file, delimiter='\t', header=2)
109
- gtex_df.Name = gtex_df.apply(lambda row: row.Name.split('.')[0], axis=1)
110
- gtex_df = gtex_df.set_index('Name').drop(columns=['Description'])
106
+ def retrieve_and_parse_ensembl_annotations(local_path, annotations_file, cons_data, gtex_file='', valid_biotypes=('protein_coding')):
107
+ if gtex_file:
108
+ gtex_df = pd.read_csv(gtex_file, delimiter='\t', header=2)
109
+ gtex_df.Name = gtex_df.apply(lambda row: row.Name.split('.')[0], axis=1)
110
+ gtex_df = gtex_df.set_index('Name').drop(columns=['Description'])
111
+ else:
112
+ gtex_df = pd.DataFrame()
111
113
 
112
114
  annotations = read_gtf(annotations_file)
113
115
  temp = annotations[(annotations.gene_biotype == 'protein_coding') & (annotations.transcript_biotype == 'protein_coding')]
@@ -200,59 +202,87 @@ def write_sequence(output_directory, header, sequence):
200
202
 
201
203
 
202
204
  def main():
203
- config_dir = Path(os.path.join(os.path.expanduser('~'), '.oncosplice_setup'))
204
- if config_dir.exists():
205
- for file in config_dir.glob('*'):
206
- file.unlink()
207
- config_dir.rmdir()
208
- config_dir.mkdir()
205
+ config_dir = Path(os.path.join(os.path.expanduser('~'), '.oncosplice_setup_new'))
206
+ # if config_dir.exists():
207
+ # for file in config_dir.glob('*'):
208
+ # file.unlink()
209
+ # config_dir.rmdir()
210
+ if not config_dir.exists():
211
+ config_dir.mkdir()
209
212
 
210
213
  parser = argparse.ArgumentParser(description="Geney database location")
211
214
  parser.add_argument("-b", "--basepath", help="The location of the data we are mounting.", required=True)
212
- # parser.add_argument("-s", "--splicepath", help="The location of the data we are mounting.", required=False, default=None)
215
+ parser.add_argument("-o", "--organism", help="Which organism we are setting up for (mm39 or hg38).", required=False, default='hg38')
213
216
 
214
217
  args = parser.parse_args()
215
218
  config_file = config_dir / 'config.json'
216
219
  config_paths = {
217
- 'CHROM_SOURCE': os.path.join(args.basepath, 'chromosomes'),
218
- 'MRNA_PATH': os.path.join(args.basepath, 'annotations'),
219
- 'MISSPLICING_PATH': os.path.join(args.basepath, 'missplicing'),
220
- 'ONCOSPLICE_PATH': os.path.join(args.basepath, 'oncosplice'),
221
- 'BASE': args.basepath,
222
- 'NETCHOP': os.path.join(args.basepath, 'netchop')
220
+ 'CHROM_SOURCE': os.path.join(args.basepath, args.organism, 'chromosomes'),
221
+ 'MRNA_PATH': os.path.join(args.basepath, args.organism, 'annotations'),
222
+ 'MISSPLICING_PATH': os.path.join(args.basepath, args.organism, 'missplicing'),
223
+ 'ONCOSPLICE_PATH': os.path.join(args.basepath, args.organism, 'oncosplice'),
224
+ 'BASE': os.path.join(args.basepath, args.organism),
225
+ 'TEMP': os.path.join(args.basepath, args.organism, 'temp')
223
226
  }
224
- dump_json(config_file, config_paths)
225
227
 
226
- base_path = Path(args.basepath)
228
+ if config_file.exists():
229
+ config_data = unload_json(config_file)
230
+ overwrite = 'y'
231
+ if args.organism in config_data:
232
+ overwrite = input("Organism {args.organism} already configured... Overwrite? (y/n)")
233
+ if overwrite == 'y':
234
+ config_data[args.organism] = config_paths
235
+ dump_json(config_file, config_data)
236
+ else:
237
+ raise SystemExit("Exiting configuration.")
238
+ else:
239
+ config_data = {args.organism: config_paths}
240
+ dump_json(config_file, config_data)
241
+
242
+ base_path = Path(args.basepath) / args.organism
227
243
  if base_path.exists() and len(os.listdir(base_path)) > 0:
228
244
  raise FileExistsError(f"Directory {base_path} not empty.")
229
245
 
230
246
  elif not base_path.exists():
231
247
  print(f"Initializing data folder at {base_path}.")
232
- base_path.mkdir()
248
+ base_path.mkdir(parents=True)
233
249
 
234
- cons_url = 'https://genome-data-public-access.s3.eu-north-1.amazonaws.com/conservation.pkl'
235
- cons_file = download(cons_url, base_path)
236
250
 
237
- gtex_url = 'https://storage.googleapis.com/adult-gtex/bulk-gex/v8/rna-seq/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz'
238
- gtex_file = download_and_ungzip(gtex_url, base_path)
251
+ if args.organism == 'hg38':
252
+ file_maps = {
253
+ 'cons_url': 'https://genome-data-public-access.s3.eu-north-1.amazonaws.com/conservation.pkl',
254
+ 'expression_url': 'https://storage.googleapis.com/adult-gtex/bulk-gex/v8/rna-seq/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz',
255
+ 'fasta_url': 'https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/latest/hg38.fa.gz',
256
+ 'ensembl_url': 'https://ftp.ensembl.org/pub/release-111/gtf/homo_sapiens/Homo_sapiens.GRCh38.111.gtf.gz'
257
+ }
258
+
259
+ elif args.organism == 'mm39':
260
+ file_maps = {
261
+ 'cons_url': 'https://genome-data-public-access.s3.eu-north-1.amazonaws.com/mm39_conservation.pkl',
262
+ 'expression_url': '',
263
+ 'fasta_url': 'https://hgdownload.soe.ucsc.edu/goldenPath/mm39/bigZips/mm39.fa.gz',
264
+ 'ensembl_url': 'https://ftp.ensembl.org/pub/release-112/gtf/mus_musculus/Mus_musculus.GRCm39.112.gtf.gz'
265
+ }
239
266
 
240
- fasta_url = 'https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/latest/hg38.fa.gz'
241
- fasta_file = download_and_ungzip(fasta_url, base_path)
267
+ else:
268
+ raise NotImplemented(f"Organism {args.organism} not supported.")
269
+
270
+ cons_file = download(file_maps['cons_url'], base_path)
271
+
272
+ if file_maps['expression_url']:
273
+ gtex_file = download_and_ungzip(file_maps['expression_url'], base_path)
274
+ else:
275
+ gtex_file = None
276
+
277
+ fasta_file = download_and_ungzip(file_maps['fasta_url'], base_path)
242
278
  fasta_build_path = base_path / f'chromosomes'
243
279
  fasta_build_path.mkdir()
244
280
  split_fasta(fasta_file, fasta_build_path)
245
281
 
246
- clinvar_url = 'https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz'
247
- clinvar_file = download_and_ungzip(clinvar_url, base_path)
248
- # clinvar_build_path = base_path / f'accessory_data'
249
- # clinvar_build_path.mkdir()
250
-
251
- ensembl_url = 'https://ftp.ensembl.org/pub/release-111/gtf/homo_sapiens/Homo_sapiens.GRCh38.111.gtf.gz'
252
- ensembl_file = download_and_ungzip(ensembl_url, base_path)
282
+ ensembl_file = download_and_ungzip(file_maps['ensembl_url'], base_path)
253
283
  ensembl_annotation_path = base_path / f'annotations'
254
284
  ensembl_annotation_path.mkdir()
255
- retrieve_and_parse_ensembl_annotations(ensembl_annotation_path, ensembl_file, gtex_file, unload_pickle(cons_file))
285
+ retrieve_and_parse_ensembl_annotations(ensembl_annotation_path, ensembl_file, unload_pickle(cons_file), gtex_file=gtex_file)
256
286
 
257
287
  splicing_path = Path(config_paths['MISSPLICING_PATH'])
258
288
  if not splicing_path.exists():
geney/oncosplice.py CHANGED
@@ -29,6 +29,27 @@ tf.config.threading.set_inter_op_parallelism_threads(1)
29
29
  sai_paths = ('models/spliceai{}.h5'.format(x) for x in range(1, 6))
30
30
  sai_models = [load_model(resource_filename('spliceai', x)) for x in sai_paths]
31
31
 
32
+
33
+ # Load models
34
+ import torch
35
+ from pkg_resources import resource_filename
36
+ from pangolin.model import *
37
+
38
+ pang_model_nums = [0, 2, 4, 6]
39
+ pang_models = []
40
+ for i in pang_model_nums:
41
+ for j in range(1, 6):
42
+ model = Pangolin(L, W, AR)
43
+ if torch.cuda.is_available():
44
+ model.cuda()
45
+ weights = torch.load(resource_filename("pangolin","models/final.%s.%s.3" % (j, i)))
46
+ else:
47
+ weights = torch.load(resource_filename("pangolin","models/final.%s.%s.3" % (j, i)),
48
+ map_location=torch.device('cpu'))
49
+ model.load_state_dict(weights)
50
+ model.eval()
51
+ pang_models.append(model)
52
+
32
53
  def is_monotonic(A):
33
54
  x, y = [], []
34
55
  x.extend(A)
@@ -691,7 +712,47 @@ def run_spliceai_seq(seq, indices, threshold=0):
691
712
  return acceptor_indices, donor_indices
692
713
 
693
714
 
694
- def run_spliceai_transcript(mutations, transcript_data, sai_mrg_context=5000, min_coverage=2500, sai_threshold=0.5):
715
+ def pang_one_hot_encode(seq):
716
+ IN_MAP = np.asarray([[0, 0, 0, 0],
717
+ [1, 0, 0, 0],
718
+ [0, 1, 0, 0],
719
+ [0, 0, 1, 0],
720
+ [0, 0, 0, 1]])
721
+ seq = seq.upper().replace('A', '1').replace('C', '2')
722
+ seq = seq.replace('G', '3').replace('T', '4').replace('N', '0')
723
+ seq = np.asarray(list(map(int, list(seq))))
724
+ return IN_MAP[seq.astype('int8')]
725
+
726
+ def pangolin_predict_probs(true_seq, models):
727
+ model_nums = [0, 2, 4, 6]
728
+ INDEX_MAP = {0: 1, 1: 2, 2: 4, 3: 5, 4: 7, 5: 8, 6: 10, 7: 11}
729
+
730
+ seq = 'N'*5000 + true_seq + 'N'*5000
731
+ acceptor_dinucleotide = np.array([true_seq[i - 2:i] == 'AG' for i in range(len(true_seq))])
732
+ donor_dinucleotide = np.array([true_seq[i + 1:i + 3] == 'GT' for i in range(len(true_seq))])
733
+
734
+ seq = pang_one_hot_encode(seq).T
735
+ seq = torch.from_numpy(np.expand_dims(seq, axis=0)).float()
736
+
737
+ if torch.cuda.is_available():
738
+ seq = seq.to(torch.device("cuda"))
739
+
740
+ scores = []
741
+ for j, model_num in enumerate(model_nums):
742
+ score = []
743
+ # Average across 5 models
744
+ for model in models[5 * j:5 * j + 5]:
745
+ with torch.no_grad():
746
+ score.append(model(seq)[0][INDEX_MAP[model_num], :].cpu().numpy())
747
+
748
+ scores.append(np.mean(score, axis=0))
749
+
750
+ splicing_pred = np.array(scores).max(axis=0)
751
+ donor_probs = [splicing_pred[i] * donor_dinucleotide[i] for i in range(len(true_seq))]
752
+ acceptor_probs = [splicing_pred[i] * acceptor_dinucleotide[i] for i in range(len(true_seq))]
753
+ return donor_probs, acceptor_probs
754
+
755
+ def run_spliceai_transcript(mutations, transcript_data, sai_mrg_context=5000, min_coverage=2500, sai_threshold=0.5, engine='spliceai'):
695
756
  positions = mutations.positions
696
757
  end_positions = [m.start + len(m.ref) for m in mutations.variants]
697
758
  positions.extend(end_positions)
@@ -733,11 +794,16 @@ def run_spliceai_transcript(mutations, transcript_data, sai_mrg_context=5000, mi
733
794
  ref_indices = ref_indices[::-1]
734
795
  mut_indices = mut_indices[::-1]
735
796
 
736
- ref_seq_probs_temp = sai_predict_probs(ref_seq, sai_models)
737
- mut_seq_probs_temp = sai_predict_probs(mut_seq, sai_models)
797
+ if engine == 'spliceai':
798
+ ref_seq_probs_temp = sai_predict_probs(ref_seq, sai_models)
799
+ mut_seq_probs_temp = sai_predict_probs(mut_seq, sai_models)
800
+ ref_seq_acceptor_probs, ref_seq_donor_probs = ref_seq_probs_temp[0, :], ref_seq_probs_temp[1, :]
801
+ mut_seq_acceptor_probs, mut_seq_donor_probs = mut_seq_probs_temp[0, :], mut_seq_probs_temp[1, :]
802
+
803
+ elif engine == 'pangolin':
804
+ ref_seq_donor_probs, ref_seq_acceptor_probs = pangolin_predict_probs(ref_seq, pangolin_models)
805
+ mut_seq_donor_probs, mut_seq_acceptor_probs = pangolin_predict_probs(mut_seq, pangolin_models)
738
806
 
739
- ref_seq_acceptor_probs, ref_seq_donor_probs = ref_seq_probs_temp[0, :], ref_seq_probs_temp[1, :]
740
- mut_seq_acceptor_probs, mut_seq_donor_probs = mut_seq_probs_temp[0, :], mut_seq_probs_temp[1, :]
741
807
 
742
808
  assert len(ref_indices) == len(ref_seq_acceptor_probs), 'Reference pos not the same'
743
809
  assert len(mut_indices) == len(mut_seq_acceptor_probs), 'Mut pos not the same'
@@ -1025,6 +1091,7 @@ def OncospliceAnnotator(reference_transcript, variant_transcript, mut):
1025
1091
 
1026
1092
 
1027
1093
  def find_splice_site_proximity(mut, transcript):
1094
+
1028
1095
  for i, (ex_start, ex_end) in enumerate(transcript.exons):
1029
1096
  if min(ex_start, ex_end) <= mut.start <= max(ex_start, ex_end):
1030
1097
  return i + 1, None, abs(mut.start - ex_start), abs(mut.start - ex_end)
@@ -1033,6 +1100,7 @@ def find_splice_site_proximity(mut, transcript):
1033
1100
  if min(in_start, in_end) <= mut.start <= max(in_start, in_end):
1034
1101
  return None, i + 1, abs(mut.start - in_end), abs(mut.start - in_start)
1035
1102
 
1103
+ return None, None, np.inf, np.inf
1036
1104
 
1037
1105
  def define_missplicing_events(ref, var):
1038
1106
  ref_introns, ref_exons = ref.introns, ref.exons
geney/power_utils.py CHANGED
@@ -60,7 +60,7 @@ def launch_dask_cluster(memory_size="3GB", num_workers=10, queue="tamirQ",
60
60
  memory=memory_size,
61
61
  processes=1,
62
62
  queue=queue,
63
- walltime=walltime,
63
+ walltime='7200',
64
64
  scheduler_options={"dashboard_address": dashboard_address},
65
65
  log_directory=log_directory,
66
66
  job_script_prologue=[f"cd {config_setup['BASE']}"]
geney/utils.py CHANGED
@@ -52,9 +52,9 @@ def dump_pickle(file_path, payload):
52
52
  return None
53
53
 
54
54
 
55
- def find_files_by_gene_name(gene_name):
55
+ def find_files_by_gene_name(gene_name, organism='hg38'):
56
56
  from geney import config_setup
57
- mrna_path = config_setup['MRNA_PATH'] / 'protein_coding'
57
+ mrna_path = config_setup['MRNA_PATH'] / organism / 'protein_coding'
58
58
  matching_files = [f for f in mrna_path.glob(f'*_{gene_name}.pkl')]
59
59
  if len(matching_files) > 1:
60
60
  print(f"Multiple files available ({[f.name for f in matching_files]}).")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: geney
3
- Version: 1.1.17
3
+ Version: 1.1.19
4
4
  Summary: A Python package for gene expression modeling.
5
5
  Home-page: https://github.com/nicolaslynn/geney
6
6
  Author: Nicolas Lynn
@@ -4,21 +4,21 @@ geney/__init__.py,sha256=r-Yvpo_Tc236DcsqsFyexT21iVoYCVl9zoJj5pFuWEE,407
4
4
  geney/benchmark_clinvar.py,sha256=LLl77e95Qbg9Kd-m2yL8ilmzubSz9SKogeARwssT4Ks,5532
5
5
  geney/compare_sets.py,sha256=TcgL57V7BUPxBoW9lv3xr8qK2Acmykn85Ev3avicQr8,2977
6
6
  geney/config_setup.py,sha256=SePeooA4RWAtR_KAT1-W1hkD3MT5tH6YMyp80t_RNPQ,385
7
- geney/data_setup.py,sha256=DZeksRPr2ZT7bszMo33W0r3OwmqHokVXtZ4gx5Lu_Mo,10725
7
+ geney/data_setup.py,sha256=AdeagKvwNEwkkjXReUZ5etgBwm0x3vCqmdsfs09QeDU,12022
8
8
  geney/gtex.py,sha256=asL2lHyU5KsbWpV096vkf1Ka7hSo_RRfZqw7p5nERmE,1919
9
9
  geney/gtex_utils.py,sha256=asL2lHyU5KsbWpV096vkf1Ka7hSo_RRfZqw7p5nERmE,1919
10
10
  geney/immune_utils.py,sha256=ZRni5ttrhpYBnmNr0d0ZatIbNPYs4nmQuoUO00SpsS4,5271
11
11
  geney/netchop.py,sha256=AMiy9YsdTmX4B3k3Y5Yh7EmoGAojM1O3AzhPKOiB--g,3050
12
- geney/oncosplice.py,sha256=hqDTeuYPAh3vCrWutH3187UP9ShWqtxAolW7cvGvf3I,68971
12
+ geney/oncosplice.py,sha256=PzeQFy8k2xCSIl07kY19rGZ6U5ljyrJ0REC_Qgf-IN0,71582
13
13
  geney/oncosplice_mouse.py,sha256=LYLOukI9qI1IBkyl1qVRFR5d1NAw7Orlj8Zth-4xCW8,12962
14
14
  geney/oncosplice_pipeline.py,sha256=hpGqFHOdn8i8tvvs1-t3-G9Ko18zInwoDXBJbbrfbC4,68036
15
15
  geney/performance_utils.py,sha256=FQt7rA4r-Wuq3kceCxsSuMfj3wU1tMG8QnbL59aBohs,4700
16
- geney/power_utils.py,sha256=6InuDm1jSrsgR-F_LmdMTbuQwty2OdYjwfGGaAPhaRI,7268
16
+ geney/power_utils.py,sha256=GtEvKAbz34S-ILQST6tabt3g0M4L8_aa50HIAQZ7byM,7266
17
17
  geney/survival.py,sha256=gNKZGcwxDZ00ixVBHf3ZdjbY_AHQOCU9kKpBC_dokbM,5572
18
18
  geney/survival_utils.py,sha256=2CAkC2LsspicHIdrqsiPnjgvpr5KHDUfLFFqnRbPJqs,5762
19
19
  geney/tcga_annotations.py,sha256=DjRl6Pk5VAOL1yhbt8SXD6FZhYbcYNu3FtXYMeveGB0,15016
20
20
  geney/tcga_utils.py,sha256=uAjejr7F-XqcXS5uANGlsHLOlzMmGo4CTbWhMO0E318,15589
21
- geney/utils.py,sha256=YOe22gA0Oew9_QEym7ivM9sb7t3wNeHTeiSDBmvOPso,1984
21
+ geney/utils.py,sha256=CgQQ8sy5g7g75cy-NEgYprink8a6pUreBgs-BhpyJt8,2012
22
22
  geney/analyzers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
23
23
  geney/analyzers/benchmark_clinvar.py,sha256=ZAxvZ-Ue5T6au5mGbk8clfvbAYl13NIY7U92KzL0lXI,5531
24
24
  geney/analyzers/characterize_epistasis.py,sha256=MvcYQMRwZ-qqlX9mn41vmr0Uxb5dIrrcaE3oiZMTYm8,648
@@ -45,7 +45,7 @@ geney/translation_initiation/resources/kozak_pssm.json,sha256=pcd0Olziutq-6H3mFW
45
45
  geney/translation_initiation/resources/tis_regressor_model.joblib,sha256=IXb4DUDhJ5rBDKcqMk9zE3ECTZZcdj7Jixz3KpoZ7OA,2592025
46
46
  geney/translation_termination/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
47
47
  geney/translation_termination/tts_utils.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
48
- geney-1.1.17.dist-info/METADATA,sha256=eql82__spjwiC-hrepMkawya4c_A4ouueRzcfCr1kfo,1199
49
- geney-1.1.17.dist-info/WHEEL,sha256=iYlv5fX357PQyRT2o6tw1bN-YcKFFHKqB_LwHO5wP-g,110
50
- geney-1.1.17.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
51
- geney-1.1.17.dist-info/RECORD,,
48
+ geney-1.1.19.dist-info/METADATA,sha256=exY4KdtXuhuA8Bol9FN1bkprgX-EiWubjd0TPUkL7U4,1199
49
+ geney-1.1.19.dist-info/WHEEL,sha256=iYlv5fX357PQyRT2o6tw1bN-YcKFFHKqB_LwHO5wP-g,110
50
+ geney-1.1.19.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
51
+ geney-1.1.19.dist-info/RECORD,,
File without changes