geney 1.2.3__py2.py3-none-any.whl → 1.2.5__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of geney might be problematic. Click here for more details.

geney/data_setup.py CHANGED
@@ -7,7 +7,7 @@ from tqdm import tqdm
7
7
  import requests
8
8
  import argparse
9
9
  from sh import gunzip
10
-
10
+ import shutil
11
11
 
12
12
  def download(external_url, local_path):
13
13
  print(f"Grabbing {external_url}")
@@ -94,6 +94,7 @@ def process_transcript(transcript_df, rev, chrm, cons_data):
94
94
  cds_start, cds_end = cds_start[0], cds_end[0]
95
95
  data.update({'TIS': cds_start, 'TTS': cds_end, 'protein_id': transcript.protein_id})
96
96
 
97
+ print(f"{transcript.transcript_id} in cons_data: {transcript.transcript_id in cons_data}")
97
98
  if transcript.transcript_id in cons_data:
98
99
  data.update({'cons_available': True, 'cons_vector': cons_data[transcript.transcript_id]['scores'], 'cons_seq': cons_data[transcript.transcript_id]['seq']})
99
100
 
@@ -104,6 +105,8 @@ def process_transcript(transcript_df, rev, chrm, cons_data):
104
105
 
105
106
 
106
107
  def retrieve_and_parse_ensembl_annotations(local_path, annotations_file, cons_data, gtex_file='', valid_biotypes=('protein_coding')):
108
+ print(cons_data.keys())
109
+
107
110
  if gtex_file:
108
111
  gtex_df = pd.read_csv(gtex_file, delimiter='\t', header=2)
109
112
  gtex_df.Name = gtex_df.apply(lambda row: row.Name.split('.')[0], axis=1)
@@ -225,21 +228,24 @@ def main():
225
228
  'TEMP': os.path.join(args.basepath, args.organism, 'temp')
226
229
  }
227
230
 
231
+ base_path = Path(args.basepath) / args.organism
232
+
228
233
  if config_file.exists():
229
234
  config_data = unload_json(config_file)
230
235
  overwrite = 'y'
231
236
  if args.organism in config_data:
232
237
  overwrite = input("Organism {args.organism} already configured... Overwrite? (y/n)")
238
+
233
239
  if overwrite == 'y':
234
240
  config_data[args.organism] = config_paths
235
241
  dump_json(config_file, config_data)
242
+ shutil.rmtree(base_path)
236
243
  else:
237
244
  raise SystemExit("Exiting configuration.")
238
245
  else:
239
246
  config_data = {args.organism: config_paths}
240
247
  dump_json(config_file, config_data)
241
248
 
242
- base_path = Path(args.basepath) / args.organism
243
249
  if base_path.exists() and len(os.listdir(base_path)) > 0:
244
250
  raise FileExistsError(f"Directory {base_path} not empty.")
245
251
 
geney/oncosplice.py CHANGED
@@ -35,7 +35,7 @@ import torch
35
35
  from pkg_resources import resource_filename
36
36
  from pangolin.model import *
37
37
 
38
- pang_model_nums = [0, 2, 4, 6]
38
+ pang_model_nums = [0, 1, 2, 3, 4, 5, 6]
39
39
  pang_models = []
40
40
  for i in pang_model_nums:
41
41
  for j in range(1, 6):
@@ -752,7 +752,7 @@ def pangolin_predict_probs(true_seq, models):
752
752
  splicing_pred = np.array(scores).max(axis=0)
753
753
  donor_probs = [splicing_pred[i] * donor_dinucleotide[i] for i in range(len(true_seq))]
754
754
  acceptor_probs = [splicing_pred[i] * acceptor_dinucleotide[i] for i in range(len(true_seq))]
755
- return donor_probs, acceptor_probs
755
+ return donor_probs[5000:-5000], acceptor_probs[5000:-5000]
756
756
 
757
757
  def run_spliceai_transcript(mutations, transcript_data, sai_mrg_context=5000, min_coverage=2500, sai_threshold=0.5, engine='spliceai'):
758
758
  positions = mutations.positions
@@ -803,9 +803,11 @@ def run_spliceai_transcript(mutations, transcript_data, sai_mrg_context=5000, mi
803
803
  mut_seq_acceptor_probs, mut_seq_donor_probs = mut_seq_probs_temp[0, :], mut_seq_probs_temp[1, :]
804
804
 
805
805
  elif engine == 'pangolin':
806
- ref_seq_donor_probs, ref_seq_acceptor_probs = pangolin_predict_probs(ref_seq, pangolin_models)
807
- mut_seq_donor_probs, mut_seq_acceptor_probs = pangolin_predict_probs(mut_seq, pangolin_models)
806
+ ref_seq_donor_probs, ref_seq_acceptor_probs = pangolin_predict_probs(ref_seq, pangolin_models=pang_models)
807
+ mut_seq_donor_probs, mut_seq_acceptor_probs = pangolin_predict_probs(mut_seq, pangolin_models=pang_models)
808
808
 
809
+ else:
810
+ raise ValueError(f"{engine} not implemented")
809
811
 
810
812
  assert len(ref_indices) == len(ref_seq_acceptor_probs), 'Reference pos not the same'
811
813
  assert len(mut_indices) == len(mut_seq_acceptor_probs), 'Mut pos not the same'
@@ -1400,18 +1402,15 @@ def moving_average_conv(vector, window_size, factor=1):
1400
1402
 
1401
1403
  return np.convolve(vector, np.ones(window_size), mode='same') / window_size
1402
1404
 
1403
- def oncosplice(mut_id, sai_threshold=0.5, protein_coding=True, primary_transcript=False, per_transcript_missplicing=False, window_length=13, save_spliceai_results=False, force_spliceai=False):
1405
+ def oncosplice(mut_id, sai_threshold=0.5, protein_coding=True, primary_transcript=False, window_length=13, save_spliceai_results=False, force_spliceai=False, organism='hg38'):
1404
1406
  mutation = Variations(mut_id)
1405
- try:
1406
- reference_gene = Gene(mutation.gene)
1407
- except FileNotFoundError:
1408
- return pd.DataFrame()
1407
+ # try:
1408
+ reference_gene = Gene(mutation.gene, organism=organism)
1409
+ # except FileNotFoundError:
1410
+ # return pd.DataFrame()
1409
1411
 
1410
- reference_gene_proteines = {g.protein: g.transcript_id for g in reference_gene.run_transcripts()}
1411
- mutated_gene = Gene(mutation.gene, mut_id)
1412
- # if not per_transcript_missplicing:
1413
- # missplicing_obj = PredictSpliceAI(mutation, reference_gene, threshold=sai_threshold, force=True, save_results=False)
1414
- # missplicing = missplicing_obj.missplicing
1412
+ reference_gene_proteins = {g.protein: g.transcript_id for g in reference_gene.run_transcripts()}
1413
+ mutated_gene = Gene(mutation.gene, mut_id, organism=organism)
1415
1414
 
1416
1415
  results = []
1417
1416
  for variant in mutated_gene.run_transcripts(protein_coding=protein_coding, primary_transcript=primary_transcript):
@@ -1420,10 +1419,9 @@ def oncosplice(mut_id, sai_threshold=0.5, protein_coding=True, primary_transcrip
1420
1419
  continue
1421
1420
 
1422
1421
  cons_vector = transform_conservation_vector(reference.cons_vector, window=window_length)
1423
- # if per_transcript_missplicing:
1424
1422
  missplicing_obj = PredictSpliceAI(mutation, reference, threshold=sai_threshold, force=force_spliceai, save_results=save_spliceai_results)
1425
1423
  missplicing = missplicing_obj.apply_sai_threshold_primary(threshold=sai_threshold)
1426
- # print(missplicing)
1424
+
1427
1425
  for i, new_boundaries in enumerate(develop_aberrant_splicing(variant, missplicing)):
1428
1426
  variant_isoform = deepcopy(variant)
1429
1427
  variant_isoform.reset_acceptors(acceptors=new_boundaries['acceptors']).reset_donors(donors=new_boundaries['donors']).organize().generate_protein()
@@ -1432,9 +1430,6 @@ def oncosplice(mut_id, sai_threshold=0.5, protein_coding=True, primary_transcrip
1432
1430
  modified_positions = find_modified_positions(len(reference.protein), deleted, inserted)
1433
1431
  temp_cons = np.convolve(cons_vector * modified_positions, np.ones(window_length)) / window_length
1434
1432
  affected_cons_scores = max(temp_cons)
1435
- # temp_cons = np.convolve(cons_vector, np.ones(window_length))
1436
- # print(temp_cons)
1437
- # print(cons_vector)
1438
1433
  percentile = (
1439
1434
  sorted(cons_vector).index(next(x for x in sorted(cons_vector) if x >= affected_cons_scores)) / len(
1440
1435
  cons_vector))
@@ -1449,7 +1444,7 @@ def oncosplice(mut_id, sai_threshold=0.5, protein_coding=True, primary_transcrip
1449
1444
  report['isoform_prevalence'] = new_boundaries['path_weight']
1450
1445
  report['full_missplicing'] = missplicing
1451
1446
  report['missplicing'] = max(missplicing_obj)
1452
- report['reference_resemblance'] = reference_gene_proteines.get(variant_isoform.protein, None)
1447
+ report['reference_resemblance'] = reference_gene_proteins.get(variant_isoform.protein, None)
1453
1448
  results.append(report)
1454
1449
 
1455
1450
  report = pd.DataFrame(results)
geney/power_utils.py CHANGED
@@ -38,7 +38,7 @@ def write_executors(folder_path, script='geney.power_utils', input_file='/tamir2
38
38
 
39
39
  def launch_dask_cluster(memory_size="3GB", num_workers=10, queue="tamirQ",
40
40
  walltime="24:00:00", dashboard_address=":23154",
41
- log_directory="dask-logs", slurm=False):
41
+ log_directory="dask-logs", slurm=False, organism='hg38'):
42
42
  """
43
43
  Launch a Dask cluster using PBS.
44
44
 
@@ -63,7 +63,7 @@ def launch_dask_cluster(memory_size="3GB", num_workers=10, queue="tamirQ",
63
63
  walltime='7200',
64
64
  scheduler_options={"dashboard_address": dashboard_address},
65
65
  log_directory=log_directory,
66
- job_script_prologue=[f"cd {config_setup['BASE']}"]
66
+ job_script_prologue=[f"cd {config_setup[organism]['BASE']}"]
67
67
  )
68
68
 
69
69
  else:
@@ -75,7 +75,7 @@ def launch_dask_cluster(memory_size="3GB", num_workers=10, queue="tamirQ",
75
75
  walltime=walltime,
76
76
  scheduler_options={"dashboard_address": dashboard_address},
77
77
  log_directory=log_directory,
78
- job_script_prologue=[f"cd {config_setup['BASE']}"]
78
+ job_script_prologue=[f"cd {config_setup[organism]['BASE']}"]
79
79
  )
80
80
 
81
81
  dask_cluster.scale(num_workers)
geney/tcga_utils.py CHANGED
@@ -363,8 +363,8 @@ class TCGAGene:
363
363
  # return cases
364
364
  #
365
365
  #
366
- # def create_mut_id(row):
367
- # return f"{row.Gene_name}:{row['Chromosome']}:{row['Start_Position']}:{row['Reference_Allele']}:{row['Tumor_Seq_Allele2']}"
366
+ def create_mut_id(row):
367
+ return f"{row.Gene_name}:{row['Chromosome']}:{row['Start_Position']}:{row['Reference_Allele']}:{row['Tumor_Seq_Allele2']}"
368
368
  #
369
369
  #
370
370
  # def is_in_exon(mut_id, tid):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: geney
3
- Version: 1.2.3
3
+ Version: 1.2.5
4
4
  Summary: A Python package for gene expression modeling.
5
5
  Home-page: https://github.com/nicolaslynn/geney
6
6
  Author: Nicolas Lynn
@@ -17,7 +17,7 @@ Requires-Dist: numpy ==1.26.4
17
17
  Requires-Dist: pandas ==2.2.1
18
18
  Requires-Dist: networkx ==3.2.1
19
19
  Requires-Dist: viennarna ==2.6.4
20
- Requires-Dist: tqdm ==4.66.1
20
+ Requires-Dist: tqdm >=4.66.1
21
21
  Requires-Dist: spliceai ==1.3.1
22
22
  Requires-Dist: scikit-learn ==1.0.2
23
23
  Requires-Dist: biopython ==1.81
@@ -4,20 +4,20 @@ geney/__init__.py,sha256=r-Yvpo_Tc236DcsqsFyexT21iVoYCVl9zoJj5pFuWEE,407
4
4
  geney/benchmark_clinvar.py,sha256=LLl77e95Qbg9Kd-m2yL8ilmzubSz9SKogeARwssT4Ks,5532
5
5
  geney/compare_sets.py,sha256=TcgL57V7BUPxBoW9lv3xr8qK2Acmykn85Ev3avicQr8,2977
6
6
  geney/config_setup.py,sha256=VA6mhVGMRadwlpEx4m1wrssmDM8qpfKT21MAijIwjyQ,428
7
- geney/data_setup.py,sha256=mV_sSCMT8C41q_PD_G34MIBvsBKA0Czrpw17-DcYmT4,12052
7
+ geney/data_setup.py,sha256=LTiJMYPgv9KnIgUNw-D57Fu4nxL4OojXMpmdhE8QSYU,12228
8
8
  geney/gtex.py,sha256=asL2lHyU5KsbWpV096vkf1Ka7hSo_RRfZqw7p5nERmE,1919
9
9
  geney/gtex_utils.py,sha256=asL2lHyU5KsbWpV096vkf1Ka7hSo_RRfZqw7p5nERmE,1919
10
10
  geney/immune_utils.py,sha256=ZRni5ttrhpYBnmNr0d0ZatIbNPYs4nmQuoUO00SpsS4,5271
11
11
  geney/netchop.py,sha256=AMiy9YsdTmX4B3k3Y5Yh7EmoGAojM1O3AzhPKOiB--g,3050
12
- geney/oncosplice.py,sha256=xDzCLivFyurx-qlQo9cyrV-9KJ9VykYAb8lY9DDWl7Q,71810
12
+ geney/oncosplice.py,sha256=9oZs9W_bI6O5h3284WvatkerhSCaxMZWfs1xVc1lJO0,71524
13
13
  geney/oncosplice_mouse.py,sha256=LYLOukI9qI1IBkyl1qVRFR5d1NAw7Orlj8Zth-4xCW8,12962
14
14
  geney/oncosplice_pipeline.py,sha256=hpGqFHOdn8i8tvvs1-t3-G9Ko18zInwoDXBJbbrfbC4,68036
15
15
  geney/performance_utils.py,sha256=FQt7rA4r-Wuq3kceCxsSuMfj3wU1tMG8QnbL59aBohs,4700
16
- geney/power_utils.py,sha256=GtEvKAbz34S-ILQST6tabt3g0M4L8_aa50HIAQZ7byM,7266
16
+ geney/power_utils.py,sha256=nppfT1-bOC1dnvfRs55LipjoWDlRrOqWiuCMH0v1auU,7303
17
17
  geney/survival.py,sha256=gNKZGcwxDZ00ixVBHf3ZdjbY_AHQOCU9kKpBC_dokbM,5572
18
18
  geney/survival_utils.py,sha256=2CAkC2LsspicHIdrqsiPnjgvpr5KHDUfLFFqnRbPJqs,5762
19
19
  geney/tcga_annotations.py,sha256=DjRl6Pk5VAOL1yhbt8SXD6FZhYbcYNu3FtXYMeveGB0,15016
20
- geney/tcga_utils.py,sha256=uAjejr7F-XqcXS5uANGlsHLOlzMmGo4CTbWhMO0E318,15589
20
+ geney/tcga_utils.py,sha256=vXSMf1OxoF_AdE_rMguy_BoYaart_E1t4FFMx2DS1Ak,15585
21
21
  geney/utils.py,sha256=xJi7fk3g7DkR2rKOb8WePLQNM1ib83rcHecwRdwd5lA,2036
22
22
  geney/analyzers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
23
23
  geney/analyzers/benchmark_clinvar.py,sha256=ZAxvZ-Ue5T6au5mGbk8clfvbAYl13NIY7U92KzL0lXI,5531
@@ -45,7 +45,7 @@ geney/translation_initiation/resources/kozak_pssm.json,sha256=pcd0Olziutq-6H3mFW
45
45
  geney/translation_initiation/resources/tis_regressor_model.joblib,sha256=IXb4DUDhJ5rBDKcqMk9zE3ECTZZcdj7Jixz3KpoZ7OA,2592025
46
46
  geney/translation_termination/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
47
47
  geney/translation_termination/tts_utils.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
48
- geney-1.2.3.dist-info/METADATA,sha256=K6ufQEQw0PTbczOxkWFW26U4URgwy92Q9Aqp8BhsKIA,1198
49
- geney-1.2.3.dist-info/WHEEL,sha256=iYlv5fX357PQyRT2o6tw1bN-YcKFFHKqB_LwHO5wP-g,110
50
- geney-1.2.3.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
51
- geney-1.2.3.dist-info/RECORD,,
48
+ geney-1.2.5.dist-info/METADATA,sha256=9UptuZVJWZvVN6Y9KgPUxrC4gnijFVW4CtkkESxrY9E,1198
49
+ geney-1.2.5.dist-info/WHEEL,sha256=iYlv5fX357PQyRT2o6tw1bN-YcKFFHKqB_LwHO5wP-g,110
50
+ geney-1.2.5.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
51
+ geney-1.2.5.dist-info/RECORD,,
File without changes