geney 1.2.3__py2.py3-none-any.whl → 1.2.5__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of geney might be problematic. Click here for more details.
- geney/data_setup.py +8 -2
- geney/oncosplice.py +15 -20
- geney/power_utils.py +3 -3
- geney/tcga_utils.py +2 -2
- {geney-1.2.3.dist-info → geney-1.2.5.dist-info}/METADATA +2 -2
- {geney-1.2.3.dist-info → geney-1.2.5.dist-info}/RECORD +8 -8
- {geney-1.2.3.dist-info → geney-1.2.5.dist-info}/WHEEL +0 -0
- {geney-1.2.3.dist-info → geney-1.2.5.dist-info}/top_level.txt +0 -0
geney/data_setup.py
CHANGED
|
@@ -7,7 +7,7 @@ from tqdm import tqdm
|
|
|
7
7
|
import requests
|
|
8
8
|
import argparse
|
|
9
9
|
from sh import gunzip
|
|
10
|
-
|
|
10
|
+
import shutil
|
|
11
11
|
|
|
12
12
|
def download(external_url, local_path):
|
|
13
13
|
print(f"Grabbing {external_url}")
|
|
@@ -94,6 +94,7 @@ def process_transcript(transcript_df, rev, chrm, cons_data):
|
|
|
94
94
|
cds_start, cds_end = cds_start[0], cds_end[0]
|
|
95
95
|
data.update({'TIS': cds_start, 'TTS': cds_end, 'protein_id': transcript.protein_id})
|
|
96
96
|
|
|
97
|
+
print(f"{transcript.transcript_id} in cons_data: {transcript.transcript_id in cons_data}")
|
|
97
98
|
if transcript.transcript_id in cons_data:
|
|
98
99
|
data.update({'cons_available': True, 'cons_vector': cons_data[transcript.transcript_id]['scores'], 'cons_seq': cons_data[transcript.transcript_id]['seq']})
|
|
99
100
|
|
|
@@ -104,6 +105,8 @@ def process_transcript(transcript_df, rev, chrm, cons_data):
|
|
|
104
105
|
|
|
105
106
|
|
|
106
107
|
def retrieve_and_parse_ensembl_annotations(local_path, annotations_file, cons_data, gtex_file='', valid_biotypes=('protein_coding')):
|
|
108
|
+
print(cons_data.keys())
|
|
109
|
+
|
|
107
110
|
if gtex_file:
|
|
108
111
|
gtex_df = pd.read_csv(gtex_file, delimiter='\t', header=2)
|
|
109
112
|
gtex_df.Name = gtex_df.apply(lambda row: row.Name.split('.')[0], axis=1)
|
|
@@ -225,21 +228,24 @@ def main():
|
|
|
225
228
|
'TEMP': os.path.join(args.basepath, args.organism, 'temp')
|
|
226
229
|
}
|
|
227
230
|
|
|
231
|
+
base_path = Path(args.basepath) / args.organism
|
|
232
|
+
|
|
228
233
|
if config_file.exists():
|
|
229
234
|
config_data = unload_json(config_file)
|
|
230
235
|
overwrite = 'y'
|
|
231
236
|
if args.organism in config_data:
|
|
232
237
|
overwrite = input("Organism {args.organism} already configured... Overwrite? (y/n)")
|
|
238
|
+
|
|
233
239
|
if overwrite == 'y':
|
|
234
240
|
config_data[args.organism] = config_paths
|
|
235
241
|
dump_json(config_file, config_data)
|
|
242
|
+
shutil.rmtree(base_path)
|
|
236
243
|
else:
|
|
237
244
|
raise SystemExit("Exiting configuration.")
|
|
238
245
|
else:
|
|
239
246
|
config_data = {args.organism: config_paths}
|
|
240
247
|
dump_json(config_file, config_data)
|
|
241
248
|
|
|
242
|
-
base_path = Path(args.basepath) / args.organism
|
|
243
249
|
if base_path.exists() and len(os.listdir(base_path)) > 0:
|
|
244
250
|
raise FileExistsError(f"Directory {base_path} not empty.")
|
|
245
251
|
|
geney/oncosplice.py
CHANGED
|
@@ -35,7 +35,7 @@ import torch
|
|
|
35
35
|
from pkg_resources import resource_filename
|
|
36
36
|
from pangolin.model import *
|
|
37
37
|
|
|
38
|
-
pang_model_nums = [0, 2, 4, 6]
|
|
38
|
+
pang_model_nums = [0, 1, 2, 3, 4, 5, 6]
|
|
39
39
|
pang_models = []
|
|
40
40
|
for i in pang_model_nums:
|
|
41
41
|
for j in range(1, 6):
|
|
@@ -752,7 +752,7 @@ def pangolin_predict_probs(true_seq, models):
|
|
|
752
752
|
splicing_pred = np.array(scores).max(axis=0)
|
|
753
753
|
donor_probs = [splicing_pred[i] * donor_dinucleotide[i] for i in range(len(true_seq))]
|
|
754
754
|
acceptor_probs = [splicing_pred[i] * acceptor_dinucleotide[i] for i in range(len(true_seq))]
|
|
755
|
-
return donor_probs, acceptor_probs
|
|
755
|
+
return donor_probs[5000:-5000], acceptor_probs[5000:-5000]
|
|
756
756
|
|
|
757
757
|
def run_spliceai_transcript(mutations, transcript_data, sai_mrg_context=5000, min_coverage=2500, sai_threshold=0.5, engine='spliceai'):
|
|
758
758
|
positions = mutations.positions
|
|
@@ -803,9 +803,11 @@ def run_spliceai_transcript(mutations, transcript_data, sai_mrg_context=5000, mi
|
|
|
803
803
|
mut_seq_acceptor_probs, mut_seq_donor_probs = mut_seq_probs_temp[0, :], mut_seq_probs_temp[1, :]
|
|
804
804
|
|
|
805
805
|
elif engine == 'pangolin':
|
|
806
|
-
ref_seq_donor_probs, ref_seq_acceptor_probs = pangolin_predict_probs(ref_seq, pangolin_models)
|
|
807
|
-
mut_seq_donor_probs, mut_seq_acceptor_probs = pangolin_predict_probs(mut_seq, pangolin_models)
|
|
806
|
+
ref_seq_donor_probs, ref_seq_acceptor_probs = pangolin_predict_probs(ref_seq, pangolin_models=pang_models)
|
|
807
|
+
mut_seq_donor_probs, mut_seq_acceptor_probs = pangolin_predict_probs(mut_seq, pangolin_models=pang_models)
|
|
808
808
|
|
|
809
|
+
else:
|
|
810
|
+
raise ValueError(f"{engine} not implemented")
|
|
809
811
|
|
|
810
812
|
assert len(ref_indices) == len(ref_seq_acceptor_probs), 'Reference pos not the same'
|
|
811
813
|
assert len(mut_indices) == len(mut_seq_acceptor_probs), 'Mut pos not the same'
|
|
@@ -1400,18 +1402,15 @@ def moving_average_conv(vector, window_size, factor=1):
|
|
|
1400
1402
|
|
|
1401
1403
|
return np.convolve(vector, np.ones(window_size), mode='same') / window_size
|
|
1402
1404
|
|
|
1403
|
-
def oncosplice(mut_id, sai_threshold=0.5, protein_coding=True, primary_transcript=False,
|
|
1405
|
+
def oncosplice(mut_id, sai_threshold=0.5, protein_coding=True, primary_transcript=False, window_length=13, save_spliceai_results=False, force_spliceai=False, organism='hg38'):
|
|
1404
1406
|
mutation = Variations(mut_id)
|
|
1405
|
-
try:
|
|
1406
|
-
|
|
1407
|
-
except FileNotFoundError:
|
|
1408
|
-
|
|
1407
|
+
# try:
|
|
1408
|
+
reference_gene = Gene(mutation.gene, organism=organism)
|
|
1409
|
+
# except FileNotFoundError:
|
|
1410
|
+
# return pd.DataFrame()
|
|
1409
1411
|
|
|
1410
|
-
|
|
1411
|
-
mutated_gene = Gene(mutation.gene, mut_id)
|
|
1412
|
-
# if not per_transcript_missplicing:
|
|
1413
|
-
# missplicing_obj = PredictSpliceAI(mutation, reference_gene, threshold=sai_threshold, force=True, save_results=False)
|
|
1414
|
-
# missplicing = missplicing_obj.missplicing
|
|
1412
|
+
reference_gene_proteins = {g.protein: g.transcript_id for g in reference_gene.run_transcripts()}
|
|
1413
|
+
mutated_gene = Gene(mutation.gene, mut_id, organism=organism)
|
|
1415
1414
|
|
|
1416
1415
|
results = []
|
|
1417
1416
|
for variant in mutated_gene.run_transcripts(protein_coding=protein_coding, primary_transcript=primary_transcript):
|
|
@@ -1420,10 +1419,9 @@ def oncosplice(mut_id, sai_threshold=0.5, protein_coding=True, primary_transcrip
|
|
|
1420
1419
|
continue
|
|
1421
1420
|
|
|
1422
1421
|
cons_vector = transform_conservation_vector(reference.cons_vector, window=window_length)
|
|
1423
|
-
# if per_transcript_missplicing:
|
|
1424
1422
|
missplicing_obj = PredictSpliceAI(mutation, reference, threshold=sai_threshold, force=force_spliceai, save_results=save_spliceai_results)
|
|
1425
1423
|
missplicing = missplicing_obj.apply_sai_threshold_primary(threshold=sai_threshold)
|
|
1426
|
-
|
|
1424
|
+
|
|
1427
1425
|
for i, new_boundaries in enumerate(develop_aberrant_splicing(variant, missplicing)):
|
|
1428
1426
|
variant_isoform = deepcopy(variant)
|
|
1429
1427
|
variant_isoform.reset_acceptors(acceptors=new_boundaries['acceptors']).reset_donors(donors=new_boundaries['donors']).organize().generate_protein()
|
|
@@ -1432,9 +1430,6 @@ def oncosplice(mut_id, sai_threshold=0.5, protein_coding=True, primary_transcrip
|
|
|
1432
1430
|
modified_positions = find_modified_positions(len(reference.protein), deleted, inserted)
|
|
1433
1431
|
temp_cons = np.convolve(cons_vector * modified_positions, np.ones(window_length)) / window_length
|
|
1434
1432
|
affected_cons_scores = max(temp_cons)
|
|
1435
|
-
# temp_cons = np.convolve(cons_vector, np.ones(window_length))
|
|
1436
|
-
# print(temp_cons)
|
|
1437
|
-
# print(cons_vector)
|
|
1438
1433
|
percentile = (
|
|
1439
1434
|
sorted(cons_vector).index(next(x for x in sorted(cons_vector) if x >= affected_cons_scores)) / len(
|
|
1440
1435
|
cons_vector))
|
|
@@ -1449,7 +1444,7 @@ def oncosplice(mut_id, sai_threshold=0.5, protein_coding=True, primary_transcrip
|
|
|
1449
1444
|
report['isoform_prevalence'] = new_boundaries['path_weight']
|
|
1450
1445
|
report['full_missplicing'] = missplicing
|
|
1451
1446
|
report['missplicing'] = max(missplicing_obj)
|
|
1452
|
-
report['reference_resemblance'] =
|
|
1447
|
+
report['reference_resemblance'] = reference_gene_proteins.get(variant_isoform.protein, None)
|
|
1453
1448
|
results.append(report)
|
|
1454
1449
|
|
|
1455
1450
|
report = pd.DataFrame(results)
|
geney/power_utils.py
CHANGED
|
@@ -38,7 +38,7 @@ def write_executors(folder_path, script='geney.power_utils', input_file='/tamir2
|
|
|
38
38
|
|
|
39
39
|
def launch_dask_cluster(memory_size="3GB", num_workers=10, queue="tamirQ",
|
|
40
40
|
walltime="24:00:00", dashboard_address=":23154",
|
|
41
|
-
log_directory="dask-logs", slurm=False):
|
|
41
|
+
log_directory="dask-logs", slurm=False, organism='hg38'):
|
|
42
42
|
"""
|
|
43
43
|
Launch a Dask cluster using PBS.
|
|
44
44
|
|
|
@@ -63,7 +63,7 @@ def launch_dask_cluster(memory_size="3GB", num_workers=10, queue="tamirQ",
|
|
|
63
63
|
walltime='7200',
|
|
64
64
|
scheduler_options={"dashboard_address": dashboard_address},
|
|
65
65
|
log_directory=log_directory,
|
|
66
|
-
job_script_prologue=[f"cd {config_setup['BASE']}"]
|
|
66
|
+
job_script_prologue=[f"cd {config_setup[organism]['BASE']}"]
|
|
67
67
|
)
|
|
68
68
|
|
|
69
69
|
else:
|
|
@@ -75,7 +75,7 @@ def launch_dask_cluster(memory_size="3GB", num_workers=10, queue="tamirQ",
|
|
|
75
75
|
walltime=walltime,
|
|
76
76
|
scheduler_options={"dashboard_address": dashboard_address},
|
|
77
77
|
log_directory=log_directory,
|
|
78
|
-
job_script_prologue=[f"cd {config_setup['BASE']}"]
|
|
78
|
+
job_script_prologue=[f"cd {config_setup[organism]['BASE']}"]
|
|
79
79
|
)
|
|
80
80
|
|
|
81
81
|
dask_cluster.scale(num_workers)
|
geney/tcga_utils.py
CHANGED
|
@@ -363,8 +363,8 @@ class TCGAGene:
|
|
|
363
363
|
# return cases
|
|
364
364
|
#
|
|
365
365
|
#
|
|
366
|
-
|
|
367
|
-
|
|
366
|
+
def create_mut_id(row):
|
|
367
|
+
return f"{row.Gene_name}:{row['Chromosome']}:{row['Start_Position']}:{row['Reference_Allele']}:{row['Tumor_Seq_Allele2']}"
|
|
368
368
|
#
|
|
369
369
|
#
|
|
370
370
|
# def is_in_exon(mut_id, tid):
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: geney
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.5
|
|
4
4
|
Summary: A Python package for gene expression modeling.
|
|
5
5
|
Home-page: https://github.com/nicolaslynn/geney
|
|
6
6
|
Author: Nicolas Lynn
|
|
@@ -17,7 +17,7 @@ Requires-Dist: numpy ==1.26.4
|
|
|
17
17
|
Requires-Dist: pandas ==2.2.1
|
|
18
18
|
Requires-Dist: networkx ==3.2.1
|
|
19
19
|
Requires-Dist: viennarna ==2.6.4
|
|
20
|
-
Requires-Dist: tqdm
|
|
20
|
+
Requires-Dist: tqdm >=4.66.1
|
|
21
21
|
Requires-Dist: spliceai ==1.3.1
|
|
22
22
|
Requires-Dist: scikit-learn ==1.0.2
|
|
23
23
|
Requires-Dist: biopython ==1.81
|
|
@@ -4,20 +4,20 @@ geney/__init__.py,sha256=r-Yvpo_Tc236DcsqsFyexT21iVoYCVl9zoJj5pFuWEE,407
|
|
|
4
4
|
geney/benchmark_clinvar.py,sha256=LLl77e95Qbg9Kd-m2yL8ilmzubSz9SKogeARwssT4Ks,5532
|
|
5
5
|
geney/compare_sets.py,sha256=TcgL57V7BUPxBoW9lv3xr8qK2Acmykn85Ev3avicQr8,2977
|
|
6
6
|
geney/config_setup.py,sha256=VA6mhVGMRadwlpEx4m1wrssmDM8qpfKT21MAijIwjyQ,428
|
|
7
|
-
geney/data_setup.py,sha256=
|
|
7
|
+
geney/data_setup.py,sha256=LTiJMYPgv9KnIgUNw-D57Fu4nxL4OojXMpmdhE8QSYU,12228
|
|
8
8
|
geney/gtex.py,sha256=asL2lHyU5KsbWpV096vkf1Ka7hSo_RRfZqw7p5nERmE,1919
|
|
9
9
|
geney/gtex_utils.py,sha256=asL2lHyU5KsbWpV096vkf1Ka7hSo_RRfZqw7p5nERmE,1919
|
|
10
10
|
geney/immune_utils.py,sha256=ZRni5ttrhpYBnmNr0d0ZatIbNPYs4nmQuoUO00SpsS4,5271
|
|
11
11
|
geney/netchop.py,sha256=AMiy9YsdTmX4B3k3Y5Yh7EmoGAojM1O3AzhPKOiB--g,3050
|
|
12
|
-
geney/oncosplice.py,sha256=
|
|
12
|
+
geney/oncosplice.py,sha256=9oZs9W_bI6O5h3284WvatkerhSCaxMZWfs1xVc1lJO0,71524
|
|
13
13
|
geney/oncosplice_mouse.py,sha256=LYLOukI9qI1IBkyl1qVRFR5d1NAw7Orlj8Zth-4xCW8,12962
|
|
14
14
|
geney/oncosplice_pipeline.py,sha256=hpGqFHOdn8i8tvvs1-t3-G9Ko18zInwoDXBJbbrfbC4,68036
|
|
15
15
|
geney/performance_utils.py,sha256=FQt7rA4r-Wuq3kceCxsSuMfj3wU1tMG8QnbL59aBohs,4700
|
|
16
|
-
geney/power_utils.py,sha256=
|
|
16
|
+
geney/power_utils.py,sha256=nppfT1-bOC1dnvfRs55LipjoWDlRrOqWiuCMH0v1auU,7303
|
|
17
17
|
geney/survival.py,sha256=gNKZGcwxDZ00ixVBHf3ZdjbY_AHQOCU9kKpBC_dokbM,5572
|
|
18
18
|
geney/survival_utils.py,sha256=2CAkC2LsspicHIdrqsiPnjgvpr5KHDUfLFFqnRbPJqs,5762
|
|
19
19
|
geney/tcga_annotations.py,sha256=DjRl6Pk5VAOL1yhbt8SXD6FZhYbcYNu3FtXYMeveGB0,15016
|
|
20
|
-
geney/tcga_utils.py,sha256=
|
|
20
|
+
geney/tcga_utils.py,sha256=vXSMf1OxoF_AdE_rMguy_BoYaart_E1t4FFMx2DS1Ak,15585
|
|
21
21
|
geney/utils.py,sha256=xJi7fk3g7DkR2rKOb8WePLQNM1ib83rcHecwRdwd5lA,2036
|
|
22
22
|
geney/analyzers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
23
23
|
geney/analyzers/benchmark_clinvar.py,sha256=ZAxvZ-Ue5T6au5mGbk8clfvbAYl13NIY7U92KzL0lXI,5531
|
|
@@ -45,7 +45,7 @@ geney/translation_initiation/resources/kozak_pssm.json,sha256=pcd0Olziutq-6H3mFW
|
|
|
45
45
|
geney/translation_initiation/resources/tis_regressor_model.joblib,sha256=IXb4DUDhJ5rBDKcqMk9zE3ECTZZcdj7Jixz3KpoZ7OA,2592025
|
|
46
46
|
geney/translation_termination/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
47
47
|
geney/translation_termination/tts_utils.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
48
|
-
geney-1.2.
|
|
49
|
-
geney-1.2.
|
|
50
|
-
geney-1.2.
|
|
51
|
-
geney-1.2.
|
|
48
|
+
geney-1.2.5.dist-info/METADATA,sha256=9UptuZVJWZvVN6Y9KgPUxrC4gnijFVW4CtkkESxrY9E,1198
|
|
49
|
+
geney-1.2.5.dist-info/WHEEL,sha256=iYlv5fX357PQyRT2o6tw1bN-YcKFFHKqB_LwHO5wP-g,110
|
|
50
|
+
geney-1.2.5.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
|
|
51
|
+
geney-1.2.5.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|