geney 1.2.39__py2.py3-none-any.whl → 1.2.41__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- geney/graphic_utils.py +1 -2
- geney/oncosplice.py +10 -6
- geney/splicing_utils.py +4 -4
- geney/tcga_utils.py +66 -33
- geney/tis_utils.py +1 -1
- {geney-1.2.39.dist-info → geney-1.2.41.dist-info}/METADATA +1 -1
- {geney-1.2.39.dist-info → geney-1.2.41.dist-info}/RECORD +9 -9
- {geney-1.2.39.dist-info → geney-1.2.41.dist-info}/WHEEL +0 -0
- {geney-1.2.39.dist-info → geney-1.2.41.dist-info}/top_level.txt +0 -0
geney/graphic_utils.py
CHANGED
|
@@ -1,9 +1,8 @@
|
|
|
1
|
-
|
|
2
1
|
import matplotlib.pyplot as plt
|
|
3
2
|
from matplotlib.patches import Rectangle
|
|
4
3
|
import seaborn as sns
|
|
5
4
|
from collections import namedtuple
|
|
6
|
-
from geney.utils import
|
|
5
|
+
from geney.utils import unload_pickle, contains, unload_json, dump_json
|
|
7
6
|
|
|
8
7
|
|
|
9
8
|
### Graphical Stuff
|
geney/oncosplice.py
CHANGED
|
@@ -331,19 +331,19 @@ def OncospliceAnnotator(reference_transcript, variant_transcript, mut):
|
|
|
331
331
|
report['primary_transcript'] = reference_transcript.primary_transcript
|
|
332
332
|
report['transcript_id'] = reference_transcript.transcript_id
|
|
333
333
|
# report['mut_id'] = mut.mut_id
|
|
334
|
-
report['cons_available'] = int(reference_transcript.cons_available)
|
|
334
|
+
# report['cons_available'] = int(reference_transcript.cons_available)
|
|
335
335
|
# report['protein_coding'] = reference_transcript.transcript_biotype
|
|
336
336
|
|
|
337
337
|
# report['reference_mrna'] = reference_transcript.transcript_seq
|
|
338
|
-
report['reference_cds_start'] = reference_transcript.TIS
|
|
338
|
+
# report['reference_cds_start'] = reference_transcript.TIS
|
|
339
339
|
# report['reference_pre_mrna'] = reference_transcript.pre_mrna
|
|
340
340
|
# report[
|
|
341
341
|
# 'reference_orf'] = reference_transcript.orf # pre_mrna[reference_transcript.transcript_indices.index(reference_transcript.TIS):reference_transcript.transcript_indices.index(reference_transcript.TTS)]
|
|
342
342
|
report['reference_protein'] = reference_transcript.protein
|
|
343
|
-
report['reference_protein_length'] = len(reference_transcript.protein)
|
|
343
|
+
# report['reference_protein_length'] = len(reference_transcript.protein)
|
|
344
344
|
|
|
345
345
|
# report['variant_mrna'] = variant_transcript.transcript_seq
|
|
346
|
-
report['variant_cds_start'] = variant_transcript.TIS
|
|
346
|
+
# report['variant_cds_start'] = variant_transcript.TIS
|
|
347
347
|
# report[
|
|
348
348
|
# 'variant_pre_mrna'] = variant_transcript.pre_mrna # pre_mrna[variant_transcript.transcript_indices.index(variant_transcript.TIS):variant_transcript.transcript_indices.index(variant_transcript.TTS)]
|
|
349
349
|
# report['variant_orf'] = variant_transcript.orf
|
|
@@ -363,6 +363,8 @@ def OncospliceAnnotator(reference_transcript, variant_transcript, mut):
|
|
|
363
363
|
|
|
364
364
|
def oncosplice(mut_id, splicing_threshold=0.5, protein_coding=True, primary_transcript=False, window_length=13, organism='hg38', engine='spliceai', domains=None):
|
|
365
365
|
gene = Gene(mut_id.split(':')[0], organism=organism)
|
|
366
|
+
reference_gene_proteins = {tid: transcript.generate_pre_mrna().generate_mature_mrna().generate_protein() for tid, transcript in gene.run_transcripts(protein_coding=True)}
|
|
367
|
+
|
|
366
368
|
mutations = [get_mutation(m, rev=gene.rev) for m in mut_id.split('|')]
|
|
367
369
|
|
|
368
370
|
results = []
|
|
@@ -408,7 +410,7 @@ def oncosplice(mut_id, splicing_threshold=0.5, protein_coding=True, primary_tran
|
|
|
408
410
|
report['isoform_prevalence'] = new_boundaries['path_weight']
|
|
409
411
|
report['full_missplicing'] = missplicing.aberrant_splicing
|
|
410
412
|
report['missplicing'] = max(missplicing)
|
|
411
|
-
|
|
413
|
+
report['reference_resemblance'] = reference_gene_proteins.get(transcript.protein, None)
|
|
412
414
|
results.append(report)
|
|
413
415
|
|
|
414
416
|
report = pd.DataFrame(results)
|
|
@@ -445,6 +447,8 @@ async def oncosplice_prototype(mut_id, splicing_threshold=0.5, protein_coding=Tr
|
|
|
445
447
|
index=['domain_identifier', 'score'])
|
|
446
448
|
|
|
447
449
|
gene = Gene(mut_id.split(':')[0], organism=organism)
|
|
450
|
+
reference_gene_proteins = {tid: transcript.generate_pre_mrna().generate_mature_mrna().generate_protein() for tid, transcript in gene.run_transcripts(protein_coding=True)}
|
|
451
|
+
|
|
448
452
|
mutations = [get_mutation(mut_id, rev=gene.rev) for mut_id in mut_id.split('|')]
|
|
449
453
|
|
|
450
454
|
results = []
|
|
@@ -501,7 +505,7 @@ async def oncosplice_prototype(mut_id, splicing_threshold=0.5, protein_coding=Tr
|
|
|
501
505
|
report['full_missplicing'] = missplicing.aberrant_splicing
|
|
502
506
|
report['missplicing'] = max(missplicing)
|
|
503
507
|
report['domains_affected'] = domains_affected
|
|
504
|
-
|
|
508
|
+
report['reference_resemblance'] = reference_gene_proteins.get(transcript.protein, None)
|
|
505
509
|
results.append(pd.Series(report))
|
|
506
510
|
|
|
507
511
|
report = pd.concat(results, axis=1).T
|
geney/splicing_utils.py
CHANGED
|
@@ -139,11 +139,11 @@ def find_ss_changes(ref_dct, mut_dct, known_splice_sites, threshold=0.5):
|
|
|
139
139
|
new_dict = {v: mut_dct.get(v, 0) - ref_dct.get(v, 0) for v in
|
|
140
140
|
list(set(list(ref_dct.keys()) + list(mut_dct.keys())))}
|
|
141
141
|
|
|
142
|
-
discovered_pos = {k: {'delta': round(float(v), 3), 'absolute': round(float(mut_dct[k]), 3)} for k, v in
|
|
143
|
-
new_dict.items() if v >= threshold and k not in known_splice_sites} # if (k not in known_splice_sites and v >= threshold) or (v > 0.45)}
|
|
142
|
+
discovered_pos = {k: {'delta': round(float(v), 3), 'absolute': round(float(mut_dct[k]), 3), 'reference': round(ref_dct[k], 3)} for k, v in
|
|
143
|
+
new_dict.items() if v >= threshold} # and k not in known_splice_sites} # if (k not in known_splice_sites and v >= threshold) or (v > 0.45)}
|
|
144
144
|
|
|
145
|
-
deleted_pos = {k: {'delta': round(float(v), 3), 'absolute': round(float(mut_dct.get(k, 0)), 3)} for k, v in
|
|
146
|
-
new_dict.items() if -v >= threshold and k in known_splice_sites} #if k in known_splice_sites and v <= -threshold}
|
|
145
|
+
deleted_pos = {k: {'delta': round(float(v), 3), 'absolute': round(float(mut_dct.get(k, 0)), 3), 'reference': round(ref_dct[k], 3)} for k, v in
|
|
146
|
+
new_dict.items() if -v >= threshold} # and k in known_splice_sites} #if k in known_splice_sites and v <= -threshold}
|
|
147
147
|
|
|
148
148
|
return discovered_pos, deleted_pos
|
|
149
149
|
|
geney/tcga_utils.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
import pandas as pd
|
|
3
3
|
import random
|
|
4
4
|
from pathlib import Path
|
|
5
|
+
from tqdm import tqdm
|
|
5
6
|
|
|
6
7
|
class TCGACase:
|
|
7
8
|
def __init__(self, df):
|
|
@@ -98,38 +99,61 @@ class TCGACase:
|
|
|
98
99
|
class TCGAGene:
|
|
99
100
|
def __init__(self, gene, cancer_path=Path('/tamir2/cancer_proj/gdc_db/data/filtered_feb_2021/AllGenes/'),
|
|
100
101
|
valid_cases=None, extra_cols=[], exclude_filters=None, include_filter=None):
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
'HGVSc', 'Chromosome', 'Start_Position', 'Reference_Allele',
|
|
105
|
-
'Tumor_Seq_Allele2', 'case_id', 'Gene_name', 'Variant_Type'] + extra_cols,
|
|
106
|
-
low_memory=False).sort_values('Start_Position', ascending=True)
|
|
107
|
-
|
|
108
|
-
if df.empty:
|
|
109
|
-
self.df = df
|
|
102
|
+
file_path = cancer_path / gene / 'GeneMutTble.txt'
|
|
103
|
+
if not file_path.exists():
|
|
104
|
+
self.df = pd.DataFrame()
|
|
110
105
|
|
|
111
106
|
else:
|
|
112
|
-
df =
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
107
|
+
df = pd.read_csv(file_path,
|
|
108
|
+
usecols=['Variant_Type', 'FILTER', 'vcf_tumor_gt', 'vcf_normal_gt',
|
|
109
|
+
'COSMIC', 't_depth', 't_ref_count', 't_alt_count', 'Proj_name',
|
|
110
|
+
'HGVSc', 'Chromosome', 'Start_Position', 'Reference_Allele',
|
|
111
|
+
'Tumor_Seq_Allele2', 'case_id', 'Gene_name', 'Variant_Type',
|
|
112
|
+
'Variant_Classification'] + extra_cols,
|
|
113
|
+
low_memory=False).sort_values('Start_Position', ascending=True)
|
|
117
114
|
|
|
118
|
-
|
|
119
|
-
for exclude_filter in exclude_filters:
|
|
120
|
-
df = df[~df.FILTER.str.contains(exclude_filter)]
|
|
115
|
+
df['attention'] = True
|
|
121
116
|
|
|
122
|
-
if
|
|
123
|
-
df = df
|
|
117
|
+
if df.empty:
|
|
118
|
+
self.df = df
|
|
124
119
|
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
120
|
+
else:
|
|
121
|
+
df = df[df.Variant_Type.isin(['SNP', 'INS', 'DEL'])]
|
|
122
|
+
df = df.astype({'Start_Position': int})
|
|
123
|
+
|
|
124
|
+
if include_filter is not None:
|
|
125
|
+
# df = df[df.FILTER == include_filter]
|
|
126
|
+
df.loc[~df['FILTER'].str.contains(include_filter), 'attention'] = False
|
|
127
|
+
|
|
128
|
+
elif exclude_filters is not None:
|
|
129
|
+
for exclude_filter in exclude_filters:
|
|
130
|
+
# df = df[~df.FILTER.str.contains(exclude_filter)]
|
|
131
|
+
df.loc[df['FILTER'].str.contains(exclude_filter), 'attention'] = False
|
|
132
|
+
|
|
133
|
+
if valid_cases is not None:
|
|
134
|
+
# df = df[df.case_id.isin(valid_cases)]
|
|
135
|
+
df.loc[~df.case_id.isin(valid_cases), 'attention'] = False
|
|
136
|
+
|
|
137
|
+
df['mut_id'] = df.apply(lambda
|
|
138
|
+
row: f"{row.Gene_name}:{row.Chromosome.replace('chr', '')}:{row.Start_Position}:{row.Reference_Allele}:{row.Tumor_Seq_Allele2}",
|
|
139
|
+
axis=1)
|
|
140
|
+
df['mut_id_yoram'] = df.apply(lambda
|
|
141
|
+
row: f"{row.Gene_name}:{row.Chromosome}:{row.Variant_Classification}:{row.Start_Position}:{row.Reference_Allele}:{row.Tumor_Seq_Allele2}",
|
|
142
|
+
axis=1)
|
|
143
|
+
silent_mut_classes = ["3'Flank", "3'UTR", "Silent", "Splice_Site", "Splice_Region", "Intron", "5'Flank",
|
|
144
|
+
"3'Flank"]
|
|
145
|
+
df['silent'] = df.apply(lambda row: row.Variant_Classification in silent_mut_classes, axis=1)
|
|
146
|
+
df['ratio'] = df.t_alt_count + df.t_ref_count
|
|
147
|
+
df = df[df.ratio > 0]
|
|
148
|
+
df['ratio'] = df.t_alt_count / df.ratio
|
|
149
|
+
self.df = df
|
|
150
|
+
|
|
151
|
+
def __repr__(self):
|
|
152
|
+
return repr(self.df[self.df.attention])
|
|
153
|
+
|
|
154
|
+
@property
|
|
155
|
+
def data(self):
|
|
156
|
+
return self.df[self.df.attention]
|
|
133
157
|
|
|
134
158
|
def affected_cases(self, mut_id=None, read_ratio=0, filters=[]):
|
|
135
159
|
if mut_id is None:
|
|
@@ -164,18 +188,27 @@ class TCGAGene:
|
|
|
164
188
|
def total_prevalence(self, mut_id):
|
|
165
189
|
pass
|
|
166
190
|
|
|
167
|
-
def project_prevalence(self, mut_id):
|
|
168
|
-
|
|
191
|
+
def project_prevalence(self, mut_id, df_p_proc):
|
|
192
|
+
mut_prevalence = {}
|
|
193
|
+
for i, g in tqdm(self.data.groupby(['mut_id', 'Transcript_ID'])):
|
|
194
|
+
mut_prevalence[i] = series_to_pretty_string((df_p_proc[g.case_id].value_counts() / project_counts).dropna())
|
|
195
|
+
return pd.Series(mut_prevalence)
|
|
169
196
|
|
|
170
197
|
def project_counts(self, mut_id):
|
|
171
198
|
pass
|
|
172
199
|
|
|
200
|
+
def filter_silent_muts(self):
|
|
201
|
+
self.df.loc[self.df.silent, 'attention'] = False
|
|
202
|
+
return self
|
|
173
203
|
|
|
174
204
|
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
205
|
+
def series_to_pretty_string(series):
|
|
206
|
+
# Format each index-value pair, applying scientific notation to floats with 3 significant figures
|
|
207
|
+
pretty_str = "\n".join([
|
|
208
|
+
f"{index}: {value:.3e}" if isinstance(value, float) else f"{index}: {value}"
|
|
209
|
+
for index, value in series.items()
|
|
210
|
+
])
|
|
211
|
+
return pretty_str
|
|
179
212
|
|
|
180
213
|
|
|
181
214
|
# CLINICAL_DATA_FILE = Path('/tamir2/nicolaslynn/data/TCGA/cancer_reports/new_df_p_proc.pkl')
|
geney/tis_utils.py
CHANGED
|
@@ -133,7 +133,7 @@ def build_titer_model(TITER_path=config['hg38']['titer_path']):
|
|
|
133
133
|
if os.path.exists(weights_path):
|
|
134
134
|
model_copy.load_weights(weights_path) # Load weights into the new model instance
|
|
135
135
|
models.append(model_copy)
|
|
136
|
-
print(f"Loaded model {i} with weights from {weights_path}")
|
|
136
|
+
# print(f"Loaded model {i} with weights from {weights_path}")
|
|
137
137
|
else:
|
|
138
138
|
print(f"Warning: Weights file {weights_path} not found")
|
|
139
139
|
|
|
@@ -2,25 +2,25 @@ geney/Fasta_segment.py,sha256=0zCdzPUbDeM9Rz642woH5Q94pwI46O0fE3H8w0XWebc,11255
|
|
|
2
2
|
geney/__init__.py,sha256=eBdDl42N6UhcYeZDjOnv199Z88fI5_8Y6xW8447OKXM,755
|
|
3
3
|
geney/config_setup.py,sha256=klm_k7Ca_703DpeGBcGoDqz1XwHQhNXENPKjj_xfSQw,608
|
|
4
4
|
geney/data_setup.py,sha256=2RHmuvcGUQbEglXQEZr0C2QPDTQYRZOEm0EcmyfQJgU,12229
|
|
5
|
-
geney/graphic_utils.py,sha256=
|
|
5
|
+
geney/graphic_utils.py,sha256=oMsBpB9YeEn96gGpKh4MmtagJffWZbk-xPrIwHvkFhA,11016
|
|
6
6
|
geney/gtex_utils.py,sha256=asL2lHyU5KsbWpV096vkf1Ka7hSo_RRfZqw7p5nERmE,1919
|
|
7
7
|
geney/immune_utils.py,sha256=ZRni5ttrhpYBnmNr0d0ZatIbNPYs4nmQuoUO00SpsS4,5271
|
|
8
8
|
geney/mutation_utils.py,sha256=C_kv2MB_L8LlhX3W2ooXjJ3uDoJ8zX1WeDtZKoBZJkI,1547
|
|
9
|
-
geney/oncosplice.py,sha256=
|
|
9
|
+
geney/oncosplice.py,sha256=1K8p-sytnMUKTYwO_z_YJLelLosKj8TZpM0i5lHcMFI,22941
|
|
10
10
|
geney/pangolin_utils.py,sha256=lLmnjJdJjqwWS85-1jlPLIjD2z14sWjzU87hS-8xxpQ,2873
|
|
11
11
|
geney/power_utils.py,sha256=MehZFUdkJ2EFUot709yPEDxSkXmH5XevMebX2HD768A,7330
|
|
12
12
|
geney/seqmat_utils.py,sha256=YV5DFLbfjXLIswPGvqK1-eEfwn9TUby0b2kewdGAKws,18372
|
|
13
13
|
geney/spliceai_utils.py,sha256=gIGPC8u3J15A7EQrk2Elho5PbF9MmUUNopGGH-eEV8s,1873
|
|
14
|
-
geney/splicing_utils.py,sha256=
|
|
14
|
+
geney/splicing_utils.py,sha256=lGBNknnAdKhcJ3MqPQ5c9oz_NKcL2lcFAr78StjKa6o,16151
|
|
15
15
|
geney/survival_utils.py,sha256=2CAkC2LsspicHIdrqsiPnjgvpr5KHDUfLFFqnRbPJqs,5762
|
|
16
|
-
geney/tcga_utils.py,sha256=
|
|
17
|
-
geney/tis_utils.py,sha256=
|
|
16
|
+
geney/tcga_utils.py,sha256=wM52QZ1M_54CrXZ_uj05R14ycZh23gTZUI8b0ZMtPd0,17615
|
|
17
|
+
geney/tis_utils.py,sha256=vA2ci4gNfwwQZlCjPpO5ehvL2NRVeM7lHI_VyfT-_10,8049
|
|
18
18
|
geney/utils.py,sha256=EsKvBM-Nz2a3_4ZAhF4Dxd4PwT7_6YYKpxEN4LLgg10,2174
|
|
19
19
|
geney/translation_initiation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
20
20
|
geney/translation_initiation/tis_utils.py,sha256=AF3siFjuQH-Rs44EV-80zHdbxRMvN4woLFSHroWIETc,4448
|
|
21
21
|
geney/translation_initiation/resources/kozak_pssm.json,sha256=pcd0Olziutq-6H3mFWDCD9cujQ_AlZO-iiOvBl82hqE,1165
|
|
22
22
|
geney/translation_initiation/resources/tis_regressor_model.joblib,sha256=IXb4DUDhJ5rBDKcqMk9zE3ECTZZcdj7Jixz3KpoZ7OA,2592025
|
|
23
|
-
geney-1.2.
|
|
24
|
-
geney-1.2.
|
|
25
|
-
geney-1.2.
|
|
26
|
-
geney-1.2.
|
|
23
|
+
geney-1.2.41.dist-info/METADATA,sha256=e7eHu8HlNdNuNXLWxK17ok3lAetzKTJ7ie-8MRct1T8,948
|
|
24
|
+
geney-1.2.41.dist-info/WHEEL,sha256=fS9sRbCBHs7VFcwJLnLXN1MZRR0_TVTxvXKzOnaSFs8,110
|
|
25
|
+
geney-1.2.41.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
|
|
26
|
+
geney-1.2.41.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|