geney 1.2.54__py2.py3-none-any.whl → 1.2.56__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of geney might be problematic. Click here for more details.
- geney/oncosplice.py +11 -59
- geney/pangolin_utils.py +3 -5
- geney/spliceai_utils.py +2 -2
- geney/splicing_utils.py +115 -118
- geney/tis_utils.py +5 -17
- {geney-1.2.54.dist-info → geney-1.2.56.dist-info}/METADATA +1 -1
- {geney-1.2.54.dist-info → geney-1.2.56.dist-info}/RECORD +9 -9
- {geney-1.2.54.dist-info → geney-1.2.56.dist-info}/WHEEL +0 -0
- {geney-1.2.54.dist-info → geney-1.2.56.dist-info}/top_level.txt +0 -0
geney/oncosplice.py
CHANGED
|
@@ -9,7 +9,6 @@ from .seqmat_utils import *
|
|
|
9
9
|
from .mutation_utils import *
|
|
10
10
|
from .tis_utils import find_tis
|
|
11
11
|
|
|
12
|
-
### Scoring
|
|
13
12
|
def find_continuous_gaps(sequence):
|
|
14
13
|
"""Find continuous gap sequences in an alignment."""
|
|
15
14
|
return [(m.start(), m.end()) for m in re.finditer(r'-+', sequence)]
|
|
@@ -121,43 +120,6 @@ def transform_conservation_vector(conservation_vector, window=13, factor=4):
|
|
|
121
120
|
return exp_factors
|
|
122
121
|
|
|
123
122
|
|
|
124
|
-
# def find_modified_positions(sequence_length, deletions, insertions, reach_limit=16):
|
|
125
|
-
# """
|
|
126
|
-
# Identify unmodified positions in a sequence given deletions and insertions.
|
|
127
|
-
#
|
|
128
|
-
# :param sequence_length: Length of the sequence.
|
|
129
|
-
# :param deletions: Dictionary of deletions.
|
|
130
|
-
# :param insertions: Dictionary of insertions.
|
|
131
|
-
# :param reach_limit: Limit for considering the effect of insertions/deletions.
|
|
132
|
-
# :return: Array indicating unmodified positions.
|
|
133
|
-
# """
|
|
134
|
-
# unmodified_positions = np.zeros(sequence_length, dtype=float)
|
|
135
|
-
#
|
|
136
|
-
# for pos, insertion in insertions.items():
|
|
137
|
-
# # if pos >= sequence_length:
|
|
138
|
-
# # pos = sequence_length - 1
|
|
139
|
-
# # add_factor = 1
|
|
140
|
-
#
|
|
141
|
-
# reach = min(len(insertion) // 2, reach_limit)
|
|
142
|
-
# front_end, back_end = max(0, pos - reach), min(sequence_length - 1, pos + reach)
|
|
143
|
-
# len_start, len_end = pos - front_end, back_end - pos
|
|
144
|
-
# try:
|
|
145
|
-
# gradient_front = np.linspace(0, 1, len_start, endpoint=False)
|
|
146
|
-
# gradient_back = np.linspace(0, 1, len_end, endpoint=True)[::-1]
|
|
147
|
-
# combined_gradient = np.concatenate([gradient_front, np.array([1]), gradient_back])
|
|
148
|
-
# unmodified_positions[front_end:back_end + 1] = combined_gradient
|
|
149
|
-
#
|
|
150
|
-
# except ValueError as e:
|
|
151
|
-
# print(
|
|
152
|
-
# f"Error: {e} | Lengths: unmodified_positions_slice={back_end - front_end}.")
|
|
153
|
-
# unmodified_positions[front_end:back_end] = np.zeros(back_end - front_end)
|
|
154
|
-
#
|
|
155
|
-
# for pos, deletion in deletions.items():
|
|
156
|
-
# deletion_length = len(deletion)
|
|
157
|
-
# unmodified_positions[pos:pos + deletion_length] = 1
|
|
158
|
-
#
|
|
159
|
-
# return unmodified_positions
|
|
160
|
-
|
|
161
123
|
def find_modified_positions(sequence_length, deletions, insertions, reach_limit=16):
|
|
162
124
|
"""
|
|
163
125
|
Identify unmodified positions in a sequence given deletions and insertions.
|
|
@@ -251,12 +213,7 @@ def moving_average_conv(vector, window_size, factor=1):
|
|
|
251
213
|
|
|
252
214
|
return np.convolve(vector, np.ones(window_size), mode='same') / window_size
|
|
253
215
|
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
216
|
def find_splice_site_proximity(pos, transcript):
|
|
259
|
-
|
|
260
217
|
for i, (ex_start, ex_end) in enumerate(transcript.exons):
|
|
261
218
|
if min(ex_start, ex_end) <= pos <= max(ex_start, ex_end):
|
|
262
219
|
return i + 1, None, abs(pos - ex_start), abs(pos - ex_end)
|
|
@@ -323,7 +280,7 @@ def summarize_missplicing_event(pes, pir, es, ne, ir):
|
|
|
323
280
|
|
|
324
281
|
# Annotating
|
|
325
282
|
def OncospliceAnnotator(reference_transcript, variant_transcript, mut):
|
|
326
|
-
affected_exon, affected_intron, distance_from_5, distance_from_3 = find_splice_site_proximity(mut.indices[0],
|
|
283
|
+
affected_exon, affected_intron, distance_from_5, distance_from_3 = find_splice_site_proximity(np.floor(mut.indices[0]),
|
|
327
284
|
reference_transcript)
|
|
328
285
|
|
|
329
286
|
report = {}
|
|
@@ -361,19 +318,17 @@ def OncospliceAnnotator(reference_transcript, variant_transcript, mut):
|
|
|
361
318
|
return report
|
|
362
319
|
|
|
363
320
|
|
|
364
|
-
def oncosplice(mut_id, splicing_threshold=0.5, protein_coding=True, primary_transcript=False, window_length=13, organism='hg38', engine='spliceai', domains=None):
|
|
321
|
+
def oncosplice(mut_id, splicing_threshold=0.5, protein_coding=True, cons_required=False, primary_transcript=False, window_length=13, organism='hg38', engine='spliceai', domains=None):
|
|
365
322
|
gene = Gene(mut_id.split(':')[0], organism=organism)
|
|
366
323
|
reference_gene_proteins = {tid: transcript.generate_pre_mrna().generate_mature_mrna().generate_protein() for tid, transcript in gene.run_transcripts(protein_coding=True)}
|
|
367
|
-
|
|
368
324
|
mutations = [get_mutation(m, rev=gene.rev) for m in mut_id.split('|')]
|
|
369
325
|
|
|
370
326
|
results = []
|
|
371
327
|
for tid, transcript in gene.run_transcripts(protein_coding=protein_coding, primary_transcript=primary_transcript):
|
|
372
|
-
if not transcript.cons_available:
|
|
328
|
+
if cons_required and not transcript.cons_available:
|
|
373
329
|
continue
|
|
374
330
|
|
|
375
331
|
if all(mutation not in transcript for mutation in mutations):
|
|
376
|
-
results.append({'transcript_id': transcript.transcript_id})
|
|
377
332
|
continue
|
|
378
333
|
|
|
379
334
|
transcript.generate_pre_mrna()
|
|
@@ -413,15 +368,15 @@ def oncosplice(mut_id, splicing_threshold=0.5, protein_coding=True, primary_tran
|
|
|
413
368
|
report['reference_resemblance'] = reference_gene_proteins.get(transcript.protein, None)
|
|
414
369
|
results.append(report)
|
|
415
370
|
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
371
|
+
if len(results) == 0:
|
|
372
|
+
return None
|
|
419
373
|
|
|
420
|
-
|
|
374
|
+
return pd.DataFrame(results)
|
|
421
375
|
|
|
422
376
|
|
|
377
|
+
import asyncio
|
|
423
378
|
async def oncosplice_prototype(mut_id, splicing_threshold=0.5, protein_coding=True, primary_transcript=False,
|
|
424
|
-
window_length=13, organism='hg38', engine='spliceai', use_cons=True):
|
|
379
|
+
window_length=13, organism='hg38', engine='spliceai', use_cons=True, require_cons=False):
|
|
425
380
|
import sys, os
|
|
426
381
|
needed_file1 = config[organism]['yoram_path'] / 'rest_api_utils.py'
|
|
427
382
|
needed_file2 = config[organism]['yoram_path'] / 'uniprot_utils.py'
|
|
@@ -452,20 +407,17 @@ async def oncosplice_prototype(mut_id, splicing_threshold=0.5, protein_coding=Tr
|
|
|
452
407
|
|
|
453
408
|
gene = Gene(mut_id.split(':')[0], organism=organism)
|
|
454
409
|
reference_gene_proteins = {tid: transcript.generate_pre_mrna().generate_mature_mrna().generate_protein() for tid, transcript in gene.run_transcripts(protein_coding=True)}
|
|
455
|
-
|
|
456
410
|
mutations = [get_mutation(mut_id, rev=gene.rev) for mut_id in mut_id.split('|')]
|
|
457
|
-
|
|
458
411
|
results = []
|
|
459
412
|
for tid, transcript in gene.run_transcripts(protein_coding=protein_coding, primary_transcript=primary_transcript):
|
|
460
|
-
if not transcript.cons_available:
|
|
413
|
+
if require_cons and not transcript.cons_available:
|
|
461
414
|
continue
|
|
462
415
|
|
|
463
416
|
if all(mutation not in transcript for mutation in mutations):
|
|
464
|
-
results.append({'transcript_id': transcript.transcript_id})
|
|
417
|
+
# results.append({'transcript_id': transcript.transcript_id})
|
|
465
418
|
continue
|
|
466
419
|
|
|
467
420
|
task1 = asyncio.create_task(background_request(tid))
|
|
468
|
-
|
|
469
421
|
transcript.generate_pre_mrna()
|
|
470
422
|
transcript.cons_vector = transform_conservation_vector(transcript.cons_vector, window=window_length)
|
|
471
423
|
transcript.generate_mature_mrna().generate_protein(inplace=True)
|
|
@@ -475,7 +427,7 @@ async def oncosplice_prototype(mut_id, splicing_threshold=0.5, protein_coding=Tr
|
|
|
475
427
|
cons_vector = np.ones(len(ref_protein))
|
|
476
428
|
|
|
477
429
|
if sum(cons_vector) == 0:
|
|
478
|
-
cons_vector = np.ones(len(ref_protein))
|
|
430
|
+
cons_vector = np.ones(len(ref_protein)) #/len(ref_protein)
|
|
479
431
|
|
|
480
432
|
reference_transcript = copy.deepcopy(transcript)
|
|
481
433
|
|
geney/pangolin_utils.py
CHANGED
|
@@ -52,12 +52,10 @@ def pangolin_predict_probs(true_seq, models):
|
|
|
52
52
|
model_nums = [0, 1, 2, 3, 4, 5, 6]
|
|
53
53
|
INDEX_MAP = {0: 1, 1: 2, 2: 4, 3: 5, 4: 7, 5: 8, 6: 10, 7: 11}
|
|
54
54
|
|
|
55
|
-
# seq = 'N'*5000 + true_seq + 'N'*5000
|
|
56
55
|
seq = true_seq
|
|
57
56
|
true_seq = true_seq[5000:-5000]
|
|
58
|
-
acceptor_dinucleotide = np.array([true_seq[i - 2:i] == 'AG' for i in range(len(true_seq))])
|
|
59
|
-
|
|
60
|
-
donor_dinucleotide = np.array([true_seq[i -2:i] == 'GT' for i in range(len(true_seq))])
|
|
57
|
+
acceptor_dinucleotide = np.array([true_seq[i - 2:i] == 'AG' for i in range(len(true_seq))]) # np.ones(len(true_seq)) #
|
|
58
|
+
donor_dinucleotide = np.array([true_seq[i+1:i+3] == 'GT' for i in range(len(true_seq))]) #np.ones(len(true_seq)) #
|
|
61
59
|
|
|
62
60
|
seq = pang_one_hot_encode(seq).T
|
|
63
61
|
seq = torch.from_numpy(np.expand_dims(seq, axis=0)).float()
|
|
@@ -78,4 +76,4 @@ def pangolin_predict_probs(true_seq, models):
|
|
|
78
76
|
splicing_pred = np.array(scores).max(axis=0)
|
|
79
77
|
donor_probs = [splicing_pred[i] * donor_dinucleotide[i] for i in range(len(true_seq))]
|
|
80
78
|
acceptor_probs = [splicing_pred[i] * acceptor_dinucleotide[i] for i in range(len(true_seq))]
|
|
81
|
-
return donor_probs, acceptor_probs
|
|
79
|
+
return donor_probs, acceptor_probs
|
geney/spliceai_utils.py
CHANGED
|
@@ -12,8 +12,8 @@ if tf.config.list_physical_devices('GPU'):
|
|
|
12
12
|
else:
|
|
13
13
|
print("Running on CPU.")
|
|
14
14
|
|
|
15
|
-
|
|
16
|
-
|
|
15
|
+
tf.config.threading.set_intra_op_parallelism_threads(1)
|
|
16
|
+
tf.config.threading.set_inter_op_parallelism_threads(1)
|
|
17
17
|
|
|
18
18
|
sai_paths = ('models/spliceai{}.h5'.format(x) for x in range(1, 6))
|
|
19
19
|
sai_models = [load_model(resource_filename('spliceai', x)) for x in sai_paths]
|
geney/splicing_utils.py
CHANGED
|
@@ -1,128 +1,125 @@
|
|
|
1
|
-
import networkx as nx
|
|
2
1
|
import numpy as np
|
|
3
2
|
from .mutation_utils import get_mutation
|
|
4
3
|
from .seqmat_utils import Gene
|
|
5
4
|
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
5
|
+
from collections import defaultdict
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def generate_adjacency_list(acceptors, donors, transcript_start, transcript_end, max_distance=50, rev=False):
|
|
9
|
+
# Append the transcript end to donors to allow connection to the end point
|
|
10
|
+
donors.append((transcript_end, 1))
|
|
11
|
+
acceptors = sorted(acceptors, key=lambda x: (x[0], x[1] if not rev else -x[1]), reverse=rev)
|
|
12
|
+
donors = sorted(donors, key=lambda x: (x[0], x[1] if not rev else -x[1]), reverse=rev)
|
|
13
|
+
|
|
14
|
+
# Initialize adjacency list to store downstream connections
|
|
15
|
+
adjacency_list = defaultdict(list)
|
|
16
|
+
|
|
17
|
+
# Connect each donor to the nearest acceptor(s) within the distance threshold
|
|
18
|
+
for d_pos, d_prob in donors:
|
|
19
|
+
running_prob = 1
|
|
20
|
+
for a_pos, a_prob in acceptors:
|
|
21
|
+
correct_orientation = (a_pos > d_pos and not rev) or (a_pos < d_pos and rev)
|
|
22
|
+
distance_valid = abs(a_pos - d_pos) <= max_distance
|
|
23
|
+
if correct_orientation and distance_valid:
|
|
24
|
+
in_between_acceptors = sum([d_pos < a < a_pos for a, _ in acceptors]) if not rev else sum([a_pos < a < d_pos for a, _ in acceptors])
|
|
25
|
+
in_between_donors = sum([d_pos < d < a_pos for d, _ in donors]) if not rev else sum([a_pos < d < d_pos for d, _ in donors])
|
|
26
|
+
in_between_naturals = 0
|
|
27
|
+
if in_between_donors == 0 or in_between_acceptors == 0:
|
|
28
|
+
adjacency_list[(d_pos, 'donor')].append((a_pos, 'acceptor', a_prob))
|
|
29
|
+
running_prob -= a_prob
|
|
30
|
+
|
|
31
|
+
else:
|
|
32
|
+
if running_prob > 0:
|
|
33
|
+
adjacency_list[(d_pos, 'donor')].append((a_pos, 'acceptor', a_prob*running_prob))
|
|
34
|
+
running_prob -= a_prob
|
|
35
|
+
else:
|
|
36
|
+
break
|
|
37
|
+
|
|
38
|
+
# Connect each acceptor to the nearest donor(s) within the distance threshold
|
|
39
|
+
for a_pos, a_prob in acceptors:
|
|
40
|
+
running_prob = 1
|
|
41
|
+
for d_pos, d_prob in donors:
|
|
42
|
+
correct_orientation = (d_pos > a_pos and not rev) or (d_pos < a_pos and rev)
|
|
43
|
+
distance_valid = abs(d_pos - a_pos) <= max_distance
|
|
44
|
+
if correct_orientation and distance_valid:
|
|
45
|
+
in_between_acceptors = sum([a_pos < a < d_pos for a, _ in acceptors]) if not rev else sum([d_pos < a < a_pos for a, _ in acceptors])
|
|
46
|
+
in_between_donors = sum([a_pos < d < d_pos for d, _ in donors]) if not rev else sum([d_pos < d < a_pos for d, _ in donors])
|
|
47
|
+
in_between_naturals = 0
|
|
48
|
+
tag = 'donor' if d_pos != transcript_end else 'transcript_end'
|
|
49
|
+
|
|
50
|
+
if in_between_acceptors == 0:
|
|
51
|
+
adjacency_list[(a_pos, 'acceptor')].append((d_pos, tag, d_prob))
|
|
52
|
+
running_prob -= d_prob
|
|
53
|
+
else:
|
|
54
|
+
if running_prob > 0:
|
|
55
|
+
adjacency_list[(a_pos, 'acceptor')].append((d_pos, tag, d_prob*running_prob))
|
|
56
|
+
running_prob -= d_prob
|
|
57
|
+
else:
|
|
58
|
+
break
|
|
59
|
+
|
|
60
|
+
# Connect the transcript start to the nearest donor(s) within the distance threshold
|
|
61
|
+
running_prob = 1
|
|
62
|
+
for d_pos, d_prob in donors:
|
|
63
|
+
if ((d_pos > transcript_start and not rev) or (d_pos < transcript_start and rev)) and abs(
|
|
64
|
+
d_pos - transcript_start) <= max_distance:
|
|
65
|
+
adjacency_list[(transcript_start, 'transcript_start')].append((d_pos, 'donor', d_prob))
|
|
66
|
+
running_prob -= d_prob
|
|
67
|
+
if running_prob <= 0:
|
|
68
|
+
break
|
|
69
|
+
|
|
70
|
+
# Normalize probabilities to ensure they sum up to 1 for each list of connections
|
|
71
|
+
for k, next_nodes in adjacency_list.items():
|
|
72
|
+
prob_sum = sum([c for a, b, c in next_nodes])
|
|
73
|
+
adjacency_list[k] = [(a, b, round(c / prob_sum, 3)) for a, b, c in next_nodes] if prob_sum > 0 else next_nodes
|
|
74
|
+
|
|
75
|
+
return adjacency_list
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def find_all_paths(graph, start, end, path=[], probability=1.0):
|
|
79
|
+
path = path + [start] # Add current node to the path
|
|
80
|
+
if start == end:
|
|
81
|
+
yield path, probability # If end is reached, yield the path and its cumulative probability
|
|
82
|
+
return
|
|
83
|
+
if start not in graph:
|
|
84
|
+
return # If the start node has no outgoing edges, return
|
|
85
|
+
|
|
86
|
+
for (next_node, node_type, prob) in graph[start]:
|
|
87
|
+
# Recur for each connected node, updating the probability
|
|
88
|
+
yield from find_all_paths(graph, (next_node, node_type), end, path, probability * prob)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def prepare_splice_sites(acceptors, donors, aberrant_splicing):
|
|
92
|
+
acceptors = {p: 1 for p in acceptors}
|
|
93
|
+
donors = {p: 1 for p in donors}
|
|
94
|
+
|
|
95
|
+
for p, v in aberrant_splicing[f'missed_donors'].items():
|
|
96
|
+
donors[p] = v['absolute']
|
|
97
|
+
|
|
98
|
+
for p, v in aberrant_splicing[f'discovered_donors'].items():
|
|
99
|
+
donors[p] = v['absolute']
|
|
100
|
+
|
|
101
|
+
for p, v in aberrant_splicing[f'missed_acceptors'].items():
|
|
102
|
+
acceptors[p] = v['absolute']
|
|
103
|
+
|
|
104
|
+
for p, v in aberrant_splicing[f'discovered_acceptors'].items():
|
|
105
|
+
acceptors[p] = v['absolute']
|
|
106
|
+
|
|
107
|
+
acceptors = {int(k): v for k, v in acceptors.items()}
|
|
108
|
+
donors = {int(k): v for k, v in donors.items()}
|
|
109
|
+
return list(acceptors.items()), list(donors.items())
|
|
41
110
|
|
|
42
111
|
|
|
43
112
|
def develop_aberrant_splicing(transcript, aberrant_splicing):
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
nodes = [s for s in nodes if s.prob > 0]
|
|
55
|
-
|
|
56
|
-
# Sort nodes based on position, respecting transcript direction
|
|
57
|
-
nodes.sort(key=lambda x: x.pos, reverse=transcript.rev)
|
|
58
|
-
|
|
59
|
-
# Create the directed graph
|
|
60
|
-
G = create_splice_graph(nodes, transcript.rev)
|
|
61
|
-
|
|
62
|
-
# Compute new paths and their probabilities sequentially
|
|
63
|
-
new_paths, prob_sum = compute_paths_sequential(G, transcript, exon_starts, exon_ends)
|
|
64
|
-
|
|
65
|
-
# Normalize probabilities and filter based on threshold
|
|
66
|
-
new_paths = normalize_and_filter_paths(new_paths, prob_sum)
|
|
67
|
-
|
|
68
|
-
return list(new_paths.values())
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
def prepare_splice_sites(transcript_sites, transcript_boundary, aberrant_splicing, site_type):
|
|
72
|
-
"""
|
|
73
|
-
Prepare and return a dictionary of splice sites (acceptors or donors) including transcript boundaries
|
|
74
|
-
and aberrant splicing information.
|
|
75
|
-
"""
|
|
76
|
-
site_dict = {v: 1 for v in transcript_sites}
|
|
77
|
-
site_dict.update({transcript_boundary: 1})
|
|
78
|
-
site_dict.update({s: v['absolute'] for s, v in aberrant_splicing[f'missed_{site_type}'].items()})
|
|
79
|
-
site_dict.update({s: v['absolute'] for s, v in aberrant_splicing[f'discovered_{site_type}'].items()})
|
|
80
|
-
return site_dict
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
def create_splice_graph(nodes, reverse_direction):
|
|
84
|
-
"""
|
|
85
|
-
Create and return a directed graph with splice sites as nodes and edges based on splice site type
|
|
86
|
-
and probability of occurrence.
|
|
87
|
-
"""
|
|
88
|
-
G = nx.DiGraph()
|
|
89
|
-
G.add_nodes_from([n.pos for n in nodes])
|
|
90
|
-
|
|
91
|
-
for i in range(len(nodes)):
|
|
92
|
-
trailing_prob = 0
|
|
93
|
-
in_between = set()
|
|
94
|
-
curr_node = nodes[i]
|
|
95
|
-
|
|
96
|
-
for j in range(i + 1, len(nodes)):
|
|
97
|
-
next_node = nodes[j]
|
|
98
|
-
in_between.add(next_node.ss_type)
|
|
99
|
-
|
|
100
|
-
if curr_node.ss_type != next_node.ss_type:
|
|
101
|
-
new_prob = next_node.prob - trailing_prob
|
|
102
|
-
if new_prob > 0:
|
|
103
|
-
G.add_edge(curr_node.pos, next_node.pos, weight=new_prob)
|
|
104
|
-
trailing_prob += next_node.prob
|
|
105
|
-
return G
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
def normalize_and_filter_paths(new_paths, prob_sum):
|
|
109
|
-
"""
|
|
110
|
-
Normalize path probabilities and filter out paths with a probability less than 0.01.
|
|
111
|
-
"""
|
|
112
|
-
for i, d in new_paths.items():
|
|
113
|
-
d['path_weight'] = round(d['path_weight'] / prob_sum, 3)
|
|
114
|
-
new_paths = {k: v for k, v in new_paths.items() if v['path_weight'] > 0.00001}
|
|
115
|
-
return new_paths
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
def path_weight_mult(G, path, weight):
|
|
119
|
-
"""
|
|
120
|
-
Calculate the multiplicative weight of the path.
|
|
121
|
-
"""
|
|
122
|
-
cost = 1
|
|
123
|
-
for node, nbr in zip(path[:-1], path[1:]):
|
|
124
|
-
cost *= G[node][nbr][weight]
|
|
125
|
-
return cost
|
|
113
|
+
all_acceptors, all_donors = prepare_splice_sites(transcript.acceptors, transcript.donors, aberrant_splicing)
|
|
114
|
+
adj_list = generate_adjacency_list(all_acceptors, all_donors, transcript_start=transcript.transcript_start,
|
|
115
|
+
transcript_end=transcript.transcript_end, rev=transcript.rev,
|
|
116
|
+
max_distance=100000)
|
|
117
|
+
end_node = (transcript.transcript_end, 'transcript_end')
|
|
118
|
+
start_node = (transcript.transcript_start, 'transcript_start')
|
|
119
|
+
for path, prob in find_all_paths(adj_list, start_node, end_node):
|
|
120
|
+
yield {'acceptors': [p[0] for p in path if p[1] == 'acceptor'],
|
|
121
|
+
'donors': [p[0] for p in path if p[1] == 'donor'], 'path_weight': prob}
|
|
122
|
+
|
|
126
123
|
|
|
127
124
|
|
|
128
125
|
# Missplicing Detection
|
geney/tis_utils.py
CHANGED
|
@@ -28,26 +28,13 @@ def find_tis(ref_seq, mut_seq, left_context=100, right_context=102):
|
|
|
28
28
|
right_context=right_context,
|
|
29
29
|
padding='$')
|
|
30
30
|
|
|
31
|
-
# 3. If condition 2 is not met, we perform a TIS reaquisition. If condition 2 is met, then we return the reference TIS to be used in the mutated sequence
|
|
32
31
|
if context_conserved:
|
|
33
|
-
return tis_coords[0]
|
|
34
|
-
|
|
35
|
-
# 4. Reaquisition of TIS follows:
|
|
36
|
-
#### The logic:
|
|
37
|
-
# a. We need to find all possible start codon candidates as relative indices
|
|
38
|
-
# b. We need to find what proteins each alternative start codon would create
|
|
39
|
-
# c. We need to make sure we are only looking at a region around a mutation
|
|
40
|
-
# d. We need the titer score rank relative to all titer score reference ranks and relative to the reference score
|
|
32
|
+
return [(tis_coords[0], 1, 'canonical')]
|
|
41
33
|
|
|
42
34
|
sc_table = pd.read_pickle(config['titer_path'] / 'titer_tis_scores.pickle')
|
|
43
|
-
# target_transcript = sc_table[sc_table.transcript_id == ref_id]
|
|
44
|
-
# if len(target_transcript) == 0:
|
|
45
|
-
### reaquire TIS score for ref
|
|
46
|
-
# pass
|
|
47
|
-
|
|
48
35
|
ref_seq_tis_context = ref_seq.asymmetric_subseq(tis_coords[0], left_context=left_context,
|
|
49
36
|
right_context=right_context, padding='$')
|
|
50
|
-
|
|
37
|
+
|
|
51
38
|
ref_titer_score = retrieve_titer_score(ref_seq_tis_context)
|
|
52
39
|
ref_titer_rank = percentileofscore(sc_table['tis_score'], ref_titer_score)
|
|
53
40
|
ref_protein = ref_seq.translate(tis_coords[0])
|
|
@@ -56,7 +43,8 @@ def find_tis(ref_seq, mut_seq, left_context=100, right_context=102):
|
|
|
56
43
|
candidate_positions = np.array(
|
|
57
44
|
[p.align(ref_protein, mut_seq.translate(mut_seq.seqmat[1, i])).score if candidate_positions[i] == True else 0
|
|
58
45
|
for i in range(len(ref_seq.seq))])
|
|
59
|
-
|
|
46
|
+
|
|
47
|
+
candidate_positions = candidate_positions > sorted(candidate_positions)[-5] # implement correct logic
|
|
60
48
|
candidate_positions = np.array([retrieve_titer_score(
|
|
61
49
|
mut_seq.asymmetric_subseq(tis_coords[0], left_context=left_context, right_context=right_context,
|
|
62
50
|
padding='$')) if candidate_positions[i] > 0 else False for i in
|
|
@@ -66,7 +54,7 @@ def find_tis(ref_seq, mut_seq, left_context=100, right_context=102):
|
|
|
66
54
|
in range(len(ref_seq.seq))])
|
|
67
55
|
best_position = np.where(candidate_positions == min(candidate_positions))[0][0]
|
|
68
56
|
out = mut_seq.seqmat[1, best_position]
|
|
69
|
-
return out
|
|
57
|
+
return out #output: [(genomic_coord1, probability, filter_tag), (genomic_coord2, probability, filter_tag)]
|
|
70
58
|
|
|
71
59
|
|
|
72
60
|
def seq_matrix(seq_list):
|
|
@@ -6,21 +6,21 @@ geney/graphic_utils.py,sha256=oMsBpB9YeEn96gGpKh4MmtagJffWZbk-xPrIwHvkFhA,11016
|
|
|
6
6
|
geney/gtex_utils.py,sha256=asL2lHyU5KsbWpV096vkf1Ka7hSo_RRfZqw7p5nERmE,1919
|
|
7
7
|
geney/immune_utils.py,sha256=ZRni5ttrhpYBnmNr0d0ZatIbNPYs4nmQuoUO00SpsS4,5271
|
|
8
8
|
geney/mutation_utils.py,sha256=C_kv2MB_L8LlhX3W2ooXjJ3uDoJ8zX1WeDtZKoBZJkI,1547
|
|
9
|
-
geney/oncosplice.py,sha256=
|
|
10
|
-
geney/pangolin_utils.py,sha256=
|
|
9
|
+
geney/oncosplice.py,sha256=eWgY2Lcj894UBFnIVhbxiVz5oqASHg-Ot1wFbjlJbI8,21857
|
|
10
|
+
geney/pangolin_utils.py,sha256=NJEdY43L_2lielY1hZOjlak0baHqXTa1ITrvx8Tkg5o,2878
|
|
11
11
|
geney/power_utils.py,sha256=MehZFUdkJ2EFUot709yPEDxSkXmH5XevMebX2HD768A,7330
|
|
12
12
|
geney/seqmat_utils.py,sha256=2cRXT_Ox4IdzCM8x3H2HexxFZzjo5WHs0HZiUQv8fBM,18347
|
|
13
|
-
geney/spliceai_utils.py,sha256=
|
|
14
|
-
geney/splicing_utils.py,sha256=
|
|
13
|
+
geney/spliceai_utils.py,sha256=21_TaiLW3faRuPegMgsVvIf1G1a03penZSiydQ-hOTA,1869
|
|
14
|
+
geney/splicing_utils.py,sha256=t0vE5KTAdYOYJLa9wjaSJ1jqiHhsDxZs64OxrgR-Sqc,16811
|
|
15
15
|
geney/survival_utils.py,sha256=KnAzEviMuXh6SnVXId9PgsFLSbgkduTvYoIthxN7FPA,6886
|
|
16
16
|
geney/tcga_utils.py,sha256=D_BNHm-D_K408dlcJm3hzH2c6QNFjQsKvUcOPiQRk7g,17612
|
|
17
|
-
geney/tis_utils.py,sha256=
|
|
17
|
+
geney/tis_utils.py,sha256=2makfGfVlDFVIbxzXE85AY9jmAjcNmxyIAxjvkRA5LY,7396
|
|
18
18
|
geney/utils.py,sha256=EsKvBM-Nz2a3_4ZAhF4Dxd4PwT7_6YYKpxEN4LLgg10,2174
|
|
19
19
|
geney/translation_initiation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
20
20
|
geney/translation_initiation/tis_utils.py,sha256=AF3siFjuQH-Rs44EV-80zHdbxRMvN4woLFSHroWIETc,4448
|
|
21
21
|
geney/translation_initiation/resources/kozak_pssm.json,sha256=pcd0Olziutq-6H3mFWDCD9cujQ_AlZO-iiOvBl82hqE,1165
|
|
22
22
|
geney/translation_initiation/resources/tis_regressor_model.joblib,sha256=IXb4DUDhJ5rBDKcqMk9zE3ECTZZcdj7Jixz3KpoZ7OA,2592025
|
|
23
|
-
geney-1.2.
|
|
24
|
-
geney-1.2.
|
|
25
|
-
geney-1.2.
|
|
26
|
-
geney-1.2.
|
|
23
|
+
geney-1.2.56.dist-info/METADATA,sha256=tHCFJyD9OKjk7GnQToKesLQZyzy0dtO9oBsr0Bjz6rI,948
|
|
24
|
+
geney-1.2.56.dist-info/WHEEL,sha256=fS9sRbCBHs7VFcwJLnLXN1MZRR0_TVTxvXKzOnaSFs8,110
|
|
25
|
+
geney-1.2.56.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
|
|
26
|
+
geney-1.2.56.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|