geney 1.2.55__py2.py3-none-any.whl → 1.2.57__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- geney/oncosplice.py +55 -97
- geney/pangolin_utils.py +5 -3
- geney/spliceai_utils.py +2 -2
- geney/splicing_utils.py +3 -3
- geney/tis_utils.py +5 -17
- {geney-1.2.55.dist-info → geney-1.2.57.dist-info}/METADATA +1 -1
- {geney-1.2.55.dist-info → geney-1.2.57.dist-info}/RECORD +9 -9
- {geney-1.2.55.dist-info → geney-1.2.57.dist-info}/WHEEL +0 -0
- {geney-1.2.55.dist-info → geney-1.2.57.dist-info}/top_level.txt +0 -0
geney/oncosplice.py
CHANGED
|
@@ -9,7 +9,6 @@ from .seqmat_utils import *
|
|
|
9
9
|
from .mutation_utils import *
|
|
10
10
|
from .tis_utils import find_tis
|
|
11
11
|
|
|
12
|
-
### Scoring
|
|
13
12
|
def find_continuous_gaps(sequence):
|
|
14
13
|
"""Find continuous gap sequences in an alignment."""
|
|
15
14
|
return [(m.start(), m.end()) for m in re.finditer(r'-+', sequence)]
|
|
@@ -121,43 +120,6 @@ def transform_conservation_vector(conservation_vector, window=13, factor=4):
|
|
|
121
120
|
return exp_factors
|
|
122
121
|
|
|
123
122
|
|
|
124
|
-
# def find_modified_positions(sequence_length, deletions, insertions, reach_limit=16):
|
|
125
|
-
# """
|
|
126
|
-
# Identify unmodified positions in a sequence given deletions and insertions.
|
|
127
|
-
#
|
|
128
|
-
# :param sequence_length: Length of the sequence.
|
|
129
|
-
# :param deletions: Dictionary of deletions.
|
|
130
|
-
# :param insertions: Dictionary of insertions.
|
|
131
|
-
# :param reach_limit: Limit for considering the effect of insertions/deletions.
|
|
132
|
-
# :return: Array indicating unmodified positions.
|
|
133
|
-
# """
|
|
134
|
-
# unmodified_positions = np.zeros(sequence_length, dtype=float)
|
|
135
|
-
#
|
|
136
|
-
# for pos, insertion in insertions.items():
|
|
137
|
-
# # if pos >= sequence_length:
|
|
138
|
-
# # pos = sequence_length - 1
|
|
139
|
-
# # add_factor = 1
|
|
140
|
-
#
|
|
141
|
-
# reach = min(len(insertion) // 2, reach_limit)
|
|
142
|
-
# front_end, back_end = max(0, pos - reach), min(sequence_length - 1, pos + reach)
|
|
143
|
-
# len_start, len_end = pos - front_end, back_end - pos
|
|
144
|
-
# try:
|
|
145
|
-
# gradient_front = np.linspace(0, 1, len_start, endpoint=False)
|
|
146
|
-
# gradient_back = np.linspace(0, 1, len_end, endpoint=True)[::-1]
|
|
147
|
-
# combined_gradient = np.concatenate([gradient_front, np.array([1]), gradient_back])
|
|
148
|
-
# unmodified_positions[front_end:back_end + 1] = combined_gradient
|
|
149
|
-
#
|
|
150
|
-
# except ValueError as e:
|
|
151
|
-
# print(
|
|
152
|
-
# f"Error: {e} | Lengths: unmodified_positions_slice={back_end - front_end}.")
|
|
153
|
-
# unmodified_positions[front_end:back_end] = np.zeros(back_end - front_end)
|
|
154
|
-
#
|
|
155
|
-
# for pos, deletion in deletions.items():
|
|
156
|
-
# deletion_length = len(deletion)
|
|
157
|
-
# unmodified_positions[pos:pos + deletion_length] = 1
|
|
158
|
-
#
|
|
159
|
-
# return unmodified_positions
|
|
160
|
-
|
|
161
123
|
def find_modified_positions(sequence_length, deletions, insertions, reach_limit=16):
|
|
162
124
|
"""
|
|
163
125
|
Identify unmodified positions in a sequence given deletions and insertions.
|
|
@@ -251,12 +213,7 @@ def moving_average_conv(vector, window_size, factor=1):
|
|
|
251
213
|
|
|
252
214
|
return np.convolve(vector, np.ones(window_size), mode='same') / window_size
|
|
253
215
|
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
216
|
def find_splice_site_proximity(pos, transcript):
|
|
259
|
-
|
|
260
217
|
for i, (ex_start, ex_end) in enumerate(transcript.exons):
|
|
261
218
|
if min(ex_start, ex_end) <= pos <= max(ex_start, ex_end):
|
|
262
219
|
return i + 1, None, abs(pos - ex_start), abs(pos - ex_end)
|
|
@@ -323,7 +280,7 @@ def summarize_missplicing_event(pes, pir, es, ne, ir):
|
|
|
323
280
|
|
|
324
281
|
# Annotating
|
|
325
282
|
def OncospliceAnnotator(reference_transcript, variant_transcript, mut):
|
|
326
|
-
affected_exon, affected_intron, distance_from_5, distance_from_3 = find_splice_site_proximity(mut.indices[0],
|
|
283
|
+
affected_exon, affected_intron, distance_from_5, distance_from_3 = find_splice_site_proximity(np.floor(mut.indices[0]),
|
|
327
284
|
reference_transcript)
|
|
328
285
|
|
|
329
286
|
report = {}
|
|
@@ -361,59 +318,60 @@ def OncospliceAnnotator(reference_transcript, variant_transcript, mut):
|
|
|
361
318
|
return report
|
|
362
319
|
|
|
363
320
|
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
321
|
+
def oncosplice(mut_id, splicing_threshold=0.5, protein_coding=True, cons_required=False, primary_transcript=False, window_length=13, organism='hg38', engine='spliceai', domains=None):
|
|
322
|
+
gene = Gene(mut_id.split(':')[0], organism=organism)
|
|
323
|
+
reference_gene_proteins = {tid: transcript.generate_pre_mrna().generate_mature_mrna().generate_protein() for tid, transcript in gene.run_transcripts(protein_coding=True)}
|
|
324
|
+
mutations = [get_mutation(m, rev=gene.rev) for m in mut_id.split('|')]
|
|
325
|
+
|
|
326
|
+
results = []
|
|
327
|
+
for tid, transcript in gene.run_transcripts(protein_coding=protein_coding, primary_transcript=primary_transcript):
|
|
328
|
+
if cons_required and not transcript.cons_available:
|
|
329
|
+
continue
|
|
330
|
+
|
|
331
|
+
if all(mutation not in transcript for mutation in mutations):
|
|
332
|
+
continue
|
|
333
|
+
|
|
334
|
+
transcript.generate_pre_mrna()
|
|
335
|
+
transcript.cons_vector = transform_conservation_vector(transcript.cons_vector, window=window_length)
|
|
336
|
+
transcript.generate_mature_mrna().generate_protein(inplace=True, domains=domains)
|
|
337
|
+
ref_protein, cons_vector = transcript.protein, transcript.cons_vector
|
|
338
|
+
reference_transcript = copy.deepcopy(transcript)
|
|
339
|
+
|
|
340
|
+
assert len(ref_protein) == len(cons_vector), f"Protein ({len(ref_protein)}) and conservation vector ({len(cons_vector)}) must be same length. {ref_protein}, \n>{cons_vector}\n>{transcript.cons_seq}"
|
|
341
|
+
|
|
342
|
+
missplicing = Missplicing(find_transcript_missplicing(transcript, mutations, engine=engine, threshold=splicing_threshold), threshold=splicing_threshold)
|
|
343
|
+
for mutation in mutations:
|
|
344
|
+
transcript.pre_mrna += mutation
|
|
345
|
+
|
|
346
|
+
for i, new_boundaries in enumerate(develop_aberrant_splicing(transcript, missplicing.aberrant_splicing)):
|
|
347
|
+
transcript.acceptors = new_boundaries['acceptors']
|
|
348
|
+
transcript.donors = new_boundaries['donors']
|
|
349
|
+
transcript.generate_mature_mrna().generate_protein()
|
|
350
|
+
|
|
351
|
+
alignment = get_logical_alignment(reference_transcript.protein, transcript.protein)
|
|
352
|
+
deleted, inserted = find_indels_with_mismatches_as_deletions(alignment.seqA, alignment.seqB)
|
|
353
|
+
modified_positions = find_modified_positions(len(ref_protein), deleted, inserted)
|
|
354
|
+
temp_cons = np.convolve(cons_vector * modified_positions, np.ones(window_length)) / window_length
|
|
355
|
+
affected_cons_scores = max(temp_cons)
|
|
356
|
+
percentile = (
|
|
357
|
+
sorted(cons_vector).index(next(x for x in sorted(cons_vector) if x >= affected_cons_scores)) / len(
|
|
358
|
+
cons_vector))
|
|
359
|
+
|
|
360
|
+
report = OncospliceAnnotator(reference_transcript, transcript, mutation)
|
|
361
|
+
report['mut_id'] = mut_id
|
|
362
|
+
report['oncosplice_score'] = affected_cons_scores
|
|
363
|
+
report['percentile'] = percentile
|
|
364
|
+
report['isoform_id'] = i
|
|
365
|
+
report['isoform_prevalence'] = new_boundaries['path_weight']
|
|
366
|
+
report['full_missplicing'] = missplicing.aberrant_splicing
|
|
367
|
+
report['missplicing'] = max(missplicing)
|
|
368
|
+
report['reference_resemblance'] = reference_gene_proteins.get(transcript.protein, None)
|
|
369
|
+
results.append(report)
|
|
370
|
+
|
|
371
|
+
if len(results) == 0:
|
|
372
|
+
return None
|
|
373
|
+
|
|
374
|
+
return pd.DataFrame(results)
|
|
417
375
|
|
|
418
376
|
|
|
419
377
|
import asyncio
|
geney/pangolin_utils.py
CHANGED
|
@@ -46,10 +46,12 @@ def pang_one_hot_encode(seq):
|
|
|
46
46
|
|
|
47
47
|
|
|
48
48
|
|
|
49
|
-
def pangolin_predict_probs(true_seq, models):
|
|
49
|
+
def pangolin_predict_probs(true_seq, models, just_ss=False):
|
|
50
50
|
# print(f"Running pangolin on: {true_seq}")
|
|
51
|
-
|
|
52
|
-
|
|
51
|
+
if just_ss:
|
|
52
|
+
model_nums = [0, 2, 4, 6]
|
|
53
|
+
else:
|
|
54
|
+
model_nums = [0, 1, 2, 3, 4, 5, 6, 7]
|
|
53
55
|
INDEX_MAP = {0: 1, 1: 2, 2: 4, 3: 5, 4: 7, 5: 8, 6: 10, 7: 11}
|
|
54
56
|
|
|
55
57
|
seq = true_seq
|
geney/spliceai_utils.py
CHANGED
|
@@ -12,8 +12,8 @@ if tf.config.list_physical_devices('GPU'):
|
|
|
12
12
|
else:
|
|
13
13
|
print("Running on CPU.")
|
|
14
14
|
|
|
15
|
-
|
|
16
|
-
|
|
15
|
+
tf.config.threading.set_intra_op_parallelism_threads(1)
|
|
16
|
+
tf.config.threading.set_inter_op_parallelism_threads(1)
|
|
17
17
|
|
|
18
18
|
sai_paths = ('models/spliceai{}.h5'.format(x) for x in range(1, 6))
|
|
19
19
|
sai_models = [load_model(resource_filename('spliceai', x)) for x in sai_paths]
|
geney/splicing_utils.py
CHANGED
|
@@ -145,7 +145,7 @@ def find_ss_changes(ref_dct, mut_dct, known_splice_sites, threshold=0.5):
|
|
|
145
145
|
return discovered_pos, deleted_pos
|
|
146
146
|
|
|
147
147
|
|
|
148
|
-
def find_transcript_missplicing(transcript, mutations, context=5000, window=2500, threshold=0.5, engine='spliceai'):
|
|
148
|
+
def find_transcript_missplicing(transcript, mutations, context=5000, window=2500, threshold=0.5, engine='spliceai', just_ss=False):
|
|
149
149
|
from functools import reduce
|
|
150
150
|
ref = transcript.pre_mrna
|
|
151
151
|
var = reduce(lambda acc, mutation: acc + mutation, mutations, ref)
|
|
@@ -182,8 +182,8 @@ def find_transcript_missplicing(transcript, mutations, context=5000, window=2500
|
|
|
182
182
|
|
|
183
183
|
elif engine == 'pangolin':
|
|
184
184
|
from .pangolin_utils import pangolin_predict_probs, pang_models
|
|
185
|
-
ref_seq_donor_probs, ref_seq_acceptor_probs = pangolin_predict_probs(ref_seq, models=pang_models)
|
|
186
|
-
mut_seq_donor_probs, mut_seq_acceptor_probs = pangolin_predict_probs(var_seq, models=pang_models)
|
|
185
|
+
ref_seq_donor_probs, ref_seq_acceptor_probs = pangolin_predict_probs(ref_seq, models=pang_models, just_ss=just_ss)
|
|
186
|
+
mut_seq_donor_probs, mut_seq_acceptor_probs = pangolin_predict_probs(var_seq, models=pang_models, just_ss=just_ss)
|
|
187
187
|
|
|
188
188
|
else:
|
|
189
189
|
raise ValueError(f"{engine} not implemented")
|
geney/tis_utils.py
CHANGED
|
@@ -28,26 +28,13 @@ def find_tis(ref_seq, mut_seq, left_context=100, right_context=102):
|
|
|
28
28
|
right_context=right_context,
|
|
29
29
|
padding='$')
|
|
30
30
|
|
|
31
|
-
# 3. If condition 2 is not met, we perform a TIS reaquisition. If condition 2 is met, then we return the reference TIS to be used in the mutated sequence
|
|
32
31
|
if context_conserved:
|
|
33
|
-
return tis_coords[0]
|
|
34
|
-
|
|
35
|
-
# 4. Reaquisition of TIS follows:
|
|
36
|
-
#### The logic:
|
|
37
|
-
# a. We need to find all possible start codon candidates as relative indices
|
|
38
|
-
# b. We need to find what proteins each alternative start codon would create
|
|
39
|
-
# c. We need to make sure we are only looking at a region around a mutation
|
|
40
|
-
# d. We need the titer score rank relative to all titer score reference ranks and relative to the reference score
|
|
32
|
+
return [(tis_coords[0], 1, 'canonical')]
|
|
41
33
|
|
|
42
34
|
sc_table = pd.read_pickle(config['titer_path'] / 'titer_tis_scores.pickle')
|
|
43
|
-
# target_transcript = sc_table[sc_table.transcript_id == ref_id]
|
|
44
|
-
# if len(target_transcript) == 0:
|
|
45
|
-
### reaquire TIS score for ref
|
|
46
|
-
# pass
|
|
47
|
-
|
|
48
35
|
ref_seq_tis_context = ref_seq.asymmetric_subseq(tis_coords[0], left_context=left_context,
|
|
49
36
|
right_context=right_context, padding='$')
|
|
50
|
-
|
|
37
|
+
|
|
51
38
|
ref_titer_score = retrieve_titer_score(ref_seq_tis_context)
|
|
52
39
|
ref_titer_rank = percentileofscore(sc_table['tis_score'], ref_titer_score)
|
|
53
40
|
ref_protein = ref_seq.translate(tis_coords[0])
|
|
@@ -56,7 +43,8 @@ def find_tis(ref_seq, mut_seq, left_context=100, right_context=102):
|
|
|
56
43
|
candidate_positions = np.array(
|
|
57
44
|
[p.align(ref_protein, mut_seq.translate(mut_seq.seqmat[1, i])).score if candidate_positions[i] == True else 0
|
|
58
45
|
for i in range(len(ref_seq.seq))])
|
|
59
|
-
|
|
46
|
+
|
|
47
|
+
candidate_positions = candidate_positions > sorted(candidate_positions)[-5] # implement correct logic
|
|
60
48
|
candidate_positions = np.array([retrieve_titer_score(
|
|
61
49
|
mut_seq.asymmetric_subseq(tis_coords[0], left_context=left_context, right_context=right_context,
|
|
62
50
|
padding='$')) if candidate_positions[i] > 0 else False for i in
|
|
@@ -66,7 +54,7 @@ def find_tis(ref_seq, mut_seq, left_context=100, right_context=102):
|
|
|
66
54
|
in range(len(ref_seq.seq))])
|
|
67
55
|
best_position = np.where(candidate_positions == min(candidate_positions))[0][0]
|
|
68
56
|
out = mut_seq.seqmat[1, best_position]
|
|
69
|
-
return out
|
|
57
|
+
return out #output: [(genomic_coord1, probability, filter_tag), (genomic_coord2, probability, filter_tag)]
|
|
70
58
|
|
|
71
59
|
|
|
72
60
|
def seq_matrix(seq_list):
|
|
@@ -6,21 +6,21 @@ geney/graphic_utils.py,sha256=oMsBpB9YeEn96gGpKh4MmtagJffWZbk-xPrIwHvkFhA,11016
|
|
|
6
6
|
geney/gtex_utils.py,sha256=asL2lHyU5KsbWpV096vkf1Ka7hSo_RRfZqw7p5nERmE,1919
|
|
7
7
|
geney/immune_utils.py,sha256=ZRni5ttrhpYBnmNr0d0ZatIbNPYs4nmQuoUO00SpsS4,5271
|
|
8
8
|
geney/mutation_utils.py,sha256=C_kv2MB_L8LlhX3W2ooXjJ3uDoJ8zX1WeDtZKoBZJkI,1547
|
|
9
|
-
geney/oncosplice.py,sha256
|
|
10
|
-
geney/pangolin_utils.py,sha256=
|
|
9
|
+
geney/oncosplice.py,sha256=eWgY2Lcj894UBFnIVhbxiVz5oqASHg-Ot1wFbjlJbI8,21857
|
|
10
|
+
geney/pangolin_utils.py,sha256=HvXfdLhHWTDXNmYtc8K3p64iTvDtsBq6-Jml5tpg7JI,2930
|
|
11
11
|
geney/power_utils.py,sha256=MehZFUdkJ2EFUot709yPEDxSkXmH5XevMebX2HD768A,7330
|
|
12
12
|
geney/seqmat_utils.py,sha256=2cRXT_Ox4IdzCM8x3H2HexxFZzjo5WHs0HZiUQv8fBM,18347
|
|
13
|
-
geney/spliceai_utils.py,sha256=
|
|
14
|
-
geney/splicing_utils.py,sha256=
|
|
13
|
+
geney/spliceai_utils.py,sha256=21_TaiLW3faRuPegMgsVvIf1G1a03penZSiydQ-hOTA,1869
|
|
14
|
+
geney/splicing_utils.py,sha256=34xdarFpTHsHZkhi7VrHby9DaIBZ2xCLqPMrTmasEgE,16860
|
|
15
15
|
geney/survival_utils.py,sha256=KnAzEviMuXh6SnVXId9PgsFLSbgkduTvYoIthxN7FPA,6886
|
|
16
16
|
geney/tcga_utils.py,sha256=D_BNHm-D_K408dlcJm3hzH2c6QNFjQsKvUcOPiQRk7g,17612
|
|
17
|
-
geney/tis_utils.py,sha256=
|
|
17
|
+
geney/tis_utils.py,sha256=2makfGfVlDFVIbxzXE85AY9jmAjcNmxyIAxjvkRA5LY,7396
|
|
18
18
|
geney/utils.py,sha256=EsKvBM-Nz2a3_4ZAhF4Dxd4PwT7_6YYKpxEN4LLgg10,2174
|
|
19
19
|
geney/translation_initiation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
20
20
|
geney/translation_initiation/tis_utils.py,sha256=AF3siFjuQH-Rs44EV-80zHdbxRMvN4woLFSHroWIETc,4448
|
|
21
21
|
geney/translation_initiation/resources/kozak_pssm.json,sha256=pcd0Olziutq-6H3mFWDCD9cujQ_AlZO-iiOvBl82hqE,1165
|
|
22
22
|
geney/translation_initiation/resources/tis_regressor_model.joblib,sha256=IXb4DUDhJ5rBDKcqMk9zE3ECTZZcdj7Jixz3KpoZ7OA,2592025
|
|
23
|
-
geney-1.2.
|
|
24
|
-
geney-1.2.
|
|
25
|
-
geney-1.2.
|
|
26
|
-
geney-1.2.
|
|
23
|
+
geney-1.2.57.dist-info/METADATA,sha256=UFirGNGhFN_aJnqSO8WHagJCmEfKoHdfRZojLfKymsE,948
|
|
24
|
+
geney-1.2.57.dist-info/WHEEL,sha256=fS9sRbCBHs7VFcwJLnLXN1MZRR0_TVTxvXKzOnaSFs8,110
|
|
25
|
+
geney-1.2.57.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
|
|
26
|
+
geney-1.2.57.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|