geney 1.3.3__py2.py3-none-any.whl → 1.3.5__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of geney might be problematic. Click here for more details.
- geney/oncosplice.py +109 -105
- geney/splicing_utils.py +314 -235
- {geney-1.3.3.dist-info → geney-1.3.5.dist-info}/METADATA +1 -1
- {geney-1.3.3.dist-info → geney-1.3.5.dist-info}/RECORD +6 -6
- {geney-1.3.3.dist-info → geney-1.3.5.dist-info}/WHEEL +0 -0
- {geney-1.3.3.dist-info → geney-1.3.5.dist-info}/top_level.txt +0 -0
geney/oncosplice.py
CHANGED
|
@@ -6,6 +6,10 @@ import pandas as pd
|
|
|
6
6
|
import numpy as np
|
|
7
7
|
from .SeqMats import SeqMat, MutSeqMat
|
|
8
8
|
from .splicing_utils import find_transcript_missplicing_seqs, develop_aberrant_splicing
|
|
9
|
+
from .Gene import Gene
|
|
10
|
+
import copy
|
|
11
|
+
from . import config
|
|
12
|
+
|
|
9
13
|
from .tis_utils import find_tis
|
|
10
14
|
|
|
11
15
|
def short_hash_of_list(numbers, length=5):
|
|
@@ -301,7 +305,7 @@ def OncospliceAnnotator(reference_transcript, variant_transcript, mut, ref_attri
|
|
|
301
305
|
|
|
302
306
|
|
|
303
307
|
def oncosplice(mut_id, splicing_threshold=0.5, protein_coding=True, cons_required=False, primary_transcript=False,
|
|
304
|
-
window_length=13, organism='hg38', engine='spliceai'
|
|
308
|
+
window_length=13, organism='hg38', engine='spliceai'):
|
|
305
309
|
gene = Gene.from_file(mut_id.split(':')[0], organism=organism)
|
|
306
310
|
reference_gene_proteins = {
|
|
307
311
|
transcript.generate_pre_mrna().generate_mature_mrna().generate_protein().protein: transcript.transcript_id for
|
|
@@ -374,110 +378,110 @@ def oncosplice(mut_id, splicing_threshold=0.5, protein_coding=True, cons_require
|
|
|
374
378
|
'mutation_distance_from_3', 'engine', 'reference_resemblance', 'oncosplice_score', 'percentile',
|
|
375
379
|
'isoform_prevalence', 'reference_protein', 'variant_protein']]
|
|
376
380
|
|
|
377
|
-
|
|
378
|
-
import asyncio
|
|
379
|
-
async def oncosplice_prototype(mut_id, splicing_threshold=0.5, protein_coding=True, primary_transcript=False,
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
381
|
+
#
|
|
382
|
+
# import asyncio
|
|
383
|
+
# async def oncosplice_prototype(mut_id, splicing_threshold=0.5, protein_coding=True, primary_transcript=False,
|
|
384
|
+
# window_length=13, organism='hg38', engine='spliceai', use_cons=True, require_cons=False):
|
|
385
|
+
# import sys, os
|
|
386
|
+
# needed_file1 = config[organism]['yoram_path'] / 'rest_api_utils.py'
|
|
387
|
+
# needed_file2 = config[organism]['yoram_path'] / 'uniprot_utils.py'
|
|
388
|
+
#
|
|
389
|
+
# if sys.platform == 'linux' and (needed_file1.is_file() and os.access(needed_file1, os.R_OK)) and (
|
|
390
|
+
# needed_file2.is_file() and os.access(needed_file2, os.R_OK)):
|
|
391
|
+
# sys.path.append(str(config[organism]['yoram_path']))
|
|
392
|
+
# import uniprot_utils as uput
|
|
393
|
+
#
|
|
394
|
+
# else:
|
|
395
|
+
# raise SystemError(
|
|
396
|
+
# "Oncosplice Prototype can only be run on Power with access to the /tamir2/yoramzar/Projects/Cancer_mut/Utils folder.")
|
|
397
|
+
#
|
|
398
|
+
# from .tis_utils import find_tis
|
|
399
|
+
#
|
|
400
|
+
# # Define async functions
|
|
401
|
+
# async def background_request(ensb_id, Uniprot_features=["Topological domain", "Transmembrane", "Domain"]):
|
|
402
|
+
# return uput.retrieve_protein_data_features_subset(uput.ensembl_id2uniprot_id(ensb_id), Uniprot_features)
|
|
403
|
+
#
|
|
404
|
+
# def inspect_domain(row, modified_vector, conservation_vector):
|
|
405
|
+
# v1, v2 = modified_vector[row.start:row.end], conservation_vector[row.start:row.end]
|
|
406
|
+
# if sum(v2) == 0:
|
|
407
|
+
# return pd.Series([f'{row.type}|{row.start}|{row.end}|{row.description}', 0],
|
|
408
|
+
# index=['domain_identifier', 'score'])
|
|
409
|
+
#
|
|
410
|
+
# return pd.Series([f'{row.type}|{row.start}|{row.end}|{row.description}', sum(v1 * v2) / sum(v2)],
|
|
411
|
+
# index=['domain_identifier', 'score'])
|
|
412
|
+
#
|
|
413
|
+
# gene = Gene(mut_id.split(':')[0], organism=organism)
|
|
414
|
+
# reference_gene_proteins = {tid: transcript.generate_pre_mrna().generate_mature_mrna().generate_protein() for tid, transcript in gene.run_transcripts(protein_coding=True)}
|
|
415
|
+
# mutations = [get_mutation(mut_id, rev=gene.rev) for mut_id in mut_id.split('|')]
|
|
416
|
+
# results = []
|
|
417
|
+
# for tid, transcript in gene.run_transcripts(protein_coding=protein_coding, primary_transcript=primary_transcript):
|
|
418
|
+
# if require_cons and not transcript.cons_available:
|
|
419
|
+
# continue
|
|
420
|
+
#
|
|
421
|
+
# if all(mutation not in transcript for mutation in mutations):
|
|
422
|
+
# # results.append({'transcript_id': transcript.transcript_id})
|
|
423
|
+
# continue
|
|
424
|
+
#
|
|
425
|
+
# task1 = asyncio.create_task(background_request(tid))
|
|
426
|
+
# transcript.generate_pre_mrna()
|
|
427
|
+
# transcript.cons_vector = transform_conservation_vector(transcript.cons_vector, window=window_length)
|
|
428
|
+
# transcript.generate_mature_mrna().generate_protein(inplace=True)
|
|
429
|
+
# ref_protein, cons_vector = transcript.protein, transcript.cons_vector
|
|
430
|
+
#
|
|
431
|
+
# if not use_cons:
|
|
432
|
+
# cons_vector = np.ones(len(ref_protein))
|
|
433
|
+
#
|
|
434
|
+
# if sum(cons_vector) == 0:
|
|
435
|
+
# cons_vector = np.ones(len(ref_protein)) #/len(ref_protein)
|
|
436
|
+
#
|
|
437
|
+
# reference_transcript = copy.deepcopy(transcript)
|
|
438
|
+
#
|
|
439
|
+
# assert len(ref_protein) == len(
|
|
440
|
+
# cons_vector), f"Protein ({len(ref_protein)}) and conservation vector ({len(cons_vector)} must be same length."
|
|
441
|
+
#
|
|
442
|
+
# missplicing = Missplicing(find_transcript_missplicing(transcript, mutations, engine=engine, threshold=splicing_threshold),
|
|
443
|
+
# threshold=splicing_threshold)
|
|
444
|
+
# for mutation in mutations:
|
|
445
|
+
# transcript.pre_mrna += mutation
|
|
446
|
+
#
|
|
447
|
+
# domains_df = await task1
|
|
448
|
+
# for i, new_boundaries in enumerate(develop_aberrant_splicing(transcript, missplicing.aberrant_splicing)):
|
|
449
|
+
# transcript.acceptors = new_boundaries['acceptors']
|
|
450
|
+
# transcript.donors = new_boundaries['donors']
|
|
451
|
+
# transcript.generate_mature_mrna()
|
|
452
|
+
# transcript.TIS = find_tis(ref_seq=reference_transcript, mut_seq=transcript)
|
|
453
|
+
# transcript.generate_protein()
|
|
454
|
+
#
|
|
455
|
+
# alignment = get_logical_alignment(reference_transcript.protein, transcript.protein)
|
|
456
|
+
# deleted, inserted = find_indels_with_mismatches_as_deletions(alignment.seqA, alignment.seqB)
|
|
457
|
+
# modified_positions = find_modified_positions(len(ref_protein), deleted, inserted)
|
|
458
|
+
# temp_cons = np.convolve(cons_vector * modified_positions, np.ones(window_length)) / window_length
|
|
459
|
+
# affected_cons_scores = max(temp_cons)
|
|
460
|
+
# percentile = (
|
|
461
|
+
# sorted(cons_vector).index(next(x for x in sorted(cons_vector) if x >= affected_cons_scores)) / len(
|
|
462
|
+
# cons_vector))
|
|
463
|
+
#
|
|
464
|
+
# out = domains_df.apply(lambda row: inspect_domain(row, modified_positions, cons_vector), axis=1)
|
|
465
|
+
# domains_affected = '+'.join([f'{a}:{round(b, 3)}' for a, b in list(zip(out.domain_identifier, out.score))])
|
|
466
|
+
#
|
|
467
|
+
# report = OncospliceAnnotator(reference_transcript, transcript, mutation)
|
|
468
|
+
# report['mut_id'] = mut_id
|
|
469
|
+
# report['oncosplice_score'] = affected_cons_scores
|
|
470
|
+
# report['cons_available'] = transcript.cons_available
|
|
471
|
+
# report['transcript_id'] = transcript.transcript_id
|
|
472
|
+
# report['percentile'] = percentile
|
|
473
|
+
# report['isoform_id'] = i
|
|
474
|
+
# report['isoform_prevalence'] = new_boundaries['path_weight']
|
|
475
|
+
# report['full_missplicing'] = missplicing.aberrant_splicing
|
|
476
|
+
# report['missplicing'] = max(missplicing)
|
|
477
|
+
# report['domains'] = domains_affected
|
|
478
|
+
# report['max_domain_score'] = out.score.max()
|
|
479
|
+
#
|
|
480
|
+
# report['reference_resemblance'] = reference_gene_proteins.get(transcript.protein, None)
|
|
481
|
+
# results.append(pd.Series(report))
|
|
482
|
+
#
|
|
483
|
+
# report = pd.concat(results, axis=1).T
|
|
484
|
+
# return report
|
|
481
485
|
|
|
482
486
|
|
|
483
487
|
if __name__ == '__main__':
|
geney/splicing_utils.py
CHANGED
|
@@ -1,9 +1,8 @@
|
|
|
1
1
|
import numpy as np
|
|
2
|
-
from ._mutation_utils import get_mutation
|
|
3
2
|
from .Gene import Gene
|
|
4
|
-
|
|
3
|
+
from .SeqMats import MutSeqMat
|
|
5
4
|
from collections import defaultdict
|
|
6
|
-
|
|
5
|
+
from . import config
|
|
7
6
|
|
|
8
7
|
def generate_adjacency_list(acceptors, donors, transcript_start, transcript_end, max_distance=50, rev=False):
|
|
9
8
|
# Append the transcript end to donors to allow connection to the end point
|
|
@@ -145,124 +144,180 @@ def find_ss_changes(ref_dct, mut_dct, known_splice_sites, threshold=0.5):
|
|
|
145
144
|
return discovered_pos, deleted_pos
|
|
146
145
|
|
|
147
146
|
|
|
148
|
-
|
|
149
|
-
from geney.Gene import Gene
|
|
150
|
-
transcript = Gene(mut_id.split(':')[0]).transcript().generate_mature_mrna()
|
|
151
|
-
out = find_transcript_missplicing(transcript, [get_mutation(mut_id, rev=transcript.rev)], context=5000, window=2500, threshold=0.5, engine='spliceai', just_ss=True)
|
|
152
|
-
best_delta = 0
|
|
153
|
-
for k, v in out.items():
|
|
154
|
-
for k1, v1 in v.items():
|
|
155
|
-
if abs(v1['delta']) > abs(best_delta):
|
|
156
|
-
best_delta = v1['delta']
|
|
157
|
-
return out, best_delta
|
|
158
|
-
|
|
159
|
-
def find_transcript_missplicing(transcript, mutations, context=5000, window=2500, threshold=0.5, engine='spliceai', just_ss=False):
|
|
160
|
-
from functools import reduce
|
|
161
|
-
ref = transcript.pre_mrna
|
|
162
|
-
mutations = [mutation for mutation in mutations if mutation.position in ref.indices]
|
|
163
|
-
if len(mutations) == 0:
|
|
164
|
-
return {'missed_acceptors': {}, 'missed_donors': {}, 'discovered_acceptors': {}, 'discovered_donors': {}}
|
|
165
|
-
|
|
166
|
-
var = reduce(lambda acc, mutation: acc + mutation, mutations, ref)
|
|
167
|
-
center = int(np.mean([mutation.position for mutation in mutations]) // 1)
|
|
168
|
-
|
|
169
|
-
total_context = context + window
|
|
170
|
-
length = ref.seqmat.shape[-1]
|
|
171
|
-
center_index = ref.rel_pos(center)
|
|
172
|
-
ref_start_pad = max(0, total_context - center_index)
|
|
173
|
-
ref_end_pad = max(0, total_context - (length - center_index))
|
|
147
|
+
from typing import Tuple, Dict
|
|
174
148
|
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
149
|
+
def run_splicing_engine(seq, engine='spliceai'):
|
|
150
|
+
match engine:
|
|
151
|
+
case 'spliceai':
|
|
152
|
+
from .spliceai_utils import sai_predict_probs, sai_models
|
|
153
|
+
donor_probs, acceptor_probs = sai_predict_probs(seq, models=sai_models)
|
|
179
154
|
|
|
180
|
-
|
|
181
|
-
|
|
155
|
+
case 'pangolin':
|
|
156
|
+
from .pangolin_utils import pangolin_predict_probs, pang_models
|
|
157
|
+
donor_probs, acceptor_probs = pangolin_predict_probs(seq, models=pang_models)
|
|
182
158
|
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
ref_indices = np.concatenate([np.zeros(ref_start_pad), ref.indices, np.zeros(ref_end_pad)])
|
|
187
|
-
mut_indices = np.concatenate([np.zeros(var_start_pad), var.indices, np.zeros(var_end_pad)])
|
|
188
|
-
|
|
189
|
-
ref_indices = ref_indices[context:-context]
|
|
190
|
-
mut_indices = mut_indices[context:-context]
|
|
191
|
-
|
|
192
|
-
ref_seq = 'N'*ref_start_pad + ref.seq + 'N'*ref_end_pad
|
|
193
|
-
var_seq = 'N'*var_start_pad + var.seq + 'N'*var_end_pad
|
|
159
|
+
case _:
|
|
160
|
+
raise ValueError(f"{engine} not implemented")
|
|
161
|
+
return donor_probs, acceptor_probs
|
|
194
162
|
|
|
195
|
-
if engine == 'spliceai':
|
|
196
|
-
from .spliceai_utils import sai_predict_probs, sai_models
|
|
197
|
-
ref_seq_acceptor_probs, ref_seq_donor_probs = sai_predict_probs(ref_seq, models=sai_models)
|
|
198
|
-
mut_seq_acceptor_probs, mut_seq_donor_probs = sai_predict_probs(var_seq, models=sai_models)
|
|
199
163
|
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
164
|
+
def find_transcript_splicing(transcript, engine: str = 'spliceai') -> Tuple[Dict[int, float], Dict[int, float]]:
|
|
165
|
+
"""
|
|
166
|
+
Predict splice site probabilities for a given transcript using the specified engine.
|
|
167
|
+
This function uses a padding of 5000 'N's on each side of the transcript sequence
|
|
168
|
+
to align with the model's required context length.
|
|
169
|
+
|
|
170
|
+
Args:
|
|
171
|
+
transcript: An object representing a transcript, expected to have:
|
|
172
|
+
- an `indices` attribute that returns a sequence of positions.
|
|
173
|
+
- a `seq` attribute that returns the sequence string.
|
|
174
|
+
engine (str): The prediction engine to use. Supported: 'spliceai', 'pangolin'.
|
|
175
|
+
|
|
176
|
+
Returns:
|
|
177
|
+
(donor_probs, acceptor_probs) as two dictionaries keyed by position with probability values.
|
|
178
|
+
|
|
179
|
+
Raises:
|
|
180
|
+
ValueError: If an unsupported engine is provided.
|
|
181
|
+
AssertionError: If the length of predicted probabilities does not match the length of indices.
|
|
182
|
+
"""
|
|
183
|
+
# Prepare reference sequence with padding
|
|
184
|
+
ref_indices = transcript.indices
|
|
185
|
+
ref_seq = 'N' * 5000 + transcript.seq + 'N' * 5000
|
|
186
|
+
ref_seq_acceptor_probs, ref_seq_donor_probs = run_splicing_engine(ref_seq, engine)
|
|
187
|
+
|
|
188
|
+
# Verify lengths
|
|
189
|
+
assert len(ref_seq_donor_probs) == len(ref_indices), (
|
|
190
|
+
f"Donor probabilities length ({len(ref_seq_donor_probs)}) does not match "
|
|
191
|
+
f"indices length ({len(ref_indices)})."
|
|
192
|
+
)
|
|
193
|
+
assert len(ref_seq_acceptor_probs) == len(ref_indices), (
|
|
194
|
+
f"Acceptor probabilities length ({len(ref_seq_acceptor_probs)}) does not match "
|
|
195
|
+
f"indices length ({len(ref_indices)})."
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
# Create dictionaries and sort them by probability in descending order
|
|
199
|
+
donor_probs = dict(sorted((i, p) for i, p in zip(ref_indices, ref_seq_donor_probs)),
|
|
200
|
+
key=lambda item: item[1], reverse=True)
|
|
201
|
+
acceptor_probs = dict(sorted((i, p) for i, p in zip(ref_indices, ref_seq_acceptor_probs)),
|
|
202
|
+
key=lambda item: item[1], reverse=True)
|
|
204
203
|
|
|
205
|
-
|
|
206
|
-
raise ValueError(f"{engine} not implemented")
|
|
204
|
+
return donor_probs, acceptor_probs
|
|
207
205
|
|
|
208
|
-
visible_donors = np.intersect1d(transcript.donors, ref_indices)
|
|
209
|
-
visible_acceptors = np.intersect1d(transcript.acceptors, ref_indices)
|
|
210
206
|
|
|
211
|
-
|
|
212
|
-
|
|
207
|
+
def find_transcript_missplicing(mut_id, transcript='primary', threshold=0.5, engine='spliceai', organism='hg38'):
|
|
208
|
+
gene = Gene.from_file(mut_id.split(':')[0], organism=organism)
|
|
209
|
+
reference_transcript = gene.transcript(transcript) if transcript is not None else gene.transcript()
|
|
210
|
+
variant_transcript = reference_transcript.clone()
|
|
211
|
+
mutations = [MutSeqMat.from_mutid(m) for m in mut_id.split('|')]
|
|
212
|
+
mutations = [m for m in mutations if m in reference_transcript]
|
|
213
|
+
if len(mutations) == 0:
|
|
214
|
+
return {'missed_acceptors': {}, 'missed_donors': {}, 'discovered_acceptors': {}, 'discovered_donors': {}}
|
|
213
215
|
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
threshold=threshold)
|
|
216
|
+
center = np.mean([m.indices[0] for m in mutations]) // 1
|
|
217
|
+
for mutation in mutations:
|
|
218
|
+
variant_transcript.mutate(mutation, inplace=True)
|
|
218
219
|
|
|
219
|
-
|
|
220
|
-
assert len(mut_indices) == len(mut_seq_donor_probs), 'Mut pos not the same'
|
|
220
|
+
return find_transcript_missplicing_seqs(reference_transcript.get_context(center, 7500), variant_transcript.get_context(center, 7500), reference_transcript.donors, reference_transcript.acceptors, threshold=threshold, engine=engine)
|
|
221
221
|
|
|
222
|
-
idp, ddp = find_ss_changes({p: v for p, v in list(zip(ref_indices, ref_seq_donor_probs))},
|
|
223
|
-
{p: v for p, v in list(zip(mut_indices, mut_seq_donor_probs))},
|
|
224
|
-
visible_donors,
|
|
225
|
-
threshold=threshold)
|
|
226
222
|
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
223
|
+
# from functools import reduce
|
|
224
|
+
# ref = transcript.pre_mrna
|
|
225
|
+
# mutations = [mutation for mutation in mutations if mutation.position in ref.indices]
|
|
226
|
+
# if len(mutations) == 0:
|
|
227
|
+
# return {'missed_acceptors': {}, 'missed_donors': {}, 'discovered_acceptors': {}, 'discovered_donors': {}}
|
|
228
|
+
#
|
|
229
|
+
# var = reduce(lambda acc, mutation: acc + mutation, mutations, ref)
|
|
230
|
+
# center = int(np.mean([mutation.position for mutation in mutations]) // 1)
|
|
231
|
+
#
|
|
232
|
+
# total_context = context + window
|
|
233
|
+
# length = ref.seqmat.shape[-1]
|
|
234
|
+
# center_index = ref.rel_pos(center)
|
|
235
|
+
# ref_start_pad = max(0, total_context - center_index)
|
|
236
|
+
# ref_end_pad = max(0, total_context - (length - center_index))
|
|
237
|
+
#
|
|
238
|
+
# length = var.seqmat.shape[-1]
|
|
239
|
+
# center_index = var.rel_pos(center)
|
|
240
|
+
# if center_index is None:
|
|
241
|
+
# raise IndexError("Center index must not be none... Issue with mutations... They must not be within the transcript.")
|
|
242
|
+
#
|
|
243
|
+
# var_start_pad = max(0, total_context - center_index)
|
|
244
|
+
# var_end_pad = max(0, total_context - (length - center_index))
|
|
245
|
+
#
|
|
246
|
+
# ref = ref.inspect(center, context=total_context)
|
|
247
|
+
# var = var.inspect(center, context=total_context)
|
|
248
|
+
#
|
|
249
|
+
# ref_indices = np.concatenate([np.zeros(ref_start_pad), ref.indices, np.zeros(ref_end_pad)])
|
|
250
|
+
# mut_indices = np.concatenate([np.zeros(var_start_pad), var.indices, np.zeros(var_end_pad)])
|
|
251
|
+
#
|
|
252
|
+
# ref_indices = ref_indices[context:-context]
|
|
253
|
+
# mut_indices = mut_indices[context:-context]
|
|
254
|
+
#
|
|
255
|
+
# ref_seq = 'N'*ref_start_pad + ref.seq + 'N'*ref_end_pad
|
|
256
|
+
# var_seq = 'N'*var_start_pad + var.seq + 'N'*var_end_pad
|
|
257
|
+
#
|
|
258
|
+
# if engine == 'spliceai':
|
|
259
|
+
# from .spliceai_utils import sai_predict_probs, sai_models
|
|
260
|
+
# ref_seq_acceptor_probs, ref_seq_donor_probs = sai_predict_probs(ref_seq, models=sai_models)
|
|
261
|
+
# mut_seq_acceptor_probs, mut_seq_donor_probs = sai_predict_probs(var_seq, models=sai_models)
|
|
262
|
+
#
|
|
263
|
+
# elif engine == 'pangolin':
|
|
264
|
+
# from .pangolin_utils import pangolin_predict_probs, pang_models
|
|
265
|
+
# ref_seq_donor_probs, ref_seq_acceptor_probs = pangolin_predict_probs(ref_seq, models=pang_models, just_ss=just_ss)
|
|
266
|
+
# mut_seq_donor_probs, mut_seq_acceptor_probs = pangolin_predict_probs(var_seq, models=pang_models, just_ss=just_ss)
|
|
267
|
+
#
|
|
268
|
+
# else:
|
|
269
|
+
# raise ValueError(f"{engine} not implemented")
|
|
270
|
+
#
|
|
271
|
+
# visible_donors = np.intersect1d(transcript.donors, ref_indices)
|
|
272
|
+
# visible_acceptors = np.intersect1d(transcript.acceptors, ref_indices)
|
|
273
|
+
#
|
|
274
|
+
# assert len(ref_indices) == len(ref_seq_acceptor_probs), f'Reference pos ({len(ref_indices)}) not the same as probs ({len(ref_seq_acceptor_probs)})'
|
|
275
|
+
# assert len(mut_indices) == len(mut_seq_acceptor_probs), f'Mut pos ({len(mut_indices)}) not the same as probs ({len(mut_seq_acceptor_probs)})'
|
|
276
|
+
#
|
|
277
|
+
# iap, dap = find_ss_changes({p: v for p, v in list(zip(ref_indices, ref_seq_acceptor_probs))},
|
|
278
|
+
# {p: v for p, v in list(zip(mut_indices, mut_seq_acceptor_probs))},
|
|
279
|
+
# visible_acceptors,
|
|
280
|
+
# threshold=threshold)
|
|
281
|
+
#
|
|
282
|
+
# assert len(ref_indices) == len(ref_seq_donor_probs), 'Reference pos not the same'
|
|
283
|
+
# assert len(mut_indices) == len(mut_seq_donor_probs), 'Mut pos not the same'
|
|
284
|
+
#
|
|
285
|
+
# idp, ddp = find_ss_changes({p: v for p, v in list(zip(ref_indices, ref_seq_donor_probs))},
|
|
286
|
+
# {p: v for p, v in list(zip(mut_indices, mut_seq_donor_probs))},
|
|
287
|
+
# visible_donors,
|
|
288
|
+
# threshold=threshold)
|
|
289
|
+
#
|
|
290
|
+
# ref_acceptors = {a: b for a, b in list(zip(ref_indices, ref_seq_acceptor_probs))}
|
|
291
|
+
# ref_donors = {a: b for a, b in list(zip(ref_indices, ref_seq_donor_probs))}
|
|
292
|
+
#
|
|
293
|
+
# lost_acceptors = {int(p): {'absolute': np.float64(0), 'delta': round(float(-ref_acceptors[p]), 3)} for p in
|
|
294
|
+
# visible_acceptors if p not in mut_indices and p not in dap}
|
|
295
|
+
# lost_donors = {int(p): {'absolute': np.float64(0), 'delta': round(float(-ref_donors[p]), 3)} for p in visible_donors
|
|
296
|
+
# if p not in mut_indices and p not in ddp}
|
|
297
|
+
# dap.update(lost_acceptors)
|
|
298
|
+
# ddp.update(lost_donors)
|
|
299
|
+
#
|
|
300
|
+
# missplicing = {'missed_acceptors': dap, 'missed_donors': ddp, 'discovered_acceptors': iap, 'discovered_donors': idp}
|
|
301
|
+
# missplicing = {outk: {float(k): v for k, v in outv.items()} for outk, outv in missplicing.items()}
|
|
302
|
+
# temp = {outk: {int(k) if k.is_integer() else k: v for k, v in outv.items()} for outk, outv in missplicing.items()}
|
|
240
303
|
return temp
|
|
241
304
|
|
|
242
|
-
|
|
305
|
+
|
|
306
|
+
def find_transcript_missplicing_seqs(ref_seq, var_seq, donors, acceptors, threshold=0.5, engine='spliceai'):
|
|
243
307
|
if ref_seq.seq == var_seq.seq:
|
|
244
308
|
return {'missed_acceptors': {}, 'missed_donors': {}, 'discovered_acceptors': {}, 'discovered_donors': {}}
|
|
245
309
|
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
ref_seq_acceptor_probs, ref_seq_donor_probs = sai_predict_probs(ref_seq.seq, models=sai_models)
|
|
249
|
-
mut_seq_acceptor_probs, mut_seq_donor_probs = sai_predict_probs(var_seq.seq, models=sai_models)
|
|
250
|
-
|
|
251
|
-
elif engine == 'pangolin':
|
|
252
|
-
from .pangolin_utils import pangolin_predict_probs, pang_models
|
|
253
|
-
ref_seq_donor_probs, ref_seq_acceptor_probs = pangolin_predict_probs(ref_seq.seq , models=pang_models, just_ss=just_ss)
|
|
254
|
-
mut_seq_donor_probs, mut_seq_acceptor_probs = pangolin_predict_probs(var_seq.seq, models=pang_models, just_ss=just_ss)
|
|
255
|
-
|
|
256
|
-
else:
|
|
257
|
-
raise ValueError(f"{engine} not implemented")
|
|
258
|
-
|
|
310
|
+
ref_seq_acceptor_probs, ref_seq_donor_probs = run_splicing_engine(ref_seq.seq, engine)
|
|
311
|
+
mut_seq_acceptor_probs, mut_seq_donor_probs = run_splicing_engine(var_seq.seq, engine)
|
|
259
312
|
ref_indices = ref_seq.indices[5000:-5000]
|
|
260
313
|
mut_indices = var_seq.indices[5000:-5000]
|
|
261
314
|
visible_donors = np.intersect1d(donors, ref_indices)
|
|
262
315
|
visible_acceptors = np.intersect1d(acceptors, ref_indices)
|
|
263
316
|
|
|
264
|
-
assert len(ref_indices) == len(
|
|
265
|
-
|
|
317
|
+
assert len(ref_indices) == len(
|
|
318
|
+
ref_seq_acceptor_probs), f'Reference pos ({len(ref_indices)}) not the same as probs ({len(ref_seq_acceptor_probs)})'
|
|
319
|
+
assert len(mut_indices) == len(
|
|
320
|
+
mut_seq_acceptor_probs), f'Mut pos ({len(mut_indices)}) not the same as probs ({len(mut_seq_acceptor_probs)})'
|
|
266
321
|
|
|
267
322
|
iap, dap = find_ss_changes({p: v for p, v in list(zip(ref_indices, ref_seq_acceptor_probs))},
|
|
268
323
|
{p: v for p, v in list(zip(mut_indices, mut_seq_acceptor_probs))},
|
|
@@ -282,106 +337,140 @@ def find_transcript_missplicing_seqs(ref_seq, var_seq, donors, acceptors, thresh
|
|
|
282
337
|
|
|
283
338
|
lost_acceptors = {int(p): {'absolute': np.float64(0), 'delta': round(float(-ref_acceptors[p]), 3)} for p in
|
|
284
339
|
visible_acceptors if p not in mut_indices and p not in dap}
|
|
285
|
-
lost_donors = {int(p): {'absolute': np.float64(0), 'delta': round(float(-ref_donors[p]), 3)} for p in
|
|
340
|
+
lost_donors = {int(p): {'absolute': np.float64(0), 'delta': round(float(-ref_donors[p]), 3)} for p in
|
|
341
|
+
visible_donors
|
|
286
342
|
if p not in mut_indices and p not in ddp}
|
|
287
343
|
dap.update(lost_acceptors)
|
|
288
344
|
ddp.update(lost_donors)
|
|
289
345
|
|
|
290
|
-
missplicing = {'missed_acceptors': dap, 'missed_donors': ddp, 'discovered_acceptors': iap,
|
|
346
|
+
missplicing = {'missed_acceptors': dap, 'missed_donors': ddp, 'discovered_acceptors': iap,
|
|
347
|
+
'discovered_donors': idp}
|
|
291
348
|
missplicing = {outk: {float(k): v for k, v in outv.items()} for outk, outv in missplicing.items()}
|
|
292
|
-
temp =
|
|
349
|
+
temp = {outk: {int(k) if k.is_integer() else k: v for k, v in outv.items()} for outk, outv in
|
|
350
|
+
missplicing.items()}
|
|
293
351
|
return Missplicing(temp, threshold=threshold)
|
|
294
352
|
|
|
295
353
|
|
|
296
354
|
class Missplicing:
|
|
297
355
|
def __init__(self, splicing_dict, threshold=0.5):
|
|
356
|
+
"""
|
|
357
|
+
Initialize a Missplicing object.
|
|
358
|
+
|
|
359
|
+
Args:
|
|
360
|
+
splicing_dict (dict): Dictionary containing splicing events and their details.
|
|
361
|
+
Example:
|
|
362
|
+
{
|
|
363
|
+
"missed_acceptors": {100: {"absolute": 0.0, "delta": -0.3}, ...},
|
|
364
|
+
"missed_donors": { ... },
|
|
365
|
+
"discovered_acceptors": { ... },
|
|
366
|
+
"discovered_donors": { ... }
|
|
367
|
+
}
|
|
368
|
+
threshold (float): The threshold above which a delta is considered significant.
|
|
369
|
+
"""
|
|
298
370
|
self.missplicing = splicing_dict
|
|
299
371
|
self.threshold = threshold
|
|
300
372
|
|
|
301
|
-
# def __repr__(self):
|
|
302
|
-
# return f'Missplicing({self.modification.mut_id}) --> {self.missplicing}'
|
|
303
|
-
|
|
304
373
|
def __str__(self):
|
|
305
|
-
|
|
374
|
+
"""String representation displays the filtered splicing events passing the threshold."""
|
|
375
|
+
return str(self.significant_events)
|
|
306
376
|
|
|
307
377
|
def __bool__(self):
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
378
|
+
"""
|
|
379
|
+
Boolean evaluation: True if any event surpasses the threshold, False otherwise.
|
|
380
|
+
"""
|
|
381
|
+
return self.first_significant_event() is not None
|
|
311
382
|
|
|
312
383
|
def __iter__(self):
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
# return not flag
|
|
384
|
+
"""
|
|
385
|
+
Iterate over all delta values from all events. The first yielded value is 0 (for compatibility),
|
|
386
|
+
followed by all deltas in self.missplicing.
|
|
387
|
+
"""
|
|
388
|
+
yield 0
|
|
389
|
+
for details in self.missplicing.values():
|
|
390
|
+
for d in details.values():
|
|
391
|
+
yield d['delta']
|
|
322
392
|
|
|
323
393
|
@property
|
|
324
|
-
def
|
|
325
|
-
|
|
394
|
+
def significant_events(self):
|
|
395
|
+
"""
|
|
396
|
+
Returns a filtered version of missplicing events that meet or exceed the current threshold.
|
|
397
|
+
"""
|
|
398
|
+
return self.filter_by_threshold(self.threshold)
|
|
399
|
+
|
|
400
|
+
def filter_by_threshold(self, threshold=None):
|
|
401
|
+
"""
|
|
402
|
+
Filter self.missplicing to only include events where abs(delta) >= threshold.
|
|
403
|
+
|
|
404
|
+
Args:
|
|
405
|
+
threshold (float, optional): The threshold to apply. Defaults to self.threshold.
|
|
406
|
+
|
|
407
|
+
Returns:
|
|
408
|
+
dict: A new dictionary with filtered events.
|
|
409
|
+
"""
|
|
410
|
+
if threshold is None:
|
|
411
|
+
threshold = self.threshold
|
|
326
412
|
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
413
|
+
return {
|
|
414
|
+
event: {
|
|
415
|
+
pos: detail for pos, detail in details.items()
|
|
416
|
+
if abs(detail['delta']) >= threshold
|
|
417
|
+
}
|
|
418
|
+
for event, details in self.missplicing.items()
|
|
419
|
+
}
|
|
420
|
+
|
|
421
|
+
def first_significant_event(self, splicing_dict=None, threshold=None):
|
|
422
|
+
"""
|
|
423
|
+
Check if there is any event surpassing a given threshold and return the dictionary if found.
|
|
424
|
+
|
|
425
|
+
Args:
|
|
426
|
+
splicing_dict (dict, optional): Dictionary to check. Defaults to self.missplicing.
|
|
427
|
+
threshold (float, optional): Threshold to apply. Defaults to self.threshold.
|
|
428
|
+
|
|
429
|
+
Returns:
|
|
430
|
+
dict or None: Returns the dictionary if a delta surpasses the threshold, otherwise None.
|
|
431
|
+
"""
|
|
432
|
+
if splicing_dict is None:
|
|
433
|
+
splicing_dict = self.missplicing
|
|
434
|
+
if threshold is None:
|
|
330
435
|
threshold = self.threshold
|
|
331
436
|
|
|
332
|
-
|
|
333
|
-
for
|
|
334
|
-
|
|
335
|
-
for e, d in details.items():
|
|
336
|
-
if abs(d['delta']) >= threshold:
|
|
337
|
-
in_dict[e] = d
|
|
338
|
-
# return splicing_dict
|
|
339
|
-
new_dict[event] = in_dict
|
|
340
|
-
return new_dict
|
|
341
|
-
|
|
342
|
-
def apply_sai_threshold_alt(self, splicing_dict=None, threshold=None):
|
|
343
|
-
splicing_dict = self.missplicing if not splicing_dict else splicing_dict
|
|
344
|
-
threshold = self.threshold if not threshold else threshold
|
|
345
|
-
for event, details in splicing_dict.items():
|
|
346
|
-
for e, d in details.items():
|
|
347
|
-
if abs(d['delta']) >= threshold:
|
|
348
|
-
return splicing_dict
|
|
437
|
+
# Check if any event meets the threshold
|
|
438
|
+
if any(abs(detail['delta']) >= threshold for details in splicing_dict.values() for detail in details.values()):
|
|
439
|
+
return splicing_dict
|
|
349
440
|
return None
|
|
350
441
|
|
|
351
|
-
def
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
for e, d in details.items():
|
|
355
|
-
if abs(d['delta']) > max_delta:
|
|
356
|
-
max_delta = abs(d['delta'])
|
|
357
|
-
return max_delta
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
def find_transcript_splicing(transcript, engine='spliceai'):
|
|
361
|
-
ref = transcript.pre_mrna
|
|
362
|
-
ref_start_pad = 5000
|
|
363
|
-
ref_end_pad = 5000
|
|
364
|
-
|
|
365
|
-
ref_indices = ref.indices
|
|
366
|
-
ref_seq = 'N' * ref_start_pad + ref.seq + 'N' * ref_end_pad
|
|
367
|
-
if engine == 'spliceai':
|
|
368
|
-
from .spliceai_utils import sai_predict_probs, sai_models
|
|
369
|
-
ref_seq_acceptor_probs, ref_seq_donor_probs = sai_predict_probs(ref_seq, sai_models)
|
|
370
|
-
|
|
371
|
-
elif engine == 'pangolin':
|
|
372
|
-
from .pangolin_utils import pangolin_predict_probs, pang_models
|
|
373
|
-
ref_seq_donor_probs, ref_seq_acceptor_probs = pangolin_predict_probs(ref_seq, models=pang_models)
|
|
442
|
+
def max_delta(self):
|
|
443
|
+
"""
|
|
444
|
+
Returns the maximum absolute delta found in all events.
|
|
374
445
|
|
|
375
|
-
|
|
376
|
-
|
|
446
|
+
Returns:
|
|
447
|
+
float: The maximum absolute delta, or 0 if no events.
|
|
448
|
+
"""
|
|
449
|
+
deltas = [detail['delta'] for details in self.missplicing.values() for detail in details.values()]
|
|
450
|
+
return max(deltas, key=abs, default=0.0)
|
|
377
451
|
|
|
378
|
-
assert len(ref_seq_donor_probs) == len(ref_indices), f'{len(ref_seq_donor_probs)} vs. {len(ref_indices)}'
|
|
379
|
-
donor_probs = {i: p for i, p in list(zip(ref_indices, ref_seq_donor_probs))}
|
|
380
|
-
donor_probs = dict(sorted(donor_probs.items(), key=lambda item: item[1], reverse=True))
|
|
381
452
|
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
453
|
+
# def find_transcript_splicing(transcript, engine='spliceai'):
|
|
454
|
+
# ref_indices = transcript.indices
|
|
455
|
+
# ref_seq = 'N' * 5000 + transcript.seq + 'N' * 5000
|
|
456
|
+
# if engine == 'spliceai':
|
|
457
|
+
# from .spliceai_utils import sai_predict_probs, sai_models
|
|
458
|
+
# ref_seq_acceptor_probs, ref_seq_donor_probs = sai_predict_probs(ref_seq, sai_models)
|
|
459
|
+
#
|
|
460
|
+
# elif engine == 'pangolin':
|
|
461
|
+
# from .pangolin_utils import pangolin_predict_probs, pang_models
|
|
462
|
+
# ref_seq_donor_probs, ref_seq_acceptor_probs = pangolin_predict_probs(ref_seq, models=pang_models)
|
|
463
|
+
#
|
|
464
|
+
# else:
|
|
465
|
+
# raise ValueError(f"{engine} not implemented")
|
|
466
|
+
#
|
|
467
|
+
# assert len(ref_seq_donor_probs) == len(ref_indices), f'{len(ref_seq_donor_probs)} vs. {len(ref_indices)}'
|
|
468
|
+
# donor_probs = {i: p for i, p in list(zip(ref_indices, ref_seq_donor_probs))}
|
|
469
|
+
# donor_probs = dict(sorted(donor_probs.items(), key=lambda item: item[1], reverse=True))
|
|
470
|
+
#
|
|
471
|
+
# acceptor_probs = {i: p for i, p in list(zip(ref_indices, ref_seq_acceptor_probs))}
|
|
472
|
+
# acceptor_probs = dict(sorted(acceptor_probs.items(), key=lambda item: item[1], reverse=True))
|
|
473
|
+
# return donor_probs, acceptor_probs
|
|
385
474
|
|
|
386
475
|
|
|
387
476
|
def benchmark_splicing(gene, organism='hg38', engine='spliceai'):
|
|
@@ -391,7 +480,7 @@ def benchmark_splicing(gene, organism='hg38', engine='spliceai'):
|
|
|
391
480
|
return None, None
|
|
392
481
|
|
|
393
482
|
transcript.generate_pre_mrna()
|
|
394
|
-
predicted_donor_sites, predicted_acceptor_sites = find_transcript_splicing(transcript, engine=engine)
|
|
483
|
+
predicted_donor_sites, predicted_acceptor_sites = find_transcript_splicing(transcript.pre_mrna, engine=engine)
|
|
395
484
|
num_introns = len(transcript.introns)
|
|
396
485
|
predicted_donors = list(predicted_donor_sites.keys())[:num_introns]
|
|
397
486
|
predicted_acceptors = list(predicted_acceptor_sites.keys())[:num_introns]
|
|
@@ -400,68 +489,58 @@ def benchmark_splicing(gene, organism='hg38', engine='spliceai'):
|
|
|
400
489
|
return len(correct_donor_preds) / num_introns, len(correct_acceptor_preds) / num_introns, len(transcript.introns)
|
|
401
490
|
|
|
402
491
|
|
|
403
|
-
def missplicing(mut_id, splicing_threshold=0.5, primary_transcript=True, organism='hg38', engine='spliceai'):
|
|
404
|
-
gene = Gene(mut_id.split(':')[0], organism=organism)
|
|
405
|
-
mutation = get_mutation(mut_id, rev=gene.rev)
|
|
406
|
-
results = {}
|
|
407
|
-
|
|
408
|
-
for tid, transcript in gene.run_transcripts():
|
|
409
|
-
# if not transcript.primary_transcript and primary_transcript:
|
|
410
|
-
# continue
|
|
411
|
-
#
|
|
412
|
-
if mutation not in transcript:
|
|
413
|
-
continue
|
|
414
|
-
|
|
415
|
-
good_tid = tid
|
|
416
|
-
|
|
417
|
-
transcript.generate_pre_mrna()
|
|
418
|
-
results[tid] = Missplicing(find_transcript_missplicing(transcript, mutation, engine=engine),
|
|
419
|
-
threshold=splicing_threshold)
|
|
420
|
-
|
|
421
|
-
# if len(results) == 0:
|
|
422
|
-
# return None
|
|
423
|
-
#
|
|
424
|
-
# if primary_transcript and good_tid in results:
|
|
425
|
-
# return results[good_tid]
|
|
426
|
-
# else:
|
|
427
|
-
# return None
|
|
428
|
-
|
|
429
|
-
return results
|
|
430
|
-
|
|
431
|
-
|
|
432
492
|
import sqlite3
|
|
433
493
|
import json
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
#
|
|
437
|
-
#
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
#
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
494
|
+
import os
|
|
495
|
+
|
|
496
|
+
# Global connection and cursor (adjust to your architecture)
|
|
497
|
+
# Ideally, initialize this once in your application startup code.
|
|
498
|
+
DB_PATH = os.path.join(config['splicing_db'], 'mutation_data.db')
|
|
499
|
+
conn = sqlite3.connect(DB_PATH, isolation_level=None) # autocommit mode
|
|
500
|
+
cursor = conn.cursor()
|
|
501
|
+
|
|
502
|
+
# Create table once at startup, not in the function
|
|
503
|
+
cursor.execute('''
|
|
504
|
+
CREATE TABLE IF NOT EXISTS mutations (
|
|
505
|
+
tool TEXT,
|
|
506
|
+
gene TEXT,
|
|
507
|
+
mutation_id TEXT,
|
|
508
|
+
transcript_id TEXT,
|
|
509
|
+
data TEXT,
|
|
510
|
+
PRIMARY KEY (tool, gene, mutation_id, transcript_id)
|
|
511
|
+
)''')
|
|
512
|
+
|
|
513
|
+
def get_or_compute_splicing(tool, gene, mutation_id, transcript_id, force_recompute=False):
|
|
514
|
+
"""
|
|
515
|
+
Retrieve computed splicing data for a given mutation from a database,
|
|
516
|
+
or compute and store it if not found or if force_recompute is True.
|
|
517
|
+
|
|
518
|
+
Args:
|
|
519
|
+
tool (str): Name of the tool used for computation.
|
|
520
|
+
gene (str): Gene name or identifier.
|
|
521
|
+
mutation_id (str): A unique identifier for the mutation.
|
|
522
|
+
transcript_id (str): ID for the transcript.
|
|
523
|
+
force_recompute (bool): If True, ignore cached value and recompute.
|
|
524
|
+
|
|
525
|
+
Returns:
|
|
526
|
+
dict: The computed splicing data.
|
|
527
|
+
"""
|
|
528
|
+
|
|
529
|
+
# Lookup in the database
|
|
530
|
+
cursor.execute('SELECT data FROM mutations WHERE tool=? AND gene=? AND mutation_id=? AND transcript_id=?',
|
|
531
|
+
(tool, gene, mutation_id, transcript_id))
|
|
532
|
+
row = cursor.fetchone()
|
|
533
|
+
|
|
534
|
+
# If found and no force recompute, return cached data
|
|
535
|
+
if row and not force_recompute:
|
|
536
|
+
return json.loads(row[0])
|
|
537
|
+
|
|
538
|
+
# Otherwise, compute the data
|
|
539
|
+
computed_data = find_transcript_missplicing(mutation_id, transcript_id=transcript_id, engine=tool) # Replace with your actual function
|
|
540
|
+
|
|
541
|
+
# Store computed data in DB
|
|
542
|
+
data_json = json.dumps(computed_data)
|
|
543
|
+
cursor.execute('REPLACE INTO mutations (tool, gene, mutation_id, transcript_id, data) VALUES (?, ?, ?, ?, ?)',
|
|
544
|
+
(tool, gene, mutation_id, transcript_id, data_json))
|
|
545
|
+
|
|
546
|
+
return computed_data
|
|
@@ -10,12 +10,12 @@ geney/graphic_utils.py,sha256=oMsBpB9YeEn96gGpKh4MmtagJffWZbk-xPrIwHvkFhA,11016
|
|
|
10
10
|
geney/gtex_utils.py,sha256=asL2lHyU5KsbWpV096vkf1Ka7hSo_RRfZqw7p5nERmE,1919
|
|
11
11
|
geney/immune_utils.py,sha256=ZRni5ttrhpYBnmNr0d0ZatIbNPYs4nmQuoUO00SpsS4,5271
|
|
12
12
|
geney/mutation_utils.py,sha256=C_kv2MB_L8LlhX3W2ooXjJ3uDoJ8zX1WeDtZKoBZJkI,1547
|
|
13
|
-
geney/oncosplice.py,sha256=
|
|
13
|
+
geney/oncosplice.py,sha256=1xphL2LeAObwUKBXgcyyKbNO9bAryKDZesK7OpUpFfA,22336
|
|
14
14
|
geney/pangolin_utils.py,sha256=i5j5vEMCWOTIa1mRP2377BAhlUFZjHBzTQBips4lA_4,2934
|
|
15
15
|
geney/power_utils.py,sha256=MehZFUdkJ2EFUot709yPEDxSkXmH5XevMebX2HD768A,7330
|
|
16
16
|
geney/seqmat_utils.py,sha256=wzb3PX5it5bpIFQvcxyzlxfhoJTbHHbsjg0rzh05iVs,19753
|
|
17
17
|
geney/spliceai_utils.py,sha256=PFIhTK8Ihrj-cv5tgRN0UFPYEmC4uxtqXSP9bBLnZRM,3077
|
|
18
|
-
geney/splicing_utils.py,sha256=
|
|
18
|
+
geney/splicing_utils.py,sha256=Y-yqRSlP7aRaYP9mpHLOI_1fL8nEEkRmgpfqQPslD8I,26358
|
|
19
19
|
geney/survival_utils.py,sha256=KnAzEviMuXh6SnVXId9PgsFLSbgkduTvYoIthxN7FPA,6886
|
|
20
20
|
geney/tcga_utils.py,sha256=D_BNHm-D_K408dlcJm3hzH2c6QNFjQsKvUcOPiQRk7g,17612
|
|
21
21
|
geney/tis_utils.py,sha256=2makfGfVlDFVIbxzXE85AY9jmAjcNmxyIAxjvkRA5LY,7396
|
|
@@ -24,7 +24,7 @@ geney/translation_initiation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM
|
|
|
24
24
|
geney/translation_initiation/tis_utils.py,sha256=AF3siFjuQH-Rs44EV-80zHdbxRMvN4woLFSHroWIETc,4448
|
|
25
25
|
geney/translation_initiation/resources/kozak_pssm.json,sha256=pcd0Olziutq-6H3mFWDCD9cujQ_AlZO-iiOvBl82hqE,1165
|
|
26
26
|
geney/translation_initiation/resources/tis_regressor_model.joblib,sha256=IXb4DUDhJ5rBDKcqMk9zE3ECTZZcdj7Jixz3KpoZ7OA,2592025
|
|
27
|
-
geney-1.3.
|
|
28
|
-
geney-1.3.
|
|
29
|
-
geney-1.3.
|
|
30
|
-
geney-1.3.
|
|
27
|
+
geney-1.3.5.dist-info/METADATA,sha256=XnLZYFIOc0OInDhB0TQOrp0rM07OatxiD8QcNNWMISg,994
|
|
28
|
+
geney-1.3.5.dist-info/WHEEL,sha256=fS9sRbCBHs7VFcwJLnLXN1MZRR0_TVTxvXKzOnaSFs8,110
|
|
29
|
+
geney-1.3.5.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
|
|
30
|
+
geney-1.3.5.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|