geney 1.2.20__py2.py3-none-any.whl → 1.2.22__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of geney might be problematic. Click here for more details.
- geney/oncosplice.py +1 -1
- {geney-1.2.20.dist-info → geney-1.2.22.dist-info}/METADATA +1 -1
- geney-1.2.22.dist-info/RECORD +19 -0
- geney/Gene.py +0 -258
- geney/analyzers/__init__.py +0 -0
- geney/analyzers/benchmark_clinvar.py +0 -158
- geney/analyzers/characterize_epistasis.py +0 -15
- geney/analyzers/compare_sets.py +0 -91
- geney/analyzers/group_comparison.py +0 -81
- geney/analyzers/survival.py +0 -144
- geney/analyzers/tcga_annotations.py +0 -194
- geney/analyzers/visualize_protein_conservation.py +0 -398
- geney/benchmark_clinvar.py +0 -158
- geney/compare_sets.py +0 -91
- geney/data_parsers/__init__.py +0 -0
- geney/data_parsers/gtex.py +0 -68
- geney/gtex.py +0 -68
- geney/immunotherapy/__init__.py +0 -0
- geney/immunotherapy/netchop.py +0 -78
- geney/mutations/__init__.py +0 -0
- geney/mutations/variant_utils.py +0 -125
- geney/netchop.py +0 -79
- geney/oncosplice/__init__.py +0 -0
- geney/oncosplice_mouse.py +0 -277
- geney/oncosplice_pipeline.py +0 -1588
- geney/performance_utils.py +0 -138
- geney/pipelines/__init__.py +0 -0
- geney/pipelines/dask_utils.py +0 -153
- geney/splicing/__init__.py +0 -2
- geney/splicing/spliceai_utils.py +0 -253
- geney/splicing/splicing_isoform_utils.py +0 -0
- geney/splicing/splicing_utils.py +0 -366
- geney/survival.py +0 -124
- geney/tcga_annotations.py +0 -352
- geney/translation_termination/__init__.py +0 -0
- geney/translation_termination/tts_utils.py +0 -0
- geney-1.2.20.dist-info/RECORD +0 -52
- {geney-1.2.20.dist-info → geney-1.2.22.dist-info}/WHEEL +0 -0
- {geney-1.2.20.dist-info → geney-1.2.22.dist-info}/top_level.txt +0 -0
geney/oncosplice_pipeline.py
DELETED
|
@@ -1,1588 +0,0 @@
|
|
|
1
|
-
import networkx as nx
|
|
2
|
-
import random
|
|
3
|
-
from Bio.Seq import Seq
|
|
4
|
-
from Bio import pairwise2
|
|
5
|
-
from dataclasses import dataclass
|
|
6
|
-
from copy import deepcopy
|
|
7
|
-
import re
|
|
8
|
-
import pandas as pd
|
|
9
|
-
from pathlib import Path
|
|
10
|
-
|
|
11
|
-
from geney import config_setup
|
|
12
|
-
from geney.splicing.splicing_utils import *
|
|
13
|
-
from geney.utils import find_files_by_gene_name, reverse_complement, unload_pickle
|
|
14
|
-
from geney.Fasta_segment import Fasta_segment
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
def sai_predict_probs(seq: str, models: list) -> list:
|
|
18
|
-
'''
|
|
19
|
-
Predicts the donor and acceptor junction probability of each
|
|
20
|
-
NT in seq using SpliceAI.
|
|
21
|
-
|
|
22
|
-
Let m:=2*sai_mrg_context + L be the input seq length. It is assumed
|
|
23
|
-
that the input seq has the following structure:
|
|
24
|
-
|
|
25
|
-
seq = |<sai_mrg_context NTs><L NTs><sai_mrg_context NTs>|
|
|
26
|
-
|
|
27
|
-
The returned probability matrix is of size 2XL, where
|
|
28
|
-
the first row is the acceptor probability and the second row
|
|
29
|
-
is the donor probability. These probabilities corresponds to the
|
|
30
|
-
middel <L NTs> NTs of the input seq.
|
|
31
|
-
'''
|
|
32
|
-
x = one_hot_encode(seq)[None, :]
|
|
33
|
-
y = np.mean([models[m].predict(x, verbose=0) for m in range(5)], axis=0)
|
|
34
|
-
return y[0, :, 1:].T
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
class Mutation:
|
|
38
|
-
def __init__(self, mid):
|
|
39
|
-
self.mut_id = mid
|
|
40
|
-
|
|
41
|
-
gene, chrom, pos, ref, alt = mid.split(':')
|
|
42
|
-
self.gene = gene
|
|
43
|
-
self.chrom = chrom.strip('chr')
|
|
44
|
-
self.start = int(pos)
|
|
45
|
-
|
|
46
|
-
self.file_identifier = self.mut_id.replace(':', '_')
|
|
47
|
-
self.file_identifier_short = f'{self.start}_{ref}_{alt}'
|
|
48
|
-
|
|
49
|
-
self.ref = ref if ref != '-' else ''
|
|
50
|
-
self.alt = alt if alt != '-' else ''
|
|
51
|
-
|
|
52
|
-
if len(self.ref) == len(self.alt) == 1:
|
|
53
|
-
self.vartype = 'SNP'
|
|
54
|
-
elif len(self.ref) == len(self.alt) > 1:
|
|
55
|
-
self.vartype = 'SUB'
|
|
56
|
-
elif self.ref and not self.alt:
|
|
57
|
-
self.vartype = 'DEL'
|
|
58
|
-
elif self.alt and not self.ref:
|
|
59
|
-
self.vartype = 'INS'
|
|
60
|
-
else:
|
|
61
|
-
self.vartype = 'INDEL'
|
|
62
|
-
|
|
63
|
-
def __str__(self):
|
|
64
|
-
return self.mut_id
|
|
65
|
-
|
|
66
|
-
def __repr__(self):
|
|
67
|
-
return f"Mutation({self.mut_id})"
|
|
68
|
-
|
|
69
|
-
def __lt__(self, other):
|
|
70
|
-
return self.start < other.start
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
class Variations:
|
|
74
|
-
def __init__(self, epistatic_set):
|
|
75
|
-
self.variants = sorted([Mutation(m) for m in epistatic_set.split('|')])
|
|
76
|
-
self.mut_id = epistatic_set
|
|
77
|
-
self.start = self.variants[0].start
|
|
78
|
-
self.positions = [v.start for v in self.variants]
|
|
79
|
-
# self.ref = ','.join([m.ref for m in self.variants])
|
|
80
|
-
# self.alt = ','.join([m.alt for m in self.variants])
|
|
81
|
-
self.gene = self.variants[0].gene
|
|
82
|
-
self.chrom = self.variants[0].chrom.strip('chr')
|
|
83
|
-
self.file_identifier = f'{self.gene}_{self.chrom}' + '_' + '_'.join(
|
|
84
|
-
[v.file_identifier_short for v in self.variants])
|
|
85
|
-
|
|
86
|
-
def __str__(self):
|
|
87
|
-
return '|'.join([m.mut_id for m in self.variants])
|
|
88
|
-
|
|
89
|
-
def __repr__(self):
|
|
90
|
-
return f"Variation({', '.join([m.mut_id for m in self.variants])})"
|
|
91
|
-
|
|
92
|
-
def __iter__(self):
|
|
93
|
-
self.current_index = 0
|
|
94
|
-
return self
|
|
95
|
-
|
|
96
|
-
def __next__(self):
|
|
97
|
-
if self.current_index < len(self.variants):
|
|
98
|
-
x = self.variants[self.current_index]
|
|
99
|
-
self.current_index += 1
|
|
100
|
-
return x
|
|
101
|
-
raise StopIteration
|
|
102
|
-
|
|
103
|
-
@property
|
|
104
|
-
def file_identifier_json(self):
|
|
105
|
-
return Path(self.file_identifier + '.json')
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
def generate_mut_variant(seq: str, indices: list, mut: Mutation):
|
|
109
|
-
offset = 1 if not mut.ref else 0
|
|
110
|
-
|
|
111
|
-
check_indices = list(range(mut.start, mut.start + len(mut.ref) + offset))
|
|
112
|
-
check1 = all([m in indices for m in check_indices])
|
|
113
|
-
|
|
114
|
-
if not check1:
|
|
115
|
-
print(
|
|
116
|
-
f"Mutation {mut} not within transcript bounds: {min(list(filter((-1).__ne__, indices)))} - {max(indices)}.")
|
|
117
|
-
return seq, indices, False, False
|
|
118
|
-
|
|
119
|
-
rel_start, rel_end = indices.index(mut.start) + offset, indices.index(mut.start) + offset + len(mut.ref)
|
|
120
|
-
acquired_seq = seq[rel_start:rel_end]
|
|
121
|
-
check2 = acquired_seq == mut.ref
|
|
122
|
-
if not check2:
|
|
123
|
-
print(f'Reference allele does not match genome_build allele. {acquired_seq}, {mut.ref}, {mut.start}')
|
|
124
|
-
consensus_allele = False
|
|
125
|
-
else:
|
|
126
|
-
consensus_allele = True
|
|
127
|
-
if len(mut.ref) == len(mut.alt) > 0:
|
|
128
|
-
temp_indices = list(range(mut.start, mut.start + len(mut.ref)))
|
|
129
|
-
else:
|
|
130
|
-
temp_indices = [indices[indices.index(mut.start)] + v / 1000 for v in list(range(1, len(mut.alt) + 1))]
|
|
131
|
-
|
|
132
|
-
new_indices = indices[:rel_start] + temp_indices + indices[rel_end:]
|
|
133
|
-
new_seq = seq[:rel_start] + mut.alt + seq[rel_end:]
|
|
134
|
-
|
|
135
|
-
assert len(new_seq) == len(new_indices), f'Error in variant modification: {mut}, {len(new_seq)}, {len(new_indices)}'
|
|
136
|
-
assert is_monotonic(list(filter((-1).__ne__, new_indices))), f'Mut indices are not monotonic.'
|
|
137
|
-
return new_seq, new_indices, True, consensus_allele
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
def is_monotonic(A):
|
|
141
|
-
x, y = [], []
|
|
142
|
-
x.extend(A)
|
|
143
|
-
y.extend(A)
|
|
144
|
-
x.sort()
|
|
145
|
-
y.sort(reverse=True)
|
|
146
|
-
if (x == A or y == A):
|
|
147
|
-
return True
|
|
148
|
-
return False
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
class Gene:
|
|
152
|
-
def __init__(self, gene_name, variation=None):
|
|
153
|
-
self.gene_name = gene_name
|
|
154
|
-
self.gene_id = ''
|
|
155
|
-
self.rev = None
|
|
156
|
-
self.chrm = ''
|
|
157
|
-
self.gene_start = 0
|
|
158
|
-
self.gene_end = 0
|
|
159
|
-
self.transcripts = {}
|
|
160
|
-
self.load_from_file(find_files_by_gene_name(gene_name))
|
|
161
|
-
self.variations = variation
|
|
162
|
-
|
|
163
|
-
def __repr__(self):
|
|
164
|
-
return f'Gene(gene_name={self.gene_name})'
|
|
165
|
-
|
|
166
|
-
def __len__(self):
|
|
167
|
-
return len(self.transcripts)
|
|
168
|
-
|
|
169
|
-
def __str__(self):
|
|
170
|
-
return '{gname}, {ntranscripts} transcripts'.format(gname=self.gene_name, ntranscripts=self.__len__())
|
|
171
|
-
|
|
172
|
-
def __copy__(self):
|
|
173
|
-
cls = self.__class__
|
|
174
|
-
result = cls.__new__(cls)
|
|
175
|
-
result.__dict__.update(self.__dict__)
|
|
176
|
-
return result
|
|
177
|
-
|
|
178
|
-
def __deepcopy__(self, memo):
|
|
179
|
-
cls = self.__class__
|
|
180
|
-
result = cls.__new__(cls)
|
|
181
|
-
memo[id(self)] = result
|
|
182
|
-
for k, v in self.__dict__.items():
|
|
183
|
-
setattr(result, k, deepcopy(v, memo))
|
|
184
|
-
return result
|
|
185
|
-
|
|
186
|
-
def __getitem__(self, index):
|
|
187
|
-
return Transcript(list(self.transcripts.values())[index])
|
|
188
|
-
|
|
189
|
-
def load_from_file(self, file_name):
|
|
190
|
-
if not file_name.exists():
|
|
191
|
-
raise FileNotFoundError(f"File '{file_name}' not found.")
|
|
192
|
-
self.load_from_dict(dict_data=unload_pickle(file_name))
|
|
193
|
-
return self
|
|
194
|
-
|
|
195
|
-
def load_from_dict(self, dict_data=None):
|
|
196
|
-
for k, v in dict_data.items():
|
|
197
|
-
setattr(self, k, v)
|
|
198
|
-
return self
|
|
199
|
-
|
|
200
|
-
def transcript(self, tid):
|
|
201
|
-
if tid not in self.transcripts:
|
|
202
|
-
raise AttributeError(f"Transcript '{tid}' not found in gene '{self.gene_name}'.")
|
|
203
|
-
return Transcript(self.transcripts[tid])
|
|
204
|
-
|
|
205
|
-
def run_transcripts(self, primary_transcript=False, protein_coding=False):
|
|
206
|
-
for tid, annotations in self.transcripts.items():
|
|
207
|
-
if primary_transcript and not annotations['primary_transcript']:
|
|
208
|
-
continue
|
|
209
|
-
if protein_coding and annotations['transcript_biotype'] != 'protein_coding':
|
|
210
|
-
continue
|
|
211
|
-
|
|
212
|
-
yield Transcript(self.transcripts[tid], variations=self.variations)
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
class Transcript:
|
|
216
|
-
def __init__(self, d=None, variations=None):
|
|
217
|
-
self.transcript_id = None
|
|
218
|
-
self.transcript_start = None # transcription
|
|
219
|
-
self.transcript_end = None # transcription
|
|
220
|
-
self.transcript_biotype = None # metadata
|
|
221
|
-
self.acceptors, self.donors = [], [] # splicing
|
|
222
|
-
self.TIS, self.TTS = None, None # translation
|
|
223
|
-
self.transcript_seq, self.transcript_indices = '', [] # sequence data
|
|
224
|
-
self.rev = None # sequence data
|
|
225
|
-
self.chrm = '' # sequence data
|
|
226
|
-
self.pre_mrna = '' # sequence data
|
|
227
|
-
self.orf = '' # sequence data
|
|
228
|
-
self.protein = '' # sequence data
|
|
229
|
-
self.log = '' # sequence data
|
|
230
|
-
self.primary_transcript = None # sequence data
|
|
231
|
-
self.cons_available = False # metadata
|
|
232
|
-
self.cons_seq = ''
|
|
233
|
-
self.cons_vector = ''
|
|
234
|
-
self.variations = None
|
|
235
|
-
if variations:
|
|
236
|
-
self.variations = Variations(variations)
|
|
237
|
-
|
|
238
|
-
if d:
|
|
239
|
-
self.load_from_dict(d)
|
|
240
|
-
|
|
241
|
-
if self.cons_available:
|
|
242
|
-
if '*' in self.cons_seq and len(self.cons_seq) == len(self.cons_vector):
|
|
243
|
-
self.cons_seq = self.cons_seq.replace('*', '')
|
|
244
|
-
self.cons_vector = self.cons_vector[:-1]
|
|
245
|
-
|
|
246
|
-
elif '*' in self.cons_seq and len(self.cons_seq) == len(self.cons_vector) + 1:
|
|
247
|
-
self.cons_seq = self.cons_seq.replace('*', '')
|
|
248
|
-
|
|
249
|
-
else:
|
|
250
|
-
self.cons_available = False
|
|
251
|
-
|
|
252
|
-
if self.transcript_biotype == 'protein_coding':
|
|
253
|
-
self.generate_protein()
|
|
254
|
-
|
|
255
|
-
def __repr__(self):
|
|
256
|
-
return 'Transcript(transcript_id={tid})'.format(tid=self.transcript_id)
|
|
257
|
-
|
|
258
|
-
def __len__(self):
|
|
259
|
-
return len(self.transcript_seq)
|
|
260
|
-
|
|
261
|
-
def __str__(self):
|
|
262
|
-
return 'Transcript {tid}, Transcript Type: ' \
|
|
263
|
-
'{protein_coding}, Primary: {primary}'.format(
|
|
264
|
-
tid=self.transcript_id, protein_coding=self.transcript_biotype.replace('_', ' ').title(),
|
|
265
|
-
primary=self.primary_transcript)
|
|
266
|
-
|
|
267
|
-
def __eq__(self, other):
|
|
268
|
-
return self.transcript_seq == other.transcript_seq
|
|
269
|
-
|
|
270
|
-
def __contains__(self, subvalue):
|
|
271
|
-
if isinstance(subvalue, str):
|
|
272
|
-
return subvalue in self.transcript_seq
|
|
273
|
-
elif isinstance(subvalue, int):
|
|
274
|
-
return subvalue in self.transcript_indices
|
|
275
|
-
else:
|
|
276
|
-
print(
|
|
277
|
-
"Pass an integer to check against the span of the gene's coordinates or a string to check against the "
|
|
278
|
-
"pre-mRNA sequence.")
|
|
279
|
-
return False
|
|
280
|
-
|
|
281
|
-
def __copy__(self):
|
|
282
|
-
cls = self.__class__
|
|
283
|
-
result = cls.__new__(cls)
|
|
284
|
-
result.__dict__.update(self.__dict__)
|
|
285
|
-
return result
|
|
286
|
-
|
|
287
|
-
def __deepcopy__(self, memo):
|
|
288
|
-
cls = self.__class__
|
|
289
|
-
result = cls.__new__(cls)
|
|
290
|
-
memo[id(self)] = result
|
|
291
|
-
for k, v in self.__dict__.items():
|
|
292
|
-
setattr(result, k, deepcopy(v, memo))
|
|
293
|
-
return result
|
|
294
|
-
|
|
295
|
-
def load_from_dict(self, data):
|
|
296
|
-
for k, v in data.items():
|
|
297
|
-
setattr(self, k, v)
|
|
298
|
-
self.__arrange_boundaries()
|
|
299
|
-
self.generate_mature_mrna(inplace=True)
|
|
300
|
-
return self
|
|
301
|
-
|
|
302
|
-
@property
|
|
303
|
-
def exons(self):
|
|
304
|
-
return list(zip(self.acceptors, self.donors))
|
|
305
|
-
|
|
306
|
-
@property
|
|
307
|
-
def introns(self):
|
|
308
|
-
return list(zip([v for v in self.donors if v != self.transcript_end],
|
|
309
|
-
[v for v in self.acceptors if v != self.transcript_start]))
|
|
310
|
-
|
|
311
|
-
def set_intron_boundaries(self, acceptors=None, donors=None):
|
|
312
|
-
if acceptors:
|
|
313
|
-
self.acceptors = acceptors
|
|
314
|
-
if donors:
|
|
315
|
-
self.donors = donors
|
|
316
|
-
self.__arrange_boundaries()
|
|
317
|
-
return self
|
|
318
|
-
|
|
319
|
-
@property
|
|
320
|
-
def introns(self):
|
|
321
|
-
return list(zip([v for v in self.donors if v != self.transcript_end],
|
|
322
|
-
[v for v in self.acceptors if v != self.transcript_start]))
|
|
323
|
-
|
|
324
|
-
def __exon_coverage_check(self):
|
|
325
|
-
if sum([abs(a - b) + 1 for a, b in self.exons]) == len(self):
|
|
326
|
-
return True
|
|
327
|
-
else:
|
|
328
|
-
return False
|
|
329
|
-
|
|
330
|
-
@property
|
|
331
|
-
def exons_pos(self):
|
|
332
|
-
temp = self.exons
|
|
333
|
-
if self.rev:
|
|
334
|
-
temp = [(b, a) for a, b in temp[::-1]]
|
|
335
|
-
return temp
|
|
336
|
-
|
|
337
|
-
@property
|
|
338
|
-
def mrna_indices(self):
|
|
339
|
-
temp = [lst for lsts in [list(range(a, b + 1)) for a, b in self.exons_pos] for lst in lsts]
|
|
340
|
-
return sorted(temp, reverse=self.rev)
|
|
341
|
-
|
|
342
|
-
@property
|
|
343
|
-
def exonic_indices(self):
|
|
344
|
-
return [lst for lsts in [list(range(a, b + 1)) for a, b in self.exons_pos] for lst in lsts]
|
|
345
|
-
|
|
346
|
-
def __arrange_boundaries(self):
|
|
347
|
-
self.acceptors.append(self.transcript_start)
|
|
348
|
-
self.donors.append(self.transcript_end)
|
|
349
|
-
self.acceptors = list(set(self.acceptors))
|
|
350
|
-
self.donors = list(set(self.donors))
|
|
351
|
-
self.acceptors.sort(reverse=self.rev)
|
|
352
|
-
self.donors.sort(reverse=self.rev)
|
|
353
|
-
return self
|
|
354
|
-
|
|
355
|
-
def positive_strand(self):
|
|
356
|
-
if self.rev:
|
|
357
|
-
return reverse_complement(self.transcript_seq)
|
|
358
|
-
else:
|
|
359
|
-
return self.transcript_seq
|
|
360
|
-
|
|
361
|
-
def __pos2sense(self, mrna, indices):
|
|
362
|
-
if self.rev:
|
|
363
|
-
mrna = reverse_complement(mrna)
|
|
364
|
-
indices = indices[::-1]
|
|
365
|
-
return mrna, indices
|
|
366
|
-
|
|
367
|
-
def pull_pre_mrna_pos(self):
|
|
368
|
-
fasta_obj = Fasta_segment()
|
|
369
|
-
if self.rev:
|
|
370
|
-
return fasta_obj.read_segment_endpoints(config_setup['CHROM_SOURCE'] / f'chr{self.chrm}.fasta',
|
|
371
|
-
self.transcript_end,
|
|
372
|
-
self.transcript_start)
|
|
373
|
-
else:
|
|
374
|
-
return fasta_obj.read_segment_endpoints(config_setup['CHROM_SOURCE'] / f'chr{self.chrm}.fasta',
|
|
375
|
-
self.transcript_start,
|
|
376
|
-
self.transcript_end)
|
|
377
|
-
|
|
378
|
-
def generate_pre_mrna_pos(self):
|
|
379
|
-
seq, indices = self.pull_pre_mrna_pos()
|
|
380
|
-
if self.variations:
|
|
381
|
-
for mutation in self.variations.variants:
|
|
382
|
-
seq, indices, _, _ = generate_mut_variant(seq, indices, mut=mutation)
|
|
383
|
-
self.pre_mrna, _ = self.__pos2sense(seq, indices)
|
|
384
|
-
return seq, indices
|
|
385
|
-
|
|
386
|
-
def generate_pre_mrna(self, inplace=True):
|
|
387
|
-
pre_mrna, pre_indices = self.__pos2sense(*self.generate_pre_mrna_pos())
|
|
388
|
-
self.pre_mrna = pre_mrna
|
|
389
|
-
if inplace:
|
|
390
|
-
return self
|
|
391
|
-
return pre_mrna, pre_indices
|
|
392
|
-
|
|
393
|
-
def generate_mature_mrna_pos(self):
|
|
394
|
-
mature_mrna, mature_indices = '', []
|
|
395
|
-
pre_seq, pre_indices = self.generate_pre_mrna_pos()
|
|
396
|
-
for i, j in self.exons_pos:
|
|
397
|
-
rel_start, rel_end = pre_indices.index(i), pre_indices.index(j)
|
|
398
|
-
mature_mrna += pre_seq[rel_start:rel_end + 1]
|
|
399
|
-
mature_indices.extend(pre_indices[rel_start:rel_end + 1])
|
|
400
|
-
return mature_mrna, mature_indices
|
|
401
|
-
|
|
402
|
-
def generate_mature_mrna(self, inplace=True):
|
|
403
|
-
if inplace:
|
|
404
|
-
self.transcript_seq, self.transcript_indices = self.__pos2sense(*self.generate_mature_mrna_pos())
|
|
405
|
-
return self
|
|
406
|
-
return self.__pos2sense(*self.generate_mature_mrna_pos())
|
|
407
|
-
|
|
408
|
-
def generate_protein(self, inplace=True, regenerate_mrna=True):
|
|
409
|
-
if regenerate_mrna:
|
|
410
|
-
self.generate_mature_mrna()
|
|
411
|
-
|
|
412
|
-
if not self.TIS or self.TIS not in self.transcript_indices:
|
|
413
|
-
return ''
|
|
414
|
-
|
|
415
|
-
rel_start = self.transcript_indices.index(self.TIS)
|
|
416
|
-
orf = self.transcript_seq[rel_start:]
|
|
417
|
-
first_stop_index = next((i for i in range(0, len(orf) - 2, 3) if orf[i:i + 3] in {"TAG", "TAA", "TGA"}), None)
|
|
418
|
-
orf = orf[:first_stop_index + 3]
|
|
419
|
-
protein = str(Seq(orf).translate()).replace('*', '')
|
|
420
|
-
if inplace:
|
|
421
|
-
self.orf = orf
|
|
422
|
-
self.protein = protein
|
|
423
|
-
if self.protein != self.cons_seq:
|
|
424
|
-
self.cons_available = False
|
|
425
|
-
return self
|
|
426
|
-
return protein
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
def develop_aberrant_splicing(transcript, aberrant_splicing):
|
|
430
|
-
exon_starts = {v: 1 for v in transcript.acceptors}
|
|
431
|
-
exon_starts.update({transcript.transcript_start: 1})
|
|
432
|
-
exon_starts.update({s: v['absolute'] for s, v in aberrant_splicing['missed_acceptors'].items()})
|
|
433
|
-
exon_starts.update({s: v['absolute'] for s, v in aberrant_splicing['discovered_acceptors'].items()})
|
|
434
|
-
|
|
435
|
-
exon_ends = {v: 1 for v in transcript.donors}
|
|
436
|
-
exon_ends.update({transcript.transcript_end: 1})
|
|
437
|
-
exon_ends.update({s: v['absolute'] for s, v in aberrant_splicing['missed_donors'].items()})
|
|
438
|
-
exon_ends.update({s: v['absolute'] for s, v in aberrant_splicing['discovered_donors'].items()})
|
|
439
|
-
|
|
440
|
-
nodes = [SpliceSite(pos=pos, ss_type=0, prob=prob) for pos, prob in exon_ends.items()] + \
|
|
441
|
-
[SpliceSite(pos=pos, ss_type=1, prob=prob) for pos, prob in exon_starts.items()]
|
|
442
|
-
|
|
443
|
-
nodes = [s for s in nodes if s.prob > 0]
|
|
444
|
-
nodes.sort(key=lambda x: x.pos, reverse=transcript.rev)
|
|
445
|
-
|
|
446
|
-
G = nx.DiGraph()
|
|
447
|
-
G.add_nodes_from([n.pos for n in nodes])
|
|
448
|
-
|
|
449
|
-
for i in range(len(nodes)):
|
|
450
|
-
trailing_prob, in_between = 0, []
|
|
451
|
-
for j in range(i + 1, len(nodes)):
|
|
452
|
-
curr_node, next_node = nodes[i], nodes[j]
|
|
453
|
-
spread = curr_node.ss_type in in_between
|
|
454
|
-
in_between.append(next_node.ss_type)
|
|
455
|
-
if curr_node.ss_type != next_node.ss_type:
|
|
456
|
-
if spread:
|
|
457
|
-
new_prob = next_node.prob - trailing_prob
|
|
458
|
-
if new_prob <= 0:
|
|
459
|
-
break
|
|
460
|
-
G.add_edge(curr_node.pos, next_node.pos)
|
|
461
|
-
G.edges[curr_node.pos, next_node.pos]['weight'] = new_prob
|
|
462
|
-
trailing_prob += next_node.prob
|
|
463
|
-
else:
|
|
464
|
-
G.add_edge(curr_node.pos, next_node.pos)
|
|
465
|
-
G.edges[curr_node.pos, next_node.pos]['weight'] = next_node.prob
|
|
466
|
-
trailing_prob += next_node.prob
|
|
467
|
-
|
|
468
|
-
new_paths, prob_sum = {}, 0
|
|
469
|
-
for i, path in enumerate(nx.all_simple_paths(G, transcript.transcript_start, transcript.transcript_end)):
|
|
470
|
-
curr_prob = path_weight_mult(G, path, 'weight')
|
|
471
|
-
prob_sum += curr_prob
|
|
472
|
-
new_paths[i] = {
|
|
473
|
-
'acceptors': sorted([p for p in path if p in exon_starts.keys() and p != transcript.transcript_start],
|
|
474
|
-
reverse=transcript.rev),
|
|
475
|
-
'donors': sorted([p for p in path if p in exon_ends.keys() and p != transcript.transcript_end],
|
|
476
|
-
reverse=transcript.rev),
|
|
477
|
-
'path_weight': curr_prob}
|
|
478
|
-
|
|
479
|
-
for i, d in new_paths.items():
|
|
480
|
-
d['path_weight'] = round(d['path_weight'] / prob_sum, 3)
|
|
481
|
-
new_paths = {k: v for k, v in new_paths.items() if v['path_weight'] > 0.01}
|
|
482
|
-
return list(new_paths.values())
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
def path_weight_mult(G, path, weight):
|
|
486
|
-
multigraph = G.is_multigraph()
|
|
487
|
-
cost = 1
|
|
488
|
-
if not nx.is_path(G, path):
|
|
489
|
-
raise nx.NetworkXNoPath("path does not exist")
|
|
490
|
-
for node, nbr in nx.utils.pairwise(path):
|
|
491
|
-
if multigraph:
|
|
492
|
-
cost *= min(v[weight] for v in G[node][nbr].values())
|
|
493
|
-
else:
|
|
494
|
-
cost *= G[node][nbr][weight]
|
|
495
|
-
return cost
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
@dataclass
|
|
499
|
-
class SpliceSite(object):
|
|
500
|
-
pos: int
|
|
501
|
-
ss_type: int
|
|
502
|
-
prob: float
|
|
503
|
-
|
|
504
|
-
def __post_init__(self):
|
|
505
|
-
pass
|
|
506
|
-
|
|
507
|
-
def __lt__(self, other):
|
|
508
|
-
return self.pos < other.pos
|
|
509
|
-
|
|
510
|
-
def __str__(self):
|
|
511
|
-
print(f"({self.ss_type}, {self.pos}, {self.prob})")
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
def run_spliceai_seq(seq, indices, rev):
|
|
515
|
-
seq = 'N' * 5000 + seq + 'N' * 5000
|
|
516
|
-
# indices = [-1] * 5000 + indices + [-1] * 5000
|
|
517
|
-
|
|
518
|
-
ref_seq_probs_temp = sai_predict_probs(seq, sai_models)
|
|
519
|
-
ref_seq_acceptor_probs, ref_seq_donor_probs = ref_seq_probs_temp[0, :], ref_seq_probs_temp[1, :]
|
|
520
|
-
|
|
521
|
-
acceptor_indices = {a: b for a, b in list(zip(indices, ref_seq_acceptor_probs)) if b > 0.75}
|
|
522
|
-
donor_indices = {a: b for a, b in list(zip(indices, ref_seq_donor_probs)) if b > 0.75}
|
|
523
|
-
return acceptor_indices, donor_indices
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
def run_spliceai_transcript(mutations, gene_data, sai_mrg_context=5000, min_coverage=2500, sai_threshold=0.5):
|
|
527
|
-
positions = mutations.positions # [m.start for m in mutations]
|
|
528
|
-
seq_start_pos = min(positions) - sai_mrg_context - min_coverage
|
|
529
|
-
seq_end_pos = max(positions) + sai_mrg_context + min_coverage # + 1
|
|
530
|
-
|
|
531
|
-
# ref_seq, ref_indices = pull_fasta_seq_endpoints(mutations.chrom, seq_start_pos, seq_end_pos)
|
|
532
|
-
fasta_obj = Fasta_segment()
|
|
533
|
-
ref_seq, ref_indices = fasta_obj.read_segment_endpoints(
|
|
534
|
-
config_setup['CHROM_SOURCE'] / f'chr{mutations.chrom}.fasta',
|
|
535
|
-
seq_start_pos,
|
|
536
|
-
seq_end_pos)
|
|
537
|
-
|
|
538
|
-
gene_start, gene_end, rev = gene_data.transcript_start, gene_data.transcript_end, gene_data.rev
|
|
539
|
-
|
|
540
|
-
mrna_acceptors = sorted(gene_data.acceptors)
|
|
541
|
-
mrna_donors = sorted(gene_data.donors)
|
|
542
|
-
|
|
543
|
-
visible_donors = np.intersect1d(mrna_donors, ref_indices)
|
|
544
|
-
visible_acceptors = np.intersect1d(mrna_acceptors, ref_indices)
|
|
545
|
-
|
|
546
|
-
start_pad = ref_indices.index(gene_start) if gene_start in ref_indices else 0
|
|
547
|
-
end_cutoff = ref_indices.index(gene_end) if gene_end in ref_indices else len(ref_indices) # - 1
|
|
548
|
-
end_pad = len(ref_indices) - end_cutoff
|
|
549
|
-
ref_seq = 'N' * start_pad + ref_seq[start_pad:end_cutoff] + 'N' * end_pad
|
|
550
|
-
ref_indices = [-1] * start_pad + ref_indices[start_pad:end_cutoff] + [-1] * end_pad
|
|
551
|
-
mut_seq, mut_indices = ref_seq, ref_indices
|
|
552
|
-
|
|
553
|
-
for mut in mutations:
|
|
554
|
-
mut_seq, mut_indices, _, _ = generate_mut_variant(seq=mut_seq, indices=mut_indices, mut=mut)
|
|
555
|
-
|
|
556
|
-
ref_indices = ref_indices[sai_mrg_context:-sai_mrg_context]
|
|
557
|
-
mut_indices = mut_indices[sai_mrg_context:-sai_mrg_context]
|
|
558
|
-
|
|
559
|
-
if rev:
|
|
560
|
-
ref_seq = reverse_complement(ref_seq)
|
|
561
|
-
mut_seq = reverse_complement(mut_seq)
|
|
562
|
-
ref_indices = ref_indices[::-1]
|
|
563
|
-
mut_indices = mut_indices[::-1]
|
|
564
|
-
|
|
565
|
-
ref_seq_probs_temp = sai_predict_probs(ref_seq, sai_models)
|
|
566
|
-
mut_seq_probs_temp = sai_predict_probs(mut_seq, sai_models)
|
|
567
|
-
|
|
568
|
-
ref_seq_acceptor_probs, ref_seq_donor_probs = ref_seq_probs_temp[0, :], ref_seq_probs_temp[1, :]
|
|
569
|
-
mut_seq_acceptor_probs, mut_seq_donor_probs = mut_seq_probs_temp[0, :], mut_seq_probs_temp[1, :]
|
|
570
|
-
|
|
571
|
-
assert len(ref_indices) == len(ref_seq_acceptor_probs), 'Reference pos not the same'
|
|
572
|
-
assert len(mut_indices) == len(mut_seq_acceptor_probs), 'Mut pos not the same'
|
|
573
|
-
|
|
574
|
-
iap, dap = find_ss_changes({p: v for p, v in list(zip(ref_indices, ref_seq_acceptor_probs))},
|
|
575
|
-
{p: v for p, v in list(zip(mut_indices, mut_seq_acceptor_probs))},
|
|
576
|
-
visible_acceptors,
|
|
577
|
-
threshold=sai_threshold)
|
|
578
|
-
|
|
579
|
-
assert len(ref_indices) == len(ref_seq_donor_probs), 'Reference pos not the same'
|
|
580
|
-
assert len(mut_indices) == len(mut_seq_donor_probs), 'Mut pos not the same'
|
|
581
|
-
|
|
582
|
-
idp, ddp = find_ss_changes({p: v for p, v in list(zip(ref_indices, ref_seq_donor_probs))},
|
|
583
|
-
{p: v for p, v in list(zip(mut_indices, mut_seq_donor_probs))},
|
|
584
|
-
visible_donors,
|
|
585
|
-
threshold=sai_threshold)
|
|
586
|
-
|
|
587
|
-
missplicing = {'missed_acceptors': dap, 'missed_donors': ddp, 'discovered_acceptors': iap, 'discovered_donors': idp}
|
|
588
|
-
missplicing = {outk: {float(k): v for k, v in outv.items()} for outk, outv in missplicing.items()}
|
|
589
|
-
return {outk: {int(k) if k.is_integer() else k: v for k, v in outv.items()} for outk, outv in missplicing.items()}
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
def OncospliceAnnotator(reference_transcript, variant_transcript, mut):
|
|
593
|
-
affected_exon, affected_intron, distance_from_5, distance_from_3 = find_splice_site_proximity(mut,
|
|
594
|
-
reference_transcript)
|
|
595
|
-
|
|
596
|
-
report = {}
|
|
597
|
-
report['primary_transcript'] = reference_transcript.primary_transcript
|
|
598
|
-
report['transcript_id'] = reference_transcript.transcript_id
|
|
599
|
-
report['mut_id'] = mut.mut_id
|
|
600
|
-
report['cons_available'] = int(reference_transcript.cons_available)
|
|
601
|
-
report['protein_coding'] = reference_transcript.transcript_biotype
|
|
602
|
-
|
|
603
|
-
report['reference_mrna'] = reference_transcript.transcript_seq
|
|
604
|
-
report['reference_cds_start'] = reference_transcript.TIS
|
|
605
|
-
report['reference_pre_mrna'] = reference_transcript.pre_mrna
|
|
606
|
-
report[
|
|
607
|
-
'reference_orf'] = reference_transcript.orf # pre_mrna[reference_transcript.transcript_indices.index(reference_transcript.TIS):reference_transcript.transcript_indices.index(reference_transcript.TTS)]
|
|
608
|
-
report['reference_protein'] = reference_transcript.protein
|
|
609
|
-
report['reference_protein_length'] = len(reference_transcript.protein)
|
|
610
|
-
|
|
611
|
-
report['variant_mrna'] = variant_transcript.transcript_seq
|
|
612
|
-
report['variant_cds_start'] = variant_transcript.TIS
|
|
613
|
-
report[
|
|
614
|
-
'variant_pre_mrna'] = variant_transcript.pre_mrna # pre_mrna[variant_transcript.transcript_indices.index(variant_transcript.TIS):variant_transcript.transcript_indices.index(variant_transcript.TTS)]
|
|
615
|
-
report['variant_orf'] = variant_transcript.orf
|
|
616
|
-
report['variant_protein'] = variant_transcript.protein
|
|
617
|
-
report['variant_protein_length'] = len(variant_transcript.protein)
|
|
618
|
-
|
|
619
|
-
descriptions = define_missplicing_events(reference_transcript.exons, variant_transcript.exons,
|
|
620
|
-
reference_transcript.rev)
|
|
621
|
-
# print(descriptions)
|
|
622
|
-
report['exon_changes'] = '|'.join([v for v in descriptions if v])
|
|
623
|
-
report['splicing_codes'] = summarize_missplicing_event(*descriptions)
|
|
624
|
-
report['affected_exon'] = affected_exon
|
|
625
|
-
report['affected_intron'] = affected_intron
|
|
626
|
-
report['mutation_distance_from_5'] = distance_from_5
|
|
627
|
-
report['mutation_distance_from_3'] = distance_from_3
|
|
628
|
-
return report
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
def find_splice_site_proximity(mut, transcript):
|
|
632
|
-
affected_exon, affected_intron, distance_from_5, distance_from_3 = None, None, None, None
|
|
633
|
-
for i, (ex_start, ex_end) in enumerate(transcript.exons):
|
|
634
|
-
if min(ex_start, ex_end) <= mut.start <= max(ex_start, ex_end):
|
|
635
|
-
affected_exon = i + 1
|
|
636
|
-
distance_from_5 = abs(mut.start - ex_start)
|
|
637
|
-
distance_from_3 = abs(mut.start - ex_end)
|
|
638
|
-
|
|
639
|
-
for i, (in_start, in_end) in enumerate(transcript.introns):
|
|
640
|
-
if min(in_start, in_end) <= mut.start <= max(in_start, in_end):
|
|
641
|
-
affected_intron = i + 1
|
|
642
|
-
distance_from_5 = abs(mut.start - in_end)
|
|
643
|
-
distance_from_3 = abs(mut.start - in_start)
|
|
644
|
-
|
|
645
|
-
return affected_exon, affected_intron, distance_from_5, distance_from_3
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
def define_missplicing_events(ref_exons, var_exons, rev):
|
|
649
|
-
ref_introns = [(ref_exons[i][1], ref_exons[i + 1][0]) for i in range(len(ref_exons) - 1)]
|
|
650
|
-
var_introns = [(var_exons[i][1], var_exons[i + 1][0]) for i in range(len(var_exons) - 1)]
|
|
651
|
-
num_ref_exons = len(ref_exons)
|
|
652
|
-
num_ref_introns = len(ref_introns)
|
|
653
|
-
if not rev:
|
|
654
|
-
partial_exon_skipping = ','.join(
|
|
655
|
-
[f'Exon {exon_count + 1}/{num_ref_exons} truncated: {(t1, t2)} --> {(s1, s2)}' for (s1, s2) in var_exons for
|
|
656
|
-
exon_count, (t1, t2) in enumerate(ref_exons) if (s1 == t1 and s2 < t2) or (s1 > t1 and s2 == t2)])
|
|
657
|
-
partial_intron_retention = ','.join(
|
|
658
|
-
[f'Intron {intron_count + 1}/{num_ref_introns} partially retained: {(t1, t2)} --> {(s1, s2)}' for (s1, s2)
|
|
659
|
-
in var_introns for intron_count, (t1, t2) in enumerate(ref_introns) if
|
|
660
|
-
(s1 == t1 and s2 < t2) or (s1 > t1 and s2 == t2)])
|
|
661
|
-
|
|
662
|
-
else:
|
|
663
|
-
partial_exon_skipping = ','.join(
|
|
664
|
-
[f'Exon {exon_count + 1}/{num_ref_exons} truncated: {(t1, t2)} --> {(s1, s2)}' for (s1, s2) in var_exons for
|
|
665
|
-
exon_count, (t1, t2) in enumerate(ref_exons) if (s1 == t1 and s2 > t2) or (s1 < t1 and s2 == t2)])
|
|
666
|
-
partial_intron_retention = ','.join(
|
|
667
|
-
[f'Intron {intron_count + 1}/{num_ref_introns} partially retained: {(t1, t2)} --> {(s1, s2)}' for (s1, s2)
|
|
668
|
-
in var_introns for intron_count, (t1, t2) in enumerate(ref_introns) if
|
|
669
|
-
(s1 == t1 and s2 > t2) or (s1 < t1 and s2 == t2)])
|
|
670
|
-
|
|
671
|
-
exon_skipping = ','.join(
|
|
672
|
-
[f'Exon {exon_count + 1}/{num_ref_exons} skipped: {(t1, t2)}' for exon_count, (t1, t2) in enumerate(ref_exons)
|
|
673
|
-
if
|
|
674
|
-
t1 not in [s1 for s1, s2 in var_exons] and t2 not in [s2 for s1, s2 in var_exons]])
|
|
675
|
-
novel_exons = ','.join([f'Novel Exon: {(t1, t2)}' for (t1, t2) in var_exons if
|
|
676
|
-
t1 not in [s1 for s1, s2 in ref_exons] and t2 not in [s2 for s1, s2 in ref_exons]])
|
|
677
|
-
intron_retention = ','.join(
|
|
678
|
-
[f'Intron {intron_count + 1}/{num_ref_introns} retained: {(t1, t2)}' for intron_count, (t1, t2) in
|
|
679
|
-
enumerate(ref_introns) if
|
|
680
|
-
t1 not in [s1 for s1, s2 in var_introns] and t2 not in [s2 for s1, s2 in var_introns]])
|
|
681
|
-
|
|
682
|
-
return partial_exon_skipping, partial_intron_retention, exon_skipping, novel_exons, intron_retention
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
def summarize_missplicing_event(pes, pir, es, ne, ir):
|
|
686
|
-
event = []
|
|
687
|
-
if pes:
|
|
688
|
-
event.append('PES')
|
|
689
|
-
if es:
|
|
690
|
-
event.append('ES')
|
|
691
|
-
if pir:
|
|
692
|
-
event.append('PIR')
|
|
693
|
-
if ir:
|
|
694
|
-
event.append('IR')
|
|
695
|
-
if ne:
|
|
696
|
-
event.append('NE')
|
|
697
|
-
if len(event) > 1:
|
|
698
|
-
return event
|
|
699
|
-
elif len(event) == 1:
|
|
700
|
-
return event[0]
|
|
701
|
-
else:
|
|
702
|
-
return '-'
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
def find_continuous_gaps(sequence):
|
|
706
|
-
"""Find continuous gap sequences in an alignment."""
|
|
707
|
-
return [(m.start(), m.end()) for m in re.finditer(r'-+', sequence)]
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
def get_logical_alignment(ref_prot, var_prot):
|
|
711
|
-
"""
|
|
712
|
-
Aligns two protein sequences and finds the optimal alignment with the least number of gaps.
|
|
713
|
-
|
|
714
|
-
Parameters:
|
|
715
|
-
ref_prot (str): Reference protein sequence.
|
|
716
|
-
var_prot (str): Variant protein sequence.
|
|
717
|
-
|
|
718
|
-
Returns:
|
|
719
|
-
tuple: Optimal alignment, number of insertions, and number of deletions.
|
|
720
|
-
"""
|
|
721
|
-
|
|
722
|
-
# Perform global alignment
|
|
723
|
-
alignments = pairwise2.align.globalms(ref_prot, var_prot, 1, -1, -3, 0, penalize_end_gaps=(True, True))
|
|
724
|
-
|
|
725
|
-
# Selecting the optimal alignment
|
|
726
|
-
if len(alignments) > 1:
|
|
727
|
-
# Calculate continuous gaps for each alignment and sum their lengths
|
|
728
|
-
gap_lengths = [sum(end - start for start, end in find_continuous_gaps(al.seqA) + find_continuous_gaps(al.seqB))
|
|
729
|
-
for al in alignments]
|
|
730
|
-
optimal_alignment = alignments[gap_lengths.index(min(gap_lengths))]
|
|
731
|
-
else:
|
|
732
|
-
optimal_alignment = alignments[0]
|
|
733
|
-
|
|
734
|
-
return optimal_alignment
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
def find_indels_with_mismatches_as_deletions(seqA, seqB):
|
|
738
|
-
"""
|
|
739
|
-
Identify insertions and deletions in aligned sequences, treating mismatches as deletions.
|
|
740
|
-
|
|
741
|
-
Parameters:
|
|
742
|
-
seqA, seqB (str): Aligned sequences.
|
|
743
|
-
|
|
744
|
-
Returns:
|
|
745
|
-
tuple: Two dictionaries containing deletions and insertions.
|
|
746
|
-
"""
|
|
747
|
-
if len(seqA) != len(seqB):
|
|
748
|
-
raise ValueError("Sequences must be of the same length")
|
|
749
|
-
|
|
750
|
-
mapperA, counter = {}, 0
|
|
751
|
-
for i, c in enumerate(list(seqA)):
|
|
752
|
-
if c != '-':
|
|
753
|
-
counter += 1
|
|
754
|
-
mapperA[i] = counter
|
|
755
|
-
|
|
756
|
-
mapperB, counter = {}, 0
|
|
757
|
-
for i, (c1, c2) in enumerate(list(zip(seqA, seqB))):
|
|
758
|
-
if c2 != '-':
|
|
759
|
-
counter += 1
|
|
760
|
-
mapperB[i] = counter
|
|
761
|
-
|
|
762
|
-
seqA_array, seqB_array = np.array(list(seqA)), np.array(list(seqB))
|
|
763
|
-
|
|
764
|
-
# Find and mark mismatch positions in seqB
|
|
765
|
-
mismatches = (seqA_array != seqB_array) & (seqA_array != '-') & (seqB_array != '-')
|
|
766
|
-
seqB_array[mismatches] = '-'
|
|
767
|
-
modified_seqB = ''.join(seqB_array)
|
|
768
|
-
|
|
769
|
-
gaps_in_A = find_continuous_gaps(seqA)
|
|
770
|
-
gaps_in_B = find_continuous_gaps(modified_seqB)
|
|
771
|
-
|
|
772
|
-
insertions = {mapperB[start]: modified_seqB[start:end].replace('-', '') for start, end in gaps_in_A if
|
|
773
|
-
seqB[start:end].strip('-')}
|
|
774
|
-
deletions = {mapperA[start]: seqA[start:end].replace('-', '') for start, end in gaps_in_B if
|
|
775
|
-
seqA[start:end].strip('-')}
|
|
776
|
-
return deletions, insertions
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
def parabolic_window(window_size):
|
|
780
|
-
"""Create a parabolic window function with a peak at the center."""
|
|
781
|
-
x = np.linspace(-1, 1, window_size)
|
|
782
|
-
return 0.9 * (1 - x ** 2) + 0.1
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
# def calculate_window_size(conservation_vector_length):
|
|
786
|
-
# return int(9 + (51 - 9) * (1 - np.exp(-0.0005 * conservation_vector_length)))
|
|
787
|
-
#
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
def transform_conservation_vector(conservation_vector):
|
|
791
|
-
"""
|
|
792
|
-
Transforms a 1D conservation vector using different parameters.
|
|
793
|
-
|
|
794
|
-
Args:
|
|
795
|
-
conservation_vector (numpy.ndarray): Input 1D vector of conservation values.
|
|
796
|
-
|
|
797
|
-
Returns:
|
|
798
|
-
numpy.ndarray: A matrix containing transformed vectors.
|
|
799
|
-
"""
|
|
800
|
-
window = 13
|
|
801
|
-
factor = 4
|
|
802
|
-
convolving_window = parabolic_window(window)
|
|
803
|
-
transformed_vector = np.convolve(conservation_vector, convolving_window, mode='same') / np.sum(convolving_window)
|
|
804
|
-
# Compute exponential factors
|
|
805
|
-
exp_factors = np.exp(-transformed_vector * factor)
|
|
806
|
-
|
|
807
|
-
# Normalize and scale exponential factors
|
|
808
|
-
# exp_factors /= exp_factors.sum()
|
|
809
|
-
return exp_factors
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
def find_modified_positions(sequence_length, deletions, insertions, reach_limit=16):
|
|
813
|
-
"""
|
|
814
|
-
Identify unmodified positions in a sequence given deletions and insertions.
|
|
815
|
-
|
|
816
|
-
:param sequence_length: Length of the sequence.
|
|
817
|
-
:param deletions: Dictionary of deletions.
|
|
818
|
-
:param insertions: Dictionary of insertions.
|
|
819
|
-
:param reach_limit: Limit for considering the effect of insertions/deletions.
|
|
820
|
-
:return: Array indicating unmodified positions.
|
|
821
|
-
"""
|
|
822
|
-
unmodified_positions = np.zeros(sequence_length, dtype=float)
|
|
823
|
-
|
|
824
|
-
for pos, insertion in insertions.items():
|
|
825
|
-
# if pos >= sequence_length:
|
|
826
|
-
# pos = sequence_length - 1
|
|
827
|
-
# add_factor = 1
|
|
828
|
-
|
|
829
|
-
reach = min(len(insertion) // 2, reach_limit)
|
|
830
|
-
front_end, back_end = max(0, pos - reach), min(sequence_length - 1, pos + reach)
|
|
831
|
-
len_start, len_end = pos - front_end, back_end - pos
|
|
832
|
-
try:
|
|
833
|
-
gradient_front = np.linspace(0, 1, len_start, endpoint=False)
|
|
834
|
-
gradient_back = np.linspace(0, 1, len_end, endpoint=True)[::-1]
|
|
835
|
-
combined_gradient = np.concatenate([gradient_front, np.array([1]), gradient_back])
|
|
836
|
-
unmodified_positions[front_end:back_end + 1] = combined_gradient
|
|
837
|
-
|
|
838
|
-
except ValueError as e:
|
|
839
|
-
print(
|
|
840
|
-
f"Error: {e} | Lengths: unmodified_positions_slice={back_end - front_end}, combined_gradient={len(combined_gradient)}")
|
|
841
|
-
unmodified_positions[front_end:back_end] = np.zeros(back_end - front_end)
|
|
842
|
-
|
|
843
|
-
for pos, deletion in deletions.items():
|
|
844
|
-
deletion_length = len(deletion)
|
|
845
|
-
unmodified_positions[pos:pos + deletion_length] = 1
|
|
846
|
-
|
|
847
|
-
return unmodified_positions
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
def calculate_penalty(domains, cons_scores, W, is_insertion=False):
|
|
851
|
-
"""
|
|
852
|
-
Calculate the penalty for mutations (either insertions or deletions) on conservation scores.
|
|
853
|
-
|
|
854
|
-
:param domains: Dictionary of mutations (inserted or deleted domains).
|
|
855
|
-
:param cons_scores: Conservation scores.
|
|
856
|
-
:param W: Window size.
|
|
857
|
-
:param is_insertion: Boolean flag to indicate if the mutation is an insertion.
|
|
858
|
-
:return: Penalty array.
|
|
859
|
-
"""
|
|
860
|
-
penalty = np.zeros(len(cons_scores))
|
|
861
|
-
for pos, seq in domains.items():
|
|
862
|
-
mutation_length = len(seq)
|
|
863
|
-
weight = max(1.0, mutation_length / W)
|
|
864
|
-
|
|
865
|
-
if is_insertion:
|
|
866
|
-
reach = min(W // 2, mutation_length // 2)
|
|
867
|
-
penalty[pos - reach:pos + reach] = weight * cons_scores[pos - reach:pos + reach]
|
|
868
|
-
else: # For deletion
|
|
869
|
-
penalty[pos:pos + mutation_length] = cons_scores[pos:pos + mutation_length] * weight
|
|
870
|
-
|
|
871
|
-
return penalty
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
def calculate_legacy_oncosplice_score(deletions, insertions, cons_vec, W):
|
|
875
|
-
"""
|
|
876
|
-
Calculate the legacy Oncosplice score based on deletions, insertions, and conservation vector.
|
|
877
|
-
|
|
878
|
-
:param deletions: Dictionary of deletions.
|
|
879
|
-
:param insertions: Dictionary of insertions.
|
|
880
|
-
:param cons_vec: Conservation vector.
|
|
881
|
-
:param W: Window size.
|
|
882
|
-
:return: Legacy Oncosplice score.
|
|
883
|
-
"""
|
|
884
|
-
smoothed_conservation_vector = np.exp(np.negative(moving_average_conv(cons_vec, W, 2)))
|
|
885
|
-
del_penalty = calculate_penalty(deletions, smoothed_conservation_vector, W, is_insertion=False)
|
|
886
|
-
ins_penalty = calculate_penalty(insertions, smoothed_conservation_vector, W, is_insertion=True)
|
|
887
|
-
combined_scores = del_penalty + ins_penalty
|
|
888
|
-
return np.max(np.convolve(combined_scores, np.ones(W), mode='same'))
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
def moving_average_conv(vector, window_size, factor=1):
|
|
892
|
-
"""
|
|
893
|
-
Calculate the moving average convolution of a vector.
|
|
894
|
-
|
|
895
|
-
Parameters:
|
|
896
|
-
vector (iterable): Input vector (list, tuple, numpy array).
|
|
897
|
-
window_size (int): Size of the convolution window. Must be a positive integer.
|
|
898
|
-
factor (float): Scaling factor for the average. Default is 1.
|
|
899
|
-
|
|
900
|
-
Returns:
|
|
901
|
-
numpy.ndarray: Convolved vector as a numpy array.
|
|
902
|
-
"""
|
|
903
|
-
if not isinstance(vector, (list, tuple, np.ndarray)):
|
|
904
|
-
raise TypeError("vector must be a list, tuple, or numpy array")
|
|
905
|
-
if not isinstance(window_size, int) or window_size <= 0:
|
|
906
|
-
raise ValueError("window_size must be a positive integer")
|
|
907
|
-
if len(vector) < window_size:
|
|
908
|
-
raise ValueError("window_size must not be greater than the length of vector")
|
|
909
|
-
if factor == 0:
|
|
910
|
-
raise ValueError("factor must not be zero")
|
|
911
|
-
|
|
912
|
-
return np.convolve(vector, np.ones(window_size), mode='same') / window_size
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
def oncosplice(mut_id, sai_threshold=0.5, protein_coding=True, primary_transcript=False):
|
|
916
|
-
mutation = Variations(mut_id)
|
|
917
|
-
reference_gene = Gene(mutation.gene)
|
|
918
|
-
mutated_gene = Gene(mutation.gene, mut_id)
|
|
919
|
-
|
|
920
|
-
results = []
|
|
921
|
-
for variant in mutated_gene.run_transcripts(protein_coding=protein_coding, primary_transcript=primary_transcript):
|
|
922
|
-
reference = reference_gene.transcript(variant.transcript_id)
|
|
923
|
-
if reference.cons_available:
|
|
924
|
-
cons_vector = transform_conservation_vector(reference.cons_vector)
|
|
925
|
-
|
|
926
|
-
missplicing = run_spliceai_transcript(mutation, reference, sai_threshold=sai_threshold)
|
|
927
|
-
for i, new_boundaries in enumerate(develop_aberrant_splicing(variant, missplicing)):
|
|
928
|
-
variant_isoform = deepcopy(variant)
|
|
929
|
-
variant_isoform.set_intron_boundaries(acceptors=new_boundaries['acceptors'],
|
|
930
|
-
donors=new_boundaries['donors']).generate_protein()
|
|
931
|
-
alignment = get_logical_alignment(reference.protein, variant_isoform.protein)
|
|
932
|
-
deleted, inserted = find_indels_with_mismatches_as_deletions(alignment.seqA, alignment.seqB)
|
|
933
|
-
modified_positions = find_modified_positions(len(cons_vector), deleted, inserted)
|
|
934
|
-
temp_cons = np.convolve(cons_vector * modified_positions, np.ones(11))
|
|
935
|
-
affected_cons_scores = max(temp_cons)
|
|
936
|
-
temp_cons = np.convolve(cons_vector, np.ones(11))
|
|
937
|
-
percentile = (
|
|
938
|
-
sorted(temp_cons).index(next(x for x in sorted(temp_cons) if x >= affected_cons_scores)) / len(
|
|
939
|
-
temp_cons))
|
|
940
|
-
|
|
941
|
-
report = OncospliceAnnotator(reference, variant_isoform, mutation)
|
|
942
|
-
report['original_cons'] = reference.cons_vector
|
|
943
|
-
report['oncosplice_score'] = affected_cons_scores
|
|
944
|
-
report['percentile'] = percentile
|
|
945
|
-
report['modified_positions'] = modified_positions
|
|
946
|
-
report['cons_vector'] = cons_vector
|
|
947
|
-
report['isoform_id'] = i
|
|
948
|
-
report['isoform_prevalence'] = new_boundaries['path_weight']
|
|
949
|
-
report['full_missplicing'] = missplicing
|
|
950
|
-
results.append(report)
|
|
951
|
-
|
|
952
|
-
report = pd.DataFrame(results)
|
|
953
|
-
return report
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
# import numpy as np
|
|
957
|
-
# import pandas as pd
|
|
958
|
-
# from Bio import pairwise2
|
|
959
|
-
# import re
|
|
960
|
-
# from copy import deepcopy
|
|
961
|
-
# from geney.splicing import PredictSpliceAI
|
|
962
|
-
# from .Gene import Gene, Transcript
|
|
963
|
-
# from geney.mutations.variant_utils import Variations, develop_aberrant_splicing
|
|
964
|
-
#
|
|
965
|
-
# sample_mut_id = 'KRAS:12:25227343:G:T'
|
|
966
|
-
# sample_epistasis_id = 'KRAS:12:25227343:G:T|KRAS:13:25227344:A:T'
|
|
967
|
-
#
|
|
968
|
-
# def oncosplice(mutation: str, sai_threshold=0.25, annotate=False) -> pd.DataFrame:
|
|
969
|
-
# '''
|
|
970
|
-
# :param mutation: str
|
|
971
|
-
# the genomic variation
|
|
972
|
-
# :param sai_threshold: float
|
|
973
|
-
# the threshold for including missplicing predictions in gene builds
|
|
974
|
-
# :param prevalence_threshold: float
|
|
975
|
-
# the minimum threshold needed to consider a predicted isoform as valid
|
|
976
|
-
# :param target_directory: pathlib.Path
|
|
977
|
-
# the directory on the machine where the mrna annotation files are stored
|
|
978
|
-
# :return: a dataframe object
|
|
979
|
-
# will contain columns pertinant to assessing mutation pathogenicity including pipelines score, GOF score, legacy pipelines score, missplicing,
|
|
980
|
-
# '''
|
|
981
|
-
#
|
|
982
|
-
# mutation = Variations(mutation) # Generate mutation object
|
|
983
|
-
# # Gene annotations should be available in the target directory under the file name mrna_gene.json
|
|
984
|
-
# gene = Gene(mutation.gene) # We obtain the annotation file and convert it into a Gene object
|
|
985
|
-
# # aberrant_splicing = PredictSpliceAI(mutation, gene, threshold=sai_threshold) # SpliceAI predictions are processed and obtained for each mutation
|
|
986
|
-
# # Oncosplice obtains predictions for each transcript in the annotation file
|
|
987
|
-
#
|
|
988
|
-
# results = []
|
|
989
|
-
# for reference_transcript in gene:
|
|
990
|
-
# aberrant_splicing = PredictSpliceAI(mutation, reference_transcript, threshold=sai_threshold)
|
|
991
|
-
# for i, new_boundaries in develop_aberrant_splicing(reference_transcript, aberrant_splicing.aberrant_splicing):
|
|
992
|
-
# res_in = oncosplice_transcript(reference_transcript=reference_transcript.generate_protein(), mutation=mutation, aberrant_splicing=aberrant_splicing, annotate=annotate, plot_term=plot_term)
|
|
993
|
-
# results.append(res_in)
|
|
994
|
-
#
|
|
995
|
-
# if len(results) > 0:
|
|
996
|
-
# results = pd.concat(results)
|
|
997
|
-
# else:
|
|
998
|
-
# return None
|
|
999
|
-
#
|
|
1000
|
-
# # Append some additional, uniform information to the results dataframe
|
|
1001
|
-
# results['mut_id'] = mutation.mut_id
|
|
1002
|
-
# results['missplicing'] = aberrant_splicing.get_max_missplicing_delta()
|
|
1003
|
-
# results['gene'] = mutation.gene
|
|
1004
|
-
# return results
|
|
1005
|
-
#
|
|
1006
|
-
#
|
|
1007
|
-
# def oncosplice_transcript(reference_transcript: Transcript, mutation: Variations, aberrant_splicing: PredictSpliceAI, annotate=False, plot_term=False) -> pd.DataFrame:
|
|
1008
|
-
# reports = []
|
|
1009
|
-
# if reference_transcript.cons_available:
|
|
1010
|
-
# cons_available, cons_array, cons_vector = True, transform_conservation_vector(reference_transcript.cons_vector), reference_transcript.cons_vector
|
|
1011
|
-
#
|
|
1012
|
-
# else:
|
|
1013
|
-
# cons_available, cons_array, cons_vector = False, transform_conservation_vector(np.ones(len(reference_transcript.protein), dtype=float)), np.ones(len(reference_transcript.protein), dtype=float)
|
|
1014
|
-
#
|
|
1015
|
-
# # For each transcript, we generate a series of isoforms based on the splice site predictions; each isoform is assigned a prevalence score
|
|
1016
|
-
# # obtained using simple graph theory where the probability of the edges taken to generate the isoform are multiplied together
|
|
1017
|
-
# for i, new_boundaries in enumerate(develop_aberrant_splicing(reference_transcript, aberrant_splicing.aberrant_splicing)):
|
|
1018
|
-
#
|
|
1019
|
-
# # The variant transcript is duplicated from the reference transcript and all needed modifications are performed
|
|
1020
|
-
# variant_transcript = Transcript(deepcopy(reference_transcript).__dict__).set_exons(new_boundaries).generate_mature_mrna(mutations=mutation.mut_id.split('|'), inplace=True).generate_translational_boundaries().generate_protein()
|
|
1021
|
-
#
|
|
1022
|
-
# # The optimal alignment that minimizes gaps between the trnascripts is obtained
|
|
1023
|
-
# alignment = get_logical_alignment(reference_transcript.protein, variant_transcript.protein)
|
|
1024
|
-
#
|
|
1025
|
-
# # Based on the optimal alignment, we can generate the relative locations of insertions and deletions
|
|
1026
|
-
# deleted, inserted = find_indels_with_mismatches_as_deletions(alignment.seqA, alignment.seqB)
|
|
1027
|
-
#
|
|
1028
|
-
# report = {
|
|
1029
|
-
# 'log': variant_transcript.log,
|
|
1030
|
-
# 'isoform': i,
|
|
1031
|
-
# 'isoform_prevalence': new_boundaries['path_weight'],
|
|
1032
|
-
# 'legacy_oncosplice_score_long': calculate_legacy_oncosplice_score(deleted, inserted, cons_vector,
|
|
1033
|
-
# min(76, len(reference_transcript.protein))),
|
|
1034
|
-
# 'legacy_oncosplice_score_short': calculate_legacy_oncosplice_score(deleted, inserted, cons_vector,
|
|
1035
|
-
# min(10,
|
|
1036
|
-
# len(reference_transcript.protein))),
|
|
1037
|
-
# 'variant_length': len(variant_transcript.protein.replace('*', '')),
|
|
1038
|
-
# }
|
|
1039
|
-
#
|
|
1040
|
-
# modified_positions = find_modified_positions(len(cons_vector), deleted, inserted)
|
|
1041
|
-
# # print(list(modified_positions))
|
|
1042
|
-
# # print(list(cons_array))
|
|
1043
|
-
# affected_cons_scores = cons_array.transpose() @ modified_positions[:, None]
|
|
1044
|
-
# # print(list(affected_cons_scores)) #[:, 0]))
|
|
1045
|
-
# # affected_cons_scores = sg.convolve2d(affected_cons_scores, np.ones(21), mode='same') #/ 21
|
|
1046
|
-
# max_score = affected_cons_scores #np.max(affected_cons_scores, axis=0)
|
|
1047
|
-
# report.update({'oncosplice_score': max_score, 'preserved_ratio': sum(modified_positions) / len(modified_positions)})
|
|
1048
|
-
#
|
|
1049
|
-
# if annotate:
|
|
1050
|
-
# report.update(OncospliceAnnotator(reference_transcript, variant_transcript, mutation))
|
|
1051
|
-
# report['insertions'] = inserted
|
|
1052
|
-
# report['deletions'] = deleted
|
|
1053
|
-
# report['full_missplicing'] = aberrant_splicing.missplicing
|
|
1054
|
-
# reports.append(report)
|
|
1055
|
-
#
|
|
1056
|
-
# reports = pd.DataFrame(reports)
|
|
1057
|
-
# reports['cons_available'] = int(cons_available)
|
|
1058
|
-
# reports['transcript_id'] = reference_transcript.transcript_id
|
|
1059
|
-
# reports['cons_sum'] = np.sum(np.exp(np.negative(cons_vector)))
|
|
1060
|
-
# reports['transcript_length'] = len(reference_transcript.protein)
|
|
1061
|
-
# reports['primary_transcript'] = reference_transcript.primary_transcript
|
|
1062
|
-
# return reports
|
|
1063
|
-
#
|
|
1064
|
-
#
|
|
1065
|
-
# def oncosplice_reduced(df):
|
|
1066
|
-
# target_columns = [c for c in df.columns if 'oncosplice' in c or 'cons' in c]
|
|
1067
|
-
# if len(target_columns) == 0:
|
|
1068
|
-
# print("No oncosplice scores to reduce.")
|
|
1069
|
-
# return None
|
|
1070
|
-
# scores = [df[['mut_id', 'missplicing']].drop_duplicates().set_index('mut_id')]
|
|
1071
|
-
# for score in target_columns:
|
|
1072
|
-
# scores.append(df.groupby(['mut_id', 'transcript_id'])[score].mean().groupby('mut_id').max())
|
|
1073
|
-
# scores.append(df.groupby(['mut_id', 'transcript_id'])[score].mean().groupby('mut_id').min())
|
|
1074
|
-
# scores = pd.concat(scores, axis=1)
|
|
1075
|
-
# return scores
|
|
1076
|
-
#
|
|
1077
|
-
#
|
|
1078
|
-
# def find_continuous_gaps(sequence):
|
|
1079
|
-
# """Find continuous gap sequences in an alignment."""
|
|
1080
|
-
# return [(m.start(), m.end()) for m in re.finditer(r'-+', sequence)]
|
|
1081
|
-
#
|
|
1082
|
-
#
|
|
1083
|
-
# def get_logical_alignment(ref_prot, var_prot):
|
|
1084
|
-
# """
|
|
1085
|
-
# Aligns two protein sequences and finds the optimal alignment with the least number of gaps.
|
|
1086
|
-
#
|
|
1087
|
-
# Parameters:
|
|
1088
|
-
# ref_prot (str): Reference protein sequence.
|
|
1089
|
-
# var_prot (str): Variant protein sequence.
|
|
1090
|
-
#
|
|
1091
|
-
# Returns:
|
|
1092
|
-
# tuple: Optimal alignment, number of insertions, and number of deletions.
|
|
1093
|
-
# """
|
|
1094
|
-
#
|
|
1095
|
-
# # Perform global alignment
|
|
1096
|
-
# alignments = pairwise2.align.globalms(ref_prot, var_prot, 1, -1, -3, 0, penalize_end_gaps=(True, True))
|
|
1097
|
-
#
|
|
1098
|
-
# # Selecting the optimal alignment
|
|
1099
|
-
# if len(alignments) > 1:
|
|
1100
|
-
# # Calculate continuous gaps for each alignment and sum their lengths
|
|
1101
|
-
# gap_lengths = [sum(end - start for start, end in find_continuous_gaps(al.seqA) + find_continuous_gaps(al.seqB)) for al in alignments]
|
|
1102
|
-
# optimal_alignment = alignments[gap_lengths.index(min(gap_lengths))]
|
|
1103
|
-
# else:
|
|
1104
|
-
# optimal_alignment = alignments[0]
|
|
1105
|
-
#
|
|
1106
|
-
# return optimal_alignment
|
|
1107
|
-
#
|
|
1108
|
-
#
|
|
1109
|
-
# def find_indels_with_mismatches_as_deletions(seqA, seqB):
|
|
1110
|
-
# """
|
|
1111
|
-
# Identify insertions and deletions in aligned sequences, treating mismatches as deletions.
|
|
1112
|
-
#
|
|
1113
|
-
# Parameters:
|
|
1114
|
-
# seqA, seqB (str): Aligned sequences.
|
|
1115
|
-
#
|
|
1116
|
-
# Returns:
|
|
1117
|
-
# tuple: Two dictionaries containing deletions and insertions.
|
|
1118
|
-
# """
|
|
1119
|
-
# if len(seqA) != len(seqB):
|
|
1120
|
-
# raise ValueError("Sequences must be of the same length")
|
|
1121
|
-
#
|
|
1122
|
-
# mapperA, counter = {}, 0
|
|
1123
|
-
# for i, c in enumerate(list(seqA)):
|
|
1124
|
-
# if c != '-':
|
|
1125
|
-
# counter += 1
|
|
1126
|
-
# mapperA[i] = counter
|
|
1127
|
-
#
|
|
1128
|
-
# mapperB, counter = {}, 0
|
|
1129
|
-
# for i, (c1, c2) in enumerate(list(zip(seqA, seqB))):
|
|
1130
|
-
# if c2 != '-':
|
|
1131
|
-
# counter += 1
|
|
1132
|
-
# mapperB[i] = counter
|
|
1133
|
-
#
|
|
1134
|
-
# seqA_array, seqB_array = np.array(list(seqA)), np.array(list(seqB))
|
|
1135
|
-
#
|
|
1136
|
-
# # Find and mark mismatch positions in seqB
|
|
1137
|
-
# mismatches = (seqA_array != seqB_array) & (seqA_array != '-') & (seqB_array != '-')
|
|
1138
|
-
# seqB_array[mismatches] = '-'
|
|
1139
|
-
# modified_seqB = ''.join(seqB_array)
|
|
1140
|
-
#
|
|
1141
|
-
# gaps_in_A = find_continuous_gaps(seqA)
|
|
1142
|
-
# gaps_in_B = find_continuous_gaps(modified_seqB)
|
|
1143
|
-
#
|
|
1144
|
-
# insertions = {mapperB[start]: modified_seqB[start:end].replace('-', '') for start, end in gaps_in_A if
|
|
1145
|
-
# seqB[start:end].strip('-')}
|
|
1146
|
-
# deletions = {mapperA[start]: seqA[start:end].replace('-', '') for start, end in gaps_in_B if
|
|
1147
|
-
# seqA[start:end].strip('-')}
|
|
1148
|
-
# return deletions, insertions
|
|
1149
|
-
#
|
|
1150
|
-
#
|
|
1151
|
-
#
|
|
1152
|
-
# def parabolic_window(window_size):
|
|
1153
|
-
# """Create a parabolic window function with a peak at the center."""
|
|
1154
|
-
# x = np.linspace(-1, 1, window_size)
|
|
1155
|
-
# return 0.9 * (1 - x**2) + 0.1
|
|
1156
|
-
#
|
|
1157
|
-
#
|
|
1158
|
-
# # def calculate_window_size(conservation_vector_length):
|
|
1159
|
-
# # return int(9 + (51 - 9) * (1 - np.exp(-0.0005 * conservation_vector_length)))
|
|
1160
|
-
# #
|
|
1161
|
-
#
|
|
1162
|
-
#
|
|
1163
|
-
# def transform_conservation_vector(conservation_vector):
|
|
1164
|
-
# """
|
|
1165
|
-
# Transforms a 1D conservation vector using different parameters.
|
|
1166
|
-
#
|
|
1167
|
-
# Args:
|
|
1168
|
-
# conservation_vector (numpy.ndarray): Input 1D vector of conservation values.
|
|
1169
|
-
#
|
|
1170
|
-
# Returns:
|
|
1171
|
-
# numpy.ndarray: A matrix containing transformed vectors.
|
|
1172
|
-
# """
|
|
1173
|
-
# window = 21
|
|
1174
|
-
# factor = 0.5
|
|
1175
|
-
# convolving_window = parabolic_window(window)
|
|
1176
|
-
# transformed_vector = np.convolve(conservation_vector, convolving_window, mode='same') / np.sum(convolving_window)
|
|
1177
|
-
# # Compute exponential factors
|
|
1178
|
-
# exp_factors = np.exp(-transformed_vector * factor)
|
|
1179
|
-
#
|
|
1180
|
-
# # Normalize and scale exponential factors
|
|
1181
|
-
# exp_factors /= exp_factors.sum()
|
|
1182
|
-
# return exp_factors
|
|
1183
|
-
#
|
|
1184
|
-
#
|
|
1185
|
-
#
|
|
1186
|
-
# def find_modified_positions(sequence_length, deletions, insertions, reach_limit=16):
|
|
1187
|
-
# """
|
|
1188
|
-
# Identify unmodified positions in a sequence given deletions and insertions.
|
|
1189
|
-
#
|
|
1190
|
-
# :param sequence_length: Length of the sequence.
|
|
1191
|
-
# :param deletions: Dictionary of deletions.
|
|
1192
|
-
# :param insertions: Dictionary of insertions.
|
|
1193
|
-
# :param reach_limit: Limit for considering the effect of insertions/deletions.
|
|
1194
|
-
# :return: Array indicating unmodified positions.
|
|
1195
|
-
# """
|
|
1196
|
-
# unmodified_positions = np.zeros(sequence_length, dtype=float)
|
|
1197
|
-
#
|
|
1198
|
-
# for pos, insertion in insertions.items():
|
|
1199
|
-
# # if pos >= sequence_length:
|
|
1200
|
-
# # pos = sequence_length - 1
|
|
1201
|
-
# # add_factor = 1
|
|
1202
|
-
#
|
|
1203
|
-
# reach = min(len(insertion) // 2, reach_limit)
|
|
1204
|
-
# front_end, back_end = max(0, pos - reach), min(sequence_length - 1, pos + reach)
|
|
1205
|
-
# len_start, len_end = pos - front_end, back_end - pos
|
|
1206
|
-
# try:
|
|
1207
|
-
# gradient_front = np.linspace(0, 1, len_start, endpoint=False)
|
|
1208
|
-
# gradient_back = np.linspace(0, 1, len_end, endpoint=True)[::-1]
|
|
1209
|
-
# combined_gradient = np.concatenate([gradient_front, np.array([1]), gradient_back])
|
|
1210
|
-
# unmodified_positions[front_end:back_end + 1] = combined_gradient
|
|
1211
|
-
#
|
|
1212
|
-
# except ValueError as e:
|
|
1213
|
-
# print(
|
|
1214
|
-
# f"Error: {e} | Lengths: unmodified_positions_slice={back_end - front_end}, combined_gradient={len(combined_gradient)}")
|
|
1215
|
-
# unmodified_positions[front_end:back_end] = np.zeros(back_end - front_end)
|
|
1216
|
-
#
|
|
1217
|
-
# for pos, deletion in deletions.items():
|
|
1218
|
-
# deletion_length = len(deletion)
|
|
1219
|
-
# unmodified_positions[pos:pos + deletion_length] = 1
|
|
1220
|
-
#
|
|
1221
|
-
# return unmodified_positions
|
|
1222
|
-
#
|
|
1223
|
-
#
|
|
1224
|
-
#
|
|
1225
|
-
# def calculate_penalty(domains, cons_scores, W, is_insertion=False):
|
|
1226
|
-
# """
|
|
1227
|
-
# Calculate the penalty for mutations (either insertions or deletions) on conservation scores.
|
|
1228
|
-
#
|
|
1229
|
-
# :param domains: Dictionary of mutations (inserted or deleted domains).
|
|
1230
|
-
# :param cons_scores: Conservation scores.
|
|
1231
|
-
# :param W: Window size.
|
|
1232
|
-
# :param is_insertion: Boolean flag to indicate if the mutation is an insertion.
|
|
1233
|
-
# :return: Penalty array.
|
|
1234
|
-
# """
|
|
1235
|
-
# penalty = np.zeros(len(cons_scores))
|
|
1236
|
-
# for pos, seq in domains.items():
|
|
1237
|
-
# mutation_length = len(seq)
|
|
1238
|
-
# weight = max(1.0, mutation_length / W)
|
|
1239
|
-
#
|
|
1240
|
-
# if is_insertion:
|
|
1241
|
-
# reach = min(W // 2, mutation_length // 2)
|
|
1242
|
-
# penalty[pos - reach:pos + reach] = weight * cons_scores[pos - reach:pos + reach]
|
|
1243
|
-
# else: # For deletion
|
|
1244
|
-
# penalty[pos:pos + mutation_length] = cons_scores[pos:pos + mutation_length] * weight
|
|
1245
|
-
#
|
|
1246
|
-
# return penalty
|
|
1247
|
-
#
|
|
1248
|
-
#
|
|
1249
|
-
# def calculate_legacy_oncosplice_score(deletions, insertions, cons_vec, W):
|
|
1250
|
-
# """
|
|
1251
|
-
# Calculate the legacy Oncosplice score based on deletions, insertions, and conservation vector.
|
|
1252
|
-
#
|
|
1253
|
-
# :param deletions: Dictionary of deletions.
|
|
1254
|
-
# :param insertions: Dictionary of insertions.
|
|
1255
|
-
# :param cons_vec: Conservation vector.
|
|
1256
|
-
# :param W: Window size.
|
|
1257
|
-
# :return: Legacy Oncosplice score.
|
|
1258
|
-
# """
|
|
1259
|
-
# smoothed_conservation_vector = np.exp(np.negative(moving_average_conv(cons_vec, W, 2)))
|
|
1260
|
-
# del_penalty = calculate_penalty(deletions, smoothed_conservation_vector, W, is_insertion=False)
|
|
1261
|
-
# ins_penalty = calculate_penalty(insertions, smoothed_conservation_vector, W, is_insertion=True)
|
|
1262
|
-
# combined_scores = del_penalty + ins_penalty
|
|
1263
|
-
# return np.max(np.convolve(combined_scores, np.ones(W), mode='same'))
|
|
1264
|
-
#
|
|
1265
|
-
#
|
|
1266
|
-
# def moving_average_conv(vector, window_size, factor=1):
|
|
1267
|
-
# """
|
|
1268
|
-
# Calculate the moving average convolution of a vector.
|
|
1269
|
-
#
|
|
1270
|
-
# Parameters:
|
|
1271
|
-
# vector (iterable): Input vector (list, tuple, numpy array).
|
|
1272
|
-
# window_size (int): Size of the convolution window. Must be a positive integer.
|
|
1273
|
-
# factor (float): Scaling factor for the average. Default is 1.
|
|
1274
|
-
#
|
|
1275
|
-
# Returns:
|
|
1276
|
-
# numpy.ndarray: Convolved vector as a numpy array.
|
|
1277
|
-
# """
|
|
1278
|
-
# if not isinstance(vector, (list, tuple, np.ndarray)):
|
|
1279
|
-
# raise TypeError("vector must be a list, tuple, or numpy array")
|
|
1280
|
-
# if not isinstance(window_size, int) or window_size <= 0:
|
|
1281
|
-
# raise ValueError("window_size must be a positive integer")
|
|
1282
|
-
# if len(vector) < window_size:
|
|
1283
|
-
# raise ValueError("window_size must not be greater than the length of vector")
|
|
1284
|
-
# if factor == 0:
|
|
1285
|
-
# raise ValueError("factor must not be zero")
|
|
1286
|
-
#
|
|
1287
|
-
# return np.convolve(vector, np.ones(window_size), mode='same') / window_size
|
|
1288
|
-
#
|
|
1289
|
-
#
|
|
1290
|
-
# def OncospliceAnnotator(reference_transcript, variant_transcript, mut):
|
|
1291
|
-
# affected_exon, affected_intron, distance_from_5, distance_from_3 = find_splice_site_proximity(mut, reference_transcript)
|
|
1292
|
-
#
|
|
1293
|
-
# report = {}
|
|
1294
|
-
# report['reference_mRNA'] = reference_transcript.transcript_seq
|
|
1295
|
-
# report['reference_CDS_start'] = reference_transcript.TIS
|
|
1296
|
-
# report['reference_pre_mrna'] = reference_transcript.pre_mrna
|
|
1297
|
-
# report['reference_ORF'] = reference_transcript.orf #pre_mrna[reference_transcript.transcript_indices.index(reference_transcript.TIS):reference_transcript.transcript_indices.index(reference_transcript.TTS)]
|
|
1298
|
-
# report['reference_protein'] = reference_transcript.protein
|
|
1299
|
-
#
|
|
1300
|
-
# report['variant_mRNA'] = variant_transcript.transcript_seq
|
|
1301
|
-
# report['variant_CDS_start'] = variant_transcript.TIS
|
|
1302
|
-
# report['variant_pre_mrna'] = variant_transcript.pre_mrna #pre_mrna[variant_transcript.transcript_indices.index(variant_transcript.TIS):variant_transcript.transcript_indices.index(variant_transcript.TTS)]
|
|
1303
|
-
# report['variant_ORF'] = variant_transcript.orf
|
|
1304
|
-
# report['variant_protein'] = variant_transcript.protein
|
|
1305
|
-
#
|
|
1306
|
-
# descriptions = define_missplicing_events(reference_transcript.exons, variant_transcript.exons,
|
|
1307
|
-
# reference_transcript.rev)
|
|
1308
|
-
# report['exon_changes'] = '|'.join([v for v in descriptions if v])
|
|
1309
|
-
# report['splicing_codes'] = summarize_missplicing_event(*descriptions)
|
|
1310
|
-
# report['affected_exon'] = affected_exon
|
|
1311
|
-
# report['affected_intron'] = affected_intron
|
|
1312
|
-
# report['mutation_distance_from_5'] = distance_from_5
|
|
1313
|
-
# report['mutation_distance_from_3'] = distance_from_3
|
|
1314
|
-
# return report
|
|
1315
|
-
#
|
|
1316
|
-
#
|
|
1317
|
-
# def find_splice_site_proximity(mut, transcript):
|
|
1318
|
-
# affected_exon, affected_intron, distance_from_5, distance_from_3 = None, None, None, None
|
|
1319
|
-
# for i, (ex_start, ex_end) in enumerate(transcript.exons):
|
|
1320
|
-
# if min(ex_start, ex_end) <= mut.start <= max(ex_start, ex_end):
|
|
1321
|
-
# affected_exon = i + 1
|
|
1322
|
-
# distance_from_5 = abs(mut.start - ex_start)
|
|
1323
|
-
# distance_from_3 = abs(mut.start - ex_end)
|
|
1324
|
-
#
|
|
1325
|
-
# for i, (in_start, in_end) in enumerate(transcript.introns):
|
|
1326
|
-
# if min(in_start, in_end) <= mut.start <= max(in_start, in_end):
|
|
1327
|
-
# affected_intron = i + 1
|
|
1328
|
-
# distance_from_5 = abs(mut.start - in_end)
|
|
1329
|
-
# distance_from_3 = abs(mut.start - in_start)
|
|
1330
|
-
#
|
|
1331
|
-
# return affected_exon, affected_intron, distance_from_5, distance_from_3
|
|
1332
|
-
#
|
|
1333
|
-
#
|
|
1334
|
-
# def define_missplicing_events(ref_exons, var_exons, rev):
|
|
1335
|
-
# ref_introns = [(ref_exons[i][1], ref_exons[i + 1][0]) for i in range(len(ref_exons) - 1)]
|
|
1336
|
-
# var_introns = [(var_exons[i][1], var_exons[i + 1][0]) for i in range(len(var_exons) - 1)]
|
|
1337
|
-
# num_ref_exons = len(ref_exons)
|
|
1338
|
-
# num_ref_introns = len(ref_introns)
|
|
1339
|
-
# if not rev:
|
|
1340
|
-
# partial_exon_skipping = ','.join(
|
|
1341
|
-
# [f'Exon {exon_count + 1}/{num_ref_exons} truncated: {(t1, t2)} --> {(s1, s2)}' for (s1, s2) in var_exons for
|
|
1342
|
-
# exon_count, (t1, t2) in enumerate(ref_exons) if (s1 == t1 and s2 < t2) or (s1 > t1 and s2 == t2)])
|
|
1343
|
-
# partial_intron_retention = ','.join(
|
|
1344
|
-
# [f'Intron {intron_count + 1}/{num_ref_introns} partially retained: {(t1, t2)} --> {(s1, s2)}' for (s1, s2)
|
|
1345
|
-
# in var_introns for intron_count, (t1, t2) in enumerate(ref_introns) if
|
|
1346
|
-
# (s1 == t1 and s2 < t2) or (s1 > t1 and s2 == t2)])
|
|
1347
|
-
#
|
|
1348
|
-
# else:
|
|
1349
|
-
# partial_exon_skipping = ','.join(
|
|
1350
|
-
# [f'Exon {exon_count + 1}/{num_ref_exons} truncated: {(t1, t2)} --> {(s1, s2)}' for (s1, s2) in var_exons for
|
|
1351
|
-
# exon_count, (t1, t2) in enumerate(ref_exons) if (s1 == t1 and s2 > t2) or (s1 < t1 and s2 == t2)])
|
|
1352
|
-
# partial_intron_retention = ','.join(
|
|
1353
|
-
# [f'Intron {intron_count + 1}/{num_ref_introns} partially retained: {(t1, t2)} --> {(s1, s2)}' for (s1, s2)
|
|
1354
|
-
# in var_introns for intron_count, (t1, t2) in enumerate(ref_introns) if
|
|
1355
|
-
# (s1 == t1 and s2 > t2) or (s1 < t1 and s2 == t2)])
|
|
1356
|
-
#
|
|
1357
|
-
# exon_skipping = ','.join(
|
|
1358
|
-
# [f'Exon {exon_count + 1}/{num_ref_exons} skipped: {(t1, t2)}' for exon_count, (t1, t2) in enumerate(ref_exons)
|
|
1359
|
-
# if
|
|
1360
|
-
# t1 not in [s1 for s1, s2 in var_exons] and t2 not in [s2 for s1, s2 in var_exons]])
|
|
1361
|
-
# novel_exons = ','.join([f'Novel Exon: {(t1, t2)}' for (t1, t2) in var_exons if
|
|
1362
|
-
# t1 not in [s1 for s1, s2 in ref_exons] and t2 not in [s2 for s1, s2 in ref_exons]])
|
|
1363
|
-
# intron_retention = ','.join(
|
|
1364
|
-
# [f'Intron {intron_count + 1}/{num_ref_introns} retained: {(t1, t2)}' for intron_count, (t1, t2) in
|
|
1365
|
-
# enumerate(ref_introns) if
|
|
1366
|
-
# t1 not in [s1 for s1, s2 in var_introns] and t2 not in [s2 for s1, s2 in var_introns]])
|
|
1367
|
-
#
|
|
1368
|
-
# return partial_exon_skipping, partial_intron_retention, exon_skipping, novel_exons, intron_retention
|
|
1369
|
-
#
|
|
1370
|
-
#
|
|
1371
|
-
# def summarize_missplicing_event(pes, pir, es, ne, ir):
|
|
1372
|
-
# event = []
|
|
1373
|
-
# if pes:
|
|
1374
|
-
# event.append('PES')
|
|
1375
|
-
# if es:
|
|
1376
|
-
# event.append('ES')
|
|
1377
|
-
# if pir:
|
|
1378
|
-
# event.append('PIR')
|
|
1379
|
-
# if ir:
|
|
1380
|
-
# event.append('IR')
|
|
1381
|
-
# if ne:
|
|
1382
|
-
# event.append('NE')
|
|
1383
|
-
# if len(event) > 1:
|
|
1384
|
-
# return event
|
|
1385
|
-
# elif len(event) == 1:
|
|
1386
|
-
# return event[0]
|
|
1387
|
-
# else:
|
|
1388
|
-
# return '-'
|
|
1389
|
-
#
|
|
1390
|
-
|
|
1391
|
-
|
|
1392
|
-
|
|
1393
|
-
# def find_indels_with_mismatches_as_deletions(seqA, seqB):
|
|
1394
|
-
# # Convert sequences to numpy arrays for element-wise comparison
|
|
1395
|
-
# ta, tb = np.array(list(seqA)), np.array(list(seqB))
|
|
1396
|
-
#
|
|
1397
|
-
# # Find mismatch positions
|
|
1398
|
-
# mismatch_positions = (ta != tb) & (ta != '-') & (tb != '-')
|
|
1399
|
-
#
|
|
1400
|
-
# # Replace mismatch positions in seqB with '-'
|
|
1401
|
-
# tb[mismatch_positions] = '-'
|
|
1402
|
-
# modified_seqB = ''.join(tb)
|
|
1403
|
-
#
|
|
1404
|
-
# # Function to find continuous gaps using regex
|
|
1405
|
-
# def find_continuous_gaps(sequence):
|
|
1406
|
-
# return [(m.start(), m.end()) for m in re.finditer(r'-+', sequence)]
|
|
1407
|
-
#
|
|
1408
|
-
# # Find gaps in both sequences
|
|
1409
|
-
# gaps_in_A = find_continuous_gaps(seqA)
|
|
1410
|
-
# gaps_in_B = find_continuous_gaps(modified_seqB)
|
|
1411
|
-
#
|
|
1412
|
-
# # Identify insertions and deletions
|
|
1413
|
-
# insertions = {start: modified_seqB[start:end].replace('-', '') for start, end in gaps_in_A if
|
|
1414
|
-
# seqB[start:end].strip('-')}
|
|
1415
|
-
# deletions = {start: seqA[start:end].replace('-', '') for start, end in gaps_in_B if seqA[start:end].strip('-')}
|
|
1416
|
-
#
|
|
1417
|
-
# return deletions, insertions
|
|
1418
|
-
|
|
1419
|
-
|
|
1420
|
-
|
|
1421
|
-
# def moving_average_conv(vector, window_size, factor=1):
|
|
1422
|
-
# """
|
|
1423
|
-
# Calculate the moving average convolution of a vector.
|
|
1424
|
-
#
|
|
1425
|
-
# :param vector: Input vector.
|
|
1426
|
-
# :param window_size: Size of the convolution window.
|
|
1427
|
-
# :return: Convolved vector as a numpy array.
|
|
1428
|
-
# """
|
|
1429
|
-
# convolving_length = np.array([min(len(vector) + window_size - i, window_size, i)
|
|
1430
|
-
# for i in range(window_size // 2, len(vector) + window_size // 2)], dtype=float)
|
|
1431
|
-
#
|
|
1432
|
-
# return np.convolve(vector, np.ones(window_size), mode='same') / (convolving_length / factor)
|
|
1433
|
-
#
|
|
1434
|
-
|
|
1435
|
-
|
|
1436
|
-
# def get_logical_alignment(ref_prot, var_prot):
|
|
1437
|
-
# '''
|
|
1438
|
-
# :param ref_prot:
|
|
1439
|
-
# :param var_prot:
|
|
1440
|
-
# :return:
|
|
1441
|
-
# '''
|
|
1442
|
-
#
|
|
1443
|
-
# alignments = pairwise2.align.globalms(ref_prot, var_prot, 1, -1, -3, 0, penalize_end_gaps=(True, False))
|
|
1444
|
-
# if len(alignments) == 1:
|
|
1445
|
-
# optimal_alignment = alignments[0]
|
|
1446
|
-
# else:
|
|
1447
|
-
# # This calculates the number of gaps in each alignment.
|
|
1448
|
-
# number_of_gaps = [re.sub('-+', '-', al.seqA).count('-') + re.sub('-+', '-', al.seqB).count('-') for al in
|
|
1449
|
-
# alignments]
|
|
1450
|
-
#
|
|
1451
|
-
# optimal_alignment = alignments[number_of_gaps.index(min(number_of_gaps))]
|
|
1452
|
-
#
|
|
1453
|
-
# num_insertions = re.sub('-+', '-', optimal_alignment.seqA).count('-')
|
|
1454
|
-
# num_deletions = re.sub('-+', '-', optimal_alignment.seqB).count('-')
|
|
1455
|
-
# return optimal_alignment
|
|
1456
|
-
#
|
|
1457
|
-
|
|
1458
|
-
|
|
1459
|
-
# def transform_conservation_vector(conservation_vector, window_size=10, verbose=False):
|
|
1460
|
-
# """
|
|
1461
|
-
# Transforms a conservation vector by applying a moving average convolution and scaling.
|
|
1462
|
-
#
|
|
1463
|
-
# :param conservation_vector: Array of conservation scores.
|
|
1464
|
-
# :param window_size: Window size for the moving average convolution. Defaults to 10, the average binding site length.
|
|
1465
|
-
# :return: Transformed conservation vector.
|
|
1466
|
-
# """
|
|
1467
|
-
# factor = 100 / window_size
|
|
1468
|
-
# conservation_vector = moving_average_conv(conservation_vector, window_size)
|
|
1469
|
-
# transformed_vector = np.exp(-conservation_vector*factor)
|
|
1470
|
-
# transformed_vector = transformed_vector / max(transformed_vector)
|
|
1471
|
-
#
|
|
1472
|
-
# if verbose:
|
|
1473
|
-
# import asciiplotlib as apl
|
|
1474
|
-
# fig = apl.figure()
|
|
1475
|
-
# fig.plot(list(range(len(transformed_vector))), transformed_vector, width=50, height=15, title="Conservation Vector")
|
|
1476
|
-
# fig.plot(list(range(len(conservation_vector))), transformed_vector, width=50, height=15, title="Entropy Vector")
|
|
1477
|
-
# fig.show()
|
|
1478
|
-
#
|
|
1479
|
-
# return transformed_vector
|
|
1480
|
-
|
|
1481
|
-
# def oncosplice_report(modified_positions, cons_matrix, tplot=False):
|
|
1482
|
-
# """
|
|
1483
|
-
# Calculate pipelines scores based on conservation vectors and detected sequence modifications.
|
|
1484
|
-
#
|
|
1485
|
-
# :param deletions: Dictionary of deletions in the sequence.
|
|
1486
|
-
# :param insertions: Dictionary of insertions in the sequence.
|
|
1487
|
-
# :param cons_vector: Conservation vector.
|
|
1488
|
-
# :param window_size: Window size for calculations.
|
|
1489
|
-
# :return: Dictionary of pipelines scores.
|
|
1490
|
-
# """
|
|
1491
|
-
# window_size = calculate_window_size(cons_matrix.shape[0])
|
|
1492
|
-
# # cons_vec_one, cons_vec_two, cons_vec_three = transform_conservation_vector(cons_matrix, tplot=tplot)
|
|
1493
|
-
# # results = {}
|
|
1494
|
-
#
|
|
1495
|
-
# # for i, cons_vec in enumerate([cons_vec_one, cons_vec_two, cons_vec_three]):
|
|
1496
|
-
# affected_cons_scores = cons_matrix * modified_positions
|
|
1497
|
-
# # affected_sum = np.sum(affected_cons_scores)
|
|
1498
|
-
# modified_cons_vector = np.convolve(affected_cons_scores, np.ones(window_size), mode='same') / window_size
|
|
1499
|
-
#
|
|
1500
|
-
# # obtaining scores
|
|
1501
|
-
# max_score = np.max(modified_cons_vector)
|
|
1502
|
-
# results = np.where(modified_cons_vector == max_score)[0]
|
|
1503
|
-
#
|
|
1504
|
-
# # # Exclude windows within one window_size of the max scoring window
|
|
1505
|
-
# # exclusion_zone = set().union(*(range(max(i - window_size, 0), min(i + window_size, len(modified_cons_vector))) for i in max_score_indices))
|
|
1506
|
-
# # viable_secondary_scores = [score for i, score in enumerate(modified_cons_vector) if i not in exclusion_zone]
|
|
1507
|
-
# #
|
|
1508
|
-
# # if len(viable_secondary_scores) == 0:
|
|
1509
|
-
# # gof_prob = 0
|
|
1510
|
-
# #
|
|
1511
|
-
# # else:
|
|
1512
|
-
# # second_highest_score = np.max(viable_secondary_scores)
|
|
1513
|
-
# # gof_prob = (max_score - second_highest_score) / max_score
|
|
1514
|
-
# # temp = {f'gof_{i}': gof_prob, f'oncosplice_score_{i}': max_score, f'affected_cons_sum_{i}': affected_sum}
|
|
1515
|
-
# # results.update(temp)
|
|
1516
|
-
# return results
|
|
1517
|
-
|
|
1518
|
-
|
|
1519
|
-
|
|
1520
|
-
# def transform_conservation_vector(conservation_vector, plot=False, tplot=False, tid=''):
|
|
1521
|
-
# # all_ones = np.all(conservation_vector == 1)
|
|
1522
|
-
# # if all_ones:
|
|
1523
|
-
# # return conservation_vector, conservation_vector, conservation_vector
|
|
1524
|
-
#
|
|
1525
|
-
# # Calculate dynamic window size
|
|
1526
|
-
# window_size = calculate_window_size(len(conservation_vector))
|
|
1527
|
-
#
|
|
1528
|
-
# if window_size > len(conservation_vector):
|
|
1529
|
-
# window_size = int(len(conservation_vector) / 2)
|
|
1530
|
-
#
|
|
1531
|
-
# # Create convolution window and transform vector
|
|
1532
|
-
# convolving_window = parabolic_window(window_size)
|
|
1533
|
-
# factor = int(100 / window_size)
|
|
1534
|
-
# transformed_vector = np.convolve(conservation_vector, convolving_window, mode='same') / sum(convolving_window)
|
|
1535
|
-
# transformed_vector = np.exp(-transformed_vector * factor)
|
|
1536
|
-
# transformed_vector_one = transformed_vector.copy()
|
|
1537
|
-
#
|
|
1538
|
-
# transformed_vector -= np.percentile(transformed_vector, 75)
|
|
1539
|
-
# transformed_vector_two = transformed_vector.copy()
|
|
1540
|
-
#
|
|
1541
|
-
# max_val = max(transformed_vector)
|
|
1542
|
-
# transformed_vector /= max_val
|
|
1543
|
-
#
|
|
1544
|
-
# # Balancing negative values
|
|
1545
|
-
# negative_values = transformed_vector[transformed_vector < 0]
|
|
1546
|
-
# if negative_values.size > 0:
|
|
1547
|
-
# balance_factor = -np.sum(transformed_vector[transformed_vector >= 0]) / np.sum(negative_values)
|
|
1548
|
-
# transformed_vector[transformed_vector < 0] *= balance_factor
|
|
1549
|
-
#
|
|
1550
|
-
# current_sum = np.sum(transformed_vector)
|
|
1551
|
-
# additional_amount_needed = len(transformed_vector) - current_sum
|
|
1552
|
-
# sum_positives = np.sum(transformed_vector[transformed_vector > 0])
|
|
1553
|
-
# if sum_positives == 0:
|
|
1554
|
-
# raise ValueError("Array contains no positive values to scale.")
|
|
1555
|
-
# scale_factor = 1 + (additional_amount_needed / sum_positives)
|
|
1556
|
-
# # Apply the scaling factor only to positive values
|
|
1557
|
-
# transformed_vector[transformed_vector > 0] *= scale_factor
|
|
1558
|
-
#
|
|
1559
|
-
#
|
|
1560
|
-
# # if plot:
|
|
1561
|
-
# # # Plotting the two vectors
|
|
1562
|
-
# # fig, ax1 = plt.subplots(figsize=(8, 4))
|
|
1563
|
-
# # color = 'tab:blue'
|
|
1564
|
-
# # ax1.set_xlabel('Position')
|
|
1565
|
-
# # ax1.set_ylabel('Conservation Vector', color=color, alpha=0.5)
|
|
1566
|
-
# # ax1.plot(conservation_vector, color=color)
|
|
1567
|
-
# # ax1.tick_params(axis='y', labelcolor=color)
|
|
1568
|
-
# #
|
|
1569
|
-
# # ax2 = ax1.twinx() # instantiate a second axes that shares the same x-axis
|
|
1570
|
-
# # color = 'tab:red'
|
|
1571
|
-
# # ax2.set_ylabel('Transformed Vector', color=color) # we already handled the x-label with ax1
|
|
1572
|
-
# # ax2.plot(transformed_vector, color=color)
|
|
1573
|
-
# # ax2.tick_params(axis='y', labelcolor=color)
|
|
1574
|
-
# # plt.axhline(0)
|
|
1575
|
-
# # plt.title(tid)
|
|
1576
|
-
# # fig.tight_layout() # otherwise the right y-label is slightly clipped
|
|
1577
|
-
# # plt.show()
|
|
1578
|
-
# #
|
|
1579
|
-
# # if tplot:
|
|
1580
|
-
# # import termplotlib as tpl
|
|
1581
|
-
# # fig = tpl.figure()
|
|
1582
|
-
# # fig.plot(list(range(len(conservation_vector))), conservation_vector, width=100, height=15)
|
|
1583
|
-
# # fig.plot(list(range(len(transformed_vector))), transformed_vector, width=100, height=15)
|
|
1584
|
-
# # fig.show()
|
|
1585
|
-
#
|
|
1586
|
-
# return transformed_vector_one, transformed_vector_two, transformed_vector
|
|
1587
|
-
|
|
1588
|
-
|