geney 1.2.20__py2.py3-none-any.whl → 1.2.22__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of geney might be problematic. Click here for more details.

Files changed (39) hide show
  1. geney/oncosplice.py +1 -1
  2. {geney-1.2.20.dist-info → geney-1.2.22.dist-info}/METADATA +1 -1
  3. geney-1.2.22.dist-info/RECORD +19 -0
  4. geney/Gene.py +0 -258
  5. geney/analyzers/__init__.py +0 -0
  6. geney/analyzers/benchmark_clinvar.py +0 -158
  7. geney/analyzers/characterize_epistasis.py +0 -15
  8. geney/analyzers/compare_sets.py +0 -91
  9. geney/analyzers/group_comparison.py +0 -81
  10. geney/analyzers/survival.py +0 -144
  11. geney/analyzers/tcga_annotations.py +0 -194
  12. geney/analyzers/visualize_protein_conservation.py +0 -398
  13. geney/benchmark_clinvar.py +0 -158
  14. geney/compare_sets.py +0 -91
  15. geney/data_parsers/__init__.py +0 -0
  16. geney/data_parsers/gtex.py +0 -68
  17. geney/gtex.py +0 -68
  18. geney/immunotherapy/__init__.py +0 -0
  19. geney/immunotherapy/netchop.py +0 -78
  20. geney/mutations/__init__.py +0 -0
  21. geney/mutations/variant_utils.py +0 -125
  22. geney/netchop.py +0 -79
  23. geney/oncosplice/__init__.py +0 -0
  24. geney/oncosplice_mouse.py +0 -277
  25. geney/oncosplice_pipeline.py +0 -1588
  26. geney/performance_utils.py +0 -138
  27. geney/pipelines/__init__.py +0 -0
  28. geney/pipelines/dask_utils.py +0 -153
  29. geney/splicing/__init__.py +0 -2
  30. geney/splicing/spliceai_utils.py +0 -253
  31. geney/splicing/splicing_isoform_utils.py +0 -0
  32. geney/splicing/splicing_utils.py +0 -366
  33. geney/survival.py +0 -124
  34. geney/tcga_annotations.py +0 -352
  35. geney/translation_termination/__init__.py +0 -0
  36. geney/translation_termination/tts_utils.py +0 -0
  37. geney-1.2.20.dist-info/RECORD +0 -52
  38. {geney-1.2.20.dist-info → geney-1.2.22.dist-info}/WHEEL +0 -0
  39. {geney-1.2.20.dist-info → geney-1.2.22.dist-info}/top_level.txt +0 -0
@@ -1,1588 +0,0 @@
1
- import networkx as nx
2
- import random
3
- from Bio.Seq import Seq
4
- from Bio import pairwise2
5
- from dataclasses import dataclass
6
- from copy import deepcopy
7
- import re
8
- import pandas as pd
9
- from pathlib import Path
10
-
11
- from geney import config_setup
12
- from geney.splicing.splicing_utils import *
13
- from geney.utils import find_files_by_gene_name, reverse_complement, unload_pickle
14
- from geney.Fasta_segment import Fasta_segment
15
-
16
-
17
- def sai_predict_probs(seq: str, models: list) -> list:
18
- '''
19
- Predicts the donor and acceptor junction probability of each
20
- NT in seq using SpliceAI.
21
-
22
- Let m:=2*sai_mrg_context + L be the input seq length. It is assumed
23
- that the input seq has the following structure:
24
-
25
- seq = |<sai_mrg_context NTs><L NTs><sai_mrg_context NTs>|
26
-
27
- The returned probability matrix is of size 2XL, where
28
- the first row is the acceptor probability and the second row
29
- is the donor probability. These probabilities corresponds to the
30
- middel <L NTs> NTs of the input seq.
31
- '''
32
- x = one_hot_encode(seq)[None, :]
33
- y = np.mean([models[m].predict(x, verbose=0) for m in range(5)], axis=0)
34
- return y[0, :, 1:].T
35
-
36
-
37
- class Mutation:
38
- def __init__(self, mid):
39
- self.mut_id = mid
40
-
41
- gene, chrom, pos, ref, alt = mid.split(':')
42
- self.gene = gene
43
- self.chrom = chrom.strip('chr')
44
- self.start = int(pos)
45
-
46
- self.file_identifier = self.mut_id.replace(':', '_')
47
- self.file_identifier_short = f'{self.start}_{ref}_{alt}'
48
-
49
- self.ref = ref if ref != '-' else ''
50
- self.alt = alt if alt != '-' else ''
51
-
52
- if len(self.ref) == len(self.alt) == 1:
53
- self.vartype = 'SNP'
54
- elif len(self.ref) == len(self.alt) > 1:
55
- self.vartype = 'SUB'
56
- elif self.ref and not self.alt:
57
- self.vartype = 'DEL'
58
- elif self.alt and not self.ref:
59
- self.vartype = 'INS'
60
- else:
61
- self.vartype = 'INDEL'
62
-
63
- def __str__(self):
64
- return self.mut_id
65
-
66
- def __repr__(self):
67
- return f"Mutation({self.mut_id})"
68
-
69
- def __lt__(self, other):
70
- return self.start < other.start
71
-
72
-
73
- class Variations:
74
- def __init__(self, epistatic_set):
75
- self.variants = sorted([Mutation(m) for m in epistatic_set.split('|')])
76
- self.mut_id = epistatic_set
77
- self.start = self.variants[0].start
78
- self.positions = [v.start for v in self.variants]
79
- # self.ref = ','.join([m.ref for m in self.variants])
80
- # self.alt = ','.join([m.alt for m in self.variants])
81
- self.gene = self.variants[0].gene
82
- self.chrom = self.variants[0].chrom.strip('chr')
83
- self.file_identifier = f'{self.gene}_{self.chrom}' + '_' + '_'.join(
84
- [v.file_identifier_short for v in self.variants])
85
-
86
- def __str__(self):
87
- return '|'.join([m.mut_id for m in self.variants])
88
-
89
- def __repr__(self):
90
- return f"Variation({', '.join([m.mut_id for m in self.variants])})"
91
-
92
- def __iter__(self):
93
- self.current_index = 0
94
- return self
95
-
96
- def __next__(self):
97
- if self.current_index < len(self.variants):
98
- x = self.variants[self.current_index]
99
- self.current_index += 1
100
- return x
101
- raise StopIteration
102
-
103
- @property
104
- def file_identifier_json(self):
105
- return Path(self.file_identifier + '.json')
106
-
107
-
108
- def generate_mut_variant(seq: str, indices: list, mut: Mutation):
109
- offset = 1 if not mut.ref else 0
110
-
111
- check_indices = list(range(mut.start, mut.start + len(mut.ref) + offset))
112
- check1 = all([m in indices for m in check_indices])
113
-
114
- if not check1:
115
- print(
116
- f"Mutation {mut} not within transcript bounds: {min(list(filter((-1).__ne__, indices)))} - {max(indices)}.")
117
- return seq, indices, False, False
118
-
119
- rel_start, rel_end = indices.index(mut.start) + offset, indices.index(mut.start) + offset + len(mut.ref)
120
- acquired_seq = seq[rel_start:rel_end]
121
- check2 = acquired_seq == mut.ref
122
- if not check2:
123
- print(f'Reference allele does not match genome_build allele. {acquired_seq}, {mut.ref}, {mut.start}')
124
- consensus_allele = False
125
- else:
126
- consensus_allele = True
127
- if len(mut.ref) == len(mut.alt) > 0:
128
- temp_indices = list(range(mut.start, mut.start + len(mut.ref)))
129
- else:
130
- temp_indices = [indices[indices.index(mut.start)] + v / 1000 for v in list(range(1, len(mut.alt) + 1))]
131
-
132
- new_indices = indices[:rel_start] + temp_indices + indices[rel_end:]
133
- new_seq = seq[:rel_start] + mut.alt + seq[rel_end:]
134
-
135
- assert len(new_seq) == len(new_indices), f'Error in variant modification: {mut}, {len(new_seq)}, {len(new_indices)}'
136
- assert is_monotonic(list(filter((-1).__ne__, new_indices))), f'Mut indices are not monotonic.'
137
- return new_seq, new_indices, True, consensus_allele
138
-
139
-
140
- def is_monotonic(A):
141
- x, y = [], []
142
- x.extend(A)
143
- y.extend(A)
144
- x.sort()
145
- y.sort(reverse=True)
146
- if (x == A or y == A):
147
- return True
148
- return False
149
-
150
-
151
- class Gene:
152
- def __init__(self, gene_name, variation=None):
153
- self.gene_name = gene_name
154
- self.gene_id = ''
155
- self.rev = None
156
- self.chrm = ''
157
- self.gene_start = 0
158
- self.gene_end = 0
159
- self.transcripts = {}
160
- self.load_from_file(find_files_by_gene_name(gene_name))
161
- self.variations = variation
162
-
163
- def __repr__(self):
164
- return f'Gene(gene_name={self.gene_name})'
165
-
166
- def __len__(self):
167
- return len(self.transcripts)
168
-
169
- def __str__(self):
170
- return '{gname}, {ntranscripts} transcripts'.format(gname=self.gene_name, ntranscripts=self.__len__())
171
-
172
- def __copy__(self):
173
- cls = self.__class__
174
- result = cls.__new__(cls)
175
- result.__dict__.update(self.__dict__)
176
- return result
177
-
178
- def __deepcopy__(self, memo):
179
- cls = self.__class__
180
- result = cls.__new__(cls)
181
- memo[id(self)] = result
182
- for k, v in self.__dict__.items():
183
- setattr(result, k, deepcopy(v, memo))
184
- return result
185
-
186
- def __getitem__(self, index):
187
- return Transcript(list(self.transcripts.values())[index])
188
-
189
- def load_from_file(self, file_name):
190
- if not file_name.exists():
191
- raise FileNotFoundError(f"File '{file_name}' not found.")
192
- self.load_from_dict(dict_data=unload_pickle(file_name))
193
- return self
194
-
195
- def load_from_dict(self, dict_data=None):
196
- for k, v in dict_data.items():
197
- setattr(self, k, v)
198
- return self
199
-
200
- def transcript(self, tid):
201
- if tid not in self.transcripts:
202
- raise AttributeError(f"Transcript '{tid}' not found in gene '{self.gene_name}'.")
203
- return Transcript(self.transcripts[tid])
204
-
205
- def run_transcripts(self, primary_transcript=False, protein_coding=False):
206
- for tid, annotations in self.transcripts.items():
207
- if primary_transcript and not annotations['primary_transcript']:
208
- continue
209
- if protein_coding and annotations['transcript_biotype'] != 'protein_coding':
210
- continue
211
-
212
- yield Transcript(self.transcripts[tid], variations=self.variations)
213
-
214
-
215
- class Transcript:
216
- def __init__(self, d=None, variations=None):
217
- self.transcript_id = None
218
- self.transcript_start = None # transcription
219
- self.transcript_end = None # transcription
220
- self.transcript_biotype = None # metadata
221
- self.acceptors, self.donors = [], [] # splicing
222
- self.TIS, self.TTS = None, None # translation
223
- self.transcript_seq, self.transcript_indices = '', [] # sequence data
224
- self.rev = None # sequence data
225
- self.chrm = '' # sequence data
226
- self.pre_mrna = '' # sequence data
227
- self.orf = '' # sequence data
228
- self.protein = '' # sequence data
229
- self.log = '' # sequence data
230
- self.primary_transcript = None # sequence data
231
- self.cons_available = False # metadata
232
- self.cons_seq = ''
233
- self.cons_vector = ''
234
- self.variations = None
235
- if variations:
236
- self.variations = Variations(variations)
237
-
238
- if d:
239
- self.load_from_dict(d)
240
-
241
- if self.cons_available:
242
- if '*' in self.cons_seq and len(self.cons_seq) == len(self.cons_vector):
243
- self.cons_seq = self.cons_seq.replace('*', '')
244
- self.cons_vector = self.cons_vector[:-1]
245
-
246
- elif '*' in self.cons_seq and len(self.cons_seq) == len(self.cons_vector) + 1:
247
- self.cons_seq = self.cons_seq.replace('*', '')
248
-
249
- else:
250
- self.cons_available = False
251
-
252
- if self.transcript_biotype == 'protein_coding':
253
- self.generate_protein()
254
-
255
- def __repr__(self):
256
- return 'Transcript(transcript_id={tid})'.format(tid=self.transcript_id)
257
-
258
- def __len__(self):
259
- return len(self.transcript_seq)
260
-
261
- def __str__(self):
262
- return 'Transcript {tid}, Transcript Type: ' \
263
- '{protein_coding}, Primary: {primary}'.format(
264
- tid=self.transcript_id, protein_coding=self.transcript_biotype.replace('_', ' ').title(),
265
- primary=self.primary_transcript)
266
-
267
- def __eq__(self, other):
268
- return self.transcript_seq == other.transcript_seq
269
-
270
- def __contains__(self, subvalue):
271
- if isinstance(subvalue, str):
272
- return subvalue in self.transcript_seq
273
- elif isinstance(subvalue, int):
274
- return subvalue in self.transcript_indices
275
- else:
276
- print(
277
- "Pass an integer to check against the span of the gene's coordinates or a string to check against the "
278
- "pre-mRNA sequence.")
279
- return False
280
-
281
- def __copy__(self):
282
- cls = self.__class__
283
- result = cls.__new__(cls)
284
- result.__dict__.update(self.__dict__)
285
- return result
286
-
287
- def __deepcopy__(self, memo):
288
- cls = self.__class__
289
- result = cls.__new__(cls)
290
- memo[id(self)] = result
291
- for k, v in self.__dict__.items():
292
- setattr(result, k, deepcopy(v, memo))
293
- return result
294
-
295
- def load_from_dict(self, data):
296
- for k, v in data.items():
297
- setattr(self, k, v)
298
- self.__arrange_boundaries()
299
- self.generate_mature_mrna(inplace=True)
300
- return self
301
-
302
- @property
303
- def exons(self):
304
- return list(zip(self.acceptors, self.donors))
305
-
306
- @property
307
- def introns(self):
308
- return list(zip([v for v in self.donors if v != self.transcript_end],
309
- [v for v in self.acceptors if v != self.transcript_start]))
310
-
311
- def set_intron_boundaries(self, acceptors=None, donors=None):
312
- if acceptors:
313
- self.acceptors = acceptors
314
- if donors:
315
- self.donors = donors
316
- self.__arrange_boundaries()
317
- return self
318
-
319
- @property
320
- def introns(self):
321
- return list(zip([v for v in self.donors if v != self.transcript_end],
322
- [v for v in self.acceptors if v != self.transcript_start]))
323
-
324
- def __exon_coverage_check(self):
325
- if sum([abs(a - b) + 1 for a, b in self.exons]) == len(self):
326
- return True
327
- else:
328
- return False
329
-
330
- @property
331
- def exons_pos(self):
332
- temp = self.exons
333
- if self.rev:
334
- temp = [(b, a) for a, b in temp[::-1]]
335
- return temp
336
-
337
- @property
338
- def mrna_indices(self):
339
- temp = [lst for lsts in [list(range(a, b + 1)) for a, b in self.exons_pos] for lst in lsts]
340
- return sorted(temp, reverse=self.rev)
341
-
342
- @property
343
- def exonic_indices(self):
344
- return [lst for lsts in [list(range(a, b + 1)) for a, b in self.exons_pos] for lst in lsts]
345
-
346
- def __arrange_boundaries(self):
347
- self.acceptors.append(self.transcript_start)
348
- self.donors.append(self.transcript_end)
349
- self.acceptors = list(set(self.acceptors))
350
- self.donors = list(set(self.donors))
351
- self.acceptors.sort(reverse=self.rev)
352
- self.donors.sort(reverse=self.rev)
353
- return self
354
-
355
- def positive_strand(self):
356
- if self.rev:
357
- return reverse_complement(self.transcript_seq)
358
- else:
359
- return self.transcript_seq
360
-
361
- def __pos2sense(self, mrna, indices):
362
- if self.rev:
363
- mrna = reverse_complement(mrna)
364
- indices = indices[::-1]
365
- return mrna, indices
366
-
367
- def pull_pre_mrna_pos(self):
368
- fasta_obj = Fasta_segment()
369
- if self.rev:
370
- return fasta_obj.read_segment_endpoints(config_setup['CHROM_SOURCE'] / f'chr{self.chrm}.fasta',
371
- self.transcript_end,
372
- self.transcript_start)
373
- else:
374
- return fasta_obj.read_segment_endpoints(config_setup['CHROM_SOURCE'] / f'chr{self.chrm}.fasta',
375
- self.transcript_start,
376
- self.transcript_end)
377
-
378
- def generate_pre_mrna_pos(self):
379
- seq, indices = self.pull_pre_mrna_pos()
380
- if self.variations:
381
- for mutation in self.variations.variants:
382
- seq, indices, _, _ = generate_mut_variant(seq, indices, mut=mutation)
383
- self.pre_mrna, _ = self.__pos2sense(seq, indices)
384
- return seq, indices
385
-
386
- def generate_pre_mrna(self, inplace=True):
387
- pre_mrna, pre_indices = self.__pos2sense(*self.generate_pre_mrna_pos())
388
- self.pre_mrna = pre_mrna
389
- if inplace:
390
- return self
391
- return pre_mrna, pre_indices
392
-
393
- def generate_mature_mrna_pos(self):
394
- mature_mrna, mature_indices = '', []
395
- pre_seq, pre_indices = self.generate_pre_mrna_pos()
396
- for i, j in self.exons_pos:
397
- rel_start, rel_end = pre_indices.index(i), pre_indices.index(j)
398
- mature_mrna += pre_seq[rel_start:rel_end + 1]
399
- mature_indices.extend(pre_indices[rel_start:rel_end + 1])
400
- return mature_mrna, mature_indices
401
-
402
- def generate_mature_mrna(self, inplace=True):
403
- if inplace:
404
- self.transcript_seq, self.transcript_indices = self.__pos2sense(*self.generate_mature_mrna_pos())
405
- return self
406
- return self.__pos2sense(*self.generate_mature_mrna_pos())
407
-
408
- def generate_protein(self, inplace=True, regenerate_mrna=True):
409
- if regenerate_mrna:
410
- self.generate_mature_mrna()
411
-
412
- if not self.TIS or self.TIS not in self.transcript_indices:
413
- return ''
414
-
415
- rel_start = self.transcript_indices.index(self.TIS)
416
- orf = self.transcript_seq[rel_start:]
417
- first_stop_index = next((i for i in range(0, len(orf) - 2, 3) if orf[i:i + 3] in {"TAG", "TAA", "TGA"}), None)
418
- orf = orf[:first_stop_index + 3]
419
- protein = str(Seq(orf).translate()).replace('*', '')
420
- if inplace:
421
- self.orf = orf
422
- self.protein = protein
423
- if self.protein != self.cons_seq:
424
- self.cons_available = False
425
- return self
426
- return protein
427
-
428
-
429
- def develop_aberrant_splicing(transcript, aberrant_splicing):
430
- exon_starts = {v: 1 for v in transcript.acceptors}
431
- exon_starts.update({transcript.transcript_start: 1})
432
- exon_starts.update({s: v['absolute'] for s, v in aberrant_splicing['missed_acceptors'].items()})
433
- exon_starts.update({s: v['absolute'] for s, v in aberrant_splicing['discovered_acceptors'].items()})
434
-
435
- exon_ends = {v: 1 for v in transcript.donors}
436
- exon_ends.update({transcript.transcript_end: 1})
437
- exon_ends.update({s: v['absolute'] for s, v in aberrant_splicing['missed_donors'].items()})
438
- exon_ends.update({s: v['absolute'] for s, v in aberrant_splicing['discovered_donors'].items()})
439
-
440
- nodes = [SpliceSite(pos=pos, ss_type=0, prob=prob) for pos, prob in exon_ends.items()] + \
441
- [SpliceSite(pos=pos, ss_type=1, prob=prob) for pos, prob in exon_starts.items()]
442
-
443
- nodes = [s for s in nodes if s.prob > 0]
444
- nodes.sort(key=lambda x: x.pos, reverse=transcript.rev)
445
-
446
- G = nx.DiGraph()
447
- G.add_nodes_from([n.pos for n in nodes])
448
-
449
- for i in range(len(nodes)):
450
- trailing_prob, in_between = 0, []
451
- for j in range(i + 1, len(nodes)):
452
- curr_node, next_node = nodes[i], nodes[j]
453
- spread = curr_node.ss_type in in_between
454
- in_between.append(next_node.ss_type)
455
- if curr_node.ss_type != next_node.ss_type:
456
- if spread:
457
- new_prob = next_node.prob - trailing_prob
458
- if new_prob <= 0:
459
- break
460
- G.add_edge(curr_node.pos, next_node.pos)
461
- G.edges[curr_node.pos, next_node.pos]['weight'] = new_prob
462
- trailing_prob += next_node.prob
463
- else:
464
- G.add_edge(curr_node.pos, next_node.pos)
465
- G.edges[curr_node.pos, next_node.pos]['weight'] = next_node.prob
466
- trailing_prob += next_node.prob
467
-
468
- new_paths, prob_sum = {}, 0
469
- for i, path in enumerate(nx.all_simple_paths(G, transcript.transcript_start, transcript.transcript_end)):
470
- curr_prob = path_weight_mult(G, path, 'weight')
471
- prob_sum += curr_prob
472
- new_paths[i] = {
473
- 'acceptors': sorted([p for p in path if p in exon_starts.keys() and p != transcript.transcript_start],
474
- reverse=transcript.rev),
475
- 'donors': sorted([p for p in path if p in exon_ends.keys() and p != transcript.transcript_end],
476
- reverse=transcript.rev),
477
- 'path_weight': curr_prob}
478
-
479
- for i, d in new_paths.items():
480
- d['path_weight'] = round(d['path_weight'] / prob_sum, 3)
481
- new_paths = {k: v for k, v in new_paths.items() if v['path_weight'] > 0.01}
482
- return list(new_paths.values())
483
-
484
-
485
- def path_weight_mult(G, path, weight):
486
- multigraph = G.is_multigraph()
487
- cost = 1
488
- if not nx.is_path(G, path):
489
- raise nx.NetworkXNoPath("path does not exist")
490
- for node, nbr in nx.utils.pairwise(path):
491
- if multigraph:
492
- cost *= min(v[weight] for v in G[node][nbr].values())
493
- else:
494
- cost *= G[node][nbr][weight]
495
- return cost
496
-
497
-
498
- @dataclass
499
- class SpliceSite(object):
500
- pos: int
501
- ss_type: int
502
- prob: float
503
-
504
- def __post_init__(self):
505
- pass
506
-
507
- def __lt__(self, other):
508
- return self.pos < other.pos
509
-
510
- def __str__(self):
511
- print(f"({self.ss_type}, {self.pos}, {self.prob})")
512
-
513
-
514
- def run_spliceai_seq(seq, indices, rev):
515
- seq = 'N' * 5000 + seq + 'N' * 5000
516
- # indices = [-1] * 5000 + indices + [-1] * 5000
517
-
518
- ref_seq_probs_temp = sai_predict_probs(seq, sai_models)
519
- ref_seq_acceptor_probs, ref_seq_donor_probs = ref_seq_probs_temp[0, :], ref_seq_probs_temp[1, :]
520
-
521
- acceptor_indices = {a: b for a, b in list(zip(indices, ref_seq_acceptor_probs)) if b > 0.75}
522
- donor_indices = {a: b for a, b in list(zip(indices, ref_seq_donor_probs)) if b > 0.75}
523
- return acceptor_indices, donor_indices
524
-
525
-
526
- def run_spliceai_transcript(mutations, gene_data, sai_mrg_context=5000, min_coverage=2500, sai_threshold=0.5):
527
- positions = mutations.positions # [m.start for m in mutations]
528
- seq_start_pos = min(positions) - sai_mrg_context - min_coverage
529
- seq_end_pos = max(positions) + sai_mrg_context + min_coverage # + 1
530
-
531
- # ref_seq, ref_indices = pull_fasta_seq_endpoints(mutations.chrom, seq_start_pos, seq_end_pos)
532
- fasta_obj = Fasta_segment()
533
- ref_seq, ref_indices = fasta_obj.read_segment_endpoints(
534
- config_setup['CHROM_SOURCE'] / f'chr{mutations.chrom}.fasta',
535
- seq_start_pos,
536
- seq_end_pos)
537
-
538
- gene_start, gene_end, rev = gene_data.transcript_start, gene_data.transcript_end, gene_data.rev
539
-
540
- mrna_acceptors = sorted(gene_data.acceptors)
541
- mrna_donors = sorted(gene_data.donors)
542
-
543
- visible_donors = np.intersect1d(mrna_donors, ref_indices)
544
- visible_acceptors = np.intersect1d(mrna_acceptors, ref_indices)
545
-
546
- start_pad = ref_indices.index(gene_start) if gene_start in ref_indices else 0
547
- end_cutoff = ref_indices.index(gene_end) if gene_end in ref_indices else len(ref_indices) # - 1
548
- end_pad = len(ref_indices) - end_cutoff
549
- ref_seq = 'N' * start_pad + ref_seq[start_pad:end_cutoff] + 'N' * end_pad
550
- ref_indices = [-1] * start_pad + ref_indices[start_pad:end_cutoff] + [-1] * end_pad
551
- mut_seq, mut_indices = ref_seq, ref_indices
552
-
553
- for mut in mutations:
554
- mut_seq, mut_indices, _, _ = generate_mut_variant(seq=mut_seq, indices=mut_indices, mut=mut)
555
-
556
- ref_indices = ref_indices[sai_mrg_context:-sai_mrg_context]
557
- mut_indices = mut_indices[sai_mrg_context:-sai_mrg_context]
558
-
559
- if rev:
560
- ref_seq = reverse_complement(ref_seq)
561
- mut_seq = reverse_complement(mut_seq)
562
- ref_indices = ref_indices[::-1]
563
- mut_indices = mut_indices[::-1]
564
-
565
- ref_seq_probs_temp = sai_predict_probs(ref_seq, sai_models)
566
- mut_seq_probs_temp = sai_predict_probs(mut_seq, sai_models)
567
-
568
- ref_seq_acceptor_probs, ref_seq_donor_probs = ref_seq_probs_temp[0, :], ref_seq_probs_temp[1, :]
569
- mut_seq_acceptor_probs, mut_seq_donor_probs = mut_seq_probs_temp[0, :], mut_seq_probs_temp[1, :]
570
-
571
- assert len(ref_indices) == len(ref_seq_acceptor_probs), 'Reference pos not the same'
572
- assert len(mut_indices) == len(mut_seq_acceptor_probs), 'Mut pos not the same'
573
-
574
- iap, dap = find_ss_changes({p: v for p, v in list(zip(ref_indices, ref_seq_acceptor_probs))},
575
- {p: v for p, v in list(zip(mut_indices, mut_seq_acceptor_probs))},
576
- visible_acceptors,
577
- threshold=sai_threshold)
578
-
579
- assert len(ref_indices) == len(ref_seq_donor_probs), 'Reference pos not the same'
580
- assert len(mut_indices) == len(mut_seq_donor_probs), 'Mut pos not the same'
581
-
582
- idp, ddp = find_ss_changes({p: v for p, v in list(zip(ref_indices, ref_seq_donor_probs))},
583
- {p: v for p, v in list(zip(mut_indices, mut_seq_donor_probs))},
584
- visible_donors,
585
- threshold=sai_threshold)
586
-
587
- missplicing = {'missed_acceptors': dap, 'missed_donors': ddp, 'discovered_acceptors': iap, 'discovered_donors': idp}
588
- missplicing = {outk: {float(k): v for k, v in outv.items()} for outk, outv in missplicing.items()}
589
- return {outk: {int(k) if k.is_integer() else k: v for k, v in outv.items()} for outk, outv in missplicing.items()}
590
-
591
-
592
- def OncospliceAnnotator(reference_transcript, variant_transcript, mut):
593
- affected_exon, affected_intron, distance_from_5, distance_from_3 = find_splice_site_proximity(mut,
594
- reference_transcript)
595
-
596
- report = {}
597
- report['primary_transcript'] = reference_transcript.primary_transcript
598
- report['transcript_id'] = reference_transcript.transcript_id
599
- report['mut_id'] = mut.mut_id
600
- report['cons_available'] = int(reference_transcript.cons_available)
601
- report['protein_coding'] = reference_transcript.transcript_biotype
602
-
603
- report['reference_mrna'] = reference_transcript.transcript_seq
604
- report['reference_cds_start'] = reference_transcript.TIS
605
- report['reference_pre_mrna'] = reference_transcript.pre_mrna
606
- report[
607
- 'reference_orf'] = reference_transcript.orf # pre_mrna[reference_transcript.transcript_indices.index(reference_transcript.TIS):reference_transcript.transcript_indices.index(reference_transcript.TTS)]
608
- report['reference_protein'] = reference_transcript.protein
609
- report['reference_protein_length'] = len(reference_transcript.protein)
610
-
611
- report['variant_mrna'] = variant_transcript.transcript_seq
612
- report['variant_cds_start'] = variant_transcript.TIS
613
- report[
614
- 'variant_pre_mrna'] = variant_transcript.pre_mrna # pre_mrna[variant_transcript.transcript_indices.index(variant_transcript.TIS):variant_transcript.transcript_indices.index(variant_transcript.TTS)]
615
- report['variant_orf'] = variant_transcript.orf
616
- report['variant_protein'] = variant_transcript.protein
617
- report['variant_protein_length'] = len(variant_transcript.protein)
618
-
619
- descriptions = define_missplicing_events(reference_transcript.exons, variant_transcript.exons,
620
- reference_transcript.rev)
621
- # print(descriptions)
622
- report['exon_changes'] = '|'.join([v for v in descriptions if v])
623
- report['splicing_codes'] = summarize_missplicing_event(*descriptions)
624
- report['affected_exon'] = affected_exon
625
- report['affected_intron'] = affected_intron
626
- report['mutation_distance_from_5'] = distance_from_5
627
- report['mutation_distance_from_3'] = distance_from_3
628
- return report
629
-
630
-
631
- def find_splice_site_proximity(mut, transcript):
632
- affected_exon, affected_intron, distance_from_5, distance_from_3 = None, None, None, None
633
- for i, (ex_start, ex_end) in enumerate(transcript.exons):
634
- if min(ex_start, ex_end) <= mut.start <= max(ex_start, ex_end):
635
- affected_exon = i + 1
636
- distance_from_5 = abs(mut.start - ex_start)
637
- distance_from_3 = abs(mut.start - ex_end)
638
-
639
- for i, (in_start, in_end) in enumerate(transcript.introns):
640
- if min(in_start, in_end) <= mut.start <= max(in_start, in_end):
641
- affected_intron = i + 1
642
- distance_from_5 = abs(mut.start - in_end)
643
- distance_from_3 = abs(mut.start - in_start)
644
-
645
- return affected_exon, affected_intron, distance_from_5, distance_from_3
646
-
647
-
648
- def define_missplicing_events(ref_exons, var_exons, rev):
649
- ref_introns = [(ref_exons[i][1], ref_exons[i + 1][0]) for i in range(len(ref_exons) - 1)]
650
- var_introns = [(var_exons[i][1], var_exons[i + 1][0]) for i in range(len(var_exons) - 1)]
651
- num_ref_exons = len(ref_exons)
652
- num_ref_introns = len(ref_introns)
653
- if not rev:
654
- partial_exon_skipping = ','.join(
655
- [f'Exon {exon_count + 1}/{num_ref_exons} truncated: {(t1, t2)} --> {(s1, s2)}' for (s1, s2) in var_exons for
656
- exon_count, (t1, t2) in enumerate(ref_exons) if (s1 == t1 and s2 < t2) or (s1 > t1 and s2 == t2)])
657
- partial_intron_retention = ','.join(
658
- [f'Intron {intron_count + 1}/{num_ref_introns} partially retained: {(t1, t2)} --> {(s1, s2)}' for (s1, s2)
659
- in var_introns for intron_count, (t1, t2) in enumerate(ref_introns) if
660
- (s1 == t1 and s2 < t2) or (s1 > t1 and s2 == t2)])
661
-
662
- else:
663
- partial_exon_skipping = ','.join(
664
- [f'Exon {exon_count + 1}/{num_ref_exons} truncated: {(t1, t2)} --> {(s1, s2)}' for (s1, s2) in var_exons for
665
- exon_count, (t1, t2) in enumerate(ref_exons) if (s1 == t1 and s2 > t2) or (s1 < t1 and s2 == t2)])
666
- partial_intron_retention = ','.join(
667
- [f'Intron {intron_count + 1}/{num_ref_introns} partially retained: {(t1, t2)} --> {(s1, s2)}' for (s1, s2)
668
- in var_introns for intron_count, (t1, t2) in enumerate(ref_introns) if
669
- (s1 == t1 and s2 > t2) or (s1 < t1 and s2 == t2)])
670
-
671
- exon_skipping = ','.join(
672
- [f'Exon {exon_count + 1}/{num_ref_exons} skipped: {(t1, t2)}' for exon_count, (t1, t2) in enumerate(ref_exons)
673
- if
674
- t1 not in [s1 for s1, s2 in var_exons] and t2 not in [s2 for s1, s2 in var_exons]])
675
- novel_exons = ','.join([f'Novel Exon: {(t1, t2)}' for (t1, t2) in var_exons if
676
- t1 not in [s1 for s1, s2 in ref_exons] and t2 not in [s2 for s1, s2 in ref_exons]])
677
- intron_retention = ','.join(
678
- [f'Intron {intron_count + 1}/{num_ref_introns} retained: {(t1, t2)}' for intron_count, (t1, t2) in
679
- enumerate(ref_introns) if
680
- t1 not in [s1 for s1, s2 in var_introns] and t2 not in [s2 for s1, s2 in var_introns]])
681
-
682
- return partial_exon_skipping, partial_intron_retention, exon_skipping, novel_exons, intron_retention
683
-
684
-
685
- def summarize_missplicing_event(pes, pir, es, ne, ir):
686
- event = []
687
- if pes:
688
- event.append('PES')
689
- if es:
690
- event.append('ES')
691
- if pir:
692
- event.append('PIR')
693
- if ir:
694
- event.append('IR')
695
- if ne:
696
- event.append('NE')
697
- if len(event) > 1:
698
- return event
699
- elif len(event) == 1:
700
- return event[0]
701
- else:
702
- return '-'
703
-
704
-
705
- def find_continuous_gaps(sequence):
706
- """Find continuous gap sequences in an alignment."""
707
- return [(m.start(), m.end()) for m in re.finditer(r'-+', sequence)]
708
-
709
-
710
- def get_logical_alignment(ref_prot, var_prot):
711
- """
712
- Aligns two protein sequences and finds the optimal alignment with the least number of gaps.
713
-
714
- Parameters:
715
- ref_prot (str): Reference protein sequence.
716
- var_prot (str): Variant protein sequence.
717
-
718
- Returns:
719
- tuple: Optimal alignment, number of insertions, and number of deletions.
720
- """
721
-
722
- # Perform global alignment
723
- alignments = pairwise2.align.globalms(ref_prot, var_prot, 1, -1, -3, 0, penalize_end_gaps=(True, True))
724
-
725
- # Selecting the optimal alignment
726
- if len(alignments) > 1:
727
- # Calculate continuous gaps for each alignment and sum their lengths
728
- gap_lengths = [sum(end - start for start, end in find_continuous_gaps(al.seqA) + find_continuous_gaps(al.seqB))
729
- for al in alignments]
730
- optimal_alignment = alignments[gap_lengths.index(min(gap_lengths))]
731
- else:
732
- optimal_alignment = alignments[0]
733
-
734
- return optimal_alignment
735
-
736
-
737
- def find_indels_with_mismatches_as_deletions(seqA, seqB):
738
- """
739
- Identify insertions and deletions in aligned sequences, treating mismatches as deletions.
740
-
741
- Parameters:
742
- seqA, seqB (str): Aligned sequences.
743
-
744
- Returns:
745
- tuple: Two dictionaries containing deletions and insertions.
746
- """
747
- if len(seqA) != len(seqB):
748
- raise ValueError("Sequences must be of the same length")
749
-
750
- mapperA, counter = {}, 0
751
- for i, c in enumerate(list(seqA)):
752
- if c != '-':
753
- counter += 1
754
- mapperA[i] = counter
755
-
756
- mapperB, counter = {}, 0
757
- for i, (c1, c2) in enumerate(list(zip(seqA, seqB))):
758
- if c2 != '-':
759
- counter += 1
760
- mapperB[i] = counter
761
-
762
- seqA_array, seqB_array = np.array(list(seqA)), np.array(list(seqB))
763
-
764
- # Find and mark mismatch positions in seqB
765
- mismatches = (seqA_array != seqB_array) & (seqA_array != '-') & (seqB_array != '-')
766
- seqB_array[mismatches] = '-'
767
- modified_seqB = ''.join(seqB_array)
768
-
769
- gaps_in_A = find_continuous_gaps(seqA)
770
- gaps_in_B = find_continuous_gaps(modified_seqB)
771
-
772
- insertions = {mapperB[start]: modified_seqB[start:end].replace('-', '') for start, end in gaps_in_A if
773
- seqB[start:end].strip('-')}
774
- deletions = {mapperA[start]: seqA[start:end].replace('-', '') for start, end in gaps_in_B if
775
- seqA[start:end].strip('-')}
776
- return deletions, insertions
777
-
778
-
779
- def parabolic_window(window_size):
780
- """Create a parabolic window function with a peak at the center."""
781
- x = np.linspace(-1, 1, window_size)
782
- return 0.9 * (1 - x ** 2) + 0.1
783
-
784
-
785
- # def calculate_window_size(conservation_vector_length):
786
- # return int(9 + (51 - 9) * (1 - np.exp(-0.0005 * conservation_vector_length)))
787
- #
788
-
789
-
790
- def transform_conservation_vector(conservation_vector):
791
- """
792
- Transforms a 1D conservation vector using different parameters.
793
-
794
- Args:
795
- conservation_vector (numpy.ndarray): Input 1D vector of conservation values.
796
-
797
- Returns:
798
- numpy.ndarray: A matrix containing transformed vectors.
799
- """
800
- window = 13
801
- factor = 4
802
- convolving_window = parabolic_window(window)
803
- transformed_vector = np.convolve(conservation_vector, convolving_window, mode='same') / np.sum(convolving_window)
804
- # Compute exponential factors
805
- exp_factors = np.exp(-transformed_vector * factor)
806
-
807
- # Normalize and scale exponential factors
808
- # exp_factors /= exp_factors.sum()
809
- return exp_factors
810
-
811
-
812
- def find_modified_positions(sequence_length, deletions, insertions, reach_limit=16):
813
- """
814
- Identify unmodified positions in a sequence given deletions and insertions.
815
-
816
- :param sequence_length: Length of the sequence.
817
- :param deletions: Dictionary of deletions.
818
- :param insertions: Dictionary of insertions.
819
- :param reach_limit: Limit for considering the effect of insertions/deletions.
820
- :return: Array indicating unmodified positions.
821
- """
822
- unmodified_positions = np.zeros(sequence_length, dtype=float)
823
-
824
- for pos, insertion in insertions.items():
825
- # if pos >= sequence_length:
826
- # pos = sequence_length - 1
827
- # add_factor = 1
828
-
829
- reach = min(len(insertion) // 2, reach_limit)
830
- front_end, back_end = max(0, pos - reach), min(sequence_length - 1, pos + reach)
831
- len_start, len_end = pos - front_end, back_end - pos
832
- try:
833
- gradient_front = np.linspace(0, 1, len_start, endpoint=False)
834
- gradient_back = np.linspace(0, 1, len_end, endpoint=True)[::-1]
835
- combined_gradient = np.concatenate([gradient_front, np.array([1]), gradient_back])
836
- unmodified_positions[front_end:back_end + 1] = combined_gradient
837
-
838
- except ValueError as e:
839
- print(
840
- f"Error: {e} | Lengths: unmodified_positions_slice={back_end - front_end}, combined_gradient={len(combined_gradient)}")
841
- unmodified_positions[front_end:back_end] = np.zeros(back_end - front_end)
842
-
843
- for pos, deletion in deletions.items():
844
- deletion_length = len(deletion)
845
- unmodified_positions[pos:pos + deletion_length] = 1
846
-
847
- return unmodified_positions
848
-
849
-
850
- def calculate_penalty(domains, cons_scores, W, is_insertion=False):
851
- """
852
- Calculate the penalty for mutations (either insertions or deletions) on conservation scores.
853
-
854
- :param domains: Dictionary of mutations (inserted or deleted domains).
855
- :param cons_scores: Conservation scores.
856
- :param W: Window size.
857
- :param is_insertion: Boolean flag to indicate if the mutation is an insertion.
858
- :return: Penalty array.
859
- """
860
- penalty = np.zeros(len(cons_scores))
861
- for pos, seq in domains.items():
862
- mutation_length = len(seq)
863
- weight = max(1.0, mutation_length / W)
864
-
865
- if is_insertion:
866
- reach = min(W // 2, mutation_length // 2)
867
- penalty[pos - reach:pos + reach] = weight * cons_scores[pos - reach:pos + reach]
868
- else: # For deletion
869
- penalty[pos:pos + mutation_length] = cons_scores[pos:pos + mutation_length] * weight
870
-
871
- return penalty
872
-
873
-
874
- def calculate_legacy_oncosplice_score(deletions, insertions, cons_vec, W):
875
- """
876
- Calculate the legacy Oncosplice score based on deletions, insertions, and conservation vector.
877
-
878
- :param deletions: Dictionary of deletions.
879
- :param insertions: Dictionary of insertions.
880
- :param cons_vec: Conservation vector.
881
- :param W: Window size.
882
- :return: Legacy Oncosplice score.
883
- """
884
- smoothed_conservation_vector = np.exp(np.negative(moving_average_conv(cons_vec, W, 2)))
885
- del_penalty = calculate_penalty(deletions, smoothed_conservation_vector, W, is_insertion=False)
886
- ins_penalty = calculate_penalty(insertions, smoothed_conservation_vector, W, is_insertion=True)
887
- combined_scores = del_penalty + ins_penalty
888
- return np.max(np.convolve(combined_scores, np.ones(W), mode='same'))
889
-
890
-
891
- def moving_average_conv(vector, window_size, factor=1):
892
- """
893
- Calculate the moving average convolution of a vector.
894
-
895
- Parameters:
896
- vector (iterable): Input vector (list, tuple, numpy array).
897
- window_size (int): Size of the convolution window. Must be a positive integer.
898
- factor (float): Scaling factor for the average. Default is 1.
899
-
900
- Returns:
901
- numpy.ndarray: Convolved vector as a numpy array.
902
- """
903
- if not isinstance(vector, (list, tuple, np.ndarray)):
904
- raise TypeError("vector must be a list, tuple, or numpy array")
905
- if not isinstance(window_size, int) or window_size <= 0:
906
- raise ValueError("window_size must be a positive integer")
907
- if len(vector) < window_size:
908
- raise ValueError("window_size must not be greater than the length of vector")
909
- if factor == 0:
910
- raise ValueError("factor must not be zero")
911
-
912
- return np.convolve(vector, np.ones(window_size), mode='same') / window_size
913
-
914
-
915
- def oncosplice(mut_id, sai_threshold=0.5, protein_coding=True, primary_transcript=False):
916
- mutation = Variations(mut_id)
917
- reference_gene = Gene(mutation.gene)
918
- mutated_gene = Gene(mutation.gene, mut_id)
919
-
920
- results = []
921
- for variant in mutated_gene.run_transcripts(protein_coding=protein_coding, primary_transcript=primary_transcript):
922
- reference = reference_gene.transcript(variant.transcript_id)
923
- if reference.cons_available:
924
- cons_vector = transform_conservation_vector(reference.cons_vector)
925
-
926
- missplicing = run_spliceai_transcript(mutation, reference, sai_threshold=sai_threshold)
927
- for i, new_boundaries in enumerate(develop_aberrant_splicing(variant, missplicing)):
928
- variant_isoform = deepcopy(variant)
929
- variant_isoform.set_intron_boundaries(acceptors=new_boundaries['acceptors'],
930
- donors=new_boundaries['donors']).generate_protein()
931
- alignment = get_logical_alignment(reference.protein, variant_isoform.protein)
932
- deleted, inserted = find_indels_with_mismatches_as_deletions(alignment.seqA, alignment.seqB)
933
- modified_positions = find_modified_positions(len(cons_vector), deleted, inserted)
934
- temp_cons = np.convolve(cons_vector * modified_positions, np.ones(11))
935
- affected_cons_scores = max(temp_cons)
936
- temp_cons = np.convolve(cons_vector, np.ones(11))
937
- percentile = (
938
- sorted(temp_cons).index(next(x for x in sorted(temp_cons) if x >= affected_cons_scores)) / len(
939
- temp_cons))
940
-
941
- report = OncospliceAnnotator(reference, variant_isoform, mutation)
942
- report['original_cons'] = reference.cons_vector
943
- report['oncosplice_score'] = affected_cons_scores
944
- report['percentile'] = percentile
945
- report['modified_positions'] = modified_positions
946
- report['cons_vector'] = cons_vector
947
- report['isoform_id'] = i
948
- report['isoform_prevalence'] = new_boundaries['path_weight']
949
- report['full_missplicing'] = missplicing
950
- results.append(report)
951
-
952
- report = pd.DataFrame(results)
953
- return report
954
-
955
-
956
- # import numpy as np
957
- # import pandas as pd
958
- # from Bio import pairwise2
959
- # import re
960
- # from copy import deepcopy
961
- # from geney.splicing import PredictSpliceAI
962
- # from .Gene import Gene, Transcript
963
- # from geney.mutations.variant_utils import Variations, develop_aberrant_splicing
964
- #
965
- # sample_mut_id = 'KRAS:12:25227343:G:T'
966
- # sample_epistasis_id = 'KRAS:12:25227343:G:T|KRAS:13:25227344:A:T'
967
- #
968
- # def oncosplice(mutation: str, sai_threshold=0.25, annotate=False) -> pd.DataFrame:
969
- # '''
970
- # :param mutation: str
971
- # the genomic variation
972
- # :param sai_threshold: float
973
- # the threshold for including missplicing predictions in gene builds
974
- # :param prevalence_threshold: float
975
- # the minimum threshold needed to consider a predicted isoform as valid
976
- # :param target_directory: pathlib.Path
977
- # the directory on the machine where the mrna annotation files are stored
978
- # :return: a dataframe object
979
- # will contain columns pertinant to assessing mutation pathogenicity including pipelines score, GOF score, legacy pipelines score, missplicing,
980
- # '''
981
- #
982
- # mutation = Variations(mutation) # Generate mutation object
983
- # # Gene annotations should be available in the target directory under the file name mrna_gene.json
984
- # gene = Gene(mutation.gene) # We obtain the annotation file and convert it into a Gene object
985
- # # aberrant_splicing = PredictSpliceAI(mutation, gene, threshold=sai_threshold) # SpliceAI predictions are processed and obtained for each mutation
986
- # # Oncosplice obtains predictions for each transcript in the annotation file
987
- #
988
- # results = []
989
- # for reference_transcript in gene:
990
- # aberrant_splicing = PredictSpliceAI(mutation, reference_transcript, threshold=sai_threshold)
991
- # for i, new_boundaries in develop_aberrant_splicing(reference_transcript, aberrant_splicing.aberrant_splicing):
992
- # res_in = oncosplice_transcript(reference_transcript=reference_transcript.generate_protein(), mutation=mutation, aberrant_splicing=aberrant_splicing, annotate=annotate, plot_term=plot_term)
993
- # results.append(res_in)
994
- #
995
- # if len(results) > 0:
996
- # results = pd.concat(results)
997
- # else:
998
- # return None
999
- #
1000
- # # Append some additional, uniform information to the results dataframe
1001
- # results['mut_id'] = mutation.mut_id
1002
- # results['missplicing'] = aberrant_splicing.get_max_missplicing_delta()
1003
- # results['gene'] = mutation.gene
1004
- # return results
1005
- #
1006
- #
1007
- # def oncosplice_transcript(reference_transcript: Transcript, mutation: Variations, aberrant_splicing: PredictSpliceAI, annotate=False, plot_term=False) -> pd.DataFrame:
1008
- # reports = []
1009
- # if reference_transcript.cons_available:
1010
- # cons_available, cons_array, cons_vector = True, transform_conservation_vector(reference_transcript.cons_vector), reference_transcript.cons_vector
1011
- #
1012
- # else:
1013
- # cons_available, cons_array, cons_vector = False, transform_conservation_vector(np.ones(len(reference_transcript.protein), dtype=float)), np.ones(len(reference_transcript.protein), dtype=float)
1014
- #
1015
- # # For each transcript, we generate a series of isoforms based on the splice site predictions; each isoform is assigned a prevalence score
1016
- # # obtained using simple graph theory where the probability of the edges taken to generate the isoform are multiplied together
1017
- # for i, new_boundaries in enumerate(develop_aberrant_splicing(reference_transcript, aberrant_splicing.aberrant_splicing)):
1018
- #
1019
- # # The variant transcript is duplicated from the reference transcript and all needed modifications are performed
1020
- # variant_transcript = Transcript(deepcopy(reference_transcript).__dict__).set_exons(new_boundaries).generate_mature_mrna(mutations=mutation.mut_id.split('|'), inplace=True).generate_translational_boundaries().generate_protein()
1021
- #
1022
- # # The optimal alignment that minimizes gaps between the trnascripts is obtained
1023
- # alignment = get_logical_alignment(reference_transcript.protein, variant_transcript.protein)
1024
- #
1025
- # # Based on the optimal alignment, we can generate the relative locations of insertions and deletions
1026
- # deleted, inserted = find_indels_with_mismatches_as_deletions(alignment.seqA, alignment.seqB)
1027
- #
1028
- # report = {
1029
- # 'log': variant_transcript.log,
1030
- # 'isoform': i,
1031
- # 'isoform_prevalence': new_boundaries['path_weight'],
1032
- # 'legacy_oncosplice_score_long': calculate_legacy_oncosplice_score(deleted, inserted, cons_vector,
1033
- # min(76, len(reference_transcript.protein))),
1034
- # 'legacy_oncosplice_score_short': calculate_legacy_oncosplice_score(deleted, inserted, cons_vector,
1035
- # min(10,
1036
- # len(reference_transcript.protein))),
1037
- # 'variant_length': len(variant_transcript.protein.replace('*', '')),
1038
- # }
1039
- #
1040
- # modified_positions = find_modified_positions(len(cons_vector), deleted, inserted)
1041
- # # print(list(modified_positions))
1042
- # # print(list(cons_array))
1043
- # affected_cons_scores = cons_array.transpose() @ modified_positions[:, None]
1044
- # # print(list(affected_cons_scores)) #[:, 0]))
1045
- # # affected_cons_scores = sg.convolve2d(affected_cons_scores, np.ones(21), mode='same') #/ 21
1046
- # max_score = affected_cons_scores #np.max(affected_cons_scores, axis=0)
1047
- # report.update({'oncosplice_score': max_score, 'preserved_ratio': sum(modified_positions) / len(modified_positions)})
1048
- #
1049
- # if annotate:
1050
- # report.update(OncospliceAnnotator(reference_transcript, variant_transcript, mutation))
1051
- # report['insertions'] = inserted
1052
- # report['deletions'] = deleted
1053
- # report['full_missplicing'] = aberrant_splicing.missplicing
1054
- # reports.append(report)
1055
- #
1056
- # reports = pd.DataFrame(reports)
1057
- # reports['cons_available'] = int(cons_available)
1058
- # reports['transcript_id'] = reference_transcript.transcript_id
1059
- # reports['cons_sum'] = np.sum(np.exp(np.negative(cons_vector)))
1060
- # reports['transcript_length'] = len(reference_transcript.protein)
1061
- # reports['primary_transcript'] = reference_transcript.primary_transcript
1062
- # return reports
1063
- #
1064
- #
1065
- # def oncosplice_reduced(df):
1066
- # target_columns = [c for c in df.columns if 'oncosplice' in c or 'cons' in c]
1067
- # if len(target_columns) == 0:
1068
- # print("No oncosplice scores to reduce.")
1069
- # return None
1070
- # scores = [df[['mut_id', 'missplicing']].drop_duplicates().set_index('mut_id')]
1071
- # for score in target_columns:
1072
- # scores.append(df.groupby(['mut_id', 'transcript_id'])[score].mean().groupby('mut_id').max())
1073
- # scores.append(df.groupby(['mut_id', 'transcript_id'])[score].mean().groupby('mut_id').min())
1074
- # scores = pd.concat(scores, axis=1)
1075
- # return scores
1076
- #
1077
- #
1078
- # def find_continuous_gaps(sequence):
1079
- # """Find continuous gap sequences in an alignment."""
1080
- # return [(m.start(), m.end()) for m in re.finditer(r'-+', sequence)]
1081
- #
1082
- #
1083
- # def get_logical_alignment(ref_prot, var_prot):
1084
- # """
1085
- # Aligns two protein sequences and finds the optimal alignment with the least number of gaps.
1086
- #
1087
- # Parameters:
1088
- # ref_prot (str): Reference protein sequence.
1089
- # var_prot (str): Variant protein sequence.
1090
- #
1091
- # Returns:
1092
- # tuple: Optimal alignment, number of insertions, and number of deletions.
1093
- # """
1094
- #
1095
- # # Perform global alignment
1096
- # alignments = pairwise2.align.globalms(ref_prot, var_prot, 1, -1, -3, 0, penalize_end_gaps=(True, True))
1097
- #
1098
- # # Selecting the optimal alignment
1099
- # if len(alignments) > 1:
1100
- # # Calculate continuous gaps for each alignment and sum their lengths
1101
- # gap_lengths = [sum(end - start for start, end in find_continuous_gaps(al.seqA) + find_continuous_gaps(al.seqB)) for al in alignments]
1102
- # optimal_alignment = alignments[gap_lengths.index(min(gap_lengths))]
1103
- # else:
1104
- # optimal_alignment = alignments[0]
1105
- #
1106
- # return optimal_alignment
1107
- #
1108
- #
1109
- # def find_indels_with_mismatches_as_deletions(seqA, seqB):
1110
- # """
1111
- # Identify insertions and deletions in aligned sequences, treating mismatches as deletions.
1112
- #
1113
- # Parameters:
1114
- # seqA, seqB (str): Aligned sequences.
1115
- #
1116
- # Returns:
1117
- # tuple: Two dictionaries containing deletions and insertions.
1118
- # """
1119
- # if len(seqA) != len(seqB):
1120
- # raise ValueError("Sequences must be of the same length")
1121
- #
1122
- # mapperA, counter = {}, 0
1123
- # for i, c in enumerate(list(seqA)):
1124
- # if c != '-':
1125
- # counter += 1
1126
- # mapperA[i] = counter
1127
- #
1128
- # mapperB, counter = {}, 0
1129
- # for i, (c1, c2) in enumerate(list(zip(seqA, seqB))):
1130
- # if c2 != '-':
1131
- # counter += 1
1132
- # mapperB[i] = counter
1133
- #
1134
- # seqA_array, seqB_array = np.array(list(seqA)), np.array(list(seqB))
1135
- #
1136
- # # Find and mark mismatch positions in seqB
1137
- # mismatches = (seqA_array != seqB_array) & (seqA_array != '-') & (seqB_array != '-')
1138
- # seqB_array[mismatches] = '-'
1139
- # modified_seqB = ''.join(seqB_array)
1140
- #
1141
- # gaps_in_A = find_continuous_gaps(seqA)
1142
- # gaps_in_B = find_continuous_gaps(modified_seqB)
1143
- #
1144
- # insertions = {mapperB[start]: modified_seqB[start:end].replace('-', '') for start, end in gaps_in_A if
1145
- # seqB[start:end].strip('-')}
1146
- # deletions = {mapperA[start]: seqA[start:end].replace('-', '') for start, end in gaps_in_B if
1147
- # seqA[start:end].strip('-')}
1148
- # return deletions, insertions
1149
- #
1150
- #
1151
- #
1152
- # def parabolic_window(window_size):
1153
- # """Create a parabolic window function with a peak at the center."""
1154
- # x = np.linspace(-1, 1, window_size)
1155
- # return 0.9 * (1 - x**2) + 0.1
1156
- #
1157
- #
1158
- # # def calculate_window_size(conservation_vector_length):
1159
- # # return int(9 + (51 - 9) * (1 - np.exp(-0.0005 * conservation_vector_length)))
1160
- # #
1161
- #
1162
- #
1163
- # def transform_conservation_vector(conservation_vector):
1164
- # """
1165
- # Transforms a 1D conservation vector using different parameters.
1166
- #
1167
- # Args:
1168
- # conservation_vector (numpy.ndarray): Input 1D vector of conservation values.
1169
- #
1170
- # Returns:
1171
- # numpy.ndarray: A matrix containing transformed vectors.
1172
- # """
1173
- # window = 21
1174
- # factor = 0.5
1175
- # convolving_window = parabolic_window(window)
1176
- # transformed_vector = np.convolve(conservation_vector, convolving_window, mode='same') / np.sum(convolving_window)
1177
- # # Compute exponential factors
1178
- # exp_factors = np.exp(-transformed_vector * factor)
1179
- #
1180
- # # Normalize and scale exponential factors
1181
- # exp_factors /= exp_factors.sum()
1182
- # return exp_factors
1183
- #
1184
- #
1185
- #
1186
- # def find_modified_positions(sequence_length, deletions, insertions, reach_limit=16):
1187
- # """
1188
- # Identify unmodified positions in a sequence given deletions and insertions.
1189
- #
1190
- # :param sequence_length: Length of the sequence.
1191
- # :param deletions: Dictionary of deletions.
1192
- # :param insertions: Dictionary of insertions.
1193
- # :param reach_limit: Limit for considering the effect of insertions/deletions.
1194
- # :return: Array indicating unmodified positions.
1195
- # """
1196
- # unmodified_positions = np.zeros(sequence_length, dtype=float)
1197
- #
1198
- # for pos, insertion in insertions.items():
1199
- # # if pos >= sequence_length:
1200
- # # pos = sequence_length - 1
1201
- # # add_factor = 1
1202
- #
1203
- # reach = min(len(insertion) // 2, reach_limit)
1204
- # front_end, back_end = max(0, pos - reach), min(sequence_length - 1, pos + reach)
1205
- # len_start, len_end = pos - front_end, back_end - pos
1206
- # try:
1207
- # gradient_front = np.linspace(0, 1, len_start, endpoint=False)
1208
- # gradient_back = np.linspace(0, 1, len_end, endpoint=True)[::-1]
1209
- # combined_gradient = np.concatenate([gradient_front, np.array([1]), gradient_back])
1210
- # unmodified_positions[front_end:back_end + 1] = combined_gradient
1211
- #
1212
- # except ValueError as e:
1213
- # print(
1214
- # f"Error: {e} | Lengths: unmodified_positions_slice={back_end - front_end}, combined_gradient={len(combined_gradient)}")
1215
- # unmodified_positions[front_end:back_end] = np.zeros(back_end - front_end)
1216
- #
1217
- # for pos, deletion in deletions.items():
1218
- # deletion_length = len(deletion)
1219
- # unmodified_positions[pos:pos + deletion_length] = 1
1220
- #
1221
- # return unmodified_positions
1222
- #
1223
- #
1224
- #
1225
- # def calculate_penalty(domains, cons_scores, W, is_insertion=False):
1226
- # """
1227
- # Calculate the penalty for mutations (either insertions or deletions) on conservation scores.
1228
- #
1229
- # :param domains: Dictionary of mutations (inserted or deleted domains).
1230
- # :param cons_scores: Conservation scores.
1231
- # :param W: Window size.
1232
- # :param is_insertion: Boolean flag to indicate if the mutation is an insertion.
1233
- # :return: Penalty array.
1234
- # """
1235
- # penalty = np.zeros(len(cons_scores))
1236
- # for pos, seq in domains.items():
1237
- # mutation_length = len(seq)
1238
- # weight = max(1.0, mutation_length / W)
1239
- #
1240
- # if is_insertion:
1241
- # reach = min(W // 2, mutation_length // 2)
1242
- # penalty[pos - reach:pos + reach] = weight * cons_scores[pos - reach:pos + reach]
1243
- # else: # For deletion
1244
- # penalty[pos:pos + mutation_length] = cons_scores[pos:pos + mutation_length] * weight
1245
- #
1246
- # return penalty
1247
- #
1248
- #
1249
- # def calculate_legacy_oncosplice_score(deletions, insertions, cons_vec, W):
1250
- # """
1251
- # Calculate the legacy Oncosplice score based on deletions, insertions, and conservation vector.
1252
- #
1253
- # :param deletions: Dictionary of deletions.
1254
- # :param insertions: Dictionary of insertions.
1255
- # :param cons_vec: Conservation vector.
1256
- # :param W: Window size.
1257
- # :return: Legacy Oncosplice score.
1258
- # """
1259
- # smoothed_conservation_vector = np.exp(np.negative(moving_average_conv(cons_vec, W, 2)))
1260
- # del_penalty = calculate_penalty(deletions, smoothed_conservation_vector, W, is_insertion=False)
1261
- # ins_penalty = calculate_penalty(insertions, smoothed_conservation_vector, W, is_insertion=True)
1262
- # combined_scores = del_penalty + ins_penalty
1263
- # return np.max(np.convolve(combined_scores, np.ones(W), mode='same'))
1264
- #
1265
- #
1266
- # def moving_average_conv(vector, window_size, factor=1):
1267
- # """
1268
- # Calculate the moving average convolution of a vector.
1269
- #
1270
- # Parameters:
1271
- # vector (iterable): Input vector (list, tuple, numpy array).
1272
- # window_size (int): Size of the convolution window. Must be a positive integer.
1273
- # factor (float): Scaling factor for the average. Default is 1.
1274
- #
1275
- # Returns:
1276
- # numpy.ndarray: Convolved vector as a numpy array.
1277
- # """
1278
- # if not isinstance(vector, (list, tuple, np.ndarray)):
1279
- # raise TypeError("vector must be a list, tuple, or numpy array")
1280
- # if not isinstance(window_size, int) or window_size <= 0:
1281
- # raise ValueError("window_size must be a positive integer")
1282
- # if len(vector) < window_size:
1283
- # raise ValueError("window_size must not be greater than the length of vector")
1284
- # if factor == 0:
1285
- # raise ValueError("factor must not be zero")
1286
- #
1287
- # return np.convolve(vector, np.ones(window_size), mode='same') / window_size
1288
- #
1289
- #
1290
- # def OncospliceAnnotator(reference_transcript, variant_transcript, mut):
1291
- # affected_exon, affected_intron, distance_from_5, distance_from_3 = find_splice_site_proximity(mut, reference_transcript)
1292
- #
1293
- # report = {}
1294
- # report['reference_mRNA'] = reference_transcript.transcript_seq
1295
- # report['reference_CDS_start'] = reference_transcript.TIS
1296
- # report['reference_pre_mrna'] = reference_transcript.pre_mrna
1297
- # report['reference_ORF'] = reference_transcript.orf #pre_mrna[reference_transcript.transcript_indices.index(reference_transcript.TIS):reference_transcript.transcript_indices.index(reference_transcript.TTS)]
1298
- # report['reference_protein'] = reference_transcript.protein
1299
- #
1300
- # report['variant_mRNA'] = variant_transcript.transcript_seq
1301
- # report['variant_CDS_start'] = variant_transcript.TIS
1302
- # report['variant_pre_mrna'] = variant_transcript.pre_mrna #pre_mrna[variant_transcript.transcript_indices.index(variant_transcript.TIS):variant_transcript.transcript_indices.index(variant_transcript.TTS)]
1303
- # report['variant_ORF'] = variant_transcript.orf
1304
- # report['variant_protein'] = variant_transcript.protein
1305
- #
1306
- # descriptions = define_missplicing_events(reference_transcript.exons, variant_transcript.exons,
1307
- # reference_transcript.rev)
1308
- # report['exon_changes'] = '|'.join([v for v in descriptions if v])
1309
- # report['splicing_codes'] = summarize_missplicing_event(*descriptions)
1310
- # report['affected_exon'] = affected_exon
1311
- # report['affected_intron'] = affected_intron
1312
- # report['mutation_distance_from_5'] = distance_from_5
1313
- # report['mutation_distance_from_3'] = distance_from_3
1314
- # return report
1315
- #
1316
- #
1317
- # def find_splice_site_proximity(mut, transcript):
1318
- # affected_exon, affected_intron, distance_from_5, distance_from_3 = None, None, None, None
1319
- # for i, (ex_start, ex_end) in enumerate(transcript.exons):
1320
- # if min(ex_start, ex_end) <= mut.start <= max(ex_start, ex_end):
1321
- # affected_exon = i + 1
1322
- # distance_from_5 = abs(mut.start - ex_start)
1323
- # distance_from_3 = abs(mut.start - ex_end)
1324
- #
1325
- # for i, (in_start, in_end) in enumerate(transcript.introns):
1326
- # if min(in_start, in_end) <= mut.start <= max(in_start, in_end):
1327
- # affected_intron = i + 1
1328
- # distance_from_5 = abs(mut.start - in_end)
1329
- # distance_from_3 = abs(mut.start - in_start)
1330
- #
1331
- # return affected_exon, affected_intron, distance_from_5, distance_from_3
1332
- #
1333
- #
1334
- # def define_missplicing_events(ref_exons, var_exons, rev):
1335
- # ref_introns = [(ref_exons[i][1], ref_exons[i + 1][0]) for i in range(len(ref_exons) - 1)]
1336
- # var_introns = [(var_exons[i][1], var_exons[i + 1][0]) for i in range(len(var_exons) - 1)]
1337
- # num_ref_exons = len(ref_exons)
1338
- # num_ref_introns = len(ref_introns)
1339
- # if not rev:
1340
- # partial_exon_skipping = ','.join(
1341
- # [f'Exon {exon_count + 1}/{num_ref_exons} truncated: {(t1, t2)} --> {(s1, s2)}' for (s1, s2) in var_exons for
1342
- # exon_count, (t1, t2) in enumerate(ref_exons) if (s1 == t1 and s2 < t2) or (s1 > t1 and s2 == t2)])
1343
- # partial_intron_retention = ','.join(
1344
- # [f'Intron {intron_count + 1}/{num_ref_introns} partially retained: {(t1, t2)} --> {(s1, s2)}' for (s1, s2)
1345
- # in var_introns for intron_count, (t1, t2) in enumerate(ref_introns) if
1346
- # (s1 == t1 and s2 < t2) or (s1 > t1 and s2 == t2)])
1347
- #
1348
- # else:
1349
- # partial_exon_skipping = ','.join(
1350
- # [f'Exon {exon_count + 1}/{num_ref_exons} truncated: {(t1, t2)} --> {(s1, s2)}' for (s1, s2) in var_exons for
1351
- # exon_count, (t1, t2) in enumerate(ref_exons) if (s1 == t1 and s2 > t2) or (s1 < t1 and s2 == t2)])
1352
- # partial_intron_retention = ','.join(
1353
- # [f'Intron {intron_count + 1}/{num_ref_introns} partially retained: {(t1, t2)} --> {(s1, s2)}' for (s1, s2)
1354
- # in var_introns for intron_count, (t1, t2) in enumerate(ref_introns) if
1355
- # (s1 == t1 and s2 > t2) or (s1 < t1 and s2 == t2)])
1356
- #
1357
- # exon_skipping = ','.join(
1358
- # [f'Exon {exon_count + 1}/{num_ref_exons} skipped: {(t1, t2)}' for exon_count, (t1, t2) in enumerate(ref_exons)
1359
- # if
1360
- # t1 not in [s1 for s1, s2 in var_exons] and t2 not in [s2 for s1, s2 in var_exons]])
1361
- # novel_exons = ','.join([f'Novel Exon: {(t1, t2)}' for (t1, t2) in var_exons if
1362
- # t1 not in [s1 for s1, s2 in ref_exons] and t2 not in [s2 for s1, s2 in ref_exons]])
1363
- # intron_retention = ','.join(
1364
- # [f'Intron {intron_count + 1}/{num_ref_introns} retained: {(t1, t2)}' for intron_count, (t1, t2) in
1365
- # enumerate(ref_introns) if
1366
- # t1 not in [s1 for s1, s2 in var_introns] and t2 not in [s2 for s1, s2 in var_introns]])
1367
- #
1368
- # return partial_exon_skipping, partial_intron_retention, exon_skipping, novel_exons, intron_retention
1369
- #
1370
- #
1371
- # def summarize_missplicing_event(pes, pir, es, ne, ir):
1372
- # event = []
1373
- # if pes:
1374
- # event.append('PES')
1375
- # if es:
1376
- # event.append('ES')
1377
- # if pir:
1378
- # event.append('PIR')
1379
- # if ir:
1380
- # event.append('IR')
1381
- # if ne:
1382
- # event.append('NE')
1383
- # if len(event) > 1:
1384
- # return event
1385
- # elif len(event) == 1:
1386
- # return event[0]
1387
- # else:
1388
- # return '-'
1389
- #
1390
-
1391
-
1392
-
1393
- # def find_indels_with_mismatches_as_deletions(seqA, seqB):
1394
- # # Convert sequences to numpy arrays for element-wise comparison
1395
- # ta, tb = np.array(list(seqA)), np.array(list(seqB))
1396
- #
1397
- # # Find mismatch positions
1398
- # mismatch_positions = (ta != tb) & (ta != '-') & (tb != '-')
1399
- #
1400
- # # Replace mismatch positions in seqB with '-'
1401
- # tb[mismatch_positions] = '-'
1402
- # modified_seqB = ''.join(tb)
1403
- #
1404
- # # Function to find continuous gaps using regex
1405
- # def find_continuous_gaps(sequence):
1406
- # return [(m.start(), m.end()) for m in re.finditer(r'-+', sequence)]
1407
- #
1408
- # # Find gaps in both sequences
1409
- # gaps_in_A = find_continuous_gaps(seqA)
1410
- # gaps_in_B = find_continuous_gaps(modified_seqB)
1411
- #
1412
- # # Identify insertions and deletions
1413
- # insertions = {start: modified_seqB[start:end].replace('-', '') for start, end in gaps_in_A if
1414
- # seqB[start:end].strip('-')}
1415
- # deletions = {start: seqA[start:end].replace('-', '') for start, end in gaps_in_B if seqA[start:end].strip('-')}
1416
- #
1417
- # return deletions, insertions
1418
-
1419
-
1420
-
1421
- # def moving_average_conv(vector, window_size, factor=1):
1422
- # """
1423
- # Calculate the moving average convolution of a vector.
1424
- #
1425
- # :param vector: Input vector.
1426
- # :param window_size: Size of the convolution window.
1427
- # :return: Convolved vector as a numpy array.
1428
- # """
1429
- # convolving_length = np.array([min(len(vector) + window_size - i, window_size, i)
1430
- # for i in range(window_size // 2, len(vector) + window_size // 2)], dtype=float)
1431
- #
1432
- # return np.convolve(vector, np.ones(window_size), mode='same') / (convolving_length / factor)
1433
- #
1434
-
1435
-
1436
- # def get_logical_alignment(ref_prot, var_prot):
1437
- # '''
1438
- # :param ref_prot:
1439
- # :param var_prot:
1440
- # :return:
1441
- # '''
1442
- #
1443
- # alignments = pairwise2.align.globalms(ref_prot, var_prot, 1, -1, -3, 0, penalize_end_gaps=(True, False))
1444
- # if len(alignments) == 1:
1445
- # optimal_alignment = alignments[0]
1446
- # else:
1447
- # # This calculates the number of gaps in each alignment.
1448
- # number_of_gaps = [re.sub('-+', '-', al.seqA).count('-') + re.sub('-+', '-', al.seqB).count('-') for al in
1449
- # alignments]
1450
- #
1451
- # optimal_alignment = alignments[number_of_gaps.index(min(number_of_gaps))]
1452
- #
1453
- # num_insertions = re.sub('-+', '-', optimal_alignment.seqA).count('-')
1454
- # num_deletions = re.sub('-+', '-', optimal_alignment.seqB).count('-')
1455
- # return optimal_alignment
1456
- #
1457
-
1458
-
1459
- # def transform_conservation_vector(conservation_vector, window_size=10, verbose=False):
1460
- # """
1461
- # Transforms a conservation vector by applying a moving average convolution and scaling.
1462
- #
1463
- # :param conservation_vector: Array of conservation scores.
1464
- # :param window_size: Window size for the moving average convolution. Defaults to 10, the average binding site length.
1465
- # :return: Transformed conservation vector.
1466
- # """
1467
- # factor = 100 / window_size
1468
- # conservation_vector = moving_average_conv(conservation_vector, window_size)
1469
- # transformed_vector = np.exp(-conservation_vector*factor)
1470
- # transformed_vector = transformed_vector / max(transformed_vector)
1471
- #
1472
- # if verbose:
1473
- # import asciiplotlib as apl
1474
- # fig = apl.figure()
1475
- # fig.plot(list(range(len(transformed_vector))), transformed_vector, width=50, height=15, title="Conservation Vector")
1476
- # fig.plot(list(range(len(conservation_vector))), transformed_vector, width=50, height=15, title="Entropy Vector")
1477
- # fig.show()
1478
- #
1479
- # return transformed_vector
1480
-
1481
- # def oncosplice_report(modified_positions, cons_matrix, tplot=False):
1482
- # """
1483
- # Calculate pipelines scores based on conservation vectors and detected sequence modifications.
1484
- #
1485
- # :param deletions: Dictionary of deletions in the sequence.
1486
- # :param insertions: Dictionary of insertions in the sequence.
1487
- # :param cons_vector: Conservation vector.
1488
- # :param window_size: Window size for calculations.
1489
- # :return: Dictionary of pipelines scores.
1490
- # """
1491
- # window_size = calculate_window_size(cons_matrix.shape[0])
1492
- # # cons_vec_one, cons_vec_two, cons_vec_three = transform_conservation_vector(cons_matrix, tplot=tplot)
1493
- # # results = {}
1494
- #
1495
- # # for i, cons_vec in enumerate([cons_vec_one, cons_vec_two, cons_vec_three]):
1496
- # affected_cons_scores = cons_matrix * modified_positions
1497
- # # affected_sum = np.sum(affected_cons_scores)
1498
- # modified_cons_vector = np.convolve(affected_cons_scores, np.ones(window_size), mode='same') / window_size
1499
- #
1500
- # # obtaining scores
1501
- # max_score = np.max(modified_cons_vector)
1502
- # results = np.where(modified_cons_vector == max_score)[0]
1503
- #
1504
- # # # Exclude windows within one window_size of the max scoring window
1505
- # # exclusion_zone = set().union(*(range(max(i - window_size, 0), min(i + window_size, len(modified_cons_vector))) for i in max_score_indices))
1506
- # # viable_secondary_scores = [score for i, score in enumerate(modified_cons_vector) if i not in exclusion_zone]
1507
- # #
1508
- # # if len(viable_secondary_scores) == 0:
1509
- # # gof_prob = 0
1510
- # #
1511
- # # else:
1512
- # # second_highest_score = np.max(viable_secondary_scores)
1513
- # # gof_prob = (max_score - second_highest_score) / max_score
1514
- # # temp = {f'gof_{i}': gof_prob, f'oncosplice_score_{i}': max_score, f'affected_cons_sum_{i}': affected_sum}
1515
- # # results.update(temp)
1516
- # return results
1517
-
1518
-
1519
-
1520
- # def transform_conservation_vector(conservation_vector, plot=False, tplot=False, tid=''):
1521
- # # all_ones = np.all(conservation_vector == 1)
1522
- # # if all_ones:
1523
- # # return conservation_vector, conservation_vector, conservation_vector
1524
- #
1525
- # # Calculate dynamic window size
1526
- # window_size = calculate_window_size(len(conservation_vector))
1527
- #
1528
- # if window_size > len(conservation_vector):
1529
- # window_size = int(len(conservation_vector) / 2)
1530
- #
1531
- # # Create convolution window and transform vector
1532
- # convolving_window = parabolic_window(window_size)
1533
- # factor = int(100 / window_size)
1534
- # transformed_vector = np.convolve(conservation_vector, convolving_window, mode='same') / sum(convolving_window)
1535
- # transformed_vector = np.exp(-transformed_vector * factor)
1536
- # transformed_vector_one = transformed_vector.copy()
1537
- #
1538
- # transformed_vector -= np.percentile(transformed_vector, 75)
1539
- # transformed_vector_two = transformed_vector.copy()
1540
- #
1541
- # max_val = max(transformed_vector)
1542
- # transformed_vector /= max_val
1543
- #
1544
- # # Balancing negative values
1545
- # negative_values = transformed_vector[transformed_vector < 0]
1546
- # if negative_values.size > 0:
1547
- # balance_factor = -np.sum(transformed_vector[transformed_vector >= 0]) / np.sum(negative_values)
1548
- # transformed_vector[transformed_vector < 0] *= balance_factor
1549
- #
1550
- # current_sum = np.sum(transformed_vector)
1551
- # additional_amount_needed = len(transformed_vector) - current_sum
1552
- # sum_positives = np.sum(transformed_vector[transformed_vector > 0])
1553
- # if sum_positives == 0:
1554
- # raise ValueError("Array contains no positive values to scale.")
1555
- # scale_factor = 1 + (additional_amount_needed / sum_positives)
1556
- # # Apply the scaling factor only to positive values
1557
- # transformed_vector[transformed_vector > 0] *= scale_factor
1558
- #
1559
- #
1560
- # # if plot:
1561
- # # # Plotting the two vectors
1562
- # # fig, ax1 = plt.subplots(figsize=(8, 4))
1563
- # # color = 'tab:blue'
1564
- # # ax1.set_xlabel('Position')
1565
- # # ax1.set_ylabel('Conservation Vector', color=color, alpha=0.5)
1566
- # # ax1.plot(conservation_vector, color=color)
1567
- # # ax1.tick_params(axis='y', labelcolor=color)
1568
- # #
1569
- # # ax2 = ax1.twinx() # instantiate a second axes that shares the same x-axis
1570
- # # color = 'tab:red'
1571
- # # ax2.set_ylabel('Transformed Vector', color=color) # we already handled the x-label with ax1
1572
- # # ax2.plot(transformed_vector, color=color)
1573
- # # ax2.tick_params(axis='y', labelcolor=color)
1574
- # # plt.axhline(0)
1575
- # # plt.title(tid)
1576
- # # fig.tight_layout() # otherwise the right y-label is slightly clipped
1577
- # # plt.show()
1578
- # #
1579
- # # if tplot:
1580
- # # import termplotlib as tpl
1581
- # # fig = tpl.figure()
1582
- # # fig.plot(list(range(len(conservation_vector))), conservation_vector, width=100, height=15)
1583
- # # fig.plot(list(range(len(transformed_vector))), transformed_vector, width=100, height=15)
1584
- # # fig.show()
1585
- #
1586
- # return transformed_vector_one, transformed_vector_two, transformed_vector
1587
-
1588
-