geney 1.1.11__py2.py3-none-any.whl → 1.1.13__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of geney might be problematic. Click here for more details.

geney/immune_utils.py CHANGED
@@ -99,87 +99,29 @@ class NetChop(object):
99
99
  return pd.DataFrame(cut_sequences)
100
100
 
101
101
 
102
+ import re
103
+ import StringIO
104
+ import pandas as pd
102
105
 
106
+ def run_mhc(sequences):
107
+ with tempfile.NamedTemporaryFile(dir='/tamir2/nicolaslynn/temp', suffix=".pep", mode="w") as input_fd:
108
+ for (i, sequence) in enumerate(sequences):
109
+ _ = input_fd.write(sequence)
110
+ _ = input_fd.write("\n")
111
+ input_fd.flush()
112
+ try:
113
+ out = subprocess.check_output(
114
+ ["netMHCpan", "-p", "-BA", str(input_fd.name)])
115
+ except subprocess.CalledProcessError as e:
116
+ logging.error("Error calling netChop: %s:\n%s" % (e, e.output))
117
+ raise
118
+ out = out.decode('utf-8')
119
+ out = out.split(
120
+ '\n---------------------------------------------------------------------------------------------------------------------------\n')
121
+ out = out[1] + '\n' + out[2]
122
+ out = re.sub(r'[ ]+', ',', out)
123
+ out = out.replace('\n,', '\n')
124
+ return pd.read_csv(StringIO(out)).drop(columns=['Unnamed: 0'])
103
125
 
104
- from .base_commandline_predictor import BaseCommandlinePredictor
105
- from .parsing import parse_netmhc41_stdout
106
- from functools import partial
107
-
108
-
109
- class NetMHCpan41(BaseCommandlinePredictor):
110
- def __init__(
111
- self,
112
- alleles,
113
- default_peptide_lengths=[9],
114
- program_name="netMHCpan",
115
- process_limit=-1,
116
- mode="binding_affinity",
117
- extra_flags=[]):
118
- """
119
- Wrapper for NetMHCpan4.1.
120
-
121
- The mode argument should be one of "binding_affinity" (default) or
122
- "elution_score".
123
- """
124
-
125
- # The -BA flag is required to predict binding affinity
126
- if mode == "binding_affinity":
127
- flags = ["-BA"]
128
- elif mode == "elution_score":
129
- flags = []
130
- else:
131
- raise ValueError("Unsupported mode", mode)
132
-
133
- BaseCommandlinePredictor.__init__(
134
- self,
135
- program_name=program_name,
136
- alleles=alleles,
137
- default_peptide_lengths=default_peptide_lengths,
138
- parse_output_fn=partial(parse_netmhc41_stdout, mode=mode),
139
- supported_alleles_flag="-listMHC",
140
- input_file_flag="-f",
141
- length_flag="-l",
142
- allele_flag="-a",
143
- extra_flags=flags + extra_flags,
144
- process_limit=process_limit)
145
-
146
- class NetMHCpan41_EL(NetMHCpan41):
147
- """
148
- Wrapper for NetMHCpan4 when the preferred mode is elution score
149
- """
150
- def __init__(
151
- self,
152
- alleles,
153
- default_peptide_lengths=[9],
154
- program_name="netMHCpan",
155
- process_limit=-1,
156
- extra_flags=[]):
157
- NetMHCpan41.__init__(
158
- self,
159
- alleles=alleles,
160
- default_peptide_lengths=default_peptide_lengths,
161
- program_name=program_name,
162
- process_limit=process_limit,
163
- mode="elution_score",
164
- extra_flags=extra_flags)
165
126
 
166
127
 
167
- class NetMHCpan41_BA(NetMHCpan41):
168
- """
169
- Wrapper for NetMHCpan4 when the preferred mode is binding affinity
170
- """
171
- def __init__(
172
- self,
173
- alleles,
174
- default_peptide_lengths=[9],
175
- program_name="netMHCpan",
176
- process_limit=-1,
177
- extra_flags=[]):
178
- NetMHCpan41.__init__(
179
- self,
180
- alleles=alleles,
181
- default_peptide_lengths=default_peptide_lengths,
182
- program_name=program_name,
183
- process_limit=process_limit,
184
- mode="binding_affinity",
185
- extra_flags=extra_flags)
geney/oncosplice.py CHANGED
@@ -261,6 +261,1034 @@ class Gene:
261
261
  yield Transcript(self.transcripts[tid], variations=self.variations)
262
262
 
263
263
 
264
+ class Transcript:
265
+ def __init__(self, d=None, variations=None):
266
+ self.transcript_id = None
267
+ self.transcript_start = None # transcription
268
+ self.transcript_end = None # transcription
269
+ self.transcript_upper = None
270
+ self.transcript_lower = None
271
+ self.transcript_biotype = None # metadata
272
+ self.acceptors, self.donors = [], [] # splicing
273
+ self.TIS, self.TTS = None, None # translation
274
+ self.transcript_seq, self.transcript_indices = '', [] # sequence data
275
+ self.rev = None # sequence data
276
+ self.chrm = '' # sequence data
277
+ self.pre_mrna = '' # sequence data
278
+ self.orf = '' # sequence data
279
+ self.protein = '' # sequence data
280
+ self.log = '' # sequence data
281
+ self.primary_transcript = None # sequence data
282
+ self.cons_available = False # metadata
283
+ self.cons_seq = ''
284
+ self.cons_vector = ''
285
+ self.variations = None
286
+ if variations:
287
+ self.variations = Variations(variations)
288
+
289
+ if d:
290
+ self.load_from_dict(d)
291
+
292
+
293
+ if self.transcript_biotype == 'protein_coding' and variations is None:
294
+ self.generate_protein()
295
+
296
+ else:
297
+ self.generate_pre_mrna()
298
+
299
+ if '*' in self.cons_seq:
300
+ self.cons_seq = self.cons_seq.replace('*', '')
301
+ self.cons_vector = np.array(self.cons_vector[:-1])
302
+
303
+ if self.cons_seq == self.protein and len(self.cons_vector) == len(self.cons_seq):
304
+ self.cons_available = True
305
+
306
+ if self.cons_available == False:
307
+ self.cons_vector = np.ones(len(self.protein))
308
+
309
+
310
+ def __repr__(self):
311
+ return 'Transcript(transcript_id={tid})'.format(tid=self.transcript_id)
312
+
313
+ def __len__(self):
314
+ return len(self.transcript_seq)
315
+
316
+ def __str__(self):
317
+ return 'Transcript {tid}, Transcript Type: ' \
318
+ '{protein_coding}, Primary: {primary}'.format(
319
+ tid=self.transcript_id, protein_coding=self.transcript_biotype.replace('_', ' ').title(),
320
+ primary=self.primary_transcript)
321
+
322
+ def __eq__(self, other):
323
+ return self.transcript_seq == other.transcript_seq
324
+
325
+ def __contains__(self, subvalue):
326
+ '''
327
+ :param subvalue: the substring to search for in the mature mrna transcript
328
+ :return: wehether or not the substring is seen in the mature transcript or not
329
+ '''
330
+ if isinstance(subvalue, str):
331
+ return subvalue in self.transcript_seq
332
+ elif isinstance(subvalue, int):
333
+ return subvalue in self.transcript_indices
334
+ elif isinstance(subvalue, Variations):
335
+ return all([self.transcript_lower <= p <= self.transcript_upper for p in subvalue.positions])
336
+
337
+ else:
338
+ print(
339
+ "Pass an integer to check against the span of the gene's coordinates or a string to check against the "
340
+ "pre-mRNA sequence.")
341
+ return False
342
+
343
+
344
+ def __deepcopy__(self, memo):
345
+ cls = self.__class__
346
+ result = cls.__new__(cls)
347
+ memo[id(self)] = result
348
+ for k, v in self.__dict__.items():
349
+ setattr(result, k, deepcopy(v, memo))
350
+ return result
351
+
352
+ def load_from_dict(self, data):
353
+ '''
354
+ :param data: data is a dictionary containing the needed data to construct the transcript
355
+ :return: itself
356
+ '''
357
+ for k, v in data.items(): # add a line here that ensure the dictionary key is a valid item
358
+ setattr(self, k, v)
359
+
360
+ self.transcript_upper, self.transcript_lower = max(self.transcript_start, self.transcript_end), min(self.transcript_start, self.transcript_end)
361
+ self.__arrange_boundaries()#.generate_mature_mrna(inplace=True)
362
+ return self
363
+
364
+ @property
365
+ def exons(self):
366
+ '''
367
+ :return: a list of tuples where the first position is the acceptor and the second position is the donor
368
+ '''
369
+ return list(zip([self.transcript_start] + self.acceptors, self.donors + [self.transcript_end]))
370
+
371
+ @property
372
+ def exons_pos(self):
373
+ temp = self.exons
374
+ if self.rev:
375
+ temp = [(b, a) for a, b in temp[::-1]]
376
+ return temp
377
+
378
+ @property
379
+ def introns(self):
380
+ '''
381
+ :return: a list of tuples where each first position is a bondary of the first intron, and the second position is the boundary of the end of the intron
382
+ '''
383
+ return list(zip([v for v in self.donors if v != self.transcript_end],
384
+ [v for v in self.acceptors if v != self.transcript_start]))
385
+
386
+ @property
387
+ def introns_pos(self):
388
+ temp = self.introns
389
+ if self.rev:
390
+ temp = [(b, a) for a, b in temp[::-1]]
391
+ return temp
392
+
393
+
394
+ def reset_acceptors(self, acceptors):
395
+ '''
396
+ :param acceptors: resetting and then reordering the list of acceptors or donors
397
+ :return: itself
398
+ '''
399
+ self.acceptors = acceptors
400
+ return self
401
+
402
+ def reset_donors(self, donors):
403
+ '''
404
+ :param donors: resetting and then reordering the list of acceptors or donors
405
+ :return: itself
406
+ '''
407
+ self.donors = donors
408
+ return self
409
+
410
+ def reset_transcription_start(self, pos):
411
+ '''
412
+ :param pos: resetting and then reordering the list of acceptors or donors
413
+ :return: itself
414
+ '''
415
+ self.transcription_start = pos
416
+ return self
417
+
418
+
419
+ def reset_transcription_end(self, pos):
420
+ '''
421
+ :param pos: resetting and then reordering the list of acceptors or donors
422
+ :return: itself
423
+ '''
424
+ self.transcription_end = pos
425
+ return self
426
+
427
+ def organize(self):
428
+ '''
429
+ In the case that transcript boundaries or exon boundaires are changed, this needs to be run to ensure the bluepritns are ordered the the mRNA is reobtained.
430
+ :return:
431
+ '''
432
+ self.__arrange_boundaries().generate_mature_mrna(inplace=True)
433
+ self.transcript_upper, self.transcript_lower = max(self.transcript_start, self.transcript_end), min(self.transcript_start, self.transcript_end)
434
+
435
+ # if self.__exon_coverage_flag():
436
+ # raise ValueError(f"Length of exon coverage does not match transcript length.")
437
+ if self.__exon_intron_matchup_flag():
438
+ raise ValueError(f"Unequal number of acceptors and donors.")
439
+ if self.__exon_intron_order_flag():
440
+ raise ValueError(f"Exons / intron order out of position.")
441
+ if self.__transcript_boundary_flag():
442
+ raise ValueError(f"Transcript boundaries must straddle acceptors and donors.")
443
+ return self
444
+
445
+ def __arrange_boundaries(self):
446
+ # self.acceptors.append(self.transcript_start)
447
+ # self.donors.append(self.transcript_end)
448
+ self.acceptors = list(set(self.acceptors))
449
+ self.donors = list(set(self.donors))
450
+ self.acceptors.sort(reverse=self.rev)
451
+ self.donors.sort(reverse=self.rev)
452
+ return self
453
+
454
+
455
+ def __exon_coverage_flag(self):
456
+ if sum([abs(a - b) + 1 for a, b in self.exons]) != len(self):
457
+ return True
458
+ else:
459
+ return False
460
+
461
+ def __exon_intron_matchup_flag(self):
462
+ if len(self.acceptors) != len(self.donors):
463
+ return True
464
+ else:
465
+ return False
466
+ def __exon_intron_order_flag(self):
467
+ for b in self.exons_pos:
468
+ if b[0] > b[1]:
469
+ return True
470
+ else:
471
+ return False
472
+ def __transcript_boundary_flag(self):
473
+ if len(self.acceptors) == 0 and len(self.donors) == 0:
474
+ return False
475
+
476
+ if self.transcript_lower > min(self.acceptors + self.donors) or self.transcript_upper < max(self.acceptors + self.donors):
477
+ return True
478
+ else:
479
+ return False
480
+
481
+
482
+ @property
483
+ def exonic_indices(self):
484
+ return [lst for lsts in [list(range(a, b + 1)) for a, b in self.exons_pos] for lst in lsts]
485
+
486
+
487
+ # Related to transcript seq generation
488
+ def pull_pre_mrna_pos(self):
489
+ fasta_obj = Fasta_segment()
490
+ return fasta_obj.read_segment_endpoints(config_setup['CHROM_SOURCE'] / f'chr{self.chrm}.fasta',
491
+ self.transcript_lower,
492
+ self.transcript_upper)
493
+
494
+ def generate_pre_mrna_pos(self):
495
+ # *_pos functions do not set values into the object.
496
+ seq, indices = self.pull_pre_mrna_pos()
497
+ if self.variations:
498
+ for mutation in self.variations.variants:
499
+ seq, indices = generate_mut_variant(seq, indices, mut=mutation)
500
+ return seq, indices
501
+
502
+ def generate_pre_mrna(self, inplace=True):
503
+ pre_mrna, pre_indices = self.__pos2sense(*self.generate_pre_mrna_pos())
504
+ self.pre_mrna = pre_mrna
505
+ self.pre_indices = pre_indices
506
+ if inplace:
507
+ return self
508
+ return pre_mrna, pre_indices
509
+
510
+ def __pos2sense(self, mrna, indices):
511
+ if self.rev:
512
+ mrna = reverse_complement(mrna)
513
+ indices = indices[::-1]
514
+ return mrna, indices
515
+
516
+ def __sense2pos(self, mrna, indices):
517
+ if self.rev:
518
+ mrna = reverse_complement(mrna)
519
+ indices = indices[::-1]
520
+ return mrna, indices
521
+
522
+ def generate_mature_mrna_pos(self, reset=True):
523
+ mature_mrna_pos, mature_indices_pos = '', []
524
+ if reset:
525
+ pre_seq_pos, pre_indices_pos = self.generate_pre_mrna_pos()
526
+ self.pre_mrna, _ = self.__pos2sense(pre_seq_pos, pre_indices_pos)
527
+ else:
528
+ pre_seq_pos, pre_indices_pos = self.__sense2pos(self.pre_mrna, self.pre_indices)
529
+
530
+ for i, j in self.exons_pos:
531
+ rel_start, rel_end = pre_indices_pos.index(i), pre_indices_pos.index(j)
532
+ mature_mrna_pos += pre_seq_pos[rel_start:rel_end + 1]
533
+ pre_indices_pos.extend(pre_indices_pos[rel_start:rel_end + 1])
534
+ return mature_mrna_pos, pre_indices_pos
535
+
536
+ def generate_mature_mrna(self, inplace=True):
537
+ if inplace:
538
+ self.transcript_seq, self.transcript_indices = self.__pos2sense(*self.generate_mature_mrna_pos())
539
+ return self
540
+ return self.__pos2sense(*self.generate_mature_mrna_pos())
541
+
542
+ def generate_protein(self, inplace=True, reset=True):
543
+ if reset:
544
+ self.generate_mature_mrna()
545
+
546
+ if not self.TIS or self.TIS not in self.transcript_indices:
547
+ return ''
548
+
549
+ rel_start = self.transcript_indices.index(self.TIS)
550
+ orf = self.transcript_seq[rel_start:]
551
+ first_stop_index = next((i for i in range(0, len(orf) - 2, 3) if orf[i:i + 3] in {"TAG", "TAA", "TGA"}), len(orf)-3)
552
+ while first_stop_index % 3 != 0:
553
+ first_stop_index -= 1
554
+
555
+ orf = orf[:first_stop_index + 3]
556
+ protein = str(Seq(orf).translate()).replace('*', '')
557
+ if inplace:
558
+ self.orf = orf
559
+ self.protein = protein
560
+ if self.protein != self.cons_seq:
561
+ self.cons_available = False
562
+ return self
563
+ return protein
564
+
565
+
566
+
567
+ ## Missplicing construction
568
+ def develop_aberrant_splicing(transcript, aberrant_splicing):
569
+ exon_starts = {v: 1 for v in transcript.acceptors}
570
+ exon_starts.update({transcript.transcript_start: 1})
571
+ exon_starts.update({s: v['absolute'] for s, v in aberrant_splicing['missed_acceptors'].items()})
572
+ exon_starts.update({s: v['absolute'] for s, v in aberrant_splicing['discovered_acceptors'].items()})
573
+
574
+ exon_ends = {v: 1 for v in transcript.donors}
575
+ exon_ends.update({transcript.transcript_end: 1})
576
+ exon_ends.update({s: v['absolute'] for s, v in aberrant_splicing['missed_donors'].items()})
577
+ exon_ends.update({s: v['absolute'] for s, v in aberrant_splicing['discovered_donors'].items()})
578
+
579
+ nodes = [SpliceSite(pos=pos, ss_type=0, prob=prob) for pos, prob in exon_ends.items()] + \
580
+ [SpliceSite(pos=pos, ss_type=1, prob=prob) for pos, prob in exon_starts.items()]
581
+
582
+ nodes = [s for s in nodes if s.prob > 0]
583
+ nodes.sort(key=lambda x: x.pos, reverse=transcript.rev)
584
+
585
+ G = nx.DiGraph()
586
+ G.add_nodes_from([n.pos for n in nodes])
587
+
588
+ for i in range(len(nodes)):
589
+ trailing_prob, in_between = 0, []
590
+ for j in range(i + 1, len(nodes)):
591
+ curr_node, next_node = nodes[i], nodes[j]
592
+ spread = curr_node.ss_type in in_between
593
+ in_between.append(next_node.ss_type)
594
+ if curr_node.ss_type != next_node.ss_type:
595
+ if spread:
596
+ new_prob = next_node.prob - trailing_prob
597
+ if new_prob <= 0:
598
+ break
599
+ G.add_edge(curr_node.pos, next_node.pos)
600
+ G.edges[curr_node.pos, next_node.pos]['weight'] = new_prob
601
+ trailing_prob += next_node.prob
602
+ else:
603
+ G.add_edge(curr_node.pos, next_node.pos)
604
+ G.edges[curr_node.pos, next_node.pos]['weight'] = next_node.prob
605
+ trailing_prob += next_node.prob
606
+
607
+ new_paths, prob_sum = {}, 0
608
+ for i, path in enumerate(nx.all_simple_paths(G, transcript.transcript_start, transcript.transcript_end)):
609
+ curr_prob = path_weight_mult(G, path, 'weight')
610
+ prob_sum += curr_prob
611
+ new_paths[i] = {
612
+ 'acceptors': sorted([p for p in path if p in exon_starts.keys() and p != transcript.transcript_start],
613
+ reverse=transcript.rev),
614
+ 'donors': sorted([p for p in path if p in exon_ends.keys() and p != transcript.transcript_end],
615
+ reverse=transcript.rev),
616
+ 'path_weight': curr_prob}
617
+
618
+ for i, path in enumerate(nx.all_simple_paths(G, transcript.transcript_end, transcript.transcript_start)):
619
+ curr_prob = path_weight_mult(G, path, 'weight')
620
+ prob_sum += curr_prob
621
+ new_paths[i] = {
622
+ 'acceptors': sorted([p for p in path if p in exon_starts.keys() and p != transcript.transcript_start],
623
+ reverse=transcript.rev),
624
+ 'donors': sorted([p for p in path if p in exon_ends.keys() and p != transcript.transcript_end],
625
+ reverse=transcript.rev),
626
+ 'path_weight': curr_prob}
627
+
628
+
629
+ for i, d in new_paths.items():
630
+ d['path_weight'] = round(d['path_weight'] / prob_sum, 3)
631
+ new_paths = {k: v for k, v in new_paths.items() if v['path_weight'] > 0.01}
632
+ return list(new_paths.values())
633
+
634
+
635
+ def path_weight_mult(G, path, weight):
636
+ multigraph = G.is_multigraph()
637
+ cost = 1
638
+ if not nx.is_path(G, path):
639
+ raise nx.NetworkXNoPath("path does not exist")
640
+ for node, nbr in nx.utils.pairwise(path):
641
+ if multigraph:
642
+ cost *= min(v[weight] for v in G[node][nbr].values())
643
+ else:
644
+ cost *= G[node][nbr][weight]
645
+ return cost
646
+
647
+ @dataclass
648
+ class SpliceSite(object):
649
+ pos: int
650
+ ss_type: int
651
+ prob: float
652
+
653
+ def __post_init__(self):
654
+ pass
655
+
656
+ def __lt__(self, other):
657
+ return self.pos < other.pos
658
+
659
+ def __str__(self):
660
+ print(f"({self.ss_type}, {self.pos}, {self.prob})")
661
+
662
+
663
+ # Missplicing Detection
664
+ def find_ss_changes(ref_dct, mut_dct, known_splice_sites, threshold=0.5):
665
+ '''
666
+ :param ref_dct: the spliceai probabilities for each nucleotide (by genomic position) as a dictionary for the reference sequence
667
+ :param mut_dct: the spliceai probabilities for each nucleotide (by genomic position) as a dictionary for the mutated sequence
668
+ :param known_splice_sites: the indices (by genomic position) that serve as known splice sites
669
+ :param threshold: the threshold for detection (difference between reference and mutated probabilities)
670
+ :return: two dictionaries; discovered_pos is a dictionary containing all the positions that meat the threshold for discovery
671
+ and deleted_pos containing all the positions that meet the threshold for missing and the condition for missing
672
+ '''
673
+
674
+ new_dict = {v: mut_dct.get(v, 0) - ref_dct.get(v, 0) for v in
675
+ list(set(list(ref_dct.keys()) + list(mut_dct.keys())))}
676
+
677
+ discovered_pos = {k: {'delta': round(float(v), 3), 'absolute': round(float(mut_dct[k]), 3)} for k, v in
678
+ new_dict.items() if v >= threshold and k not in known_splice_sites} # if (k not in known_splice_sites and v >= threshold) or (v > 0.45)}
679
+
680
+ deleted_pos = {k: {'delta': round(float(v), 3), 'absolute': round(float(mut_dct.get(k, 0)), 3)} for k, v in
681
+ new_dict.items() if -v >= threshold and k in known_splice_sites} #if k in known_splice_sites and v <= -threshold}
682
+
683
+ return discovered_pos, deleted_pos
684
+
685
+ def run_spliceai_seq(seq, indices, threshold=0):
686
+ seq = 'N' * 5000 + seq + 'N' * 5000
687
+ ref_seq_probs_temp = sai_predict_probs(seq, sai_models)
688
+ ref_seq_acceptor_probs, ref_seq_donor_probs = ref_seq_probs_temp[0, :], ref_seq_probs_temp[1, :]
689
+ acceptor_indices = {a: b for a, b in list(zip(indices, ref_seq_acceptor_probs)) if b >= threshold}
690
+ donor_indices = {a: b for a, b in list(zip(indices, ref_seq_donor_probs)) if b >= threshold}
691
+ return acceptor_indices, donor_indices
692
+
693
+
694
+ def run_spliceai_transcript(mutations, transcript_data, sai_mrg_context=5000, min_coverage=2500, sai_threshold=0.5):
695
+ positions = mutations.positions
696
+ end_positions = [m.start + len(m.ref) for m in mutations.variants]
697
+ positions.extend(end_positions)
698
+
699
+ seq_start_pos = min(positions) - sai_mrg_context - min_coverage
700
+ seq_end_pos = max(positions) + sai_mrg_context + min_coverage
701
+
702
+ fasta_obj = Fasta_segment()
703
+ ref_seq, ref_indices = fasta_obj.read_segment_endpoints(
704
+ config_setup['CHROM_SOURCE'] / f'chr{mutations.chrom}.fasta',
705
+ seq_start_pos,
706
+ seq_end_pos)
707
+
708
+ transcript_start, transcript_end, rev = transcript_data.transcript_lower, transcript_data.transcript_upper, transcript_data.rev
709
+
710
+ # visible_donors = np.intersect1d(transcript_data.donors, ref_indices)
711
+ # visible_acceptors = np.intersect1d(transcript_data.acceptors, ref_indices)
712
+
713
+ start_pad = ref_indices.index(transcript_start) if transcript_start in ref_indices else 0
714
+ end_cutoff = ref_indices.index(transcript_end) if transcript_end in ref_indices else len(ref_indices)
715
+ end_pad = len(ref_indices) - end_cutoff
716
+ ref_seq = 'N' * start_pad + ref_seq[start_pad:end_cutoff] + 'N' * end_pad
717
+ ref_indices = [-1] * start_pad + ref_indices[start_pad:end_cutoff] + [-1] * end_pad
718
+ mut_seq, mut_indices = ref_seq, ref_indices
719
+
720
+ for mut in mutations:
721
+ mut_seq, mut_indices = generate_mut_variant(seq=mut_seq, indices=mut_indices, mut=mut)
722
+
723
+ if mut_seq == ref_seq:
724
+ print("Even in SpliceAI?!")
725
+
726
+ ref_indices = ref_indices[sai_mrg_context:-sai_mrg_context]
727
+ mut_indices = mut_indices[sai_mrg_context:-sai_mrg_context]
728
+ copy_mut_indices = mut_indices.copy()
729
+
730
+ visible_donors = np.intersect1d(transcript_data.donors, ref_indices)
731
+ visible_acceptors = np.intersect1d(transcript_data.acceptors, ref_indices)
732
+
733
+ if rev:
734
+ ref_seq = reverse_complement(ref_seq)
735
+ mut_seq = reverse_complement(mut_seq)
736
+ ref_indices = ref_indices[::-1]
737
+ mut_indices = mut_indices[::-1]
738
+
739
+ ref_seq_probs_temp = sai_predict_probs(ref_seq, sai_models)
740
+ mut_seq_probs_temp = sai_predict_probs(mut_seq, sai_models)
741
+
742
+ ref_seq_acceptor_probs, ref_seq_donor_probs = ref_seq_probs_temp[0, :], ref_seq_probs_temp[1, :]
743
+ mut_seq_acceptor_probs, mut_seq_donor_probs = mut_seq_probs_temp[0, :], mut_seq_probs_temp[1, :]
744
+
745
+ assert len(ref_indices) == len(ref_seq_acceptor_probs), 'Reference pos not the same'
746
+ assert len(mut_indices) == len(mut_seq_acceptor_probs), 'Mut pos not the same'
747
+
748
+ iap, dap = find_ss_changes({p: v for p, v in list(zip(ref_indices, ref_seq_acceptor_probs))},
749
+ {p: v for p, v in list(zip(mut_indices, mut_seq_acceptor_probs))},
750
+ visible_acceptors,
751
+ threshold=sai_threshold)
752
+
753
+ assert len(ref_indices) == len(ref_seq_donor_probs), 'Reference pos not the same'
754
+ assert len(mut_indices) == len(mut_seq_donor_probs), 'Mut pos not the same'
755
+
756
+ idp, ddp = find_ss_changes({p: v for p, v in list(zip(ref_indices, ref_seq_donor_probs))},
757
+ {p: v for p, v in list(zip(mut_indices, mut_seq_donor_probs))},
758
+ visible_donors,
759
+ threshold=sai_threshold)
760
+
761
+ ref_acceptors = {a: b for a, b in list(zip(ref_indices, ref_seq_acceptor_probs))}
762
+ ref_donors = {a: b for a, b in list(zip(ref_indices, ref_seq_donor_probs))}
763
+
764
+ lost_acceptors = {int(p): {'absolute': np.float64(0), 'delta': round(float(-ref_acceptors[p]), 3)} for p in visible_acceptors if p not in mut_indices and p not in dap}
765
+ lost_donors = {int(p): {'absolute': np.float64(0), 'delta': round(float(-ref_donors[p]), 3)} for p in visible_donors if p not in mut_indices and p not in ddp}
766
+ dap.update(lost_acceptors)
767
+ ddp.update(lost_donors)
768
+
769
+ missplicing = {'missed_acceptors': dap, 'missed_donors': ddp, 'discovered_acceptors': iap, 'discovered_donors': idp}
770
+ missplicing = {outk: {float(k): v for k, v in outv.items()} for outk, outv in missplicing.items()}
771
+ return {outk: {int(k) if k.is_integer() else k: v for k, v in outv.items()} for outk, outv in missplicing.items()}
772
+
773
+
774
+ # def run_spliceai(mutations, gene_data, sai_mrg_context=5000, min_coverage=2500, sai_threshold=0.5):
775
+ # positions = mutations.positions
776
+ # seq_start_pos = min(positions) - sai_mrg_context - min_coverage
777
+ # seq_end_pos = max(positions) + sai_mrg_context + min_coverage
778
+ #
779
+ # fasta_obj = Fasta_segment()
780
+ # ref_seq, ref_indices = fasta_obj.read_segment_endpoints(
781
+ # config_setup['CHROM_SOURCE'] / f'chr{mutations.chrom}.fasta',
782
+ # seq_start_pos,
783
+ # seq_end_pos)
784
+ #
785
+ # gene_start, gene_end, rev = gene_data.gene_start, gene_data.gene_end, gene_data.rev
786
+ #
787
+ # mrna_acceptors = sorted(list(set([lst for lsts in
788
+ # [mrna.get('acceptors', []) for mrna in gene_data.transcripts.values() if
789
+ # mrna['transcript_biotype'] == 'protein_coding'] for lst in lsts])))
790
+ #
791
+ # mrna_donors = sorted(list(set([lst for lsts in
792
+ # [mrna.get('donors', []) for mrna in gene_data.transcripts.values() if
793
+ # mrna['transcript_biotype'] == 'protein_coding'] for lst in lsts])))
794
+ #
795
+ # visible_donors = np.intersect1d(mrna_donors, ref_indices)
796
+ # visible_acceptors = np.intersect1d(mrna_acceptors, ref_indices)
797
+ #
798
+ # start_pad = ref_indices.index(gene_start) if gene_start in ref_indices else 0
799
+ # end_cutoff = ref_indices.index(gene_end) if gene_end in ref_indices else len(ref_indices) # - 1
800
+ # end_pad = len(ref_indices) - end_cutoff
801
+ # ref_seq = 'N' * start_pad + ref_seq[start_pad:end_cutoff] + 'N' * end_pad
802
+ # ref_indices = [-1] * start_pad + ref_indices[start_pad:end_cutoff] + [-1] * end_pad
803
+ # mut_seq, mut_indices = ref_seq, ref_indices
804
+ #
805
+ # for mut in mutations:
806
+ # mut_seq, mut_indices = generate_mut_variant(seq=mut_seq, indices=mut_indices, mut=mut)
807
+ #
808
+ # ref_indices = ref_indices[sai_mrg_context:-sai_mrg_context]
809
+ # mut_indices = mut_indices[sai_mrg_context:-sai_mrg_context]
810
+ #
811
+ # copy_mut_indices = mut_indices.copy()
812
+ # if rev:
813
+ # ref_seq = reverse_complement(ref_seq)
814
+ # mut_seq = reverse_complement(mut_seq)
815
+ # ref_indices = ref_indices[::-1]
816
+ # mut_indices = mut_indices[::-1]
817
+ #
818
+ # ref_seq_probs_temp = sai_predict_probs(ref_seq, sai_models)
819
+ # mut_seq_probs_temp = sai_predict_probs(mut_seq, sai_models)
820
+ #
821
+ # ref_seq_acceptor_probs, ref_seq_donor_probs = ref_seq_probs_temp[0, :], ref_seq_probs_temp[1, :]
822
+ # mut_seq_acceptor_probs, mut_seq_donor_probs = mut_seq_probs_temp[0, :], mut_seq_probs_temp[1, :]
823
+ #
824
+ # assert len(ref_indices) == len(ref_seq_acceptor_probs), 'Reference pos not the same'
825
+ # assert len(mut_indices) == len(mut_seq_acceptor_probs), 'Mut pos not the same'
826
+ #
827
+ # iap, dap = find_ss_changes({p: v for p, v in list(zip(ref_indices, ref_seq_acceptor_probs))},
828
+ # {p: v for p, v in list(zip(mut_indices, mut_seq_acceptor_probs))},
829
+ # visible_acceptors,
830
+ # threshold=sai_threshold)
831
+ #
832
+ # assert len(ref_indices) == len(ref_seq_donor_probs), 'Reference pos not the same'
833
+ # assert len(mut_indices) == len(mut_seq_donor_probs), 'Mut pos not the same'
834
+ #
835
+ # idp, ddp = find_ss_changes({p: v for p, v in list(zip(ref_indices, ref_seq_donor_probs))},
836
+ # {p: v for p, v in list(zip(mut_indices, mut_seq_donor_probs))},
837
+ # visible_donors,
838
+ # threshold=sai_threshold)
839
+ #
840
+ # # lost_acceptors = {p: {'absolute': 0, 'delta': -1} for p in gene_data.acceptors if not contains(copy_mut_indices, p)}
841
+ # # lost_donors = {p: {'absolute': 0, 'delta': -1} for p in gene_data.donors if not contains(copy_mut_indices, p)}
842
+ # # dap.update(lost_acceptors)
843
+ # # ddp.update(lost_donors)
844
+ # missplicing = {'missed_acceptors': dap, 'missed_donors': ddp, 'discovered_acceptors': iap, 'discovered_donors': idp}
845
+ # missplicing = {outk: {float(k): v for k, v in outv.items()} for outk, outv in missplicing.items()}
846
+ #
847
+ # return {outk: {int(k) if k.is_integer() else k: v for k, v in outv.items()} for outk, outv in missplicing.items()}
848
+ #
849
+
850
+ class PredictSpliceAI:
851
+ def __init__(self, mutation, gene_data,
852
+ threshold=0.5, force=False, save_results=False, sai_mrg_context=5000, min_coverage=2500):
853
+ self.modification = mutation
854
+ self.threshold = threshold
855
+ self.transcript_id = gene_data.transcript_id
856
+ self.spliceai_db = config_setup['MISSPLICING_PATH'] / f'spliceai_epistatic'
857
+ self.missplicing = {}
858
+
859
+ if self.prediction_file_exists() and not force: # need to do a check for the filename length
860
+ self.missplicing = self.load_sai_predictions()
861
+
862
+ if not self.missplicing:
863
+ # else:
864
+ # if isinstance(gene_data, Gene):
865
+ # self.missplicing = run_spliceai(self.modification, gene_data=gene_data, sai_mrg_context=sai_mrg_context, min_coverage=min_coverage, sai_threshold=0.1)
866
+ # if save_results:
867
+ # self.save_sai_predictions()
868
+ #
869
+ # elif isinstance(gene_data, Transcript):
870
+ self.missplicing = run_spliceai_transcript(self.modification, transcript_data=gene_data, sai_mrg_context=sai_mrg_context, min_coverage=min_coverage, sai_threshold=0.1)
871
+ if save_results:
872
+ self.save_sai_predictions()
873
+
874
+
875
+ def __repr__(self):
876
+ return f'Missplicing({self.modification.mut_id}) --> {self.missplicing}'
877
+
878
+ def __str__(self):
879
+ return self.aberrant_splicing
880
+ def __bool__(self):
881
+ for event, details in self.aberrant_splicing.items():
882
+ if details:
883
+ return True
884
+ return False
885
+
886
+ def __eq__(self, alt_splicing):
887
+ flag, _ = check_splicing_difference(self.missplicing, alt_splicing, self.threshold)
888
+ return not flag
889
+
890
+ def __iter__(self):
891
+ penetrances = [abs(d_in['delta']) for d in self.missplicing.values() for d_in in d.values()] + [0]
892
+ return iter(penetrances)
893
+
894
+ @property
895
+ def aberrant_splicing(self):
896
+ return self.apply_sai_threshold(self.missplicing, self.threshold)
897
+
898
+ @property
899
+ def prediction_file(self):
900
+ return self.spliceai_db / self.modification.gene / self.modification.file_identifier_json
901
+
902
+ def prediction_file_exists(self):
903
+ return self.prediction_file.exists()
904
+
905
+ def load_sai_predictions(self):
906
+ missplicing = unload_json(self.prediction_file)
907
+ if self.transcript_id in missplicing:
908
+ missplicing = missplicing[self.transcript_id]
909
+ else:
910
+ return {}
911
+
912
+ missplicing = {outk: {float(k): v for k, v in outv.items()} for outk, outv in missplicing.items()}
913
+ missplicing = {outk: {int(k) if k.is_integer() or 'missed' in outk else k: v for k, v in outv.items()} for
914
+ outk, outv in
915
+ missplicing.items()}
916
+ return missplicing
917
+
918
+ def save_sai_predictions(self):
919
+ self.prediction_file.parent.mkdir(parents=True, exist_ok=True)
920
+ if self.prediction_file_exists():
921
+ missplicing = unload_json(self.prediction_file)
922
+ missplicing[self.transcript_id] = self.missplicing
923
+
924
+ else:
925
+ missplicing = {self.transcript_id: self.missplicing}
926
+
927
+ # print(missplicing)
928
+ dump_json(self.prediction_file, missplicing)
929
+
930
+ def apply_sai_threshold(self, splicing_dict=None, threshold=None):
931
+ splicing_dict = self.missplicing if not splicing_dict else splicing_dict
932
+ threshold = self.threshold if not threshold else threshold
933
+ new_dict = {}
934
+ for event, details in splicing_dict.items():
935
+ for e, d in details.items():
936
+ if abs(d['delta']) >= threshold:
937
+ return splicing_dict
938
+ # new_dict[event] = {} #{k: v for k, v in details.items() if abs(v['delta']) >= threshold}
939
+ return new_dict
940
+
941
+
942
+ def apply_sai_threshold_primary(self, splicing_dict=None, threshold=None):
943
+ splicing_dict = self.missplicing if not splicing_dict else splicing_dict
944
+ threshold = self.threshold if not threshold else threshold
945
+ new_dict = {}
946
+ for event, details in splicing_dict.items():
947
+ new_dict_in = {}
948
+ for e, d in details.items():
949
+ if abs(d['delta']) >= threshold:
950
+ new_dict_in[e] = d
951
+ new_dict[event] = new_dict_in
952
+ return new_dict
953
+
954
+ def get_max_missplicing_delta(self):
955
+ max_delta = 0
956
+ for event, details in self.missplicing.items():
957
+ for e, d in details.items():
958
+ if abs(d['delta']) > max_delta:
959
+ max_delta = abs(d['delta'])
960
+ return max_delta
961
+
962
+
963
+ def check_splicing_difference(missplicing1, missplicing2, threshold=None):
964
+ flag = False
965
+ true_differences = {}
966
+ for event in ['missed_acceptors', 'missed_donors']:
967
+ td = {}
968
+ dct1 = missplicing1[event]
969
+ dct2 = missplicing2[event]
970
+ for k in list(set(list(dct1.keys()) + list(dct2.keys()))):
971
+ diff = abs(dct1.get(k, {'delta': 0})['delta']) - abs(dct2.get(k, {'delta': 0})['delta'])
972
+ if abs(diff) >= threshold:
973
+ flag = True
974
+ td[k] = diff
975
+ true_differences[event] = td
976
+
977
+ for event in ['discovered_acceptors', 'discovered_donors']:
978
+ td = {}
979
+ dct1 = missplicing1[event]
980
+ dct2 = missplicing2[event]
981
+ for k in list(set(list(dct1.keys()) + list(dct2.keys()))):
982
+ diff = abs(dct1.get(k, {'delta': 0})['delta']) - abs(dct2.get(k, {'delta': 0})['delta'])
983
+ if abs(diff) >= threshold:
984
+ flag = True
985
+ td[k] = diff
986
+ true_differences[event] = td
987
+
988
+ return flag, true_differences
989
+
990
+
991
+ # Annotating
992
+ def OncospliceAnnotator(reference_transcript, variant_transcript, mut):
993
+ affected_exon, affected_intron, distance_from_5, distance_from_3 = find_splice_site_proximity(mut,
994
+ reference_transcript)
995
+
996
+ report = {}
997
+ report['primary_transcript'] = reference_transcript.primary_transcript
998
+ report['transcript_id'] = reference_transcript.transcript_id
999
+ report['mut_id'] = mut.mut_id
1000
+ report['cons_available'] = int(reference_transcript.cons_available)
1001
+ report['protein_coding'] = reference_transcript.transcript_biotype
1002
+
1003
+ report['reference_mrna'] = reference_transcript.transcript_seq
1004
+ report['reference_cds_start'] = reference_transcript.TIS
1005
+ report['reference_pre_mrna'] = reference_transcript.pre_mrna
1006
+ report[
1007
+ 'reference_orf'] = reference_transcript.orf # pre_mrna[reference_transcript.transcript_indices.index(reference_transcript.TIS):reference_transcript.transcript_indices.index(reference_transcript.TTS)]
1008
+ report['reference_protein'] = reference_transcript.protein
1009
+ report['reference_protein_length'] = len(reference_transcript.protein)
1010
+
1011
+ report['variant_mrna'] = variant_transcript.transcript_seq
1012
+ report['variant_cds_start'] = variant_transcript.TIS
1013
+ report[
1014
+ 'variant_pre_mrna'] = variant_transcript.pre_mrna # pre_mrna[variant_transcript.transcript_indices.index(variant_transcript.TIS):variant_transcript.transcript_indices.index(variant_transcript.TTS)]
1015
+ report['variant_orf'] = variant_transcript.orf
1016
+ report['variant_protein'] = variant_transcript.protein
1017
+ report['variant_protein_length'] = len(variant_transcript.protein)
1018
+
1019
+ descriptions = define_missplicing_events(reference_transcript, variant_transcript)
1020
+ # print(descriptions)
1021
+ report['exon_changes'] = '|'.join([v for v in descriptions if v])
1022
+ report['splicing_codes'] = summarize_missplicing_event(*descriptions)
1023
+ report['affected_exon'] = affected_exon
1024
+ report['affected_intron'] = affected_intron
1025
+ report['mutation_distance_from_5'] = distance_from_5
1026
+ report['mutation_distance_from_3'] = distance_from_3
1027
+ return report
1028
+
1029
+ from Bio.Seq import Seq
1030
+ from Bio import pairwise2
1031
+ from dataclasses import dataclass
1032
+ from copy import deepcopy
1033
+ import re
1034
+ import pandas as pd
1035
+ from pathlib import Path
1036
+ import numpy as np
1037
+ from geney import config_setup
1038
+ import networkx as nx
1039
+ import matplotlib.pyplot as plt
1040
+ from matplotlib.patches import Rectangle
1041
+ import seaborn as sns
1042
+ from collections import namedtuple
1043
+
1044
+
1045
+ from geney.utils import find_files_by_gene_name, reverse_complement, unload_pickle, contains, unload_json, dump_json #, is_monotonic
1046
+ from geney.Fasta_segment import Fasta_segment
1047
+
1048
+ #### SpliceAI Modules
1049
+ import tensorflow as tf
1050
+ from keras.models import load_model
1051
+ from pkg_resources import resource_filename
1052
+ from spliceai.utils import one_hot_encode
1053
+
1054
+ tf.config.threading.set_intra_op_parallelism_threads(1)
1055
+ tf.config.threading.set_inter_op_parallelism_threads(1)
1056
+
1057
+ sai_paths = ('models/spliceai{}.h5'.format(x) for x in range(1, 6))
1058
+ sai_models = [load_model(resource_filename('spliceai', x)) for x in sai_paths]
1059
+
1060
+ def is_monotonic(A):
1061
+ x, y = [], []
1062
+ x.extend(A)
1063
+ y.extend(A)
1064
+ x.sort()
1065
+ y.sort(reverse=True)
1066
+ if (x == A or y == A):
1067
+ return True
1068
+ return False
1069
+
1070
+ def sai_predict_probs(seq: str, models: list) -> list:
1071
+ '''
1072
+ Predicts the donor and acceptor junction probability of each
1073
+ NT in seq using SpliceAI.
1074
+
1075
+ Let m:=2*sai_mrg_context + L be the input seq length. It is assumed
1076
+ that the input seq has the following structure:
1077
+
1078
+ seq = |<sai_mrg_context NTs><L NTs><sai_mrg_context NTs>|
1079
+
1080
+ The returned probability matrix is of size 2XL, where
1081
+ the first row is the acceptor probability and the second row
1082
+ is the donor probability. These probabilities corresponds to the
1083
+ middel <L NTs> NTs of the input seq.
1084
+ '''
1085
+ x = one_hot_encode(seq)[None, :]
1086
+ y = np.mean([models[m].predict(x, verbose=0) for m in range(5)], axis=0)
1087
+ return y[0, :, 1:].T
1088
+
1089
+
1090
+ ### Variant Modules
1091
+ class Mutation:
1092
+ def __init__(self, mid):
1093
+ '''
1094
+
1095
+ :param mid: mutation id in the format of gene:chrom:pos:ref:alt
1096
+ Needs only to store the following properties for a given mutation
1097
+ gene: the name of the gene
1098
+ chrom: the chromosome refernece
1099
+ start: the position of the mutation
1100
+ file_identifier: some filename that can be used to store related data
1101
+ vartype: the variant type
1102
+
1103
+ We want to be able to compare mutations based on location.
1104
+ '''
1105
+
1106
+ self.mut_id = mid
1107
+
1108
+ gene, chrom, pos, ref, alt = mid.split(':')
1109
+ self.gene = gene
1110
+ self.chrom = chrom.strip('chr')
1111
+ self.start = int(pos)
1112
+
1113
+ self.file_identifier = self.mut_id.replace(':', '_')
1114
+ self.file_identifier_short = f'{self.start}_{ref[:6]}_{alt[:6]}'
1115
+
1116
+ self.ref = ref if ref != '-' else ''
1117
+ self.alt = alt if alt != '-' else ''
1118
+
1119
+ if len(self.ref) == len(self.alt) == 1:
1120
+ self.vartype = 'SNP'
1121
+
1122
+ elif len(self.ref) == len(self.alt) > 1:
1123
+ self.vartype = 'SUB'
1124
+ elif self.ref and not self.alt:
1125
+ self.vartype = 'DEL'
1126
+ elif self.alt and not self.ref:
1127
+ self.vartype = 'INS'
1128
+ else:
1129
+ self.vartype = 'INDEL'
1130
+
1131
+ def __str__(self):
1132
+ return self.mut_id
1133
+
1134
+ def __repr__(self):
1135
+ return f"Mutation({self.mut_id})"
1136
+
1137
+ def __lt__(self, other):
1138
+ return self.start < other.start
1139
+
1140
+ class Variations:
1141
+ '''
1142
+ Unlike a mutation, here we have an epistatic set, or a series of mtuations that are separated by '|' characters
1143
+ For such events we want to store them
1144
+ '''
1145
+ def __init__(self, epistatic_set):
1146
+ self.variants = sorted([Mutation(m) for m in epistatic_set.split('|')])
1147
+ self.mut_id = epistatic_set
1148
+ self.start = self.variants[0].start
1149
+ self.positions = [v.start for v in self.variants]
1150
+ self.gene = self.variants[0].gene
1151
+ self.chrom = self.variants[0].chrom.strip('chr')
1152
+ self.file_identifier = f'{self.gene}_{self.chrom}' + '_' + '_'.join(
1153
+ [v.file_identifier_short for v in self.variants])
1154
+ self.range = max(self.positions) - min(self.positions)
1155
+
1156
+ def __str__(self):
1157
+ return '|'.join([m.mut_id for m in self.variants])
1158
+
1159
+ def __repr__(self):
1160
+ return f"Variation({', '.join([m.mut_id for m in self.variants])})"
1161
+
1162
+ def __iter__(self):
1163
+ self.current_index = 0
1164
+ return self
1165
+
1166
+ def __next__(self):
1167
+ if self.current_index < len(self.variants):
1168
+ x = self.variants[self.current_index]
1169
+ self.current_index += 1
1170
+ return x
1171
+ raise StopIteration
1172
+
1173
+ @property
1174
+ def file_identifier_json(self):
1175
+ return Path(self.file_identifier + '.json')
1176
+
1177
+ @property
1178
+ def as_dict(self):
1179
+ return {m.start: m.alt for m in self.variants}
1180
+
1181
+ def verify(self):
1182
+ if len(set(self.positions)) != len(self.variants):
1183
+ return False
1184
+ return True
1185
+
1186
+
1187
+ def generate_mut_variant(seq: str, indices: list, mut: Mutation):
1188
+ offset = 1 if not mut.ref else 0
1189
+ check_indices = list(range(mut.start, mut.start + len(mut.ref) + offset))
1190
+ check1 = all([contains(list(filter((-1).__ne__, indices)), m) for m in check_indices])
1191
+ if not check1:
1192
+ print(
1193
+ f"Mutation {mut} not within transcript bounds: {min(list(filter((-1).__ne__, indices)))} - {max(indices)}.")
1194
+
1195
+ return seq, indices
1196
+
1197
+ rel_start, rel_end = indices.index(mut.start) + offset, indices.index(mut.start) + offset + len(mut.ref)
1198
+ acquired_seq = seq[rel_start:rel_end]
1199
+ check2 = acquired_seq == mut.ref
1200
+ if not check2:
1201
+ print(f'Reference allele ({mut.ref}) does not match genome_build allele ({acquired_seq}).')
1202
+
1203
+ if len(mut.ref) == len(mut.alt) > 0:
1204
+ temp_indices = list(range(mut.start, mut.start + len(mut.ref)))
1205
+ # elif len(mut.ref) > 0 and len(mut.alt) > 0:
1206
+ # temp_indices = [indices[indices.index(mut.start)] + v / 1000 for v in list(range(0, len(mut.alt)))]
1207
+ else:
1208
+ temp_indices = [indices[indices.index(mut.start)] + v / 1000 for v in list(range(1, len(mut.alt) + 1))]
1209
+
1210
+ new_indices = indices[:rel_start] + temp_indices + indices[rel_end:]
1211
+ new_seq = seq[:rel_start] + mut.alt + seq[rel_end:]
1212
+
1213
+ assert len(new_seq) == len(new_indices), f'Error in preserving sequence lengths during variant modification: {mut}, {len(new_seq)}, {len(new_indices)}'
1214
+ assert is_monotonic(list(filter((-1).__ne__, new_indices))), f'Modified nucleotide indices are not monotonic.'
1215
+ return new_seq, new_indices
1216
+
1217
+
1218
+
1219
+ class Gene:
1220
+ def __init__(self, gene_name, variation=None):
1221
+ self.gene_name = gene_name
1222
+ self.gene_id = ''
1223
+ self.rev = None
1224
+ self.chrm = ''
1225
+ self.gene_start = 0
1226
+ self.gene_end = 0
1227
+ self.transcripts = {}
1228
+ self.load_from_file(find_files_by_gene_name(gene_name))
1229
+ self.variations = variation
1230
+ self.primary_tid = None
1231
+ tids = [k for k, v in self.transcripts.items() if v['primary_transcript'] and v['transcript_biotype'] == 'protein_coding']
1232
+ if tids:
1233
+ self.primary_tid = tids[0]
1234
+ else:
1235
+ self.primary_tid = list(self.transcripts.keys())[0]
1236
+
1237
+ def __repr__(self):
1238
+ return f'Gene(gene_name={self.gene_name})'
1239
+
1240
+ def __len__(self):
1241
+ return len(self.transcripts)
1242
+
1243
+ def __str__(self):
1244
+ return '{gname}, {ntranscripts} transcripts'.format(gname=self.gene_name, ntranscripts=self.__len__())
1245
+
1246
+ def __copy__(self):
1247
+ cls = self.__class__
1248
+ result = cls.__new__(cls)
1249
+ result.__dict__.update(self.__dict__)
1250
+ return result
1251
+
1252
+ def __deepcopy__(self, memo):
1253
+ cls = self.__class__
1254
+ result = cls.__new__(cls)
1255
+ memo[id(self)] = result
1256
+ for k, v in self.__dict__.items():
1257
+ setattr(result, k, deepcopy(v, memo))
1258
+ return result
1259
+
1260
+ def __getitem__(self, index):
1261
+ return Transcript(list(self.transcripts.values())[index])
1262
+
1263
+ def load_from_file(self, file_name):
1264
+ if not file_name.exists():
1265
+ raise FileNotFoundError(f"File '{file_name}' not found.")
1266
+ self.load_from_dict(dict_data=unload_pickle(file_name))
1267
+ return self
1268
+
1269
+ def load_from_dict(self, dict_data=None):
1270
+ for k, v in dict_data.items():
1271
+ setattr(self, k, v)
1272
+ return self
1273
+
1274
+ def transcript(self, tid=None):
1275
+ if tid is None:
1276
+ tid = self.primary_tid
1277
+
1278
+ if tid not in self.transcripts:
1279
+ raise AttributeError(f"Transcript '{tid}' not found in gene '{self.gene_name}'.")
1280
+ return Transcript(self.transcripts[tid])
1281
+
1282
+ def run_transcripts(self, primary_transcript=False, protein_coding=False):
1283
+ for tid, annotations in self.transcripts.items():
1284
+ if primary_transcript and not annotations['primary_transcript']:
1285
+ continue
1286
+ if protein_coding and annotations['transcript_biotype'] != 'protein_coding':
1287
+ continue
1288
+
1289
+ yield Transcript(self.transcripts[tid], variations=self.variations)
1290
+
1291
+
264
1292
  class Transcript:
265
1293
  def __init__(self, d=None, variations=None):
266
1294
  self.transcript_id = None
@@ -678,7 +1706,7 @@ def find_ss_changes(ref_dct, mut_dct, known_splice_sites, threshold=0.5):
678
1706
  new_dict.items() if v >= threshold and k not in known_splice_sites} # if (k not in known_splice_sites and v >= threshold) or (v > 0.45)}
679
1707
 
680
1708
  deleted_pos = {k: {'delta': round(float(v), 3), 'absolute': round(float(mut_dct.get(k, 0)), 3)} for k, v in
681
- new_dict.items() if v >= threshold and k in known_splice_sites} #if k in known_splice_sites and v <= -threshold}
1709
+ new_dict.items() if -v >= threshold and k in known_splice_sites} #if k in known_splice_sites and v <= -threshold}
682
1710
 
683
1711
  return discovered_pos, deleted_pos
684
1712
 
@@ -720,6 +1748,9 @@ def run_spliceai_transcript(mutations, transcript_data, sai_mrg_context=5000, mi
720
1748
  for mut in mutations:
721
1749
  mut_seq, mut_indices = generate_mut_variant(seq=mut_seq, indices=mut_indices, mut=mut)
722
1750
 
1751
+ if mut_seq == ref_seq:
1752
+ print("Even in SpliceAI?!")
1753
+
723
1754
  ref_indices = ref_indices[sai_mrg_context:-sai_mrg_context]
724
1755
  mut_indices = mut_indices[sai_mrg_context:-sai_mrg_context]
725
1756
  copy_mut_indices = mut_indices.copy()
@@ -1024,6 +2055,11 @@ def OncospliceAnnotator(reference_transcript, variant_transcript, mut):
1024
2055
  return report
1025
2056
 
1026
2057
 
2058
+ # def find_splice_site_proximity(mut, transcript):
2059
+ # for i, (ex_start, ex_end) in enumerate(transcript.exons):
2060
+ # if min(ex_start, ex_end) <= mut.start <= max(ex_start, ex_end):
2061
+ #
2062
+
1027
2063
  def find_splice_site_proximity(mut, transcript):
1028
2064
  for i, (ex_start, ex_end) in enumerate(transcript.exons):
1029
2065
  if min(ex_start, ex_end) <= mut.start <= max(ex_start, ex_end):
@@ -1347,8 +2383,12 @@ def oncosplice(mut_id, sai_threshold=0.5, protein_coding=True, primary_transcrip
1347
2383
  for variant in mutated_gene.run_transcripts(protein_coding=protein_coding, primary_transcript=primary_transcript):
1348
2384
  reference = reference_gene.transcript(variant.transcript_id)
1349
2385
  if mutation not in reference or reference.protein == '' or len(reference.protein) < window_length:
2386
+ print("exit flag 1")
1350
2387
  continue
1351
2388
 
2389
+ if reference.pre_mrna == variant.pre_mrna:
2390
+ print("WHAT THE FUCK?")
2391
+
1352
2392
  cons_vector = transform_conservation_vector(reference.cons_vector, window=window_length)
1353
2393
  # if per_transcript_missplicing:
1354
2394
  missplicing_obj = PredictSpliceAI(mutation, reference, threshold=sai_threshold, force=force_spliceai, save_results=save_spliceai_results)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: geney
3
- Version: 1.1.11
3
+ Version: 1.1.13
4
4
  Summary: A Python package for gene expression modeling.
5
5
  Home-page: https://github.com/nicolaslynn/geney
6
6
  Author: Nicolas Lynn
@@ -7,9 +7,9 @@ geney/config_setup.py,sha256=SePeooA4RWAtR_KAT1-W1hkD3MT5tH6YMyp80t_RNPQ,385
7
7
  geney/data_setup.py,sha256=DZeksRPr2ZT7bszMo33W0r3OwmqHokVXtZ4gx5Lu_Mo,10725
8
8
  geney/gtex.py,sha256=asL2lHyU5KsbWpV096vkf1Ka7hSo_RRfZqw7p5nERmE,1919
9
9
  geney/gtex_utils.py,sha256=asL2lHyU5KsbWpV096vkf1Ka7hSo_RRfZqw7p5nERmE,1919
10
- geney/immune_utils.py,sha256=0udmTxqF9jCYeUOgP7bGLWMEBH3KBikKu8pPQnE9Rfo,6881
10
+ geney/immune_utils.py,sha256=elxjQyB52lYXrrt3sX6vtYlr_pTFEeCFzmEMP2qlPwA,5300
11
11
  geney/netchop.py,sha256=AMiy9YsdTmX4B3k3Y5Yh7EmoGAojM1O3AzhPKOiB--g,3050
12
- geney/oncosplice.py,sha256=Fyc_UtAhV3Pv0vk8V55rO_jnb2Dwj5sW98KVwP3PHwU,68964
12
+ geney/oncosplice.py,sha256=cQYqBPeNqbTnSqJSKZxIjekWXVlgnqIoa7UXVFQ2PtE,112111
13
13
  geney/oncosplice_pipeline.py,sha256=hpGqFHOdn8i8tvvs1-t3-G9Ko18zInwoDXBJbbrfbC4,68036
14
14
  geney/performance_utils.py,sha256=FQt7rA4r-Wuq3kceCxsSuMfj3wU1tMG8QnbL59aBohs,4700
15
15
  geney/power_utils.py,sha256=6InuDm1jSrsgR-F_LmdMTbuQwty2OdYjwfGGaAPhaRI,7268
@@ -44,7 +44,7 @@ geney/translation_initiation/resources/kozak_pssm.json,sha256=pcd0Olziutq-6H3mFW
44
44
  geney/translation_initiation/resources/tis_regressor_model.joblib,sha256=IXb4DUDhJ5rBDKcqMk9zE3ECTZZcdj7Jixz3KpoZ7OA,2592025
45
45
  geney/translation_termination/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
46
46
  geney/translation_termination/tts_utils.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
47
- geney-1.1.11.dist-info/METADATA,sha256=eKUG3cuHIC37_E6QJg5TyDjBC6NXoine75FZWLxCK6A,1131
48
- geney-1.1.11.dist-info/WHEEL,sha256=iYlv5fX357PQyRT2o6tw1bN-YcKFFHKqB_LwHO5wP-g,110
49
- geney-1.1.11.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
50
- geney-1.1.11.dist-info/RECORD,,
47
+ geney-1.1.13.dist-info/METADATA,sha256=JLpUYMSjNwVBkmZYRCfzoW4KdVyFz3oL1o0qw5hIFfA,1131
48
+ geney-1.1.13.dist-info/WHEEL,sha256=iYlv5fX357PQyRT2o6tw1bN-YcKFFHKqB_LwHO5wP-g,110
49
+ geney-1.1.13.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
50
+ geney-1.1.13.dist-info/RECORD,,
File without changes