supremo-lite 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,3098 @@
1
+ """
2
+ Personalized sequence generation for supremo_lite.
3
+
4
+ This module provides functions for creating personalized genomes by applying
5
+ variants to a reference genome and generating sequence windows around variants.
6
+ """
7
+
8
+ import bisect
9
+ import re
10
+ import warnings
11
+ import os
12
+ from typing import Dict, List, Tuple, Union, NamedTuple
13
+ import pandas as pd
14
+ import numpy as np
15
+ from pyfaidx import Fasta
16
+ from .variant_utils import read_vcf
17
+ from .chromosome_utils import match_chromosomes_with_report, apply_chromosome_mapping
18
+ from .sequence_utils import encode_seq
19
+ from .core import TORCH_AVAILABLE
20
+ from .variant_utils import classify_variant_type, parse_vcf_info
21
+
22
+ try:
23
+ import torch
24
+ except ImportError:
25
+ pass # Already handled in core
26
+
27
+
28
+ # IUPAC degenerate nucleotide codes for PAM pattern matching
29
+ IUPAC_CODES = {
30
+ "A": "A",
31
+ "C": "C",
32
+ "G": "G",
33
+ "T": "T",
34
+ "U": "U",
35
+ "W": "[AT]",
36
+ "S": "[CG]",
37
+ "M": "[AC]",
38
+ "K": "[GT]",
39
+ "R": "[AG]",
40
+ "Y": "[CT]",
41
+ "B": "[CGT]",
42
+ "D": "[AGT]",
43
+ "H": "[ACT]",
44
+ "V": "[ACG]",
45
+ "N": "[ACGT]"
46
+ }
47
+
48
+
49
+ class ChromosomeOffsetTracker:
50
+ """
51
+ Tracks cumulative coordinate offsets per chromosome from applied variants.
52
+
53
+ When standard variants (INS/DEL) change chromosome lengths, the original VCF
54
+ coordinates for later BND variants become invalid. This class tracks the
55
+ cumulative offset at each position to enable coordinate transformation.
56
+ """
57
+
58
+ def __init__(self):
59
+ """Initialize empty offset tracker."""
60
+ self.chromosome_offsets: Dict[str, List[Tuple[int, int]]] = (
61
+ {}
62
+ ) # chrom -> [(pos, cumulative_offset)]
63
+
64
+ def add_offset(self, chrom: str, pos: int, offset: int) -> None:
65
+ """
66
+ Add an offset at a specific position on a chromosome.
67
+
68
+ Args:
69
+ chrom: Chromosome name
70
+ pos: Genomic position (0-based) where offset occurs
71
+ offset: Length change (+/- bases) from the variant
72
+ """
73
+ if chrom not in self.chromosome_offsets:
74
+ self.chromosome_offsets[chrom] = []
75
+
76
+ # Find insertion point and update cumulative offsets
77
+ offset_list = self.chromosome_offsets[chrom]
78
+
79
+ # Calculate cumulative offset at this position
80
+ cumulative_offset = offset
81
+ for existing_pos, existing_cumulative in offset_list:
82
+ if existing_pos <= pos:
83
+ cumulative_offset += existing_cumulative
84
+
85
+ # Insert new offset entry, maintaining sorted order by position
86
+ inserted = False
87
+ for i, (existing_pos, existing_cumulative) in enumerate(offset_list):
88
+ if pos < existing_pos:
89
+ offset_list.insert(i, (pos, cumulative_offset))
90
+ inserted = True
91
+ # Update all downstream offsets
92
+ for j in range(i + 1, len(offset_list)):
93
+ old_pos, old_cumulative = offset_list[j]
94
+ offset_list[j] = (old_pos, old_cumulative + offset)
95
+ break
96
+
97
+ if not inserted:
98
+ offset_list.append((pos, cumulative_offset))
99
+
100
+ def get_offset_at_position(self, chrom: str, pos: int) -> int:
101
+ """
102
+ Get the cumulative offset at a specific position.
103
+
104
+ Args:
105
+ chrom: Chromosome name
106
+ pos: Genomic position (0-based) to query
107
+
108
+ Returns:
109
+ Cumulative offset at this position
110
+ """
111
+ if chrom not in self.chromosome_offsets:
112
+ return 0
113
+
114
+ offset_list = self.chromosome_offsets[chrom]
115
+ cumulative_offset = 0
116
+
117
+ for offset_pos, offset_cumulative in offset_list:
118
+ if offset_pos <= pos:
119
+ cumulative_offset = offset_cumulative
120
+ else:
121
+ break
122
+
123
+ return cumulative_offset
124
+
125
+ def transform_coordinate(self, chrom: str, pos: int) -> int:
126
+ """
127
+ Transform a VCF coordinate to account for applied variants.
128
+
129
+ Args:
130
+ chrom: Chromosome name
131
+ pos: Original VCF position (1-based)
132
+
133
+ Returns:
134
+ Transformed position (1-based) in the modified sequence
135
+ """
136
+ # Convert to 0-based, apply offset, convert back to 1-based
137
+ pos_0based = pos - 1
138
+ offset = self.get_offset_at_position(chrom, pos_0based)
139
+ return pos + offset
140
+
141
+
142
+ class SequenceSegment(NamedTuple):
143
+ """Represents a segment within a sequence with its source and position."""
144
+
145
+ source_type: str # 'reference', 'novel', 'rc_reference'
146
+ source_chrom: str # chromosome name or 'NOVEL'
147
+ start_pos: int # start position in the final sequence
148
+ end_pos: int # end position in the final sequence
149
+ length: int # segment length
150
+ orientation: str # 'forward', 'reverse', 'novel'
151
+
152
+
153
+ class FrozenRegionTracker:
154
+ """
155
+ Efficiently tracks genomic regions that are 'frozen' due to applied variants.
156
+
157
+ Frozen regions prevent overlapping variants from being applied to the same
158
+ genomic coordinates. Uses a sorted list of non-overlapping intervals with
159
+ binary search for lookup.
160
+ """
161
+
162
+ def __init__(self):
163
+ """Initialize empty interval tracker."""
164
+ self.intervals: List[Tuple[int, int]] = [] # sorted list of (start, end) tuples
165
+
166
+ def is_frozen(self, pos: int) -> bool:
167
+ """
168
+ Check if a genomic position is within any frozen region.
169
+
170
+ Args:
171
+ pos: Genomic position (0-based)
172
+
173
+ Returns:
174
+ True if position is frozen, False otherwise
175
+
176
+ """
177
+ if not self.intervals:
178
+ return False
179
+
180
+ # Binary search for interval that could contain pos
181
+ idx = bisect.bisect_right(self.intervals, (pos, float("inf"))) - 1
182
+
183
+ if idx >= 0:
184
+ start, end = self.intervals[idx]
185
+ return start <= pos <= end
186
+
187
+ return False
188
+
189
+ def add_range(self, start: int, end: int) -> None:
190
+ """
191
+ Add a new frozen region, merging with existing overlapping intervals.
192
+
193
+ Args:
194
+ start: Start position of region (0-based, inclusive)
195
+ end: End position of region (0-based, inclusive)
196
+ """
197
+ if start > end:
198
+ return
199
+
200
+ # Find insertion point and overlapping intervals
201
+ left_idx = bisect.bisect_left(self.intervals, (start, start))
202
+ right_idx = bisect.bisect_right(self.intervals, (end, end))
203
+
204
+ # Check for overlap with interval before insertion point
205
+ if left_idx > 0:
206
+ prev_start, prev_end = self.intervals[left_idx - 1]
207
+ if prev_end >= start - 1: # Adjacent or overlapping
208
+ left_idx -= 1
209
+ start = min(start, prev_start)
210
+ end = max(end, prev_end)
211
+
212
+ # Merge with all overlapping intervals
213
+ for i in range(left_idx, min(right_idx, len(self.intervals))):
214
+ interval_start, interval_end = self.intervals[i]
215
+ if interval_start <= end + 1: # Adjacent or overlapping
216
+ start = min(start, interval_start)
217
+ end = max(end, interval_end)
218
+
219
+ # Remove old intervals and insert merged interval
220
+ del self.intervals[left_idx:right_idx]
221
+ self.intervals.insert(left_idx, (start, end))
222
+
223
+
224
+ class ChromosomeSegmentTracker:
225
+ """Track which segments of each chromosome are used by fusions."""
226
+
227
+ def __init__(self, ref_sequences: Dict[str, str]):
228
+ self.ref_sequences = ref_sequences
229
+ self.used_segments = {chrom: [] for chrom in ref_sequences.keys()}
230
+
231
+ def add_used_segment(self, chrom: str, start: int, end: int, verbose: bool = False):
232
+ """Add a used segment (0-based coordinates)."""
233
+ if chrom in self.used_segments:
234
+ self.used_segments[chrom].append((start, end))
235
+ if verbose:
236
+ print(
237
+ f" 🔍 Tracking used segment: {chrom}[{start}:{end}] = {end-start}bp"
238
+ )
239
+
240
+ def get_leftover_sequences(self, verbose: bool = False) -> Dict[str, str]:
241
+ """Calculate leftover sequences not used by any fusion."""
242
+ leftover_sequences = {}
243
+
244
+ for chrom, ref_seq in self.ref_sequences.items():
245
+ segments = sorted(self.used_segments[chrom])
246
+ leftover_parts = []
247
+
248
+ if not segments:
249
+ # No segments used - entire chromosome is leftover
250
+ leftover_parts = [ref_seq]
251
+ else:
252
+ # Find gaps between used segments
253
+ current_pos = 0
254
+
255
+ for start, end in segments:
256
+ # Add leftover before this segment
257
+ if current_pos < start:
258
+ leftover_parts.append(ref_seq[current_pos:start])
259
+ current_pos = max(current_pos, end)
260
+
261
+ # Add leftover after last segment
262
+ if current_pos < len(ref_seq):
263
+ leftover_parts.append(ref_seq[current_pos:])
264
+
265
+ # Combine leftover parts
266
+ if leftover_parts:
267
+ leftover_seq = "".join(leftover_parts)
268
+ if leftover_seq: # Only add non-empty leftovers
269
+ leftover_sequences[chrom] = leftover_seq
270
+ if verbose:
271
+ print(f" ✂️ Created leftover {chrom}: {len(leftover_seq)} bp")
272
+
273
+ return leftover_sequences
274
+
275
+
276
+ class VariantApplicator:
277
+ """
278
+ Applies VCF variants to a reference sequence in memory.
279
+
280
+ Handles coordinate system transformations, frozen region tracking,
281
+ and sequence modifications for SNVs, insertions, and deletions.
282
+ """
283
+
284
+ def __init__(
285
+ self,
286
+ sequence_str: str,
287
+ variants_df: pd.DataFrame,
288
+ frozen_tracker: FrozenRegionTracker = None,
289
+ offset_tracker: ChromosomeOffsetTracker = None,
290
+ chrom: str = None,
291
+ ):
292
+ """
293
+ Initialize variant applicator for a single chromosome.
294
+
295
+ Args:
296
+ sequence_str: Reference sequence as string
297
+ variants_df: DataFrame containing variants for this chromosome
298
+ frozen_tracker: Optional existing FrozenRegionTracker to preserve overlap state across chunks
299
+ offset_tracker: Optional ChromosomeOffsetTracker to track coordinate offsets
300
+ chrom: Chromosome name (required if offset_tracker is provided)
301
+ """
302
+ self.sequence = bytearray(sequence_str.encode()) # Mutable sequence
303
+ self.variants = variants_df.sort_values("pos1").reset_index(drop=True)
304
+ self.frozen_tracker = (
305
+ frozen_tracker if frozen_tracker is not None else FrozenRegionTracker()
306
+ )
307
+ self.offset_tracker = offset_tracker
308
+ self.chrom = chrom
309
+ self.cumulative_offset = 0 # Track length changes from applied variants
310
+ self.applied_count = 0
311
+ self.skipped_count = 0
312
+ self.skipped_variants = (
313
+ []
314
+ ) # List of (vcf_line, chrom, pos1, ref, alt, reason) tuples
315
+
316
+ def apply_variants(self) -> Tuple[str, Dict[str, int]]:
317
+ """
318
+ Apply all variants to the sequence.
319
+
320
+ Returns:
321
+ Tuple of (modified_sequence, statistics_dict)
322
+ """
323
+ for _, variant in self.variants.iterrows():
324
+ try:
325
+ self._apply_single_variant(variant)
326
+ except Exception as e:
327
+ # Extract concise error message and context
328
+ vcf_line = variant.get("vcf_line", "?")
329
+ chrom = variant.get("chrom", self.chrom)
330
+ error_msg = str(e).split(":")[0] if ":" in str(e) else str(e)
331
+ warnings.warn(
332
+ f"Skipped variant at {chrom}:{variant.pos1} (VCF line {vcf_line}): {error_msg}"
333
+ )
334
+ self.skipped_count += 1
335
+ # Record skip details for reporting
336
+ self.skipped_variants.append(
337
+ (
338
+ vcf_line,
339
+ chrom,
340
+ variant.pos1,
341
+ variant.ref,
342
+ variant.alt,
343
+ "validation_error",
344
+ )
345
+ )
346
+
347
+ stats = {
348
+ "applied": self.applied_count,
349
+ "skipped": self.skipped_count,
350
+ "total": len(self.variants),
351
+ "skipped_variants": self.skipped_variants,
352
+ }
353
+
354
+ return self.sequence.decode(), stats
355
+
356
+ def apply_single_variant_to_window(
357
+ self, variant: pd.Series, window_start: int, window_end: int
358
+ ) -> str:
359
+ """
360
+ Apply a single variant to a sequence window.
361
+
362
+ Args:
363
+ variant: Series containing variant information (pos, ref, alt)
364
+ window_start: Start position of window (0-based)
365
+ window_end: End position of window (0-based, exclusive)
366
+
367
+ Returns:
368
+ Modified sequence string
369
+ """
370
+ # Create a copy of the window sequence
371
+ window_seq = self.sequence[window_start:window_end].copy()
372
+
373
+ # Handle multiple ALT alleles - take first one
374
+ alt_allele = variant.alt.split(",")[0]
375
+
376
+ # Calculate variant position relative to window
377
+ genomic_pos = variant.pos1 - 1 # Convert VCF 1-based to 0-based
378
+ var_pos_in_window = genomic_pos - window_start
379
+
380
+ # Check if variant is within window
381
+ if var_pos_in_window < 0 or var_pos_in_window >= len(window_seq):
382
+ return window_seq.decode()
383
+
384
+ # Check if entire variant fits in window
385
+ ref_end = var_pos_in_window + len(variant.ref)
386
+ if ref_end > len(window_seq):
387
+ return window_seq.decode()
388
+
389
+ # Validate reference matches
390
+ expected_ref = window_seq[var_pos_in_window:ref_end].decode()
391
+ if expected_ref.upper() != variant.ref.upper():
392
+ warnings.warn(
393
+ f"Reference mismatch at position {variant.pos1}: "
394
+ f"expected '{variant.ref}', found '{expected_ref}'"
395
+ )
396
+ return window_seq.decode()
397
+
398
+ # Apply variant
399
+ if len(alt_allele) == len(variant.ref):
400
+ # SNV: Direct substitution
401
+ window_seq[var_pos_in_window:ref_end] = alt_allele.encode()
402
+ elif len(alt_allele) < len(variant.ref):
403
+ # Deletion
404
+ window_seq[var_pos_in_window : var_pos_in_window + len(alt_allele)] = (
405
+ alt_allele.encode()
406
+ )
407
+ del window_seq[var_pos_in_window + len(alt_allele) : ref_end]
408
+ else:
409
+ # Insertion
410
+ window_seq[var_pos_in_window:ref_end] = alt_allele.encode()
411
+
412
+ return window_seq.decode()
413
+
414
+ def _apply_single_variant(self, variant: pd.Series) -> None:
415
+ """
416
+ Apply a single variant to the sequence using variant type classifications.
417
+
418
+ Args:
419
+ variant: Series containing variant information (pos, ref, alt, variant_type)
420
+ """
421
+ # 1. VARIANT TYPE VALIDATION
422
+ variant_type = variant.get("variant_type", "unknown")
423
+
424
+ # Define supported variant types for standard variant processing
425
+ supported_types = {
426
+ "SNV",
427
+ "MNV",
428
+ "INS",
429
+ "DEL",
430
+ "complex",
431
+ "SV_DUP",
432
+ "SV_INV",
433
+ "SV_BND_DUP",
434
+ "SV_BND_INV",
435
+ }
436
+
437
+ # Handle variants that should be processed elsewhere or are unsupported
438
+ vcf_line = variant.get("vcf_line", "?")
439
+ chrom = variant.get("chrom", self.chrom)
440
+
441
+ if variant_type in ["SV_BND"]:
442
+ warnings.warn(
443
+ f"Skipped variant at {chrom}:{variant.pos1} (VCF line {vcf_line}): type '{variant_type}' not supported"
444
+ )
445
+ self.skipped_count += 1
446
+ self.skipped_variants.append(
447
+ (
448
+ vcf_line,
449
+ chrom,
450
+ variant.pos1,
451
+ variant.ref,
452
+ variant.alt,
453
+ "unsupported_type",
454
+ )
455
+ )
456
+ return
457
+ elif variant_type in {"SV_DEL", "SV_INS", "SV_CNV"}:
458
+ warnings.warn(
459
+ f"Skipped variant at {chrom}:{variant.pos1} (VCF line {vcf_line}): type '{variant_type}' not supported"
460
+ )
461
+ self.skipped_count += 1
462
+ self.skipped_variants.append(
463
+ (
464
+ vcf_line,
465
+ chrom,
466
+ variant.pos1,
467
+ variant.ref,
468
+ variant.alt,
469
+ "unsupported_type",
470
+ )
471
+ )
472
+ return
473
+ elif variant_type in {"missing", "unknown"}:
474
+ warnings.warn(
475
+ f"Skipped variant at {chrom}:{variant.pos1} (VCF line {vcf_line}): type '{variant_type}' not supported"
476
+ )
477
+ self.skipped_count += 1
478
+ self.skipped_variants.append(
479
+ (
480
+ vcf_line,
481
+ chrom,
482
+ variant.pos1,
483
+ variant.ref,
484
+ variant.alt,
485
+ "missing_type",
486
+ )
487
+ )
488
+ return
489
+ elif variant_type not in supported_types:
490
+ warnings.warn(
491
+ f"Skipped variant at {chrom}:{variant.pos1} (VCF line {vcf_line}): type '{variant_type}' not supported"
492
+ )
493
+ self.skipped_count += 1
494
+ self.skipped_variants.append(
495
+ (
496
+ vcf_line,
497
+ chrom,
498
+ variant.pos1,
499
+ variant.ref,
500
+ variant.alt,
501
+ "unsupported_type",
502
+ )
503
+ )
504
+ return
505
+
506
+ # 2. STRUCTURAL VARIANT INFO PARSING
507
+ info_dict = {}
508
+ if variant_type in ["SV_DUP", "SV_INV"] and "info" in variant:
509
+ info_dict = parse_vcf_info(variant["info"])
510
+
511
+ # 3. BASIC VALIDATION CHECKS
512
+ if variant.alt == variant.ref:
513
+ self.skipped_count += 1
514
+ return # Skip ref-only variants
515
+
516
+ # Handle multiple ALT alleles - take first one
517
+ alt_allele = variant.alt.split(",")[0]
518
+
519
+ # 4. COORDINATE CALCULATION
520
+ genomic_pos = variant.pos1 - 1 # Convert VCF 1-based to 0-based
521
+ buffer_pos = genomic_pos + self.cumulative_offset
522
+
523
+ # For structural variants, calculate affected region from INFO fields
524
+ if variant_type in ["SV_DUP", "SV_INV"]:
525
+ end_pos = info_dict.get("END", None)
526
+ svlen = info_dict.get("SVLEN", None)
527
+
528
+ # Calculate END position if not provided
529
+ if end_pos is None and svlen is not None:
530
+ end_pos = variant.pos1 + abs(svlen) - 1
531
+ elif end_pos is None:
532
+ # Fallback to REF length for structural variants
533
+ end_pos = variant.pos1 + len(variant.ref) - 1
534
+ warnings.warn(
535
+ f"Cannot determine structural variant end position for {variant.get('id', 'unknown')} at {variant.pos1}"
536
+ )
537
+
538
+ ref_length = end_pos - variant.pos1 + 1 # Total affected region length
539
+ else:
540
+ ref_length = len(variant.ref)
541
+
542
+ # 5. FROZEN REGION CHECK
543
+ ref_start = genomic_pos
544
+ ref_end = genomic_pos + ref_length - 1
545
+
546
+ if self.frozen_tracker.is_frozen(ref_start) or self.frozen_tracker.is_frozen(
547
+ ref_end
548
+ ):
549
+ self.skipped_count += 1
550
+ # Record skip details for reporting
551
+ vcf_line = variant.get("vcf_line", "?")
552
+ chrom = variant.get("chrom", self.chrom)
553
+ self.skipped_variants.append(
554
+ (vcf_line, chrom, variant.pos1, variant.ref, variant.alt, "overlap")
555
+ )
556
+ return # Skip overlapping variants
557
+
558
+ # 6. BOUNDS CHECK
559
+ if buffer_pos < 0 or buffer_pos + ref_length > len(self.sequence):
560
+ raise ValueError(f"Variant position {variant.pos1} out of sequence bounds")
561
+
562
+ # 7. REFERENCE VALIDATION (skip for symbolic structural variants)
563
+ if variant_type not in ["SV_DUP", "SV_INV"]:
564
+ expected_ref = self.sequence[
565
+ buffer_pos : buffer_pos + len(variant.ref)
566
+ ].decode()
567
+ if expected_ref.upper() != variant.ref.upper():
568
+ raise ValueError(
569
+ f"Reference mismatch at position {variant.pos1}: "
570
+ f"expected '{variant.ref}', found '{expected_ref}'"
571
+ )
572
+
573
+ # 8. SEQUENCE MODIFICATION
574
+ self._modify_sequence(
575
+ buffer_pos, variant.ref, alt_allele, variant_type, info_dict
576
+ )
577
+
578
+ # 9. UPDATE TRACKING
579
+ if variant_type in ["SV_DUP", "SV_INV"]:
580
+ # For structural variants, calculate length difference based on variant type
581
+ if variant_type == "SV_DUP":
582
+ # Duplication adds the duplicated region length
583
+ length_diff = ref_length
584
+ elif variant_type == "SV_INV":
585
+ # Inversion doesn't change sequence length
586
+ length_diff = 0
587
+ else:
588
+ length_diff = len(alt_allele) - len(variant.ref)
589
+ self.cumulative_offset += length_diff
590
+ self.frozen_tracker.add_range(ref_start, ref_end)
591
+
592
+ # Record offset for coordinate transformation if tracker is provided
593
+ if self.offset_tracker and self.chrom and length_diff != 0:
594
+ self.offset_tracker.add_offset(self.chrom, ref_start, length_diff)
595
+
596
+ self.applied_count += 1
597
+
598
+ def _modify_sequence(
599
+ self,
600
+ pos: int,
601
+ ref_allele: str,
602
+ alt_allele: str,
603
+ variant_type: str,
604
+ info_dict: dict = None,
605
+ ) -> None:
606
+ """
607
+ Modify sequence at specified position using variant type classification.
608
+
609
+ Args:
610
+ pos: Buffer position (0-based)
611
+ ref_allele: Reference allele sequence
612
+ alt_allele: Alternate allele sequence
613
+ variant_type: Classified variant type (SNV, MNV, INS, DEL, complex, SV_DUP, SV_INV)
614
+ info_dict: Parsed INFO field for structural variants (optional)
615
+ """
616
+ # Dispatch based on variant type classification
617
+ if variant_type in ["SNV", "MNV"]:
618
+ # Single or multi-nucleotide substitution
619
+ self.sequence[pos : pos + len(ref_allele)] = alt_allele.encode()
620
+
621
+ elif variant_type == "INS":
622
+ # Insertion: replace reference with longer alternate sequence
623
+ self.sequence[pos : pos + len(ref_allele)] = alt_allele.encode()
624
+
625
+ elif variant_type == "DEL":
626
+ # Deletion: replace reference with shorter alternate sequence
627
+ self.sequence[pos : pos + len(alt_allele)] = alt_allele.encode()
628
+ del self.sequence[pos + len(alt_allele) : pos + len(ref_allele)]
629
+
630
+ elif variant_type == "complex":
631
+ # Complex variant: use length-based logic as fallback
632
+ ref_len = len(ref_allele)
633
+ alt_len = len(alt_allele)
634
+
635
+ if alt_len == ref_len:
636
+ # Same length substitution
637
+ self.sequence[pos : pos + ref_len] = alt_allele.encode()
638
+ elif alt_len < ref_len:
639
+ # Deletion-like complex variant
640
+ self.sequence[pos : pos + alt_len] = alt_allele.encode()
641
+ del self.sequence[pos + alt_len : pos + ref_len]
642
+ else:
643
+ # Insertion-like complex variant
644
+ self.sequence[pos : pos + ref_len] = alt_allele.encode()
645
+
646
+ elif variant_type == "SV_DUP":
647
+ # Tandem duplication: insert duplicated region after original
648
+ if not info_dict:
649
+ raise ValueError("INFO field required for SV_DUP variant")
650
+
651
+ end_pos = info_dict.get("END")
652
+
653
+ # Calculate duplication region using END field only
654
+ if end_pos is None:
655
+ raise ValueError("END field required for SV_DUP variant")
656
+
657
+ # Calculate from buffer position (already offset-adjusted)
658
+ genomic_start = (
659
+ pos - self.cumulative_offset + len(ref_allele)
660
+ ) # Position after current cumulative changes
661
+ genomic_end = end_pos - 1 # Convert VCF 1-based to 0-based
662
+ dup_length = genomic_end - genomic_start + 1
663
+
664
+ # Extract the region to duplicate from current sequence
665
+ duplicated_region = self.sequence[pos : pos + dup_length]
666
+
667
+ # Insert duplicated region after original (tandem duplication)
668
+ self.sequence[pos + dup_length : pos + dup_length] = duplicated_region
669
+
670
+ elif variant_type == "SV_INV":
671
+ # Inversion: reverse complement the affected region
672
+ if not info_dict:
673
+ raise ValueError("INFO field required for SV_INV variant")
674
+
675
+ end_pos = info_dict.get("END")
676
+
677
+ # Calculate inversion region using END field only
678
+ if end_pos is None:
679
+ raise ValueError("END field required for SV_INV variant")
680
+
681
+ # pos is already the correct buffer position (0-based) where inversion starts
682
+ # END field is 1-based, so convert to 0-based buffer position
683
+ buffer_start = pos
684
+ buffer_end = end_pos - 1 # Convert 1-based END to 0-based
685
+ inv_length = buffer_end - buffer_start + 1
686
+
687
+ # Extract region to invert
688
+ region_to_invert = self.sequence[
689
+ buffer_start : buffer_start + inv_length
690
+ ].decode()
691
+
692
+ # Apply reverse complement
693
+ from .sequence_utils import rc_str
694
+
695
+ inverted_region = rc_str(region_to_invert)
696
+
697
+ # Replace with inverted sequence
698
+ self.sequence[buffer_start : buffer_start + inv_length] = (
699
+ inverted_region.encode()
700
+ )
701
+
702
+ elif variant_type == "SV_BND_DUP":
703
+ # BND-derived tandem duplication
704
+ # Note: Individual SV_BND_DUP variants should not reach this point as they are
705
+ # preprocessed by _preprocess_bnd_derived_variants() into synthetic SV_DUP variants
706
+ genomic_pos = (
707
+ pos + self.cumulative_offset + 1
708
+ ) # Convert back to 1-based genomic position
709
+ raise ValueError(
710
+ f"SV_BND_DUP variants should be preprocessed into SV_DUP variants. Position: {genomic_pos}"
711
+ )
712
+
713
+ elif variant_type == "SV_BND_INV":
714
+ # BND-derived inversion
715
+ # Note: Individual SV_BND_INV variants should not reach this point as they are
716
+ # preprocessed by _preprocess_bnd_derived_variants() into synthetic SV_INV variants
717
+ genomic_pos = (
718
+ pos + self.cumulative_offset + 1
719
+ ) # Convert back to 1-based genomic position
720
+ raise ValueError(
721
+ f"SV_BND_INV variants should be preprocessed into SV_INV variants. Position: {genomic_pos}"
722
+ )
723
+
724
+ else:
725
+ # This should not happen due to validation in _apply_single_variant
726
+ raise ValueError(
727
+ f"Unsupported variant type in sequence modification: {variant_type}"
728
+ )
729
+
730
+
731
+ class ChimericSequenceBuilder:
732
+ """Builds chimeric sequences from BND rearrangements."""
733
+
734
+ def __init__(self, reference_sequences: Dict[str, str]):
735
+ self.reference_sequences = reference_sequences
736
+ self.chimeric_sequences = {}
737
+ self.sequence_segments = {} # Store segment metadata for each sequence
738
+
739
+ def create_fusion_from_pair(self, breakend_pair: Tuple) -> Tuple[str, str]:
740
+ """
741
+ Create fusion sequence from a pair of breakends.
742
+
743
+ Returns:
744
+ Tuple of (fusion_name, fusion_sequence)
745
+ """
746
+ bnd1, bnd2 = breakend_pair
747
+
748
+ # Generate fusion name
749
+ fusion_name = f"{bnd1.chrom}_{bnd2.chrom}_fusion_{bnd1.id}_{bnd2.id}"
750
+
751
+ # Get sequences
752
+ seq1 = self.reference_sequences[bnd1.chrom]
753
+ seq2 = self.reference_sequences[bnd2.chrom]
754
+
755
+ # Convert VCF 1-based coordinates to 0-based array indices
756
+ pos1_0 = bnd1.pos - 1 # VCF 1-based -> 0-based array index
757
+ pos2_0 = bnd2.pos - 1 # VCF 1-based -> 0-based array index
758
+
759
+ # Create fusion based on orientation
760
+ fusion_seq, segments = self._build_oriented_fusion(
761
+ seq1,
762
+ pos1_0,
763
+ bnd1.orientation,
764
+ seq2,
765
+ pos2_0,
766
+ bnd2.orientation,
767
+ bnd1.inserted_seq + bnd2.inserted_seq,
768
+ bnd1.chrom,
769
+ bnd2.chrom,
770
+ bnd1,
771
+ bnd2, # Pass original breakends for VCF positions
772
+ )
773
+
774
+ # Store segment metadata
775
+ self.sequence_segments[fusion_name] = segments
776
+
777
+ return fusion_name, fusion_seq
778
+
779
+ def _build_oriented_fusion(
780
+ self,
781
+ seq1: str,
782
+ pos1_0: int,
783
+ orient1: str,
784
+ seq2: str,
785
+ pos2_0: int,
786
+ orient2: str,
787
+ novel_seq: str,
788
+ seq1_chrom: str,
789
+ seq2_chrom: str,
790
+ bnd1,
791
+ bnd2,
792
+ ) -> Tuple[str, List[SequenceSegment]]:
793
+ """
794
+ Build fusion sequence respecting coordinated breakend pair orientations.
795
+
796
+ Coordinated BND fusion patterns:
797
+
798
+ 1. RC Coordination Patterns (require special handling):
799
+ - [p[t + [p[t : RC(seq2[pos2:]) + seq1[pos1:]
800
+ - t]p] + t]p] : seq1[:pos1] + RC(seq2[:pos2])
801
+
802
+ 2. Same Direction Patterns (simple concatenation):
803
+ - ]p]t + ]p]t : seq2[:pos2] + seq1[pos1:]
804
+ - t[p[ + t[p[ : seq1[:pos1] + seq2[pos2:]
805
+ """
806
+ from .sequence_utils import rc_str as reverse_complement
807
+
808
+ # Handle coordinated patterns by looking at both orientations together
809
+ if orient1 == "[p[t" and orient2 == "[p[t":
810
+ # [p[t + [p[t pattern: RC(seq2[pos2_0:]) + seq1[pos1_0:]
811
+ left_part = reverse_complement(seq2[pos2_0:])
812
+ right_part = seq1[pos1_0:]
813
+ left_chrom = seq2_chrom
814
+ right_chrom = seq1_chrom
815
+ left_orientation = "reverse"
816
+ right_orientation = "forward"
817
+
818
+ elif orient1 == "t]p]" and orient2 == "t]p]":
819
+ # t]p] + t]p] pattern: seq1[:pos1] + RC(seq2[:pos2]) (use VCF positions as base counts)
820
+ left_part = seq1[: bnd1.pos] # Include pos1 bases from seq1
821
+ right_part = reverse_complement(
822
+ seq2[: bnd2.pos]
823
+ ) # Include pos2 bases from seq2
824
+ left_chrom = seq1_chrom
825
+ right_chrom = seq2_chrom
826
+ left_orientation = "forward"
827
+ right_orientation = "reverse"
828
+
829
+ elif orient1 == "]p]t" and orient2 == "]p]t":
830
+ # ]p]t + ]p]t pattern: seq2[:pos2] + seq1[pos1_0:] (use VCF pos2 as base count)
831
+ left_part = seq2[: bnd2.pos] # Include pos2 bases from seq2
832
+ right_part = seq1[pos1_0:] # From pos1_0 to end of seq1
833
+ left_chrom = seq2_chrom
834
+ right_chrom = seq1_chrom
835
+ left_orientation = "forward"
836
+ right_orientation = "forward"
837
+
838
+ elif orient1 == "t[p[" and orient2 == "t[p[":
839
+ # t[p[ + t[p[ pattern: seq1[:pos1_0] + seq2[pos2_0:]
840
+ left_part = seq1[:pos1_0]
841
+ right_part = seq2[pos2_0:]
842
+ left_chrom = seq1_chrom
843
+ right_chrom = seq2_chrom
844
+ left_orientation = "forward"
845
+ right_orientation = "forward"
846
+
847
+ elif orient1 == "]p]t" and orient2 == "t[p[":
848
+ # ]p]t + t[p[ pattern: seq2[:pos2] + seq1[pos1_0:] (use VCF pos2 as base count)
849
+ left_part = seq2[: bnd2.pos] # Include pos2 bases from seq2
850
+ right_part = seq1[pos1_0:] # From pos1_0 to end of seq1
851
+ left_chrom = seq2_chrom
852
+ right_chrom = seq1_chrom
853
+ left_orientation = "forward"
854
+ right_orientation = "forward"
855
+
856
+ elif orient1 == "t[p[" and orient2 == "]p]t":
857
+ # t[p[ + ]p]t pattern: seq1[:pos1_0] + seq2[pos2_0:]
858
+ left_part = seq1[:pos1_0]
859
+ right_part = seq2[pos2_0:]
860
+ left_chrom = seq1_chrom
861
+ right_chrom = seq2_chrom
862
+ left_orientation = "forward"
863
+ right_orientation = "forward"
864
+
865
+ elif orient1 == "t]p]" and orient2 == "]p]t":
866
+ # t]p] + ]p]t pattern (mixed coordination): seq1[:pos1] + seq2[pos2_0:]
867
+ left_part = seq1[: bnd1.pos] # Include pos1 bases from seq1 (VCF position)
868
+ right_part = seq2[pos2_0:] # From pos2_0 to end of seq2
869
+ left_chrom = seq1_chrom
870
+ right_chrom = seq2_chrom
871
+ left_orientation = "forward"
872
+ right_orientation = "forward"
873
+
874
+ else:
875
+ # Unknown orientation pattern - fail fast to ensure proper implementation
876
+ supported_patterns = [
877
+ "[p[t + [p[t (RC coordination)",
878
+ "t]p] + t]p] (RC coordination)",
879
+ "]p]t + ]p]t (same direction)",
880
+ "t[p[ + t[p[ (same direction)",
881
+ "t]p] + ]p]t (mixed coordination)",
882
+ ]
883
+ raise ValueError(
884
+ f"Unsupported BND orientation pattern: '{orient1}' + '{orient2}'. "
885
+ f"Supported patterns: {', '.join(supported_patterns)}. "
886
+ f"This pattern requires explicit implementation."
887
+ )
888
+
889
+ # Build fusion sequence and track segments
890
+ segments = []
891
+ current_pos = 0
892
+
893
+ # Add left segment
894
+ if len(left_part) > 0:
895
+ left_type = "rc_reference" if left_orientation == "reverse" else "reference"
896
+ segments.append(
897
+ SequenceSegment(
898
+ source_type=left_type,
899
+ source_chrom=left_chrom,
900
+ start_pos=current_pos,
901
+ end_pos=current_pos + len(left_part),
902
+ length=len(left_part),
903
+ orientation=left_orientation,
904
+ )
905
+ )
906
+ current_pos += len(left_part)
907
+
908
+ # Add novel sequence segment
909
+ if len(novel_seq) > 0:
910
+ segments.append(
911
+ SequenceSegment(
912
+ source_type="novel",
913
+ source_chrom="NOVEL",
914
+ start_pos=current_pos,
915
+ end_pos=current_pos + len(novel_seq),
916
+ length=len(novel_seq),
917
+ orientation="novel",
918
+ )
919
+ )
920
+ current_pos += len(novel_seq)
921
+
922
+ # Add right segment
923
+ if len(right_part) > 0:
924
+ right_type = (
925
+ "rc_reference" if right_orientation == "reverse" else "reference"
926
+ )
927
+ segments.append(
928
+ SequenceSegment(
929
+ source_type=right_type,
930
+ source_chrom=right_chrom,
931
+ start_pos=current_pos,
932
+ end_pos=current_pos + len(right_part),
933
+ length=len(right_part),
934
+ orientation=right_orientation,
935
+ )
936
+ )
937
+
938
+ # Combine parts to create fusion sequence
939
+ fusion = left_part + novel_seq + right_part
940
+
941
+ return fusion, segments
942
+
943
+ def get_sequence_segments(self, sequence_name: str) -> List[SequenceSegment]:
944
+ """Get segment metadata for a sequence."""
945
+ return self.sequence_segments.get(sequence_name, [])
946
+
947
+
948
+ def _load_reference(reference_fn: Union[str, Dict, Fasta]) -> Union[Dict, Fasta]:
949
+ """Load reference genome from file or return as-is if already loaded."""
950
+ if isinstance(reference_fn, str) and os.path.isfile(reference_fn):
951
+ return Fasta(reference_fn)
952
+ return reference_fn
953
+
954
+
955
+ def _encode_genome_sequences(reference, encode=True, encoder=None):
956
+ """Helper function to encode genome sequences for output."""
957
+ genome = {}
958
+ for chrom, seq in reference.items():
959
+ seq_str = str(seq)
960
+ if encode:
961
+ if encoder:
962
+ genome[chrom] = encoder(seq_str)
963
+ else:
964
+ genome[chrom] = encode_seq(seq_str)
965
+ else:
966
+ genome[chrom] = seq_str
967
+ return genome
968
+
969
+
970
+ def _load_variants(variants_fn: Union[str, pd.DataFrame]) -> pd.DataFrame:
971
+ """
972
+ Load variants from file or return as-is if already a DataFrame.
973
+ Ensures variant classification happens once during loading.
974
+
975
+ For DataFrames, assumes position column is either 'pos', 'pos1', or the second column.
976
+ If DataFrame lacks variant_type column, classification will be added.
977
+ """
978
+ if isinstance(variants_fn, str):
979
+ # Always load all variants with classification
980
+ variants_df = read_vcf(variants_fn, classify_variants=True)
981
+ # Rename pos to pos1 for consistency
982
+
983
+ return variants_df
984
+ else:
985
+ # Handle DataFrame input
986
+ variants_df = variants_fn.copy()
987
+
988
+ # Always use second column as pos1, regardless of current name
989
+ if len(variants_df.columns) >= 2:
990
+ # Rename second column to pos1 if it's not already named that
991
+ if variants_df.columns[1] != "pos1":
992
+ new_columns = list(variants_df.columns)
993
+ new_columns[1] = "pos1"
994
+ variants_df.columns = new_columns
995
+
996
+ # Validate that pos1 column is numeric
997
+ if not pd.api.types.is_numeric_dtype(variants_df["pos1"]):
998
+ raise ValueError(
999
+ f"Position column (second column) must be numeric, got {variants_df['pos1'].dtype}"
1000
+ )
1001
+ else:
1002
+ raise ValueError(
1003
+ "DataFrame must have at least 2 columns with position in second column"
1004
+ )
1005
+
1006
+ # Ensure variant classification exists
1007
+ if "variant_type" not in variants_df.columns:
1008
+ if len(variants_df) > 0:
1009
+ # Add variant classification to non-empty DataFrame
1010
+ variants_df["variant_type"] = variants_df.apply(
1011
+ lambda row: classify_variant_type(
1012
+ row["ref"],
1013
+ row["alt"],
1014
+ parse_vcf_info(row.get("info", "")) if "info" in row else None,
1015
+ ),
1016
+ axis=1,
1017
+ )
1018
+ else:
1019
+ # Handle empty DataFrame - just add empty column
1020
+ variants_df["variant_type"] = pd.Series(dtype="object")
1021
+
1022
+ return variants_df
1023
+
1024
+
1025
+ def _preprocess_bnd_derived_variants(chrom_variants, vcf_path=None, verbose=False):
1026
+ """
1027
+ Convert BND-derived DUP/INV pairs to synthetic SV_DUP/SV_INV variants.
1028
+
1029
+ This pre-processing step allows BND-derived structural variants to be processed
1030
+ by the existing SV_DUP/SV_INV logic, ensuring proper frozen region tracking
1031
+ and coordinate transformation.
1032
+
1033
+ Args:
1034
+ chrom_variants: DataFrame of variants for a single chromosome
1035
+ vcf_path: Path to VCF file for BND classification (optional)
1036
+ verbose: Print processing information
1037
+
1038
+ Returns:
1039
+ DataFrame with BND-derived variants replaced by synthetic variants
1040
+ """
1041
+ import pandas as pd
1042
+ from .variant_utils import parse_breakend_alt
1043
+
1044
+ # Extract BND-derived variants that need pair processing
1045
+ bnd_dup_variants = chrom_variants[
1046
+ chrom_variants["variant_type"] == "SV_BND_DUP"
1047
+ ].copy()
1048
+ bnd_inv_variants = chrom_variants[
1049
+ chrom_variants["variant_type"] == "SV_BND_INV"
1050
+ ].copy()
1051
+
1052
+ if len(bnd_dup_variants) == 0 and len(bnd_inv_variants) == 0:
1053
+ return chrom_variants
1054
+
1055
+ if verbose:
1056
+ print(
1057
+ f" 🔄 Pre-processing {len(bnd_dup_variants)} BND-DUP + {len(bnd_inv_variants)} BND-INV variants"
1058
+ )
1059
+
1060
+ # Get BND classification results to find mate coordinates
1061
+ synthetic_variants = []
1062
+ processed_ids = set()
1063
+
1064
+ # Process BND-derived duplications
1065
+ for _, variant in bnd_dup_variants.iterrows():
1066
+ if variant["id"] in processed_ids:
1067
+ continue
1068
+
1069
+ # Parse mate coordinates from ALT field
1070
+ alt_info = parse_breakend_alt(variant["alt"])
1071
+ if not alt_info["is_valid"]:
1072
+ if verbose:
1073
+ print(f" ⚠️ Could not parse BND ALT field: {variant['alt']}")
1074
+ continue
1075
+
1076
+ mate_chrom = alt_info["mate_chrom"]
1077
+ mate_pos = alt_info["mate_pos"]
1078
+
1079
+ # Ensure this is an intrachromosomal duplication (same chromosome)
1080
+ if mate_chrom != variant["chrom"]:
1081
+ if verbose:
1082
+ print(
1083
+ f" ⚠️ Skipping interchromosomal BND: {variant['chrom']}:{variant['pos1']} -> {mate_chrom}:{mate_pos}"
1084
+ )
1085
+ continue
1086
+
1087
+ # Calculate duplication region boundaries
1088
+ start_pos = min(variant["pos1"], mate_pos)
1089
+ end_pos = max(variant["pos1"], mate_pos)
1090
+
1091
+ # Create synthetic SV_DUP variant
1092
+ synthetic_variant = variant.copy()
1093
+ synthetic_variant["variant_type"] = "SV_DUP"
1094
+ synthetic_variant["pos1"] = start_pos
1095
+ synthetic_variant["ref"] = "N" # Placeholder
1096
+ synthetic_variant["alt"] = "<DUP>"
1097
+ synthetic_variant["info"] = f"END={end_pos};SVTYPE=DUP"
1098
+
1099
+ synthetic_variants.append(synthetic_variant)
1100
+ processed_ids.add(variant["id"])
1101
+
1102
+ if verbose:
1103
+ region_length = end_pos - start_pos
1104
+ print(
1105
+ f" ✅ Created synthetic DUP: {variant['chrom']}:{start_pos}-{end_pos} ({region_length}bp)"
1106
+ )
1107
+
1108
+ # Process BND-derived inversions: handle 4-breakend inversion topology
1109
+ if len(bnd_inv_variants) > 0:
1110
+ # Group BND inversions by chromosome to handle 4-breakend patterns
1111
+ chrom_groups = bnd_inv_variants.groupby("chrom")
1112
+
1113
+ for chrom, chrom_bnd_invs in chrom_groups:
1114
+ chrom_breakends = chrom_bnd_invs.copy()
1115
+
1116
+ # Check if we have exactly 4 breakends (standard inversion pattern)
1117
+ if len(chrom_breakends) == 4:
1118
+ # Sort breakends by position to identify topology
1119
+ chrom_breakends = chrom_breakends.sort_values("pos1")
1120
+ positions = chrom_breakends["pos1"].tolist()
1121
+
1122
+ # 4-breakend inversion: outer breakpoints define boundaries, inner breakpoints define inverted region
1123
+ # Positions: [W, V, U, X] where W-X are outer, V-U are inner (get inverted)
1124
+ outer_start = positions[0] # W (position 10)
1125
+ inner_start = positions[1] # V (position 11)
1126
+ inner_end = positions[2] # U (position 30)
1127
+ outer_end = positions[3] # X (position 31)
1128
+
1129
+ # Create single synthetic SV_INV for the inner region (what gets inverted)
1130
+ first_variant = chrom_breakends.iloc[0].copy()
1131
+ synthetic_variant = first_variant.copy()
1132
+ synthetic_variant["variant_type"] = "SV_INV"
1133
+ synthetic_variant["pos1"] = inner_start # Start of inverted region
1134
+ synthetic_variant["ref"] = "N" # Placeholder
1135
+ synthetic_variant["alt"] = "<INV>"
1136
+ synthetic_variant["info"] = (
1137
+ f"END={inner_end};SVTYPE=INV" # End of inverted region
1138
+ )
1139
+
1140
+ synthetic_variants.append(synthetic_variant)
1141
+
1142
+ # Mark all 4 breakends as processed
1143
+ for _, variant in chrom_breakends.iterrows():
1144
+ processed_ids.add(variant["id"])
1145
+
1146
+ if verbose:
1147
+ inversion_length = inner_end - inner_start
1148
+ boundary_span = outer_end - outer_start
1149
+ print(
1150
+ f" ✅ Created synthetic INV: {chrom}:{inner_start}-{inner_end} ({inversion_length}bp) [4-breakend topology, boundary span {outer_start}-{outer_end}]"
1151
+ )
1152
+
1153
+ else:
1154
+ # Handle non-standard cases (not exactly 4 breakends)
1155
+ if verbose:
1156
+ print(
1157
+ f" ⚠️ Non-standard BND inversion pattern: {len(chrom_breakends)} breakends on {chrom}"
1158
+ )
1159
+
1160
+ # Fallback: process individually for non-4-breakend cases
1161
+ for _, variant in chrom_breakends.iterrows():
1162
+ if variant["id"] in processed_ids:
1163
+ continue
1164
+
1165
+ # Parse mate coordinates from ALT field
1166
+ alt_info = parse_breakend_alt(variant["alt"])
1167
+ if not alt_info["is_valid"]:
1168
+ if verbose:
1169
+ print(
1170
+ f" ⚠️ Could not parse BND ALT field: {variant['alt']}"
1171
+ )
1172
+ continue
1173
+
1174
+ mate_pos = alt_info["mate_pos"]
1175
+ start_pos = min(variant["pos1"], mate_pos)
1176
+ end_pos = max(variant["pos1"], mate_pos)
1177
+
1178
+ # Create synthetic SV_INV variant
1179
+ synthetic_variant = variant.copy()
1180
+ synthetic_variant["variant_type"] = "SV_INV"
1181
+ synthetic_variant["pos1"] = start_pos
1182
+ synthetic_variant["ref"] = "N"
1183
+ synthetic_variant["alt"] = "<INV>"
1184
+ synthetic_variant["info"] = f"END={end_pos};SVTYPE=INV"
1185
+
1186
+ synthetic_variants.append(synthetic_variant)
1187
+ processed_ids.add(variant["id"])
1188
+
1189
+ if verbose:
1190
+ region_length = end_pos - start_pos
1191
+ print(
1192
+ f" ✅ Created synthetic INV: {chrom}:{start_pos}-{end_pos} ({region_length}bp) [fallback]"
1193
+ )
1194
+
1195
+ # Create result DataFrame: remove BND-derived variants, add synthetic variants
1196
+ result_variants = chrom_variants[
1197
+ ~chrom_variants["variant_type"].isin(["SV_BND_DUP", "SV_BND_INV"])
1198
+ ].copy()
1199
+
1200
+ if synthetic_variants:
1201
+ synthetic_df = pd.DataFrame(synthetic_variants)
1202
+ result_variants = pd.concat([result_variants, synthetic_df], ignore_index=True)
1203
+ # Re-sort by position to maintain VCF order
1204
+ result_variants = result_variants.sort_values("pos1")
1205
+
1206
+ if verbose and len(synthetic_variants) > 0:
1207
+ print(
1208
+ f" 🎯 Pre-processing complete: {len(synthetic_variants)} synthetic variants created"
1209
+ )
1210
+
1211
+ return result_variants
1212
+
1213
+
1214
+ def _format_skipped_variant_report(skipped_variants_list):
1215
+ """
1216
+ Format skipped variant details for reporting.
1217
+
1218
+ Args:
1219
+ skipped_variants_list: List of (vcf_line, chrom, pos1, ref, alt, reason) tuples
1220
+
1221
+ Returns:
1222
+ Formatted string with grouped skip reasons
1223
+ """
1224
+ if not skipped_variants_list:
1225
+ return ""
1226
+
1227
+ from collections import defaultdict
1228
+
1229
+ # Group by reason
1230
+ by_reason = defaultdict(list)
1231
+ for vcf_line, chrom, pos1, ref, alt, reason in skipped_variants_list:
1232
+ by_reason[reason].append((vcf_line, chrom, pos1, ref, alt))
1233
+
1234
+ # Format output
1235
+ lines = []
1236
+ reason_labels = {
1237
+ "overlap": "overlap with previous variant",
1238
+ "unsupported_type": "unsupported variant type",
1239
+ "validation_error": "validation error",
1240
+ "missing_type": "missing/unknown type",
1241
+ }
1242
+
1243
+ for reason, variants in sorted(by_reason.items()):
1244
+ label = reason_labels.get(reason, reason)
1245
+ # Group by position for concise output
1246
+ by_pos = defaultdict(list)
1247
+ for vcf_line, chrom, pos1, ref, alt in variants:
1248
+ by_pos[f"{chrom}:{pos1}"].append(vcf_line)
1249
+
1250
+ for pos, vcf_lines in sorted(by_pos.items()):
1251
+ vcf_lines_str = ", ".join(map(str, sorted(vcf_lines)))
1252
+ lines.append(f" • {label}: VCF line(s) {vcf_lines_str} at {pos}")
1253
+
1254
+ return "\n".join(lines)
1255
+
1256
+
1257
+ def get_personal_genome(
1258
+ reference_fn,
1259
+ variants_fn,
1260
+ encode=True,
1261
+ n_chunks=1,
1262
+ verbose=False,
1263
+ encoder=None,
1264
+ auto_map_chromosomes=False,
1265
+ ):
1266
+ """
1267
+ Create a personalized genome by applying variants to a reference genome.
1268
+
1269
+ This function implements multi-phase variant processing with proper frozen region tracking:
1270
+
1271
+ Phase 1: Standard variants + Early structural variants (in VCF order):
1272
+ - SNV, MNV, INS, DEL, SV_DUP, SV_INV
1273
+
1274
+ Phase 2: BND semantic classification and application:
1275
+ - Classify BNDs to identify SV_BND_DUP and SV_BND_INV patterns
1276
+ - Apply SV_BND_DUP and SV_BND_INV first
1277
+ - Apply remaining true BND translocations
1278
+
1279
+ Frozen region enforcement:
1280
+ - Each variant freezes its genomic region after application
1281
+ - Later variants overlapping frozen regions are skipped with warnings
1282
+ - BND breakpoints in frozen regions cause entire BND to be skipped
1283
+
1284
+ Output chromosome ordering:
1285
+ - Chromosomes are returned in the same order as the reference genome
1286
+ - BND-generated fusion sequences appear after all original chromosomes
1287
+ - Leftover sequences (from consumed chromosomes) follow fusion sequences
1288
+
1289
+ Args:
1290
+ reference_fn: Path to reference genome file or dictionary-like object
1291
+ variants_fn: Path to variants file or DataFrame. Supports VCF 4.2 format
1292
+ including BND (breakend) variants with bracket notation.
1293
+ encode: Return sequences as one-hot encoded arrays (default: True)
1294
+ n_chunks: Number of chunks to split variants into for processing (default: 1)
1295
+ verbose: Print progress information (default: False)
1296
+ encoder: Optional custom encoding function. If provided, should accept a single
1297
+ sequence string and return encoded array with shape (L, 4). Default: None
1298
+ auto_map_chromosomes: Automatically map chromosome names between VCF and reference
1299
+ when they don't match exactly (e.g., 'chr1' <-> '1', 'chrM' <-> 'MT').
1300
+ Default: False. When False, raises ChromosomeMismatchError if names
1301
+ don't match. (default: False)
1302
+
1303
+ Returns:
1304
+ If encode=True: A dictionary mapping chromosome names to encoded tensors/arrays
1305
+ If encode=False: A dictionary mapping chromosome names to sequence strings
1306
+
1307
+ The dictionary preserves reference genome chromosome order, with any fusion
1308
+ or leftover sequences appended at the end.
1309
+
1310
+ Raises:
1311
+ ChromosomeMismatchError: If auto_map_chromosomes=False and chromosome names in VCF
1312
+ and reference don't match exactly
1313
+
1314
+ Examples:
1315
+ # Apply variants with proper ordering and conflict resolution
1316
+ personal_genome = get_personal_genome('reference.fa', 'variants.vcf')
1317
+
1318
+ # Get raw sequences without encoding
1319
+ personal_genome = get_personal_genome('reference.fa', 'variants.vcf', encode=False)
1320
+
1321
+ # Enable automatic chromosome mapping if VCF uses 'chr1' and reference uses '1'
1322
+ personal_genome = get_personal_genome('reference.fa', 'variants.vcf', auto_map_chromosomes=True)
1323
+
1324
+ # Verify chromosome order is preserved
1325
+ ref_chroms = list(pyfaidx.Fasta('reference.fa').keys())
1326
+ personal_chroms = list(personal_genome.keys())
1327
+ assert personal_chroms[:len(ref_chroms)] == ref_chroms # Original order preserved
1328
+ """
1329
+ # Load ALL variants with classification
1330
+ from .variant_utils import group_variants_by_semantic_type
1331
+
1332
+ variants_df = _load_variants(variants_fn)
1333
+ reference = _load_reference(reference_fn)
1334
+
1335
+ if len(variants_df) == 0:
1336
+ if verbose:
1337
+ print("🧬 No variants found - returning reference genome")
1338
+ return _encode_genome_sequences(reference, encode, encoder)
1339
+
1340
+ # Group variants by semantic type for proper processing order
1341
+ # Pass VCF path for BND semantic classification if available
1342
+ vcf_path = variants_fn if isinstance(variants_fn, str) else None
1343
+ grouped_variants = group_variants_by_semantic_type(variants_df, vcf_path)
1344
+
1345
+ if verbose:
1346
+ total_variants = len(variants_df)
1347
+ print(
1348
+ f"🧬 Processing {total_variants:,} variants across {len(variants_df['chrom'].unique())} chromosomes"
1349
+ )
1350
+ print(
1351
+ f" Phase 1: {len(grouped_variants['standard']) + len(grouped_variants['dup_variants']) + len(grouped_variants['inv_variants'])} standard variants (SNV, MNV, INS, DEL, SV_DUP, SV_INV)"
1352
+ )
1353
+ print(
1354
+ f" Phase 2: {len(grouped_variants['bnd_variants'])} BND variants for semantic classification"
1355
+ )
1356
+
1357
+ # Apply chromosome name matching
1358
+ ref_chroms = set(reference.keys())
1359
+ vcf_chroms = set(variants_df["chrom"].unique())
1360
+
1361
+ mapping, unmatched = match_chromosomes_with_report(
1362
+ ref_chroms,
1363
+ vcf_chroms,
1364
+ verbose=verbose,
1365
+ auto_map_chromosomes=auto_map_chromosomes,
1366
+ )
1367
+
1368
+ # Apply chromosome name mapping to all variants
1369
+ if mapping:
1370
+ for group_name, variant_group in grouped_variants.items():
1371
+ if len(variant_group) > 0:
1372
+ grouped_variants[group_name] = apply_chromosome_mapping(
1373
+ variant_group, mapping
1374
+ )
1375
+
1376
+ # Initialize processing state
1377
+ personal_genome = {}
1378
+ total_processed = 0
1379
+ offset_tracker = ChromosomeOffsetTracker()
1380
+ modified_sequences = {}
1381
+
1382
+ # PHASE 1: Apply standard variants + early structural variants (in VCF order)
1383
+ # Include both symbolic and BND-derived DUP/INV variants
1384
+ symbolic_dup_variants = (
1385
+ grouped_variants["dup_variants"][
1386
+ grouped_variants["dup_variants"]["variant_type"] == "SV_DUP"
1387
+ ]
1388
+ if len(grouped_variants["dup_variants"]) > 0
1389
+ else pd.DataFrame()
1390
+ )
1391
+
1392
+ symbolic_inv_variants = (
1393
+ grouped_variants["inv_variants"][
1394
+ grouped_variants["inv_variants"]["variant_type"] == "SV_INV"
1395
+ ]
1396
+ if len(grouped_variants["inv_variants"]) > 0
1397
+ else pd.DataFrame()
1398
+ )
1399
+
1400
+ # Extract BND-derived DUP/INV variants for Phase 1 processing
1401
+ bnd_dup_variants = (
1402
+ grouped_variants["dup_variants"][
1403
+ grouped_variants["dup_variants"]["variant_type"] == "SV_BND_DUP"
1404
+ ]
1405
+ if len(grouped_variants["dup_variants"]) > 0
1406
+ else pd.DataFrame()
1407
+ )
1408
+
1409
+ bnd_inv_variants = (
1410
+ grouped_variants["inv_variants"][
1411
+ grouped_variants["inv_variants"]["variant_type"] == "SV_BND_INV"
1412
+ ]
1413
+ if len(grouped_variants["inv_variants"]) > 0
1414
+ else pd.DataFrame()
1415
+ )
1416
+
1417
+ phase1_variants = pd.concat(
1418
+ [
1419
+ grouped_variants["standard"],
1420
+ symbolic_dup_variants,
1421
+ symbolic_inv_variants,
1422
+ bnd_dup_variants,
1423
+ bnd_inv_variants,
1424
+ ],
1425
+ ignore_index=True,
1426
+ )
1427
+
1428
+ if len(phase1_variants) > 0:
1429
+ # Sort by chromosome and position to maintain VCF order
1430
+ phase1_variants = phase1_variants.sort_values(["chrom", "pos1"])
1431
+
1432
+ for chrom, chrom_variants in phase1_variants.groupby("chrom"):
1433
+ if chrom not in reference:
1434
+ if verbose:
1435
+ print(f"⚠️ Skipping {chrom}: not found in reference")
1436
+ continue
1437
+
1438
+ ref_seq = str(reference[chrom])
1439
+
1440
+ if verbose:
1441
+ variant_counts = chrom_variants["variant_type"].value_counts().to_dict()
1442
+ type_summary = ", ".join(
1443
+ [f"{count} {vtype}" for vtype, count in variant_counts.items()]
1444
+ )
1445
+ print(
1446
+ f"🔄 Processing chromosome {chrom}: {len(chrom_variants):,} variants ({type_summary})"
1447
+ )
1448
+
1449
+ # Process all Phase 1 variants with offset tracking (chunking available for standard variants only)
1450
+ if n_chunks == 1 or any(
1451
+ vtype in ["SV_DUP", "SV_INV"]
1452
+ for vtype in chrom_variants["variant_type"].unique()
1453
+ ):
1454
+ # PRE-PROCESS: Convert BND-derived variants to synthetic variants
1455
+ processed_variants = _preprocess_bnd_derived_variants(
1456
+ chrom_variants, vcf_path, verbose
1457
+ )
1458
+
1459
+ # Process all variants at once (required for structural variants)
1460
+ applicator = VariantApplicator(
1461
+ ref_seq,
1462
+ processed_variants,
1463
+ offset_tracker=offset_tracker,
1464
+ chrom=chrom,
1465
+ )
1466
+ personal_seq, stats = applicator.apply_variants()
1467
+
1468
+ if verbose and stats["total"] > 0:
1469
+ # Report skipped variants if any
1470
+ if stats["skipped"] > 0 and stats.get("skipped_variants"):
1471
+ print(f" ⚠️ Skipped {stats['skipped']} variant(s):")
1472
+ print(_format_skipped_variant_report(stats["skipped_variants"]))
1473
+ print(
1474
+ f" ✅ Applied {stats['applied']}/{stats['total']} variants ({stats['skipped']} skipped)"
1475
+ )
1476
+
1477
+ else:
1478
+ # PRE-PROCESS: Convert BND-derived variants to synthetic variants (for chunked processing too)
1479
+ processed_variants = _preprocess_bnd_derived_variants(
1480
+ chrom_variants, vcf_path, verbose
1481
+ )
1482
+
1483
+ # Process in chunks (standard variants only)
1484
+ current_sequence = ref_seq
1485
+ shared_frozen_tracker = FrozenRegionTracker()
1486
+ total_applied = 0
1487
+ total_skipped = 0
1488
+ all_skipped_variants = []
1489
+
1490
+ indices = np.array_split(np.arange(len(processed_variants)), n_chunks)
1491
+
1492
+ if verbose:
1493
+ avg_chunk_size = len(processed_variants) // n_chunks
1494
+ print(
1495
+ f" 📦 Processing {n_chunks} chunks of ~{avg_chunk_size:,} variants each"
1496
+ )
1497
+
1498
+ for i, chunk_indices in enumerate(indices):
1499
+ if len(chunk_indices) == 0:
1500
+ continue
1501
+
1502
+ chunk_df = processed_variants.iloc[chunk_indices].reset_index(
1503
+ drop=True
1504
+ )
1505
+
1506
+ applicator = VariantApplicator(
1507
+ current_sequence,
1508
+ chunk_df,
1509
+ shared_frozen_tracker,
1510
+ offset_tracker,
1511
+ chrom,
1512
+ )
1513
+ current_sequence, stats = applicator.apply_variants()
1514
+
1515
+ total_applied += stats["applied"]
1516
+ total_skipped += stats["skipped"]
1517
+ all_skipped_variants.extend(stats.get("skipped_variants", []))
1518
+
1519
+ if verbose:
1520
+ print(
1521
+ f" ✅ Chunk {i+1}: {stats['applied']}/{stats['total']} variants applied"
1522
+ )
1523
+
1524
+ personal_seq = current_sequence
1525
+
1526
+ if verbose:
1527
+ # Report skipped variants if any
1528
+ if total_skipped > 0 and all_skipped_variants:
1529
+ print(f" ⚠️ Skipped {total_skipped} variant(s):")
1530
+ print(_format_skipped_variant_report(all_skipped_variants))
1531
+ print(
1532
+ f" 🎯 Total: {total_applied}/{len(processed_variants)} variants applied ({total_skipped} skipped)"
1533
+ )
1534
+
1535
+ modified_sequences[chrom] = personal_seq
1536
+ total_processed += len(chrom_variants)
1537
+
1538
+ # Initialize sequences for chromosomes not processed in Phase 1
1539
+ for chrom in reference.keys():
1540
+ if chrom not in modified_sequences:
1541
+ modified_sequences[chrom] = str(reference[chrom])
1542
+
1543
+ # PHASE 2: BND translocation processing
1544
+ # Only process true BND translocations (BND-derived DUP/INV are now handled in Phase 1)
1545
+ true_bnd_variants = grouped_variants["bnd_variants"]
1546
+
1547
+ # Phase 2 variants are now only true BND translocations
1548
+ phase2_variants = true_bnd_variants
1549
+
1550
+ if len(phase2_variants) > 0:
1551
+ if verbose:
1552
+ phase2_counts = phase2_variants["variant_type"].value_counts().to_dict()
1553
+ counts_msg = ", ".join(
1554
+ [f"{count} {vtype}" for vtype, count in phase2_counts.items()]
1555
+ )
1556
+ print(
1557
+ f"🔄 Phase 2: Processing {len(phase2_variants)} BND variants with semantic classification ({counts_msg})"
1558
+ )
1559
+
1560
+ # Use the BND classifier results directly instead of create_breakend_pairs
1561
+ # This ensures we get the inferred mates that the classifier created
1562
+ if vcf_path:
1563
+ from .variant_utils import BNDClassifier
1564
+
1565
+ classifier = BNDClassifier()
1566
+ classified_breakends = classifier.classify_all_breakends(
1567
+ vcf_path, verbose=verbose
1568
+ )
1569
+
1570
+ # Extract all paired breakends (including those with inferred mates)
1571
+ all_paired_breakends = classified_breakends["paired"]
1572
+
1573
+ # Convert to BreakendPair-like objects for ChimericSequenceBuilder compatibility
1574
+ breakend_pairs = []
1575
+ processed_ids = set()
1576
+
1577
+ for breakend in all_paired_breakends:
1578
+ if breakend.id in processed_ids or not breakend.mate_breakend:
1579
+ continue
1580
+
1581
+ # Create a pair tuple (bnd1, bnd2) for ChimericSequenceBuilder
1582
+ pair_tuple = (breakend, breakend.mate_breakend)
1583
+ breakend_pairs.append(pair_tuple)
1584
+ processed_ids.add(breakend.id)
1585
+ processed_ids.add(breakend.mate_breakend.id)
1586
+ else:
1587
+ # Fallback to create_breakend_pairs if no VCF path available
1588
+ from .variant_utils import create_breakend_pairs
1589
+
1590
+ breakend_pairs = create_breakend_pairs(phase2_variants)
1591
+
1592
+ if len(breakend_pairs) > 0:
1593
+ if verbose:
1594
+ print(f" Created {len(breakend_pairs)} BND pairs for processing")
1595
+
1596
+ # Transform BND coordinates using offset tracker from Phase 1
1597
+ for pair in breakend_pairs:
1598
+ # Handle both BreakendPair objects and tuple pairs
1599
+ if hasattr(pair, "breakend1"):
1600
+ bnd1 = pair.breakend1
1601
+ bnd2 = pair.breakend2
1602
+ else:
1603
+ bnd1, bnd2 = pair
1604
+
1605
+ original_pos1 = bnd1.pos
1606
+ original_pos2 = bnd2.pos
1607
+
1608
+ # Transform coordinates to account for applied Phase 1 variants
1609
+ if hasattr(offset_tracker, "get_offset_at_position"):
1610
+ bnd1_offset = offset_tracker.get_offset_at_position(
1611
+ bnd1.chrom, bnd1.pos - 1
1612
+ )
1613
+ bnd2_offset = offset_tracker.get_offset_at_position(
1614
+ bnd2.chrom, bnd2.pos - 1
1615
+ )
1616
+ bnd1.pos += bnd1_offset
1617
+ bnd2.pos += bnd2_offset
1618
+
1619
+ if verbose and (bnd1_offset != 0 or bnd2_offset != 0):
1620
+ print(
1621
+ f" 📍 Transformed coordinates: {bnd1.chrom}:{original_pos1}→{bnd1.pos}, {bnd2.chrom}:{original_pos2}→{bnd2.pos}"
1622
+ )
1623
+
1624
+ # Note: BND semantic classification (SV_BND_DUP, SV_BND_INV) is handled by
1625
+ # group_variants_by_semantic_type() and _preprocess_bnd_derived_variants().
1626
+ # Remaining BND variants are processed as translocations using ChimericSequenceBuilder.
1627
+
1628
+ # Enhanced frozen region validation for BND breakpoints
1629
+ validated_pairs = []
1630
+ skipped_pairs = []
1631
+
1632
+ for pair in breakend_pairs:
1633
+ # Handle both BreakendPair objects and tuple pairs
1634
+ if hasattr(pair, "breakend1"):
1635
+ bnd1 = pair.breakend1
1636
+ bnd2 = pair.breakend2
1637
+ else:
1638
+ bnd1, bnd2 = pair
1639
+
1640
+ # Check if both breakpoints are in non-frozen regions
1641
+ # Create a temporary FrozenRegionTracker to check current frozen regions
1642
+ # Note: This is a simplified check - a more sophisticated implementation would
1643
+ # track frozen regions across all chromosomes from Phase 1
1644
+
1645
+ breakpoint_conflicts = []
1646
+
1647
+ # Note: Frozen region tracking is handled by FrozenRegionTracker within each chromosome
1648
+ # processing. Cross-Phase conflict detection could be enhanced in future versions.
1649
+ if breakpoint_conflicts:
1650
+ skipped_pairs.append(pair)
1651
+ if verbose:
1652
+ conflicts_msg = "; ".join(breakpoint_conflicts)
1653
+ print(
1654
+ f" ⚠️ Skipping BND pair {bnd1.id}-{bnd2.id}: {conflicts_msg}"
1655
+ )
1656
+ else:
1657
+ validated_pairs.append(pair)
1658
+
1659
+ if verbose and len(skipped_pairs) > 0:
1660
+ print(
1661
+ f" 📍 Skipped {len(skipped_pairs)} BND pairs due to frozen region conflicts"
1662
+ )
1663
+
1664
+ # Create chimeric sequences using validated pairs only
1665
+ sequence_builder = ChimericSequenceBuilder(modified_sequences)
1666
+
1667
+ # Initialize segment tracker with original reference chromosomes only
1668
+ original_ref_sequences = {
1669
+ chrom: seq
1670
+ for chrom, seq in modified_sequences.items()
1671
+ if "_fusion_" not in chrom
1672
+ }
1673
+ segment_tracker = ChromosomeSegmentTracker(original_ref_sequences)
1674
+
1675
+ for i, pair in enumerate(validated_pairs):
1676
+ # Handle both BreakendPair objects and tuple pairs for display
1677
+ if hasattr(pair, "breakend1"):
1678
+ bnd1_id, bnd2_id = pair.breakend1.id, pair.breakend2.id
1679
+ pair_tuple = (pair.breakend1, pair.breakend2)
1680
+ bnd1, bnd2 = pair.breakend1, pair.breakend2
1681
+ else:
1682
+ bnd1_id, bnd2_id = pair[0].id, pair[1].id
1683
+ pair_tuple = pair
1684
+ bnd1, bnd2 = pair[0], pair[1]
1685
+
1686
+ if verbose:
1687
+ print(
1688
+ f" 🔄 Creating fusion {i+1}/{len(validated_pairs)}: {bnd1_id}-{bnd2_id}"
1689
+ )
1690
+
1691
+ try:
1692
+ fusion_name, fusion_seq = sequence_builder.create_fusion_from_pair(
1693
+ pair_tuple
1694
+ )
1695
+ modified_sequences[fusion_name] = fusion_seq
1696
+ total_processed += 2 # Count both BNDs in the pair
1697
+
1698
+ if verbose:
1699
+ print(
1700
+ f" ✅ Created fusion: {fusion_name} ({len(fusion_seq)} bp)"
1701
+ )
1702
+
1703
+ # Track chromosome segment usage based on fusion orientations
1704
+ pos1_0 = bnd1.pos - 1 # Convert to 0-based
1705
+ pos2_0 = bnd2.pos - 1 # Convert to 0-based
1706
+ seq1_len = len(modified_sequences[bnd1.chrom])
1707
+ seq2_len = len(modified_sequences[bnd2.chrom])
1708
+
1709
+ # Track segments used based on the actual fusion logic from prototype
1710
+ if bnd1.orientation == "t]p]" and bnd2.orientation == "t]p]":
1711
+ # seq1[:pos1] + RC(seq2[:pos2]) - uses chromosome prefixes
1712
+ segment_tracker.add_used_segment(
1713
+ bnd1.chrom, 0, bnd1.pos, verbose
1714
+ ) # VCF pos as count
1715
+ segment_tracker.add_used_segment(
1716
+ bnd2.chrom, 0, bnd2.pos, verbose
1717
+ ) # VCF pos as count
1718
+ elif bnd1.orientation == "]p]t" and bnd2.orientation == "t[p[":
1719
+ # seq2[:pos2] + seq1[pos1_0:] - prefix from seq2, suffix from seq1
1720
+ segment_tracker.add_used_segment(
1721
+ bnd2.chrom, 0, bnd2.pos, verbose
1722
+ ) # VCF pos as count
1723
+ segment_tracker.add_used_segment(
1724
+ bnd1.chrom, pos1_0, seq1_len, verbose
1725
+ )
1726
+ elif bnd1.orientation == "[p[t" and bnd2.orientation == "[p[t":
1727
+ # RC(seq2[pos2_0:]) + seq1[pos1_0:] - uses chromosome suffixes
1728
+ segment_tracker.add_used_segment(
1729
+ bnd2.chrom, pos2_0, seq2_len, verbose
1730
+ )
1731
+ segment_tracker.add_used_segment(
1732
+ bnd1.chrom, pos1_0, seq1_len, verbose
1733
+ )
1734
+ elif bnd1.orientation == "t[p[" and bnd2.orientation == "t[p[":
1735
+ # seq1[:pos1_0] + seq2[pos2_0:] - prefix from seq1, suffix from seq2
1736
+ segment_tracker.add_used_segment(bnd1.chrom, 0, pos1_0, verbose)
1737
+ segment_tracker.add_used_segment(
1738
+ bnd2.chrom, pos2_0, seq2_len, verbose
1739
+ )
1740
+ elif bnd1.orientation == "t[p[" and bnd2.orientation == "]p]t":
1741
+ # seq1[:pos1_0] + seq2[pos2_0:] - prefix from seq1, suffix from seq2
1742
+ segment_tracker.add_used_segment(bnd1.chrom, 0, pos1_0, verbose)
1743
+ segment_tracker.add_used_segment(
1744
+ bnd2.chrom, pos2_0, seq2_len, verbose
1745
+ )
1746
+ elif bnd1.orientation == "]p]t" and bnd2.orientation == "]p]t":
1747
+ # seq2[:pos2] + seq1[pos1_0:] - prefix from seq2, suffix from seq1
1748
+ segment_tracker.add_used_segment(
1749
+ bnd2.chrom, 0, bnd2.pos, verbose
1750
+ ) # VCF pos as count
1751
+ segment_tracker.add_used_segment(
1752
+ bnd1.chrom, pos1_0, seq1_len, verbose
1753
+ )
1754
+ else:
1755
+ # Unknown orientation patterns - track conservatively
1756
+ if verbose:
1757
+ print(
1758
+ f" ⚠️ Unknown orientation pattern: {bnd1.orientation} + {bnd2.orientation}"
1759
+ )
1760
+ segment_tracker.add_used_segment(bnd1.chrom, 0, pos1_0, verbose)
1761
+ segment_tracker.add_used_segment(
1762
+ bnd2.chrom, pos2_0, seq2_len, verbose
1763
+ )
1764
+
1765
+ except Exception as e:
1766
+ if verbose:
1767
+ print(
1768
+ f" ⚠️ Failed to create fusion for {bnd1_id}-{bnd2_id}: {e}"
1769
+ )
1770
+
1771
+ # Calculate and add leftover sequences
1772
+ leftover_sequences = segment_tracker.get_leftover_sequences(verbose)
1773
+
1774
+ # Remove original chromosomes that were consumed by fusions and replace with leftovers
1775
+ chromosomes_with_fusions = set()
1776
+ for seq_name in list(modified_sequences.keys()):
1777
+ if "_fusion_" in seq_name:
1778
+ # Extract chromosome names from fusion sequence names
1779
+ parts = seq_name.split("_")
1780
+ if len(parts) >= 2:
1781
+ chromosomes_with_fusions.add(parts[0])
1782
+ chromosomes_with_fusions.add(parts[1])
1783
+
1784
+ # Remove consumed chromosomes and add their leftovers
1785
+ for chrom in chromosomes_with_fusions:
1786
+ if chrom in modified_sequences:
1787
+ del modified_sequences[chrom]
1788
+ if verbose:
1789
+ print(f" 🗑️ Removed consumed chromosome: {chrom}")
1790
+
1791
+ # Add leftover sequences
1792
+ modified_sequences.update(leftover_sequences)
1793
+
1794
+ # FINAL STEP: Encode sequences and create output
1795
+ # Preserve reference chromosome order, then append fusion/leftover sequences
1796
+ reference_chroms = list(reference.keys())
1797
+
1798
+ # First, add chromosomes in reference order
1799
+ for chrom in reference_chroms:
1800
+ if chrom in modified_sequences:
1801
+ seq = modified_sequences[chrom]
1802
+ if encode:
1803
+ if encoder:
1804
+ personal_genome[chrom] = encoder(seq)
1805
+ else:
1806
+ personal_genome[chrom] = encode_seq(seq)
1807
+ else:
1808
+ personal_genome[chrom] = seq
1809
+
1810
+ # Then, add fusion and leftover sequences (not in original reference)
1811
+ for chrom, seq in modified_sequences.items():
1812
+ if chrom not in reference_chroms:
1813
+ if encode:
1814
+ if encoder:
1815
+ personal_genome[chrom] = encoder(seq)
1816
+ else:
1817
+ personal_genome[chrom] = encode_seq(seq)
1818
+ else:
1819
+ personal_genome[chrom] = seq
1820
+
1821
+ if verbose:
1822
+ total_variants = len(variants_df)
1823
+ sequences_msg = f"{len(personal_genome):,} sequences"
1824
+ if any("_fusion_" in name for name in personal_genome.keys()):
1825
+ fusion_count = sum(
1826
+ 1 for name in personal_genome.keys() if "_fusion_" in name
1827
+ )
1828
+ leftover_count = len(personal_genome) - fusion_count
1829
+ sequences_msg = (
1830
+ f"{fusion_count} fusions, {leftover_count} leftover sequences"
1831
+ )
1832
+
1833
+ print(
1834
+ f"🧬 Completed: {total_processed:,}/{total_variants:,} variants processed → {sequences_msg}"
1835
+ )
1836
+
1837
+ return personal_genome
1838
+
1839
+
1840
+ def _generate_sequence_metadata(chunk_variants, seq_len):
1841
+ """
1842
+ Generate standardized metadata for sequence functions.
1843
+
1844
+ This centralizes metadata generation to eliminate duplication across
1845
+ get_alt_sequences, get_ref_sequences, and get_alt_ref_sequences.
1846
+
1847
+ Args:
1848
+ chunk_variants: DataFrame of variants for this chunk
1849
+ seq_len: Length of the sequence window
1850
+
1851
+ Returns:
1852
+ pandas.DataFrame: Comprehensive metadata with standardized columns
1853
+ """
1854
+ metadata = []
1855
+
1856
+ for _, var in chunk_variants.iterrows():
1857
+ # Basic position calculations
1858
+ pos = var["pos1"] # 1-based VCF position
1859
+ genomic_pos = pos - 1 # Convert to 0-based
1860
+ half_len = seq_len // 2
1861
+ window_start = max(0, genomic_pos - half_len)
1862
+ window_end = window_start + seq_len
1863
+
1864
+ # Variant classification
1865
+ variant_type = var.get("variant_type", "unknown")
1866
+
1867
+ # Build minimal metadata dictionary
1868
+ meta_dict = {
1869
+ "chrom": var["chrom"],
1870
+ "window_start": window_start,
1871
+ "window_end": window_end,
1872
+ "variant_pos0": genomic_pos, # 0-based absolute position
1873
+ "variant_pos1": pos, # 1-based absolute position
1874
+ "ref": var["ref"],
1875
+ "alt": var["alt"],
1876
+ "variant_type": variant_type,
1877
+ }
1878
+
1879
+ # Add sym_variant_end ONLY for symbolic alleles (<INV>, <DUP>, etc.)
1880
+ if variant_type.startswith("SV_") and "<" in var["alt"]:
1881
+ if "info" in var and var["info"] and var["info"] != ".":
1882
+ parsed_info = parse_vcf_info(var["info"])
1883
+ sym_end = parsed_info.get("END")
1884
+ if sym_end is not None:
1885
+ meta_dict["sym_variant_end"] = sym_end
1886
+
1887
+ metadata.append(meta_dict)
1888
+
1889
+ return pd.DataFrame(metadata)
1890
+
1891
+
1892
+ def _generate_bnd_ref_sequences(
1893
+ breakend_pairs, reference, seq_len, encode=True, encoder=None
1894
+ ):
1895
+ """
1896
+ Generate dual reference sequences for BND variants (no ALT sequences).
1897
+
1898
+ Args:
1899
+ breakend_pairs: List of breakend pairs from load_breakend_variants
1900
+ reference: Reference genome dictionary
1901
+ seq_len: Length of sequence window
1902
+ encode: Whether to encode sequences
1903
+ encoder: Optional custom encoder
1904
+
1905
+ Returns:
1906
+ Tuple of (left_ref_sequences, right_ref_sequences, metadata)
1907
+ left_ref_sequences contains left breakend reference (+ N-padding)
1908
+ right_ref_sequences contains right breakend reference (N-padding +)
1909
+ """
1910
+ from .sequence_utils import encode_seq, rc_str
1911
+
1912
+ left_ref_sequences = []
1913
+ right_ref_sequences = []
1914
+ metadata = []
1915
+
1916
+ if not breakend_pairs:
1917
+ return left_ref_sequences, right_ref_sequences, metadata
1918
+
1919
+ # Process each breakend pair
1920
+ for bnd1, bnd2 in breakend_pairs:
1921
+ try:
1922
+ # Get chromosome sequences
1923
+ seq1 = str(reference[bnd1.chrom]) if bnd1.chrom in reference else ""
1924
+ seq2 = str(reference[bnd2.chrom]) if bnd2.chrom in reference else ""
1925
+
1926
+ if not seq1 or not seq2:
1927
+ continue
1928
+
1929
+ # Calculate window centered on first breakend
1930
+ center_pos = bnd1.pos - 1 # Convert to 0-based
1931
+
1932
+ # Detect if this is a BND with insertion for consistent handling
1933
+ has_insertion = bool(bnd1.inserted_seq or bnd2.inserted_seq)
1934
+ insertion_length = len(bnd1.inserted_seq) + len(bnd2.inserted_seq)
1935
+
1936
+ # Generate left reference sequence (sequence before breakend + right-side N-padding)
1937
+ # For BNDs, we want to show what was there BEFORE the fusion point
1938
+ # Then pad the right side with N's to represent the missing fusion partner + insertion
1939
+ half_len = seq_len // 2
1940
+
1941
+ # Extract sequence leading up to the breakend (before the fusion point)
1942
+ left_start = max(0, center_pos - half_len)
1943
+ left_end = center_pos # Stop at the breakend position
1944
+ left_ref_raw = seq1[left_start:left_end]
1945
+
1946
+ # Pad the right side to represent where the fusion partner + insertion would attach
1947
+ left_padding_needed = seq_len - len(left_ref_raw)
1948
+ # Note: For BND with insertion, this padding represents both the missing chromosome and the insertion
1949
+ left_ref_seq = left_ref_raw + "N" * left_padding_needed
1950
+
1951
+ # Generate right reference sequence (left-side N-padding + sequence after breakend)
1952
+ # For the right side, we want to show what was there AFTER the fusion point
1953
+ # Pad the left side with N's to represent the missing fusion partner + insertion
1954
+ bnd2_center = bnd2.pos - 1 # Convert to 0-based
1955
+
1956
+ # Extract sequence starting from the breakend (after the fusion point)
1957
+ right_start = bnd2_center # Start at the breakend position
1958
+ right_end = min(len(seq2), bnd2_center + half_len)
1959
+ right_ref_raw = seq2[right_start:right_end]
1960
+
1961
+ # Pad the left side to represent where the fusion partner + insertion would attach
1962
+ right_padding_needed = seq_len - len(right_ref_raw)
1963
+ # Note: For BND with insertion, this padding represents both the missing chromosome and the insertion
1964
+ right_ref_seq = "N" * right_padding_needed + right_ref_raw
1965
+
1966
+ # Apply reverse complement if needed based on orientation
1967
+ if bnd1.orientation in ["t]p]", "[p[t"]: # orientations requiring RC
1968
+ left_ref_seq = rc_str(left_ref_seq)
1969
+ if bnd2.orientation in ["t]p]", "[p[t"]:
1970
+ right_ref_seq = rc_str(right_ref_seq)
1971
+
1972
+ # Ensure sequences are exactly seq_len
1973
+ left_ref_seq = left_ref_seq[:seq_len].ljust(seq_len, "N")
1974
+ right_ref_seq = right_ref_seq[:seq_len].ljust(seq_len, "N")
1975
+
1976
+ left_ref_sequences.append(left_ref_seq)
1977
+ right_ref_sequences.append(right_ref_seq)
1978
+
1979
+ # Create metadata for this BND
1980
+ window_start = max(0, center_pos - seq_len // 2)
1981
+ window_end = window_start + seq_len
1982
+ metadata.append(
1983
+ {
1984
+ "chrom": bnd1.chrom,
1985
+ "window_start": window_start,
1986
+ "window_end": window_end,
1987
+ "variant_pos0": center_pos,
1988
+ "variant_pos1": bnd1.pos,
1989
+ "ref": bnd1.ref,
1990
+ "alt": bnd1.alt,
1991
+ "variant_type": "SV_BND",
1992
+ "mate_chrom": bnd2.chrom,
1993
+ "mate_pos": bnd2.pos,
1994
+ "orientation_1": bnd1.orientation,
1995
+ "orientation_2": bnd2.orientation,
1996
+ }
1997
+ )
1998
+
1999
+ except Exception as e:
2000
+ # Log error but continue processing other BNDs
2001
+ import warnings
2002
+
2003
+ warnings.warn(f"Failed to process BND pair {bnd1.id}-{bnd2.id}: {e}")
2004
+ continue
2005
+
2006
+ # Encode sequences if requested
2007
+ if encode and left_ref_sequences:
2008
+ # Encode each sequence individually and collect them
2009
+ encoded_left_ref = []
2010
+ encoded_right_ref = []
2011
+
2012
+ for i in range(len(left_ref_sequences)):
2013
+ encoded_left_ref.append(encode_seq(left_ref_sequences[i], encoder))
2014
+ encoded_right_ref.append(encode_seq(right_ref_sequences[i], encoder))
2015
+
2016
+ # Stack the encoded sequences
2017
+ if TORCH_AVAILABLE:
2018
+ left_ref_sequences = (
2019
+ torch.stack(encoded_left_ref) if encoded_left_ref else []
2020
+ )
2021
+ right_ref_sequences = (
2022
+ torch.stack(encoded_right_ref) if encoded_right_ref else []
2023
+ )
2024
+ else:
2025
+ left_ref_sequences = np.stack(encoded_left_ref) if encoded_left_ref else []
2026
+ right_ref_sequences = (
2027
+ np.stack(encoded_right_ref) if encoded_right_ref else []
2028
+ )
2029
+
2030
+ return left_ref_sequences, right_ref_sequences, metadata
2031
+
2032
+
2033
+ def _generate_bnd_sequences(
2034
+ breakend_pairs, reference, seq_len, encode=True, encoder=None
2035
+ ):
2036
+ """
2037
+ Generate ALT and reference sequences for BND variants.
2038
+
2039
+ Args:
2040
+ breakend_pairs: List of breakend pairs from load_breakend_variants
2041
+ reference: Reference genome dictionary
2042
+ seq_len: Length of sequence window
2043
+ encode: Whether to encode sequences
2044
+ encoder: Optional custom encoder
2045
+
2046
+ Returns:
2047
+ Tuple of (alt_sequences, left_ref_sequences, right_ref_sequences, metadata)
2048
+ For BNDs: alt_sequences contains fusion sequences
2049
+ left_ref_sequences contains left breakend reference (+ N-padding)
2050
+ right_ref_sequences contains right breakend reference (N-padding +)
2051
+ """
2052
+ from .sequence_utils import encode_seq, rc_str
2053
+
2054
+ alt_sequences = []
2055
+ left_ref_sequences = []
2056
+ right_ref_sequences = []
2057
+ metadata = []
2058
+
2059
+ if not breakend_pairs:
2060
+ return alt_sequences, left_ref_sequences, right_ref_sequences, metadata
2061
+
2062
+ # Process each breakend pair
2063
+ for bnd1, bnd2 in breakend_pairs:
2064
+ try:
2065
+ # Get chromosome sequences
2066
+ seq1 = str(reference[bnd1.chrom]) if bnd1.chrom in reference else ""
2067
+ seq2 = str(reference[bnd2.chrom]) if bnd2.chrom in reference else ""
2068
+
2069
+ if not seq1 or not seq2:
2070
+ continue
2071
+
2072
+ # Generate fusion sequence using existing ChimericSequenceBuilder
2073
+ builder = ChimericSequenceBuilder({bnd1.chrom: seq1, bnd2.chrom: seq2})
2074
+ fusion_name, fusion_seq = builder.create_fusion_from_pair((bnd1, bnd2))
2075
+
2076
+ # Detect if this is a BND with insertion and center appropriately
2077
+ has_insertion = bool(bnd1.inserted_seq or bnd2.inserted_seq)
2078
+
2079
+ if has_insertion:
2080
+ # For BND with insertion, center window on the inserted sequence
2081
+ # Use segment metadata to find where the novel sequence is located
2082
+ segments = builder.get_sequence_segments(fusion_name)
2083
+ novel_segment = None
2084
+ for seg in segments:
2085
+ if seg.source_type == "novel":
2086
+ novel_segment = seg
2087
+ break
2088
+
2089
+ if novel_segment:
2090
+ # Center window on the novel sequence
2091
+ novel_center = (
2092
+ novel_segment.start_pos + novel_segment.end_pos
2093
+ ) // 2
2094
+ window_start = max(0, novel_center - seq_len // 2)
2095
+ window_end = window_start + seq_len
2096
+ else:
2097
+ # Fallback to standard centering if no novel segment found
2098
+ center_pos = bnd1.pos - 1 # Convert to 0-based
2099
+ window_start = max(0, center_pos - seq_len // 2)
2100
+ window_end = window_start + seq_len
2101
+ else:
2102
+ # Standard BND: center on first breakend position
2103
+ center_pos = bnd1.pos - 1 # Convert to 0-based
2104
+ window_start = max(0, center_pos - seq_len // 2)
2105
+ window_end = window_start + seq_len
2106
+
2107
+ # Generate ALT sequence (fusion sequence window)
2108
+ if len(fusion_seq) >= seq_len:
2109
+ alt_seq = fusion_seq[window_start:window_end]
2110
+ else:
2111
+ # Pad if fusion is shorter than window
2112
+ alt_seq = fusion_seq + "N" * (seq_len - len(fusion_seq))
2113
+
2114
+ # Generate reference sequences with appropriate padding
2115
+ # For BNDs with insertions, we need to account for the inserted sequence length
2116
+ half_len = seq_len // 2
2117
+ insertion_length = len(bnd1.inserted_seq) + len(bnd2.inserted_seq)
2118
+
2119
+ # Generate left reference sequence (sequence before breakend + right-side N-padding)
2120
+ # For BNDs, we want to show what was there BEFORE the fusion point
2121
+ # Then pad the right side with N's to represent the missing fusion partner + insertion
2122
+ if not has_insertion:
2123
+ # Standard BND: use existing logic
2124
+ center_pos = bnd1.pos - 1 # Convert to 0-based if not set above
2125
+ left_start = max(0, center_pos - half_len)
2126
+ left_end = center_pos # Stop at the breakend position
2127
+ left_ref_raw = seq1[left_start:left_end]
2128
+
2129
+ # For BND with insertion, pad for both the missing chromosome and the insertion
2130
+ left_padding_needed = seq_len - len(left_ref_raw)
2131
+ # Note: The padding represents what's missing (other chromosome + insertion)
2132
+ # but we don't artificially inflate it since the user wants to see proper N-padding
2133
+ left_ref_seq = left_ref_raw + "N" * left_padding_needed
2134
+
2135
+ # Generate right reference sequence (left-side N-padding + sequence after breakend)
2136
+ # For the right side, we want to show what was there AFTER the fusion point
2137
+ # Pad the left side with N's to represent the missing fusion partner + insertion
2138
+ bnd2_center = bnd2.pos - 1 # Convert to 0-based
2139
+
2140
+ # Extract sequence starting from the breakend (after the fusion point)
2141
+ right_start = bnd2_center # Start at the breakend position
2142
+ right_end = min(len(seq2), bnd2_center + half_len)
2143
+ right_ref_raw = seq2[right_start:right_end]
2144
+
2145
+ # For BND with insertion, pad for both the missing chromosome and the insertion
2146
+ right_padding_needed = seq_len - len(right_ref_raw)
2147
+ # Note: The padding represents what's missing (other chromosome + insertion)
2148
+ # but we don't artificially inflate it since the user wants to see proper N-padding
2149
+ right_ref_seq = "N" * right_padding_needed + right_ref_raw
2150
+
2151
+ # Apply reverse complement if needed based on orientation
2152
+ if bnd1.orientation in ["t]p]", "[p[t"]: # orientations requiring RC
2153
+ left_ref_seq = rc_str(left_ref_seq)
2154
+ if bnd2.orientation in ["t]p]", "[p[t"]:
2155
+ right_ref_seq = rc_str(right_ref_seq)
2156
+
2157
+ # Ensure sequences are exactly seq_len
2158
+ alt_seq = alt_seq[:seq_len].ljust(seq_len, "N")
2159
+ left_ref_seq = left_ref_seq[:seq_len].ljust(seq_len, "N")
2160
+ right_ref_seq = right_ref_seq[:seq_len].ljust(seq_len, "N")
2161
+
2162
+ alt_sequences.append(alt_seq)
2163
+ left_ref_sequences.append(left_ref_seq)
2164
+ right_ref_sequences.append(right_ref_seq)
2165
+
2166
+ # Create metadata for this BND
2167
+ metadata.append(
2168
+ {
2169
+ "chrom": bnd1.chrom,
2170
+ "window_start": window_start,
2171
+ "window_end": window_end,
2172
+ "variant_pos0": center_pos,
2173
+ "variant_pos1": bnd1.pos,
2174
+ "ref": bnd1.ref,
2175
+ "alt": bnd1.alt,
2176
+ "variant_type": "SV_BND",
2177
+ "mate_chrom": bnd2.chrom,
2178
+ "mate_pos": bnd2.pos,
2179
+ "orientation_1": bnd1.orientation,
2180
+ "orientation_2": bnd2.orientation,
2181
+ "fusion_name": fusion_name,
2182
+ }
2183
+ )
2184
+
2185
+ except Exception as e:
2186
+ # Log error but continue processing other BNDs
2187
+ import warnings
2188
+
2189
+ warnings.warn(f"Failed to process BND pair {bnd1.id}-{bnd2.id}: {e}")
2190
+ continue
2191
+
2192
+ # Encode sequences if requested
2193
+ if encode and alt_sequences:
2194
+ # Encode each sequence individually and collect them
2195
+ encoded_alt = []
2196
+ encoded_left_ref = []
2197
+ encoded_right_ref = []
2198
+
2199
+ for i in range(len(alt_sequences)):
2200
+ encoded_alt.append(encode_seq(alt_sequences[i], encoder))
2201
+ encoded_left_ref.append(encode_seq(left_ref_sequences[i], encoder))
2202
+ encoded_right_ref.append(encode_seq(right_ref_sequences[i], encoder))
2203
+
2204
+ # Stack the encoded sequences
2205
+ if TORCH_AVAILABLE:
2206
+ alt_sequences = torch.stack(encoded_alt) if encoded_alt else []
2207
+ left_ref_sequences = (
2208
+ torch.stack(encoded_left_ref) if encoded_left_ref else []
2209
+ )
2210
+ right_ref_sequences = (
2211
+ torch.stack(encoded_right_ref) if encoded_right_ref else []
2212
+ )
2213
+ else:
2214
+ alt_sequences = np.stack(encoded_alt) if encoded_alt else []
2215
+ left_ref_sequences = np.stack(encoded_left_ref) if encoded_left_ref else []
2216
+ right_ref_sequences = (
2217
+ np.stack(encoded_right_ref) if encoded_right_ref else []
2218
+ )
2219
+
2220
+ return alt_sequences, left_ref_sequences, right_ref_sequences, metadata
2221
+
2222
+
2223
+ def get_alt_sequences(
2224
+ reference_fn,
2225
+ variants_fn,
2226
+ seq_len,
2227
+ encode=True,
2228
+ n_chunks=1,
2229
+ encoder=None,
2230
+ auto_map_chromosomes=False,
2231
+ ):
2232
+ """
2233
+ Create sequence windows centered on each variant position with variants applied.
2234
+ Now supports both standard variants and BND variants.
2235
+
2236
+ Args:
2237
+ reference_fn: Path to reference genome file or dictionary-like object
2238
+ variants_fn: Path to VCF file (string) or DataFrame with variant data.
2239
+ For DataFrames, position column can be 'pos', 'pos1', or assumes second column is position.
2240
+ seq_len: Length of the sequence window
2241
+ encode: Return sequences as one-hot encoded numpy arrays (default: True)
2242
+ n_chunks: Number of chunks to split variants into (default: 1)
2243
+ encoder: Optional custom encoding function
2244
+ auto_map_chromosomes: Automatically map chromosome names between VCF and reference
2245
+ when they don't match exactly. Default: False. (default: False)
2246
+
2247
+ Yields:
2248
+ Tuple containing (sequences, metadata_df) where:
2249
+ If encode=True: sequences is a tensor/array of shape (chunk_size, seq_len, 4) for each chunk
2250
+ If encode=False: sequences is a list of tuples containing (chrom, start, end, sequence_string) for each chunk
2251
+ metadata_df is a DataFrame with variant information including position offsets
2252
+ For BND variants: sequences contain fusion sequences
2253
+
2254
+ Raises:
2255
+ ChromosomeMismatchError: If auto_map_chromosomes=False and chromosome names don't match
2256
+ """
2257
+ # Load reference and variants, separating BNDs from standard variants
2258
+ from .variant_utils import load_breakend_variants
2259
+
2260
+ reference = _load_reference(reference_fn)
2261
+
2262
+ # Load variants and separate BNDs
2263
+ standard_variants, breakend_pairs = load_breakend_variants(variants_fn)
2264
+
2265
+ # Combine chromosome names from both standard variants and breakend pairs
2266
+ ref_chroms = set(reference.keys())
2267
+ standard_chroms = (
2268
+ set(standard_variants["chrom"].unique())
2269
+ if len(standard_variants) > 0
2270
+ else set()
2271
+ )
2272
+ breakend_chroms = set()
2273
+ for bnd1, bnd2 in breakend_pairs:
2274
+ breakend_chroms.add(bnd1.chrom)
2275
+ breakend_chroms.add(bnd2.chrom)
2276
+ vcf_chroms = standard_chroms | breakend_chroms
2277
+
2278
+ # Use chromosome matching to handle name mismatches
2279
+ mapping, unmatched = match_chromosomes_with_report(
2280
+ ref_chroms, vcf_chroms, verbose=True, auto_map_chromosomes=auto_map_chromosomes
2281
+ )
2282
+
2283
+ # Apply chromosome name mapping to standard variants
2284
+ if mapping and len(standard_variants) > 0:
2285
+ standard_variants = apply_chromosome_mapping(standard_variants, mapping)
2286
+
2287
+ # Apply chromosome name mapping to breakend pairs
2288
+ if mapping and breakend_pairs:
2289
+ updated_pairs = []
2290
+ for bnd1, bnd2 in breakend_pairs:
2291
+ # Update chromosome names in breakend objects if needed
2292
+ if bnd1.chrom in mapping:
2293
+ bnd1.chrom = mapping[bnd1.chrom]
2294
+ if bnd2.chrom in mapping:
2295
+ bnd2.chrom = mapping[bnd2.chrom]
2296
+ if bnd1.mate_chrom in mapping:
2297
+ bnd1.mate_chrom = mapping[bnd1.mate_chrom]
2298
+ if bnd2.mate_chrom in mapping:
2299
+ bnd2.mate_chrom = mapping[bnd2.mate_chrom]
2300
+ updated_pairs.append((bnd1, bnd2))
2301
+ breakend_pairs = updated_pairs
2302
+
2303
+ # Process standard variants and BNDs separately, then combine results
2304
+ # For now, we'll process all in one chunk (BND chunking is more complex)
2305
+
2306
+ # Process standard variants first - yield each chunk individually
2307
+ if len(standard_variants) > 0:
2308
+ # Split standard variants into chunks
2309
+ std_indices = np.array_split(np.arange(len(standard_variants)), n_chunks)
2310
+ std_variant_chunks = (
2311
+ standard_variants.iloc[chunk_indices].reset_index(drop=True)
2312
+ for chunk_indices in std_indices
2313
+ if len(chunk_indices) > 0
2314
+ )
2315
+
2316
+ for chunk_variants in std_variant_chunks:
2317
+ sequences = []
2318
+
2319
+ # Vectorized calculation of window positions for ALL variants in chunk
2320
+ positions = chunk_variants["pos1"].values - 1 # Convert to 0-based
2321
+ half_len = seq_len // 2
2322
+ window_starts = positions - half_len
2323
+ window_ends = window_starts + seq_len
2324
+
2325
+ # Cache for reference chromosome access
2326
+ current_chrom = None
2327
+ ref_seq = None
2328
+ chrom_length = None
2329
+
2330
+ # Track valid indices for metadata filtering
2331
+ valid_indices = []
2332
+
2333
+ # Process each variant individually (applying only that single variant)
2334
+ for idx, (_, var) in enumerate(chunk_variants.iterrows()):
2335
+ chrom = var["chrom"]
2336
+ pos = var["pos1"]
2337
+
2338
+ # Load reference chromosome (with caching)
2339
+ if chrom != current_chrom:
2340
+ if chrom not in reference:
2341
+ warnings.warn(
2342
+ f"Chromosome {chrom} not found in reference. Skipping variant at {chrom}:{pos}."
2343
+ )
2344
+ # Skip this variant - don't add to sequences
2345
+ continue
2346
+ ref_seq = str(reference[chrom])
2347
+ chrom_length = len(ref_seq)
2348
+ current_chrom = chrom
2349
+
2350
+ # Track that this variant was successfully processed
2351
+ valid_indices.append(idx)
2352
+
2353
+ # Use pre-calculated window positions
2354
+ window_start = window_starts[idx]
2355
+ window_end = window_ends[idx]
2356
+
2357
+ # Calculate padding and actual bounds
2358
+ if window_start < 0:
2359
+ left_pad = -window_start
2360
+ actual_start = 0
2361
+ else:
2362
+ left_pad = 0
2363
+ actual_start = window_start
2364
+
2365
+ if window_end > chrom_length:
2366
+ right_pad = window_end - chrom_length
2367
+ actual_end = chrom_length
2368
+ else:
2369
+ right_pad = 0
2370
+ actual_end = window_end
2371
+
2372
+ # Apply ONLY this single variant to reference sequence
2373
+ single_var_df = pd.DataFrame([var])
2374
+ applicator = VariantApplicator(ref_seq, single_var_df)
2375
+ modified_seq, _ = applicator.apply_variants()
2376
+
2377
+ # Extract window from the single-variant modified sequence
2378
+ window_seq = modified_seq[actual_start:actual_end]
2379
+
2380
+ # Add padding if needed
2381
+ if left_pad > 0:
2382
+ window_seq = "N" * left_pad + window_seq
2383
+ if right_pad > 0:
2384
+ window_seq = window_seq + "N" * right_pad
2385
+
2386
+ # Truncate or pad as needed
2387
+ if len(window_seq) < seq_len:
2388
+ window_seq += "N" * (seq_len - len(window_seq))
2389
+ else:
2390
+ window_seq = window_seq[:seq_len]
2391
+
2392
+ # Ensure correct length
2393
+ if len(window_seq) != seq_len:
2394
+ warnings.warn(
2395
+ f"Sequence length mismatch for variant at {chrom}:{pos}. "
2396
+ f"Expected {seq_len}, got {len(window_seq)}"
2397
+ )
2398
+
2399
+ if encode:
2400
+ sequences.append(encode_seq(window_seq, encoder))
2401
+ else:
2402
+ genomic_pos = positions[idx]
2403
+ sequences.append(
2404
+ (
2405
+ chrom,
2406
+ max(0, genomic_pos - half_len),
2407
+ max(0, genomic_pos - half_len) + seq_len,
2408
+ window_seq,
2409
+ )
2410
+ )
2411
+
2412
+ # Generate metadata only for successfully processed variants
2413
+ if valid_indices:
2414
+ filtered_chunk = chunk_variants.iloc[valid_indices].reset_index(
2415
+ drop=True
2416
+ )
2417
+ metadata_df = _generate_sequence_metadata(filtered_chunk, seq_len)
2418
+ else:
2419
+ # No valid variants in this chunk, create empty metadata
2420
+ metadata_df = pd.DataFrame()
2421
+
2422
+ # Yield each chunk immediately
2423
+ if encode and sequences:
2424
+ if TORCH_AVAILABLE:
2425
+ sequences_result = torch.stack(sequences)
2426
+ else:
2427
+ sequences_result = np.stack(sequences)
2428
+ else:
2429
+ sequences_result = sequences
2430
+
2431
+ yield (sequences_result, metadata_df)
2432
+
2433
+ # Process BND variants
2434
+ bnd_alt_sequences, bnd_left_refs, bnd_right_refs, bnd_metadata = (
2435
+ _generate_bnd_sequences(breakend_pairs, reference, seq_len, encode, encoder)
2436
+ )
2437
+
2438
+ # Process BND variants after standard variants (if any)
2439
+ # BND variants are yielded as a single batch for now
2440
+ if len(bnd_alt_sequences) > 0:
2441
+ bnd_metadata_df = pd.DataFrame(bnd_metadata) if bnd_metadata else pd.DataFrame()
2442
+
2443
+ # BND sequences are already stacked by _generate_bnd_sequences
2444
+ bnd_sequences_result = bnd_alt_sequences
2445
+
2446
+ yield (bnd_sequences_result, bnd_metadata_df)
2447
+
2448
+
2449
+ def get_ref_sequences(
2450
+ reference_fn,
2451
+ variants_fn,
2452
+ seq_len,
2453
+ encode=True,
2454
+ n_chunks=1,
2455
+ encoder=None,
2456
+ auto_map_chromosomes=False,
2457
+ ):
2458
+ """
2459
+ Create reference sequence windows centered on each variant position (no variants applied).
2460
+
2461
+ Args:
2462
+ reference_fn: Path to reference genome file or dictionary-like object
2463
+ variants_fn: Path to VCF file (string) or DataFrame with variant data.
2464
+ For DataFrames, position column can be 'pos', 'pos1', or assumes second column is position.
2465
+ seq_len: Length of the sequence window
2466
+ encode: Return sequences as one-hot encoded numpy arrays (default: True)
2467
+ n_chunks: Number of chunks to split variants into (default: 1)
2468
+ encoder: Optional custom encoding function
2469
+ auto_map_chromosomes: Automatically map chromosome names between VCF and reference
2470
+ when they don't match exactly. Default: False. (default: False)
2471
+
2472
+ Yields:
2473
+ Tuple containing (sequences, metadata_df) where:
2474
+ If encode=True: sequences is a tensor/array of shape (chunk_size, seq_len, 4) for each chunk
2475
+ If encode=False: sequences is a list of tuples containing (chrom, start, end, sequence_string) for each chunk
2476
+ metadata_df is a DataFrame with variant information including position offsets
2477
+ For BND variants: sequences contain dual reference sequences (left + right)
2478
+
2479
+ Raises:
2480
+ ChromosomeMismatchError: If auto_map_chromosomes=False and chromosome names don't match
2481
+ """
2482
+ # Load reference and variants, separating BNDs from standard variants
2483
+ from .variant_utils import load_breakend_variants
2484
+
2485
+ reference = _load_reference(reference_fn)
2486
+
2487
+ # Load variants and separate BNDs
2488
+ standard_variants, breakend_pairs = load_breakend_variants(variants_fn)
2489
+
2490
+ # Combine chromosome names from both standard variants and breakend pairs
2491
+ ref_chroms = set(reference.keys())
2492
+ standard_chroms = (
2493
+ set(standard_variants["chrom"].unique())
2494
+ if len(standard_variants) > 0
2495
+ else set()
2496
+ )
2497
+ breakend_chroms = set()
2498
+ for bnd1, bnd2 in breakend_pairs:
2499
+ breakend_chroms.add(bnd1.chrom)
2500
+ breakend_chroms.add(bnd2.chrom)
2501
+ vcf_chroms = standard_chroms | breakend_chroms
2502
+
2503
+ # Use chromosome matching to handle name mismatches
2504
+ mapping, unmatched = match_chromosomes_with_report(
2505
+ ref_chroms, vcf_chroms, verbose=True, auto_map_chromosomes=auto_map_chromosomes
2506
+ )
2507
+
2508
+ # Apply chromosome name mapping to standard variants
2509
+ if mapping and len(standard_variants) > 0:
2510
+ standard_variants = apply_chromosome_mapping(standard_variants, mapping)
2511
+
2512
+ # Apply chromosome name mapping to breakend pairs
2513
+ if mapping and breakend_pairs:
2514
+ updated_pairs = []
2515
+ for bnd1, bnd2 in breakend_pairs:
2516
+ # Update chromosome names in breakend objects if needed
2517
+ if bnd1.chrom in mapping:
2518
+ bnd1.chrom = mapping[bnd1.chrom]
2519
+ if bnd2.chrom in mapping:
2520
+ bnd2.chrom = mapping[bnd2.chrom]
2521
+ if bnd1.mate_chrom in mapping:
2522
+ bnd1.mate_chrom = mapping[bnd1.mate_chrom]
2523
+ if bnd2.mate_chrom in mapping:
2524
+ bnd2.mate_chrom = mapping[bnd2.mate_chrom]
2525
+ updated_pairs.append((bnd1, bnd2))
2526
+ breakend_pairs = updated_pairs
2527
+
2528
+ # Process standard variants first - yield each chunk individually
2529
+ if len(standard_variants) > 0:
2530
+ # Split standard variants into chunks
2531
+ std_indices = np.array_split(np.arange(len(standard_variants)), n_chunks)
2532
+ std_variant_chunks = (
2533
+ standard_variants.iloc[chunk_indices].reset_index(drop=True)
2534
+ for chunk_indices in std_indices
2535
+ if len(chunk_indices) > 0
2536
+ )
2537
+
2538
+ for chunk_variants in std_variant_chunks:
2539
+ sequences = []
2540
+ # Generate standardized metadata using shared function
2541
+ metadata_df = _generate_sequence_metadata(chunk_variants, seq_len)
2542
+
2543
+ # Group variants by chromosome for efficient processing
2544
+ for chrom, chrom_variants in chunk_variants.groupby("chrom"):
2545
+ if chrom not in reference:
2546
+ warnings.warn(
2547
+ f"Chromosome {chrom} not found in reference. Skipping {len(chrom_variants)} variants."
2548
+ )
2549
+ continue
2550
+
2551
+ ref_seq = str(reference[chrom])
2552
+ chrom_length = len(ref_seq)
2553
+
2554
+ # Vectorized calculation of window positions
2555
+ positions = chrom_variants["pos1"].values - 1 # Convert to 0-based
2556
+ half_len = seq_len // 2
2557
+ window_starts = positions - half_len
2558
+ window_ends = window_starts + seq_len
2559
+
2560
+ # Process all variants in this chromosome using NumPy operations
2561
+ for idx, (_, var) in enumerate(chrom_variants.iterrows()):
2562
+ pos = var["pos1"]
2563
+ genomic_pos = positions[idx]
2564
+ window_start = window_starts[idx]
2565
+ window_end = window_ends[idx]
2566
+
2567
+ # Handle edge cases and extract window
2568
+ if window_start < 0:
2569
+ left_pad = -window_start
2570
+ actual_start = 0
2571
+ else:
2572
+ left_pad = 0
2573
+ actual_start = window_start
2574
+
2575
+ if window_end > chrom_length:
2576
+ right_pad = window_end - chrom_length
2577
+ actual_end = chrom_length
2578
+ else:
2579
+ right_pad = 0
2580
+ actual_end = window_end
2581
+
2582
+ # Extract window from reference chromosome (no variants applied)
2583
+ window_seq = ref_seq[actual_start:actual_end]
2584
+
2585
+ # Add padding if needed
2586
+ if left_pad > 0:
2587
+ window_seq = "N" * left_pad + window_seq
2588
+ if right_pad > 0:
2589
+ window_seq = window_seq + "N" * right_pad
2590
+
2591
+ # Ensure correct length
2592
+ if len(window_seq) != seq_len:
2593
+ warnings.warn(
2594
+ f"Sequence length mismatch for variant at {chrom}:{pos}. "
2595
+ f"Expected {seq_len}, got {len(window_seq)}"
2596
+ )
2597
+ # Truncate or pad as needed
2598
+ if len(window_seq) < seq_len:
2599
+ window_seq += "N" * (seq_len - len(window_seq))
2600
+ else:
2601
+ window_seq = window_seq[:seq_len]
2602
+
2603
+ if encode:
2604
+ sequences.append(encode_seq(window_seq, encoder))
2605
+ else:
2606
+ sequences.append(
2607
+ (
2608
+ chrom,
2609
+ max(0, genomic_pos - half_len),
2610
+ max(0, genomic_pos - half_len) + seq_len,
2611
+ window_seq,
2612
+ )
2613
+ )
2614
+
2615
+ # Yield each chunk immediately
2616
+ if encode and sequences:
2617
+ if TORCH_AVAILABLE:
2618
+ sequences_result = torch.stack(sequences)
2619
+ else:
2620
+ sequences_result = np.stack(sequences)
2621
+ else:
2622
+ sequences_result = sequences
2623
+
2624
+ yield (sequences_result, metadata_df)
2625
+
2626
+ # Process BND variants after standard variants (if any)
2627
+ # BND variants are yielded as dual references
2628
+ if breakend_pairs:
2629
+ bnd_left_refs, bnd_right_refs, bnd_metadata = _generate_bnd_ref_sequences(
2630
+ breakend_pairs, reference, seq_len, encode, encoder
2631
+ )
2632
+
2633
+ if len(bnd_left_refs) > 0 or len(bnd_right_refs) > 0:
2634
+ bnd_metadata_df = (
2635
+ pd.DataFrame(bnd_metadata) if bnd_metadata else pd.DataFrame()
2636
+ )
2637
+
2638
+ # For ref sequences, we return dual references as a tuple
2639
+ # This is different from get_alt_sequences which returns fusion sequences
2640
+ # BND ref sequences are already stacked by _generate_bnd_ref_sequences
2641
+ bnd_left_result = bnd_left_refs
2642
+ bnd_right_result = bnd_right_refs
2643
+
2644
+ # Return dual references as a tuple (left_refs, right_refs)
2645
+ yield ((bnd_left_result, bnd_right_result), bnd_metadata_df)
2646
+
2647
+
2648
+ def get_alt_ref_sequences(
2649
+ reference_fn,
2650
+ variants_fn,
2651
+ seq_len,
2652
+ encode=True,
2653
+ n_chunks=1,
2654
+ encoder=None,
2655
+ auto_map_chromosomes=False,
2656
+ ):
2657
+ """
2658
+ Create both reference and variant sequence windows for alt/ref ratio calculations.
2659
+ Maintains backward compatibility while supporting BND variants with dual references.
2660
+
2661
+ This wrapper function calls both get_ref_sequences and get_alt_sequences to return
2662
+ matching pairs of reference and variant sequences for computing ratios.
2663
+
2664
+ Args:
2665
+ reference_fn: Path to reference genome file or dictionary-like object
2666
+ variants_fn: Path to VCF file (string) or DataFrame with variant data.
2667
+ For DataFrames, position column can be 'pos', 'pos1', or assumes second column is position.
2668
+ seq_len: Length of the sequence window
2669
+ encode: Return sequences as one-hot encoded numpy arrays (default: True)
2670
+ n_chunks: Number of chunks to split variants into (default: 1)
2671
+ encoder: Optional custom encoder function
2672
+ auto_map_chromosomes: Automatically map chromosome names between VCF and reference
2673
+ when they don't match exactly. Default: False. (default: False)
2674
+
2675
+ Yields:
2676
+ Tuple containing (alt_sequences, ref_sequences, metadata_df):
2677
+ For standard variants:
2678
+ - alt_sequences: Variant sequences with mutations applied
2679
+ - ref_sequences: Reference sequences without mutations
2680
+ - metadata_df: Variant metadata (pandas DataFrame)
2681
+
2682
+ For BND variants:
2683
+ - alt_sequences: Fusion sequences from breakend pairs
2684
+ - ref_sequences: Tuple of (left_ref_sequences, right_ref_sequences)
2685
+ - metadata_df: BND metadata with orientation and mate information
2686
+
2687
+ Metadata DataFrame columns:
2688
+ Standard fields (all variants):
2689
+ - chrom: Chromosome name (str)
2690
+ - window_start: Window start position, 0-based (int)
2691
+ - window_end: Window end position, 0-based exclusive (int)
2692
+ - variant_pos0: Variant position, 0-based (int)
2693
+ - variant_pos1: Variant position, 1-based VCF standard (int)
2694
+ - ref: Reference allele (str)
2695
+ - alt: Alternate allele (str)
2696
+ - variant_type: Variant classification (str)
2697
+ Examples: 'SNV', 'INS', 'DEL', 'MNV', 'SV_INV', 'SV_DUP', 'SV_BND'
2698
+
2699
+ Additional field for symbolic alleles (<INV>, <DUP>, etc.):
2700
+ - sym_variant_end: END position from INFO field, 1-based (int, optional)
2701
+
2702
+ BND-specific fields:
2703
+ - mate_chrom: Mate breakend chromosome (str)
2704
+ - mate_pos: Mate breakend position, 1-based (int)
2705
+ - orientation_1: First breakend orientation (str)
2706
+ - orientation_2: Second breakend orientation (str)
2707
+ - fusion_name: Fusion sequence identifier (str, optional)
2708
+
2709
+ Raises:
2710
+ ChromosomeMismatchError: If auto_map_chromosomes=False and chromosome names don't match
2711
+ """
2712
+ # Get generators for both reference and variant sequences
2713
+ # These already handle variant loading, chromosome matching, and chunking consistently
2714
+ ref_gen = get_ref_sequences(
2715
+ reference_fn,
2716
+ variants_fn,
2717
+ seq_len,
2718
+ encode,
2719
+ n_chunks,
2720
+ encoder,
2721
+ auto_map_chromosomes,
2722
+ )
2723
+ alt_gen = get_alt_sequences(
2724
+ reference_fn,
2725
+ variants_fn,
2726
+ seq_len,
2727
+ encode,
2728
+ n_chunks,
2729
+ encoder,
2730
+ auto_map_chromosomes,
2731
+ )
2732
+
2733
+ # Process chunks from both generators
2734
+ # Both generators will yield chunks in the same order:
2735
+ # 1. Standard variant chunks first (if any) - maintains existing behavior
2736
+ # 2. BND variant chunks last (if any) - new dual reference structure
2737
+ for (ref_chunk, ref_metadata), (alt_chunk, alt_metadata) in zip(ref_gen, alt_gen):
2738
+ # For standard variants: preserve existing behavior exactly
2739
+ # For BND variants: ref_chunk will be (left_refs, right_refs) tuple
2740
+ # The caller can detect BND chunks by checking if ref_chunk is a tuple
2741
+ yield (alt_chunk, ref_chunk, ref_metadata)
2742
+
2743
+
2744
+ def get_pam_disrupting_alt_sequences(
2745
+ reference_fn,
2746
+ variants_fn,
2747
+ seq_len,
2748
+ max_pam_distance,
2749
+ pam_sequence="NGG",
2750
+ encode=True,
2751
+ n_chunks=1,
2752
+ encoder=None,
2753
+ auto_map_chromosomes=False,
2754
+ ):
2755
+ """
2756
+ Generate sequences for variants that disrupt PAM sites.
2757
+
2758
+ This function identifies variants that disrupt existing PAM sites in the reference
2759
+ genome and generates sequence pairs for each disrupting variant. Works like
2760
+ get_alt_ref_sequences() but filtered to only PAM-disrupting variants.
2761
+
2762
+ Args:
2763
+ reference_fn: Path to reference genome file or dictionary-like object
2764
+ variants_fn: Path to variants file or DataFrame
2765
+ seq_len: Length of sequence windows
2766
+ max_pam_distance: Maximum distance from variant to PAM site
2767
+ pam_sequence: PAM sequence pattern (default: 'NGG' for SpCas9).
2768
+ Supports all IUPAC degenerate nucleotide codes:
2769
+ N (any), R (A/G), Y (C/T), W (A/T), S (C/G), M (A/C),
2770
+ K (G/T), B (C/G/T), D (A/G/T), H (A/C/T), V (A/C/G)
2771
+ encode: Return sequences as one-hot encoded numpy arrays (default: True)
2772
+ n_chunks: Number of chunks to split variants for processing (default: 1)
2773
+ encoder: Optional custom encoding function
2774
+ auto_map_chromosomes: Automatically map chromosome names between VCF and reference
2775
+ when they don't match exactly. Default: False. (default: False)
2776
+
2777
+ Yields:
2778
+ Tuple containing (alt_sequences, ref_sequences, metadata_df):
2779
+ - alt_sequences: Variant sequences with mutations applied
2780
+ - ref_sequences: Reference sequences without mutations
2781
+ - metadata_df: Variant metadata (pandas DataFrame) with PAM-specific columns
2782
+
2783
+ Metadata DataFrame columns:
2784
+ Standard fields:
2785
+ - chrom: Chromosome name (str)
2786
+ - window_start: Window start position, 0-based (int)
2787
+ - window_end: Window end position, 0-based exclusive (int)
2788
+ - variant_pos0: Variant position, 0-based (int)
2789
+ - variant_pos1: Variant position, 1-based VCF standard (int)
2790
+ - ref: Reference allele (str)
2791
+ - alt: Alternate allele (str)
2792
+ - variant_type: Variant classification (str)
2793
+
2794
+ PAM-specific fields:
2795
+ - pam_site_pos: 0-based start position of PAM site in window (int)
2796
+ - pam_ref_sequence: PAM sequence in reference (str)
2797
+ - pam_alt_sequence: PAM sequence after variant (str)
2798
+ - pam_distance: Distance from variant to PAM start (int)
2799
+
2800
+ Raises:
2801
+ ChromosomeMismatchError: If auto_map_chromosomes=False and chromosome names don't match
2802
+
2803
+ Example:
2804
+ >>> # Process all PAM-disrupting variants at once
2805
+ >>> gen = get_pam_disrupting_alt_sequences(ref, vcf, seq_len=50,
2806
+ ... max_pam_distance=10, n_chunks=1)
2807
+ >>> alt_seqs, ref_seqs, metadata = next(gen)
2808
+ >>>
2809
+ >>> # Or iterate through chunks
2810
+ >>> for alt_seqs, ref_seqs, metadata in get_pam_disrupting_alt_sequences(
2811
+ ... ref, vcf, seq_len=50, max_pam_distance=10, n_chunks=5):
2812
+ ... predictions = model.predict(alt_seqs, ref_seqs)
2813
+ """
2814
+ # Helper function to find PAM sites in a sequence
2815
+ def _find_pam_sites(sequence, pam_pattern):
2816
+ """Find all PAM site positions in a sequence using IUPAC codes.
2817
+
2818
+ Supports IUPAC degenerate nucleotide codes in the pattern (N, R, Y, W, S, M, K, B, D, H, V).
2819
+ Pattern wildcards match corresponding bases in the sequence.
2820
+ Sequence 'N' (used for padding or unknown bases) matches any pattern base.
2821
+ """
2822
+ sites = []
2823
+ seq_upper = sequence.upper()
2824
+ pat_upper = pam_pattern.upper()
2825
+
2826
+ for i in range(len(seq_upper) - len(pat_upper) + 1):
2827
+ match = True
2828
+ for j in range(len(pat_upper)):
2829
+ seq_base = seq_upper[i + j]
2830
+ pat_base = pat_upper[j]
2831
+
2832
+ # Sequence 'N' (padding or unknown) matches any pattern base
2833
+ if seq_base == 'N':
2834
+ continue # Always matches
2835
+
2836
+ # Get allowed bases for this pattern position
2837
+ allowed_bases = IUPAC_CODES.get(pat_base, pat_base)
2838
+
2839
+ # Remove brackets from character class if present
2840
+ if allowed_bases.startswith("["):
2841
+ allowed_bases = allowed_bases[1:-1]
2842
+
2843
+ # Check if sequence base is in the pattern's allowed bases
2844
+ if seq_base not in allowed_bases:
2845
+ match = False
2846
+ break
2847
+
2848
+ if match:
2849
+ sites.append(i)
2850
+
2851
+ return sites
2852
+
2853
+ # Load reference and variants
2854
+ reference = _load_reference(reference_fn)
2855
+ variants = _load_variants(variants_fn)
2856
+
2857
+ # Get all chromosome names and apply chromosome matching
2858
+ ref_chroms = set(reference.keys())
2859
+ vcf_chroms = set(variants["chrom"].unique())
2860
+
2861
+ # Use chromosome matching to handle name mismatches
2862
+ mapping, unmatched = match_chromosomes_with_report(
2863
+ ref_chroms, vcf_chroms, verbose=True, auto_map_chromosomes=auto_map_chromosomes
2864
+ )
2865
+
2866
+ # Apply chromosome name mapping to variants
2867
+ if mapping:
2868
+ variants = apply_chromosome_mapping(variants, mapping)
2869
+
2870
+ # Filter variants to find those that disrupt PAM sites
2871
+ pam_disrupting_variants_list = []
2872
+ pam_metadata_list = []
2873
+
2874
+ # Process each variant to identify PAM disruption
2875
+ for _, var in variants.iterrows():
2876
+ chrom = var["chrom"]
2877
+ pos = var["pos1"] # 1-based position
2878
+
2879
+ # Get reference sequence for this chromosome
2880
+ if chrom not in reference:
2881
+ warnings.warn(
2882
+ f"Chromosome {chrom} not found in reference. Skipping variant at {chrom}:{pos}."
2883
+ )
2884
+ continue
2885
+
2886
+ ref_seq = str(reference[chrom])
2887
+ chrom_length = len(ref_seq)
2888
+
2889
+ # Convert to 0-based position
2890
+ genomic_pos = pos - 1
2891
+
2892
+ # Calculate window boundaries centered on variant start
2893
+ half_len = seq_len // 2
2894
+ window_start = genomic_pos - half_len
2895
+ window_end = window_start + seq_len
2896
+
2897
+ # Check if variant extends past window boundaries
2898
+ ref_allele = var.get("ref", "")
2899
+ alt_allele = var.get("alt", "")
2900
+ variant_length = max(len(ref_allele), len(alt_allele))
2901
+ variant_end = genomic_pos + variant_length
2902
+
2903
+ if variant_end > window_end:
2904
+ overflow = variant_end - window_end
2905
+ warnings.warn(
2906
+ f"Variant at {chrom}:{pos} extends {overflow} bp beyond the "
2907
+ f"requested window (length: {seq_len} bp). This may affect "
2908
+ f"PAM site detection accuracy.",
2909
+ UserWarning,
2910
+ )
2911
+
2912
+ # Handle edge cases for reference sequence PAM detection
2913
+ if window_start < 0:
2914
+ left_pad = -window_start
2915
+ ref_window_start = 0
2916
+ else:
2917
+ left_pad = 0
2918
+ ref_window_start = window_start
2919
+
2920
+ if window_end > chrom_length:
2921
+ right_pad = window_end - chrom_length
2922
+ ref_window_end = chrom_length
2923
+ else:
2924
+ right_pad = 0
2925
+ ref_window_end = window_end
2926
+
2927
+ # Extract window from reference for PAM detection
2928
+ ref_window_seq = ref_seq[ref_window_start:ref_window_end]
2929
+
2930
+ # Add padding for PAM detection
2931
+ if left_pad > 0:
2932
+ ref_window_seq = "N" * left_pad + ref_window_seq
2933
+ if right_pad > 0:
2934
+ ref_window_seq = ref_window_seq + "N" * right_pad
2935
+
2936
+ # Find PAM sites in the reference sequence window
2937
+ ref_pam_sites = _find_pam_sites(ref_window_seq, pam_sequence)
2938
+
2939
+ # Calculate variant position in padded window
2940
+ variant_pos_in_window = left_pad + (genomic_pos - ref_window_start)
2941
+
2942
+ # Filter PAM sites that are within max_pam_distance of the variant
2943
+ nearby_ref_pam_sites = [
2944
+ p
2945
+ for p in ref_pam_sites
2946
+ if abs(p - variant_pos_in_window) <= max_pam_distance
2947
+ ]
2948
+
2949
+ # Skip if no nearby PAM sites
2950
+ if not nearby_ref_pam_sites:
2951
+ continue
2952
+
2953
+ # Create a temporary applicator with just this variant
2954
+ single_var_df = pd.DataFrame([var])
2955
+ temp_applicator = VariantApplicator(ref_seq, single_var_df)
2956
+
2957
+ # Apply the variant to get the full modified chromosome
2958
+ modified_chrom, stats = temp_applicator.apply_variants()
2959
+
2960
+ # Extract window from modified chromosome
2961
+ if window_start < 0:
2962
+ actual_start = 0
2963
+ else:
2964
+ actual_start = window_start
2965
+
2966
+ if window_end > len(modified_chrom):
2967
+ actual_end = len(modified_chrom)
2968
+ else:
2969
+ actual_end = window_end
2970
+
2971
+ modified_window = modified_chrom[actual_start:actual_end]
2972
+
2973
+ # Add padding
2974
+ if left_pad > 0:
2975
+ modified_window = "N" * left_pad + modified_window
2976
+ if right_pad > 0:
2977
+ modified_window = modified_window + "N" * right_pad
2978
+
2979
+ # Ensure correct length
2980
+ if len(modified_window) != seq_len:
2981
+ if len(modified_window) < seq_len:
2982
+ modified_window += "N" * (seq_len - len(modified_window))
2983
+ else:
2984
+ modified_window = modified_window[:seq_len]
2985
+
2986
+ # Check for new PAM formation in the alternate sequence
2987
+ # Find PAM sites in the modified (alternate) sequence
2988
+ alt_pam_sites = _find_pam_sites(modified_window, pam_sequence)
2989
+
2990
+ # Filter to nearby PAM sites in the alternate sequence
2991
+ nearby_alt_pam_sites = [
2992
+ p
2993
+ for p in alt_pam_sites
2994
+ if abs(p - variant_pos_in_window) <= max_pam_distance
2995
+ ]
2996
+
2997
+ # Identify which reference PAM sites are truly disrupted
2998
+ # Different logic for SNVs vs INDELs:
2999
+ # - SNV: PAM disrupted if the exact PAM sequence changes at that position
3000
+ # - INDEL: PAM disrupted ONLY if no PAM exists in ALT (even at shifted position)
3001
+ # If INDEL creates/shifts a PAM, it's NOT considered disrupting
3002
+
3003
+ ref_allele = var.get("ref", "")
3004
+ alt_allele = var.get("alt", "")
3005
+ is_indel = (
3006
+ len(ref_allele) != len(alt_allele)
3007
+ or ref_allele == "-"
3008
+ or alt_allele == "-"
3009
+ )
3010
+
3011
+ truly_disrupted_pam_sites = []
3012
+
3013
+ if is_indel:
3014
+ # For INDELs: check if PAM still exists anywhere nearby (allowing for shifts)
3015
+ for ref_pam_pos in nearby_ref_pam_sites:
3016
+ pam_still_exists = False
3017
+ for alt_pam_pos in nearby_alt_pam_sites:
3018
+ # Allow for positional shifts due to the INDEL
3019
+ # If a PAM exists within a reasonable distance, consider it maintained
3020
+ if abs(ref_pam_pos - alt_pam_pos) <= len(pam_sequence):
3021
+ pam_still_exists = True
3022
+ break
3023
+
3024
+ if not pam_still_exists:
3025
+ truly_disrupted_pam_sites.append(ref_pam_pos)
3026
+ else:
3027
+ # For SNVs: check if the PAM sequence at the exact position has changed
3028
+ for ref_pam_pos in nearby_ref_pam_sites:
3029
+ # Extract the PAM sequence from both ref and alt at this exact position
3030
+ ref_pam_seq = ref_window_seq[
3031
+ ref_pam_pos : ref_pam_pos + len(pam_sequence)
3032
+ ]
3033
+ alt_pam_seq = modified_window[
3034
+ ref_pam_pos : ref_pam_pos + len(pam_sequence)
3035
+ ]
3036
+
3037
+ # Check if the PAM pattern still matches after the variant
3038
+ alt_matches_pattern = all(
3039
+ a == b or b == "N"
3040
+ for a, b in zip(alt_pam_seq.upper(), pam_sequence.upper())
3041
+ )
3042
+
3043
+ # If the ALT no longer matches the PAM pattern, it's disrupted
3044
+ if not alt_matches_pattern:
3045
+ truly_disrupted_pam_sites.append(ref_pam_pos)
3046
+
3047
+ # Only proceed if there are truly disrupted PAM sites
3048
+ # If all reference PAMs are maintained (possibly at shifted positions for INDELs),
3049
+ # skip this variant
3050
+ if not truly_disrupted_pam_sites:
3051
+ continue
3052
+
3053
+ # For each disrupted PAM site, create a metadata entry
3054
+ for pam_site_pos in truly_disrupted_pam_sites:
3055
+ # Extract PAM sequences
3056
+ ref_pam_seq = ref_window_seq[pam_site_pos : pam_site_pos + len(pam_sequence)]
3057
+ alt_pam_seq = modified_window[pam_site_pos : pam_site_pos + len(pam_sequence)]
3058
+
3059
+ # Calculate distance from variant to PAM
3060
+ pam_distance = abs(pam_site_pos - variant_pos_in_window)
3061
+
3062
+ # Store the variant (may be duplicate if multiple PAMs disrupted)
3063
+ pam_disrupting_variants_list.append(var)
3064
+
3065
+ # Store PAM-specific metadata
3066
+ pam_metadata_list.append({
3067
+ 'pam_site_pos': pam_site_pos,
3068
+ 'pam_ref_sequence': ref_pam_seq,
3069
+ 'pam_alt_sequence': alt_pam_seq,
3070
+ 'pam_distance': pam_distance
3071
+ })
3072
+
3073
+ # If no PAM-disrupting variants found, yield empty results
3074
+ if not pam_disrupting_variants_list:
3075
+ # Return empty generator
3076
+ return
3077
+
3078
+ # Create DataFrame with filtered PAM-disrupting variants
3079
+ filtered_variants_df = pd.DataFrame(pam_disrupting_variants_list).reset_index(drop=True)
3080
+ pam_metadata_df = pd.DataFrame(pam_metadata_list)
3081
+
3082
+ # Call get_alt_ref_sequences with the filtered variants
3083
+ for alt_seqs, ref_seqs, base_metadata in get_alt_ref_sequences(
3084
+ reference_fn,
3085
+ filtered_variants_df,
3086
+ seq_len,
3087
+ encode,
3088
+ n_chunks,
3089
+ encoder,
3090
+ auto_map_chromosomes
3091
+ ):
3092
+ # Merge PAM-specific metadata with base metadata
3093
+ # Both should have the same number of rows since we created one entry per PAM site
3094
+ enriched_metadata = pd.concat([base_metadata.reset_index(drop=True),
3095
+ pam_metadata_df], axis=1)
3096
+
3097
+ # Yield the chunk with enriched metadata
3098
+ yield (alt_seqs, ref_seqs, enriched_metadata)