supremo-lite 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1645 @@
1
+ """
2
+ Variant reading and handling utilities for supremo_lite.
3
+
4
+ This module provides functions for reading variants from VCF files
5
+ and other related operations.
6
+ """
7
+
8
+ import io
9
+ import pandas as pd
10
+ import numpy as np
11
+ import re
12
+ import warnings
13
+ from typing import Dict, Optional, List, Tuple, Union
14
+ from dataclasses import dataclass
15
+
16
+
17
+ @dataclass
18
+ class BreakendVariant:
19
+ """
20
+ Represents a single breakend variant from a VCF file.
21
+
22
+ This class stores all information needed to process a BND variant,
23
+ including mate relationships and inserted sequences.
24
+ """
25
+
26
+ id: str # VCF ID field (e.g., "bnd_W")
27
+ chrom: str # Chromosome name
28
+ pos: int # 1-based position
29
+ ref: str # Reference allele
30
+ alt: str # Complete ALT field (e.g., "G]17:198982]")
31
+ mate_id: str # MATEID from INFO field
32
+ mate_chrom: str # Mate chromosome (parsed from ALT)
33
+ mate_pos: int # Mate position (parsed from ALT)
34
+ orientation: str # Breakend orientation (e.g., "ref_then_mate")
35
+ inserted_seq: str # Novel sequence at junction
36
+ info: str # Complete INFO field
37
+ variant_type: str = "SV_BND" # Always BND for breakend variants
38
+
39
+ def __post_init__(self):
40
+ """Validate breakend data after initialization."""
41
+ if not self.id:
42
+ raise ValueError("Breakend ID cannot be empty")
43
+ if self.pos <= 0:
44
+ raise ValueError("Breakend position must be positive")
45
+ if self.mate_pos <= 0:
46
+ raise ValueError("Breakend mate position must be positive")
47
+
48
+
49
+ @dataclass
50
+ class BreakendPair:
51
+ """
52
+ Represents a pair of mated breakend variants that create a novel adjacency.
53
+
54
+ This class coordinates the application of both breakends to create
55
+ complex rearrangements like translocations, inversions, etc.
56
+ """
57
+
58
+ breakend1: BreakendVariant
59
+ breakend2: BreakendVariant
60
+ is_valid: bool = True
61
+ validation_errors: List[str] = None
62
+ validation_warnings: List[str] = None
63
+
64
+ def __post_init__(self):
65
+ """Validate that the two breakends form a consistent pair."""
66
+ if self.validation_errors is None:
67
+ self.validation_errors = []
68
+ if self.validation_warnings is None:
69
+ self.validation_warnings = []
70
+
71
+ # Validate mate relationships
72
+ if self.breakend1.mate_id != self.breakend2.id:
73
+ self.validation_errors.append(
74
+ f"Breakend {self.breakend1.id} MATEID {self.breakend1.mate_id} "
75
+ f"does not match mate ID {self.breakend2.id}"
76
+ )
77
+ self.is_valid = False
78
+
79
+ if self.breakend2.mate_id != self.breakend1.id:
80
+ self.validation_errors.append(
81
+ f"Breakend {self.breakend2.id} MATEID {self.breakend2.mate_id} "
82
+ f"does not match mate ID {self.breakend1.id}"
83
+ )
84
+ self.is_valid = False
85
+
86
+ # Validate coordinate consistency
87
+ if self.breakend1.mate_chrom != self.breakend2.chrom:
88
+ self.validation_errors.append(
89
+ f"Breakend {self.breakend1.id} mate chromosome {self.breakend1.mate_chrom} "
90
+ f"does not match actual chromosome {self.breakend2.chrom}"
91
+ )
92
+ self.is_valid = False
93
+
94
+ if self.breakend1.mate_pos != self.breakend2.pos:
95
+ self.validation_errors.append(
96
+ f"Breakend {self.breakend1.id} mate position {self.breakend1.mate_pos} "
97
+ f"does not match actual position {self.breakend2.pos}"
98
+ )
99
+ self.is_valid = False
100
+
101
+ @property
102
+ def rearrangement_type(self) -> str:
103
+ """
104
+ Determine the type of rearrangement represented by this breakend pair.
105
+
106
+ Returns:
107
+ str: Rearrangement type ('translocation', 'inversion', 'duplication', 'complex')
108
+ """
109
+ if not self.is_valid:
110
+ return "invalid"
111
+
112
+ # Check if breakends are on same chromosome
113
+ if self.breakend1.chrom == self.breakend2.chrom:
114
+ # Same chromosome - could be inversion, duplication, or deletion
115
+ pos1, pos2 = self.breakend1.pos, self.breakend2.pos
116
+ orient1, orient2 = self.breakend1.orientation, self.breakend2.orientation
117
+
118
+ # Simple heuristics for now - detailed implementation would need more logic
119
+ if abs(pos1 - pos2) < 1000: # Close positions might be duplication
120
+ return "duplication"
121
+ elif orient1 != orient2: # Different orientations suggest inversion
122
+ return "inversion"
123
+ else:
124
+ return "complex"
125
+ else:
126
+ # Different chromosomes - translocation
127
+ return "translocation"
128
+
129
+ def get_affected_regions(self) -> List[Tuple[str, int, int]]:
130
+ """
131
+ Get genomic regions affected by this breakend pair.
132
+
133
+ Returns:
134
+ List of tuples (chrom, start, end) for affected regions
135
+ """
136
+ regions = []
137
+
138
+ # Add region around first breakend
139
+ regions.append(
140
+ (
141
+ self.breakend1.chrom,
142
+ max(1, self.breakend1.pos - 1), # Include position before breakend
143
+ self.breakend1.pos + len(self.breakend1.ref),
144
+ )
145
+ )
146
+
147
+ # Add region around second breakend
148
+ regions.append(
149
+ (
150
+ self.breakend2.chrom,
151
+ max(1, self.breakend2.pos - 1), # Include position before breakend
152
+ self.breakend2.pos + len(self.breakend2.ref),
153
+ )
154
+ )
155
+
156
+ return regions
157
+
158
+
159
+ @dataclass
160
+ class Breakend:
161
+ """Enhanced breakend with classification information."""
162
+
163
+ id: str
164
+ chrom: str
165
+ pos: int
166
+ ref: str
167
+ alt: str
168
+ mate_chrom: str
169
+ mate_pos: int
170
+ orientation: str
171
+ inserted_seq: str
172
+ classification: str # 'paired', 'missing_mate', 'singleton_insertion'
173
+ mate_breakend: Optional["Breakend"] = None
174
+
175
+ @classmethod
176
+ def from_breakend_variant(
177
+ cls, variant: BreakendVariant, classification: str
178
+ ) -> "Breakend":
179
+ """Create from BreakendVariant."""
180
+ return cls(
181
+ id=variant.id,
182
+ chrom=variant.chrom,
183
+ pos=variant.pos,
184
+ ref=variant.ref,
185
+ alt=variant.alt,
186
+ mate_chrom=variant.mate_chrom,
187
+ mate_pos=variant.mate_pos,
188
+ orientation=variant.orientation,
189
+ inserted_seq=variant.inserted_seq,
190
+ classification=classification,
191
+ mate_breakend=None,
192
+ )
193
+
194
+
195
+ class BNDClassifier:
196
+ """
197
+ BND classifier that doesn't depend on MATEID fields.
198
+
199
+ Classifies BNDs into categories:
200
+
201
+ 1. Paired breakends - have matching mates by coordinates
202
+ 2. Missing mates - reference coordinates not present in VCF (can be inferred)
203
+ 3. Insertions with mates - insertions where mate is present
204
+ 4. Insertions without mates - insertions where mate is missing (inferred)
205
+ """
206
+
207
+ def __init__(self):
208
+ self.all_breakends = []
209
+ self.coordinate_index = {} # Map (chrom, pos) -> breakend
210
+
211
+ def classify_all_breakends(
212
+ self, vcf_path: str, verbose: bool = False
213
+ ) -> Dict[str, List[Breakend]]:
214
+ """
215
+ Classify all BND variants from a VCF file.
216
+
217
+ Args:
218
+ vcf_path: Path to VCF file containing BND variants
219
+ verbose: Print detailed classification information (default: False)
220
+
221
+ Returns:
222
+ Dict with keys 'paired', 'missing_mate', 'singleton_insertion'
223
+ """
224
+ # Load VCF with variant classification
225
+ variants_df = read_vcf(vcf_path, include_info=True, classify_variants=True)
226
+ bnd_variants = variants_df[
227
+ variants_df["variant_type"].isin(["SV_BND", "SV_BND_INS"])
228
+ ]
229
+
230
+ if verbose:
231
+ print(f"Found {len(bnd_variants)} BND variants")
232
+
233
+ # Parse all breakends and build coordinate index
234
+ self._parse_and_index_breakends(bnd_variants)
235
+
236
+ # Classify breakends
237
+ classified = {
238
+ "paired": [],
239
+ "missing_mate": [],
240
+ "insertion_with_mate": [],
241
+ "insertion_missing_mate": [],
242
+ }
243
+
244
+ processed_ids = set()
245
+
246
+ for breakend in self.all_breakends:
247
+ if breakend.id in processed_ids:
248
+ continue
249
+
250
+ # Try to find mate by coordinates
251
+ mate_key = (breakend.mate_chrom, breakend.mate_pos)
252
+
253
+ if mate_key in self.coordinate_index:
254
+ # Found mate - this is a paired breakend
255
+ mate_breakend = self.coordinate_index[mate_key]
256
+
257
+ # Create enhanced breakends for the pair
258
+ enhanced1 = Breakend.from_breakend_variant(breakend, "paired")
259
+ enhanced2 = Breakend.from_breakend_variant(mate_breakend, "paired")
260
+
261
+ enhanced1.mate_breakend = enhanced2
262
+ enhanced2.mate_breakend = enhanced1
263
+
264
+ classified["paired"].extend([enhanced1, enhanced2])
265
+ processed_ids.add(breakend.id)
266
+ processed_ids.add(mate_breakend.id)
267
+
268
+ else:
269
+ # No mate found - ALWAYS infer the missing mate and create a fusion
270
+ # Create an inferred mate breakend
271
+ inferred_mate = BreakendVariant(
272
+ id=f"{breakend.id}_inferred_mate",
273
+ chrom=breakend.mate_chrom,
274
+ pos=breakend.mate_pos,
275
+ ref="N", # Unknown reference at mate position
276
+ alt="<INFERRED>",
277
+ mate_id=breakend.id,
278
+ mate_chrom=breakend.chrom,
279
+ mate_pos=breakend.pos,
280
+ orientation=self._infer_mate_orientation(breakend.orientation),
281
+ inserted_seq="", # Novel sequence stays on the original breakend
282
+ info=f"INFERRED_FROM={breakend.id}",
283
+ variant_type="SV_BND",
284
+ )
285
+
286
+ # Create enhanced breakends for the inferred pair
287
+ enhanced1 = Breakend.from_breakend_variant(breakend, "paired")
288
+ enhanced2 = Breakend.from_breakend_variant(inferred_mate, "paired")
289
+
290
+ enhanced1.mate_breakend = enhanced2
291
+ enhanced2.mate_breakend = enhanced1
292
+
293
+ classified["paired"].extend([enhanced1, enhanced2])
294
+
295
+ if verbose:
296
+ if breakend.inserted_seq:
297
+ print(
298
+ f" INFO: Inferred missing mate for {breakend.id} with novel sequence '{breakend.inserted_seq}' "
299
+ f"-> created fusion with inferred mate at {breakend.mate_chrom}:{breakend.mate_pos}"
300
+ )
301
+ else:
302
+ print(
303
+ f" INFO: Inferred missing mate for {breakend.id} "
304
+ f"-> created fusion with inferred mate at {breakend.mate_chrom}:{breakend.mate_pos}"
305
+ )
306
+
307
+ processed_ids.add(breakend.id)
308
+
309
+ # Apply semantic classification to detect DUP and INV patterns
310
+ # First detect duplications
311
+ dup_breakends = self._detect_duplication_pattern(classified["paired"])
312
+
313
+ # Remove duplication breakends from paired list before inversion detection
314
+ dup_ids = {b.id for b in dup_breakends}
315
+ remaining_paired = [b for b in classified["paired"] if b.id not in dup_ids]
316
+
317
+ # Then detect inversions from remaining breakends
318
+ inv_breakends = self._detect_inversion_pattern(remaining_paired)
319
+
320
+ # Remove reclassified breakends from 'paired' category and add to semantic categories
321
+ reclassified_ids = {b.id for b in dup_breakends + inv_breakends}
322
+ classified["paired"] = [
323
+ b for b in classified["paired"] if b.id not in reclassified_ids
324
+ ]
325
+
326
+ # Add semantic classifications
327
+ classified["dup_breakends"] = dup_breakends
328
+ classified["inv_breakends"] = inv_breakends
329
+
330
+ # Print classification summary
331
+ if verbose:
332
+ print(f"\nBND Classification Summary:")
333
+ print(
334
+ f" Paired breakends (true translocations): {len(classified['paired'])}"
335
+ )
336
+ print(f" Duplication breakends (SV_BND_DUP): {len(dup_breakends)}")
337
+ print(f" Inversion breakends (SV_BND_INV): {len(inv_breakends)}")
338
+ total_inferred = len(
339
+ [
340
+ bnd
341
+ for bnd in classified["paired"] + dup_breakends + inv_breakends
342
+ if "inferred" in bnd.id
343
+ ]
344
+ )
345
+ print(f" Inferred mates created: {total_inferred}")
346
+
347
+ return classified
348
+
349
+ def _detect_duplication_pattern(
350
+ self, paired_breakends: List[Breakend]
351
+ ) -> List[Breakend]:
352
+ """
353
+ Detect duplication patterns from paired breakends.
354
+
355
+ A duplication pattern consists of 2 breakends:
356
+ - BND1: position A pointing to position B
357
+ - BND2: position B pointing to position A
358
+ - Same chromosome, A < B (tandem duplication)
359
+
360
+ Returns:
361
+ List of breakends reclassified as SV_BND_DUP
362
+ """
363
+ dup_breakends = []
364
+ processed_ids = set()
365
+
366
+ for breakend in paired_breakends:
367
+ if breakend.id in processed_ids or not breakend.mate_breakend:
368
+ continue
369
+
370
+ mate = breakend.mate_breakend
371
+
372
+ # Check if this forms a duplication pattern
373
+ if (
374
+ breakend.chrom == mate.chrom # Same chromosome
375
+ and breakend.chrom
376
+ == breakend.mate_chrom # Mate points to same chromosome
377
+ and mate.chrom
378
+ == mate.mate_chrom # Mate's mate points to same chromosome
379
+ and breakend.pos != mate.pos
380
+ ): # Different positions
381
+
382
+ # Check mutual pointing (A->B, B->A)
383
+ points_to_mate = (
384
+ breakend.mate_chrom == mate.chrom and breakend.mate_pos == mate.pos
385
+ )
386
+ mate_points_back = (
387
+ mate.mate_chrom == breakend.chrom and mate.mate_pos == breakend.pos
388
+ )
389
+
390
+ if points_to_mate and mate_points_back:
391
+ # Check orientations to determine if this is truly a duplication or inversion
392
+ # Duplication: orientations should be compatible with copy-paste behavior
393
+ # Inversion: orientations should indicate sequence reversal
394
+
395
+ orientation1 = breakend.orientation
396
+ orientation2 = mate.orientation
397
+
398
+ # Simple heuristic: if we have more than 2 breakends on same chromosome pointing to each other,
399
+ # it's likely an inversion pattern, not duplication (which typically involves 2 breakends)
400
+ # Count breakends on this chromosome
401
+ same_chrom_breakends = [
402
+ b
403
+ for b in paired_breakends
404
+ if b.chrom == breakend.chrom and b.mate_chrom == breakend.chrom
405
+ ]
406
+
407
+ if len(same_chrom_breakends) > 2:
408
+ # Likely inversion pattern with multiple breakends - skip duplication classification
409
+ continue
410
+
411
+ # This is a duplication pattern - reclassify both breakends
412
+ dup_breakend1 = Breakend.from_breakend_variant(
413
+ BreakendVariant(
414
+ id=breakend.id,
415
+ chrom=breakend.chrom,
416
+ pos=breakend.pos,
417
+ ref=breakend.ref,
418
+ alt=breakend.alt,
419
+ mate_id=getattr(breakend, "mate_id", ""),
420
+ mate_chrom=breakend.mate_chrom,
421
+ mate_pos=breakend.mate_pos,
422
+ orientation=breakend.orientation,
423
+ inserted_seq=breakend.inserted_seq,
424
+ info="",
425
+ variant_type="SV_BND_DUP",
426
+ ),
427
+ "SV_BND_DUP",
428
+ )
429
+ dup_breakend2 = Breakend.from_breakend_variant(
430
+ BreakendVariant(
431
+ id=mate.id,
432
+ chrom=mate.chrom,
433
+ pos=mate.pos,
434
+ ref=mate.ref,
435
+ alt=mate.alt,
436
+ mate_id=getattr(mate, "mate_id", ""),
437
+ mate_chrom=mate.mate_chrom,
438
+ mate_pos=mate.mate_pos,
439
+ orientation=mate.orientation,
440
+ inserted_seq=mate.inserted_seq,
441
+ info="",
442
+ variant_type="SV_BND_DUP",
443
+ ),
444
+ "SV_BND_DUP",
445
+ )
446
+
447
+ # Maintain mate relationships
448
+ dup_breakend1.mate_breakend = dup_breakend2
449
+ dup_breakend2.mate_breakend = dup_breakend1
450
+
451
+ dup_breakends.extend([dup_breakend1, dup_breakend2])
452
+ processed_ids.add(breakend.id)
453
+ processed_ids.add(mate.id)
454
+
455
+ return dup_breakends
456
+
457
+ def _detect_inversion_pattern(
458
+ self, paired_breakends: List[Breakend]
459
+ ) -> List[Breakend]:
460
+ """
461
+ Detect inversion patterns from paired breakends.
462
+
463
+ An inversion pattern consists of 4 breakends forming 2 pairs:
464
+ - Pair 1: Outer breakpoints (A, B) with inverted orientations
465
+ - Pair 2: Inner breakpoints (C, D) with inverted orientations
466
+ - Same chromosome, positions in order A < C < D < B
467
+
468
+ Returns:
469
+ List of breakends reclassified as SV_BND_INV
470
+ """
471
+ inv_breakends = []
472
+ processed_ids = set()
473
+
474
+ # Group breakends by chromosome for efficiency
475
+ chrom_groups = {}
476
+ for breakend in paired_breakends:
477
+ if breakend.id in processed_ids:
478
+ continue
479
+ chrom = breakend.chrom
480
+ if chrom not in chrom_groups:
481
+ chrom_groups[chrom] = []
482
+ chrom_groups[chrom].append(breakend)
483
+
484
+ # Look for inversion patterns within each chromosome
485
+ for chrom, breakends in chrom_groups.items():
486
+ if len(breakends) < 4: # Need at least 4 breakends for inversion
487
+ continue
488
+
489
+ # Sort by position
490
+ breakends_sorted = sorted(breakends, key=lambda x: x.pos)
491
+
492
+ # Check for inversion patterns - simplified heuristic
493
+ # Look for breakends that point inward (toward each other)
494
+ for i in range(len(breakends_sorted) - 1):
495
+ breakend1 = breakends_sorted[i]
496
+ breakend2 = breakends_sorted[i + 1]
497
+
498
+ if (
499
+ breakend1.id in processed_ids
500
+ or breakend2.id in processed_ids
501
+ or not breakend1.mate_breakend
502
+ or not breakend2.mate_breakend
503
+ ):
504
+ continue
505
+
506
+ # Check if this is part of an inversion pattern
507
+ # For inversion: we expect 4 breakends on same chromosome with crossed connections
508
+ if (
509
+ breakend1.chrom == breakend2.chrom == chrom
510
+ and breakend1.mate_chrom == breakend2.mate_chrom == chrom
511
+ and breakend1.mate_breakend
512
+ and breakend2.mate_breakend
513
+ ):
514
+
515
+ # Check if the 4 breakends form inversion pattern
516
+ # Simple heuristic: 4 breakends all pointing to each other on same chromosome
517
+ same_chrom_count = len(
518
+ [
519
+ b
520
+ for b in breakends_sorted
521
+ if b.chrom == chrom and b.mate_chrom == chrom
522
+ ]
523
+ )
524
+
525
+ if same_chrom_count == 4:
526
+ # Reclassify all 4 breakends as inversion
527
+ for breakend_to_classify in breakends_sorted:
528
+ if breakend_to_classify.id in processed_ids:
529
+ continue
530
+
531
+ # Reclassify as inversion
532
+ inv_breakend = Breakend.from_breakend_variant(
533
+ BreakendVariant(
534
+ id=breakend_to_classify.id,
535
+ chrom=breakend_to_classify.chrom,
536
+ pos=breakend_to_classify.pos,
537
+ ref=breakend_to_classify.ref,
538
+ alt=breakend_to_classify.alt,
539
+ mate_id=getattr(
540
+ breakend_to_classify, "mate_id", ""
541
+ ),
542
+ mate_chrom=breakend_to_classify.mate_chrom,
543
+ mate_pos=breakend_to_classify.mate_pos,
544
+ orientation=breakend_to_classify.orientation,
545
+ inserted_seq=breakend_to_classify.inserted_seq,
546
+ info="",
547
+ variant_type="SV_BND_INV",
548
+ ),
549
+ "SV_BND_INV",
550
+ )
551
+ inv_breakends.append(inv_breakend)
552
+ processed_ids.add(breakend_to_classify.id)
553
+
554
+ # Exit the loop since we processed all breakends for this chromosome
555
+ break
556
+
557
+ return inv_breakends
558
+
559
+ def _infer_mate_orientation(self, original_orientation: str) -> str:
560
+ """
561
+ Infer the orientation of a missing mate breakend based on the original breakend's orientation.
562
+
563
+ BND orientation pairs (original -> inferred mate):
564
+
565
+ - t[p[ -> ]p]t
566
+ - t]p] -> [p[t
567
+ - ]p]t -> t[p[
568
+ - [p[t -> t]p]
569
+ """
570
+ orientation_pairs = {
571
+ "t[p[": "]p]t",
572
+ "t]p]": "[p[t",
573
+ "]p]t": "t[p[",
574
+ "[p[t": "t]p]",
575
+ }
576
+ return orientation_pairs.get(original_orientation, "]p]t")
577
+
578
+ def _parse_and_index_breakends(self, bnd_variants: pd.DataFrame):
579
+ """Parse breakends and build coordinate index."""
580
+ for _, variant in bnd_variants.iterrows():
581
+ try:
582
+ # Parse ALT field
583
+ alt_info = parse_breakend_alt(variant["alt"])
584
+ if not alt_info["is_valid"]:
585
+ warnings.warn(
586
+ f"Could not parse ALT field for {variant['id']}: {variant['alt']}"
587
+ )
588
+ continue
589
+
590
+ # Parse INFO field for optional MATEID
591
+ info_dict = parse_vcf_info(variant.get("info", ""))
592
+ mate_id = info_dict.get("MATEID", None)
593
+
594
+ # Create BreakendVariant
595
+ breakend_var = BreakendVariant(
596
+ id=variant["id"],
597
+ chrom=variant["chrom"],
598
+ pos=variant["pos1"],
599
+ ref=variant["ref"],
600
+ alt=variant["alt"],
601
+ mate_id=mate_id,
602
+ mate_chrom=alt_info["mate_chrom"],
603
+ mate_pos=alt_info["mate_pos"],
604
+ orientation=alt_info["orientation"],
605
+ inserted_seq=alt_info["inserted_seq"],
606
+ info=variant.get("info", ""),
607
+ variant_type="SV_BND",
608
+ )
609
+
610
+ self.all_breakends.append(breakend_var)
611
+
612
+ # Index by coordinates
613
+ coord_key = (breakend_var.chrom, breakend_var.pos)
614
+ self.coordinate_index[coord_key] = breakend_var
615
+
616
+ except Exception as e:
617
+ warnings.warn(f"Error processing breakend {variant['id']}: {e}")
618
+
619
+
620
+ def _count_vcf_header_lines(path: str) -> int:
621
+ """
622
+ Count the number of header lines in a VCF file.
623
+
624
+ VCF files have two types of header lines:
625
+ - Lines starting with ## (metadata)
626
+ - Line starting with #CHROM (column header)
627
+
628
+ Args:
629
+ path: Path to VCF file
630
+
631
+ Returns:
632
+ Number of lines to skip (all ## lines + the #CHROM line)
633
+ """
634
+ with open(path, "r") as f:
635
+ header_count = 0
636
+ for line in f:
637
+ if line.startswith("##"):
638
+ header_count += 1
639
+ elif line.startswith("#"):
640
+ header_count += 1 # Skip the #CHROM header line
641
+ break
642
+ else:
643
+ break # Reached data lines
644
+ return header_count
645
+
646
+
647
+ def read_vcf(path, include_info=True, classify_variants=True):
648
+ """
649
+ Read VCF file into pandas DataFrame with enhanced variant classification.
650
+
651
+ Args:
652
+ path: Path to VCF file
653
+ include_info: Whether to include INFO field (default: True)
654
+ classify_variants: Whether to classify variant types (default: True)
655
+
656
+ Returns:
657
+ DataFrame with columns: chrom, pos1, id, ref, alt, [info], [variant_type]
658
+
659
+ Notes:
660
+ - INFO field parsing enables structural variant classification
661
+ - variant_type column uses VCF 4.2 compliant classification
662
+ - Compatible with existing code expecting basic 5-column format
663
+ """
664
+ # Determine columns to read based on parameters
665
+ if include_info:
666
+ usecols = [0, 1, 2, 3, 4, 7] # Include INFO field
667
+ base_columns = ["chrom", "pos1", "id", "ref", "alt", "info"]
668
+ else:
669
+ usecols = [0, 1, 2, 3, 4] # Include ID field by default
670
+ base_columns = ["chrom", "pos1", "id", "ref", "alt"]
671
+
672
+ # Count header lines for VCF line tracking (needed for vcf_line column)
673
+ header_count = _count_vcf_header_lines(path)
674
+
675
+ # Read VCF using pandas with comment='#' to skip all header lines automatically
676
+ df = pd.read_table(
677
+ path, comment="#", header=None, names=base_columns, usecols=usecols
678
+ )
679
+
680
+ # Add VCF line numbers for debugging (1-indexed, accounting for header lines)
681
+ # Line number = header_lines + 1 (for 1-indexing) + row_index
682
+ df["vcf_line"] = df.index + header_count + 1
683
+
684
+ # Validate that pos1 column is numeric
685
+ if not pd.api.types.is_numeric_dtype(df["pos1"]):
686
+ raise ValueError(
687
+ f"Position column (second column) must be numeric, got {df['pos1'].dtype}"
688
+ )
689
+
690
+ # Filter out multiallelic variants (ALT alleles containing commas)
691
+ df = _filter_multiallelic_variants(df)
692
+
693
+ # Add variant classification if requested
694
+ if classify_variants:
695
+ df["variant_type"] = df.apply(
696
+ lambda row: classify_variant_type(
697
+ row["ref"],
698
+ row["alt"],
699
+ parse_vcf_info(row.get("info", "")) if include_info else None,
700
+ ),
701
+ axis=1,
702
+ )
703
+ # Note: INV and DUP variants represented by multiple BNDs are handled by
704
+ # BNDClassifier and group_variants_by_semantic_type() functions
705
+ return df
706
+
707
+
708
+ def read_vcf_chunked(path, n_chunks=1, include_info=True, classify_variants=True):
709
+ """
710
+ Read VCF file in chunks using generator with enhanced variant classification.
711
+
712
+ Args:
713
+ path: Path to VCF file
714
+ n_chunks: Number of chunks to split variants into (default: 1)
715
+ include_info: Whether to include INFO field (default: True)
716
+ classify_variants: Whether to classify variant types (default: True)
717
+
718
+ Yields:
719
+ DataFrame chunks with columns: chrom, pos1, id, ref, alt, [info], [variant_type]
720
+ """
721
+ # Determine columns to read based on parameters
722
+ if include_info:
723
+ usecols = [0, 1, 2, 3, 4, 7] # Include INFO field
724
+ base_columns = ["chrom", "pos1", "id", "ref", "alt", "info"]
725
+ else:
726
+ usecols = [0, 1, 2, 3, 4] # Include ID field by default
727
+ base_columns = ["chrom", "pos1", "id", "ref", "alt"]
728
+
729
+ # Count header lines for VCF line tracking (needed for vcf_line column)
730
+ header_count = _count_vcf_header_lines(path)
731
+
732
+ # Read VCF using pandas with comment='#' to skip all header lines automatically
733
+ full_df = pd.read_table(
734
+ path, comment="#", header=None, names=base_columns, usecols=usecols
735
+ )
736
+
737
+ # Handle empty DataFrame
738
+ if len(full_df) == 0:
739
+ return
740
+
741
+ # Add VCF line numbers for debugging (1-indexed, accounting for header lines)
742
+ # Line number = header_lines + 1 (for 1-indexing) + row_index
743
+ full_df["vcf_line"] = full_df.index + header_count + 1
744
+
745
+ # Validate that pos1 column is numeric
746
+ if not pd.api.types.is_numeric_dtype(full_df["pos1"]):
747
+ raise ValueError(
748
+ f"Position column (second column) must be numeric, got {full_df['pos1'].dtype}"
749
+ )
750
+
751
+ # Filter out multiallelic variants (ALT alleles containing commas)
752
+ full_df = _filter_multiallelic_variants(full_df)
753
+
754
+ # Add variant classification if requested
755
+ if classify_variants:
756
+ full_df["variant_type"] = full_df.apply(
757
+ lambda row: classify_variant_type(
758
+ row["ref"],
759
+ row["alt"],
760
+ parse_vcf_info(row.get("info", "")) if include_info else None,
761
+ ),
762
+ axis=1,
763
+ )
764
+
765
+ # Split into chunks using numpy array_split
766
+ # Use numpy array_split to create n_chunks approximately equal chunks
767
+ indices = np.array_split(np.arange(len(full_df)), n_chunks)
768
+
769
+ for chunk_indices in indices:
770
+ if len(chunk_indices) > 0:
771
+ yield full_df.iloc[chunk_indices].reset_index(drop=True)
772
+
773
+
774
+ def get_vcf_chromosomes(path):
775
+ """
776
+ Get list of chromosomes in VCF file without loading all variants.
777
+
778
+ Args:
779
+ path: Path to VCF file
780
+
781
+ Returns:
782
+ Set of chromosome names found in the VCF file
783
+ """
784
+ chromosomes = set()
785
+ with open(path, "r") as f:
786
+ for line in f:
787
+ if line.startswith("##"):
788
+ continue
789
+ if line.startswith("#CHROM"):
790
+ continue
791
+ # Parse first column (chromosome)
792
+ chrom = line.split("\t")[0]
793
+ chromosomes.add(chrom)
794
+ return chromosomes
795
+
796
+
797
+ def read_vcf_chromosome(
798
+ path, target_chromosome, include_info=True, classify_variants=True
799
+ ):
800
+ """
801
+ Read VCF file for a specific chromosome only with enhanced variant classification.
802
+
803
+ Args:
804
+ path: Path to VCF file
805
+ target_chromosome: Chromosome name to filter for
806
+ include_info: Whether to include INFO field (default: True)
807
+ classify_variants: Whether to classify variant types (default: True)
808
+
809
+ Returns:
810
+ DataFrame with variants only from specified chromosome
811
+ (columns: chrom, pos1, id, ref, alt, [info], [variant_type])
812
+ """
813
+ chromosome_lines = []
814
+ header_line = None
815
+
816
+ with open(path, "r") as f:
817
+ for line in f:
818
+ if line.startswith("##"):
819
+ continue
820
+ if line.startswith("#CHROM"):
821
+ header_line = line
822
+ continue
823
+
824
+ # Check if this line is for our target chromosome
825
+ chrom = line.split("\t")[0]
826
+ if chrom == target_chromosome:
827
+ chromosome_lines.append(line)
828
+
829
+ # Determine columns to read based on parameters
830
+ if include_info:
831
+ usecols = [0, 1, 2, 3, 4, 7] # Include INFO field
832
+ base_columns = ["chrom", "pos1", "id", "ref", "alt", "info"]
833
+ else:
834
+ usecols = [0, 1, 2, 3, 4] # Include ID field by default
835
+ base_columns = ["chrom", "pos1", "id", "ref", "alt"]
836
+
837
+ if not chromosome_lines:
838
+ # Return empty DataFrame with correct columns if no variants found
839
+ empty_columns = base_columns.copy()
840
+ if classify_variants:
841
+ empty_columns.append("variant_type")
842
+ return pd.DataFrame(columns=empty_columns)
843
+
844
+ # Combine header and chromosome-specific lines
845
+ vcf_data = header_line + "".join(chromosome_lines)
846
+
847
+ # Parse into DataFrame
848
+ df = pd.read_csv(io.StringIO(vcf_data), sep="\t", usecols=usecols)
849
+
850
+ # Set column names
851
+ df.columns = base_columns
852
+
853
+ # Validate that pos1 column is numeric
854
+ if len(df) > 0 and not pd.api.types.is_numeric_dtype(df["pos1"]):
855
+ raise ValueError(
856
+ f"Position column (second column) must be numeric, got {df['pos1'].dtype}"
857
+ )
858
+
859
+ # Filter out multiallelic variants (ALT alleles containing commas)
860
+ if len(df) > 0:
861
+ df = _filter_multiallelic_variants(df)
862
+
863
+ # Add variant classification if requested
864
+ if classify_variants and len(df) > 0:
865
+ df["variant_type"] = df.apply(
866
+ lambda row: classify_variant_type(
867
+ row["ref"],
868
+ row["alt"],
869
+ parse_vcf_info(row.get("info", "")) if include_info else None,
870
+ ),
871
+ axis=1,
872
+ )
873
+
874
+ return df
875
+
876
+
877
+ def read_vcf_chromosomes_chunked(
878
+ path, target_chromosomes, n_chunks=1, include_info=True, classify_variants=True
879
+ ):
880
+ """
881
+ Read VCF file for specific chromosomes in chunks with enhanced variant classification.
882
+
883
+ Args:
884
+ path: Path to VCF file
885
+ target_chromosomes: List/set of chromosome names to include
886
+ n_chunks: Number of chunks per chromosome (default: 1)
887
+ include_info: Whether to include INFO field (default: True)
888
+ classify_variants: Whether to classify variant types (default: True)
889
+
890
+ Yields:
891
+ Tuples of (chromosome, variants_dataframe) for each chunk
892
+ DataFrame columns: chrom, pos1, id, ref, alt, [info], [variant_type]
893
+ """
894
+ target_chromosomes = set(target_chromosomes)
895
+
896
+ for chrom in target_chromosomes:
897
+ chrom_variants = read_vcf_chromosome(
898
+ path, chrom, include_info, classify_variants
899
+ )
900
+
901
+ if len(chrom_variants) == 0:
902
+ continue
903
+
904
+ if n_chunks == 1:
905
+ # Single chunk - yield all variants for this chromosome
906
+ yield chrom, chrom_variants
907
+ else:
908
+ # Multiple chunks - split chromosome variants into n_chunks
909
+ indices = np.array_split(np.arange(len(chrom_variants)), n_chunks)
910
+
911
+ for i, chunk_indices in enumerate(indices):
912
+ if len(chunk_indices) > 0:
913
+ chunk_df = chrom_variants.iloc[chunk_indices].reset_index(drop=True)
914
+ yield f"{chrom}_chunk_{i+1}", chunk_df
915
+
916
+
917
+ def group_variants_by_semantic_type(
918
+ variants_df: pd.DataFrame, vcf_path: Optional[str] = None
919
+ ) -> Dict[str, pd.DataFrame]:
920
+ """
921
+ Group variants by semantic type for unified processing.
922
+
923
+ This function groups variants so that DUP and SV_BND_DUP are processed together,
924
+ INV and SV_BND_INV are processed together, etc.
925
+
926
+ Args:
927
+ variants_df: DataFrame with variants including variant_type column
928
+ vcf_path: Optional VCF path for BND semantic classification
929
+
930
+ Returns:
931
+ Dict with keys: 'standard', 'dup_variants', 'inv_variants', 'bnd_variants'
932
+ """
933
+ grouped = {
934
+ "standard": pd.DataFrame(),
935
+ "dup_variants": pd.DataFrame(),
936
+ "inv_variants": pd.DataFrame(),
937
+ "bnd_variants": pd.DataFrame(),
938
+ }
939
+
940
+ # Standard variants (SNV, INS, DEL, MNV)
941
+ standard_types = ["SNV", "MNV", "INS", "DEL", "complex"]
942
+ grouped["standard"] = variants_df[
943
+ variants_df["variant_type"].isin(standard_types)
944
+ ].copy()
945
+
946
+ # Symbolic DUP variants
947
+ dup_types = ["SV_DUP"]
948
+ grouped["dup_variants"] = variants_df[
949
+ variants_df["variant_type"].isin(dup_types)
950
+ ].copy()
951
+
952
+ # Symbolic INV variants
953
+ inv_types = ["SV_INV"]
954
+ grouped["inv_variants"] = variants_df[
955
+ variants_df["variant_type"].isin(inv_types)
956
+ ].copy()
957
+
958
+ # Handle BND variants with semantic classification
959
+ bnd_types = ["SV_BND", "SV_BND_INS"]
960
+ bnd_variants = variants_df[variants_df["variant_type"].isin(bnd_types)]
961
+
962
+ if len(bnd_variants) > 0 and vcf_path:
963
+ # Use BNDClassifier to get semantic classifications
964
+ classifier = BNDClassifier()
965
+ classified_breakends = classifier.classify_all_breakends(
966
+ vcf_path, verbose=False
967
+ )
968
+
969
+ # Extract variant IDs for each semantic type
970
+ dup_bnd_ids = {b.id for b in classified_breakends.get("dup_breakends", [])}
971
+ inv_bnd_ids = {b.id for b in classified_breakends.get("inv_breakends", [])}
972
+ true_bnd_ids = {b.id for b in classified_breakends.get("paired", [])}
973
+
974
+ # Group BND variants by semantic type
975
+ dup_bnd_variants = bnd_variants[bnd_variants["id"].isin(dup_bnd_ids)].copy()
976
+ inv_bnd_variants = bnd_variants[bnd_variants["id"].isin(inv_bnd_ids)].copy()
977
+ true_bnd_variants = bnd_variants[bnd_variants["id"].isin(true_bnd_ids)].copy()
978
+
979
+ # Update variant_type for semantic consistency
980
+ dup_bnd_variants["variant_type"] = "SV_BND_DUP"
981
+ inv_bnd_variants["variant_type"] = "SV_BND_INV"
982
+
983
+ # Combine with symbolic variants
984
+ grouped["dup_variants"] = pd.concat(
985
+ [grouped["dup_variants"], dup_bnd_variants], ignore_index=True
986
+ )
987
+ grouped["inv_variants"] = pd.concat(
988
+ [grouped["inv_variants"], inv_bnd_variants], ignore_index=True
989
+ )
990
+ grouped["bnd_variants"] = true_bnd_variants.copy()
991
+ else:
992
+ # No BND semantic classification possible
993
+ grouped["bnd_variants"] = bnd_variants.copy()
994
+
995
+ return grouped
996
+
997
+
998
+ def parse_vcf_info(info_string: str) -> Dict:
999
+ """
1000
+ Parse VCF INFO field to extract variant information according to VCF 4.2 specification.
1001
+
1002
+ Args:
1003
+ info_string: VCF INFO field string (e.g., "SVTYPE=INV;END=1234;SVLEN=100")
1004
+
1005
+ Returns:
1006
+ dict: Parsed INFO field values with appropriate type conversion
1007
+
1008
+ VCF 4.2 INFO field specification:
1009
+
1010
+ - Key=Value pairs separated by semicolons
1011
+ - Boolean flags have no value (key presence = True)
1012
+ - Numeric values auto-converted to int/float
1013
+ - Reserved keys: AA, AC, AF, AN, BQ, CIGAR, DB, DP, END, H2, H3, MQ, MQ0, NS, SB, etc.
1014
+
1015
+ Examples:
1016
+
1017
+ >>> parse_vcf_info("SVTYPE=INV;END=1234;SVLEN=100")
1018
+ {'SVTYPE': 'INV', 'END': 1234, 'SVLEN': 100}
1019
+
1020
+ >>> parse_vcf_info("DB;H2;AF=0.5")
1021
+ {'DB': True, 'H2': True, 'AF': 0.5}
1022
+ """
1023
+ info_dict = {}
1024
+ if not info_string or info_string == ".":
1025
+ return info_dict
1026
+
1027
+ for field in info_string.split(";"):
1028
+ field = field.strip()
1029
+ if not field:
1030
+ continue
1031
+
1032
+ if "=" in field:
1033
+ key, value = field.split("=", 1)
1034
+ key = key.strip()
1035
+ value = value.strip()
1036
+
1037
+ # Handle comma-separated lists (like AC=1,2,3)
1038
+ if "," in value:
1039
+ value_list = [v.strip() for v in value.split(",")]
1040
+ # Try to convert list elements to numbers
1041
+ converted_list = []
1042
+ for v in value_list:
1043
+ try:
1044
+ if "." in v:
1045
+ converted_list.append(float(v))
1046
+ else:
1047
+ converted_list.append(int(v))
1048
+ except ValueError:
1049
+ converted_list.append(v)
1050
+ info_dict[key] = converted_list
1051
+ else:
1052
+ # Single value - try numeric conversion
1053
+ try:
1054
+ if "." in value:
1055
+ info_dict[key] = float(value)
1056
+ else:
1057
+ info_dict[key] = int(value)
1058
+ except ValueError:
1059
+ info_dict[key] = value
1060
+ else:
1061
+ # Boolean flag (presence = True)
1062
+ info_dict[field.strip()] = True
1063
+
1064
+ return info_dict
1065
+
1066
+
1067
+ def classify_variant_type(
1068
+ ref_allele: str, alt_allele: str, info_dict: Optional[Dict] = None
1069
+ ) -> str:
1070
+ """
1071
+ Classify variant type according to VCF 4.2 specification using comprehensive heuristics.
1072
+
1073
+ Note: This function only correctly classifies variants that are represented in a single
1074
+ VCF record, this means that an additional classification step is needed for BNDs that
1075
+ actually represent INV or DUP variants as those can be represented as 4 or 2 VCF records
1076
+ respectively.
1077
+
1078
+ This function implements the complete VCF 4.2 variant classification rules with proper
1079
+ handling of structural variants, standard sequence variants, and edge cases.
1080
+
1081
+ Args:
1082
+ ref_allele: Reference allele sequence (REF field)
1083
+ alt_allele: Alternate allele sequence (ALT field)
1084
+ info_dict: Parsed INFO field dictionary (optional, for structural variants)
1085
+
1086
+ Returns:
1087
+ str: Variant type classification
1088
+
1089
+ VCF 4.2 Variant Types (in classification priority order):
1090
+
1091
+ - 'complex': Complex/multiallelic variants (ALT contains comma)
1092
+ - 'missing': Missing/upstream deletion allele (ALT = '*')
1093
+ - 'SV_INV': Inversion structural variant
1094
+ - 'SV_DUP': Duplication structural variant
1095
+ - 'SV_DEL': Deletion structural variant
1096
+ - 'SV_INS': Insertion structural variant
1097
+ - 'SV_CNV': Copy number variant
1098
+ - 'SV_BND': Breakend/translocation
1099
+ - 'SV_BND_INS': Breakend/translocation with inserted sequence
1100
+ - 'SNV': Single nucleotide variant
1101
+ - 'MNV': Milti-nucleotide variant (alt len = ref len but no prefix)
1102
+ - 'INS': Sequence insertion
1103
+ - 'DEL': Sequence deletion
1104
+ - 'complex': Complex/multi-nucleotide variant (same length substitution)
1105
+ - 'unknown': Unclassifiable variant
1106
+
1107
+ Note: MNV is not part of the official VCF 4.2 spec, they are treated the same as SNVs
1108
+ for all functions in supremo_lite.
1109
+
1110
+ Examples:
1111
+
1112
+ # Multiallelic variants
1113
+ >>> classify_variant_type('A', 'G,T')
1114
+ 'multiallelic'
1115
+ >>> classify_variant_type('T', 'TGGG,C')
1116
+ 'multiallelic'
1117
+
1118
+ # Standard variants
1119
+ >>> classify_variant_type('A', 'G')
1120
+ 'SNV'
1121
+ >>> classify_variant_type('AGG', 'TCG')
1122
+ 'MNV'
1123
+ >>> classify_variant_type('T', 'TGGG')
1124
+ 'INS'
1125
+ >>> classify_variant_type('CGAGAA', 'C')
1126
+ 'DEL'
1127
+
1128
+ # Structural variants
1129
+ >>> classify_variant_type('N', '<INV>')
1130
+ 'SV_INV'
1131
+ >>> classify_variant_type('G', 'G]17:198982]')
1132
+ 'SV_BND'
1133
+ >>> classify_variant_type('T', ']chr2:20]ATCGT')
1134
+ 'SV_BND_INS'
1135
+
1136
+ # Special cases
1137
+ >>> classify_variant_type('T', '*')
1138
+ 'missing'
1139
+
1140
+ VCF 4.2 Reference: https://samtools.github.io/hts-specs/VCFv4.2.pdf
1141
+ """
1142
+ if not ref_allele or not alt_allele:
1143
+ return "missing_ref_or_alt"
1144
+
1145
+ # Normalize alleles (VCF allows mixed case)
1146
+ ref = ref_allele.upper().strip()
1147
+ alt = alt_allele.upper().strip()
1148
+
1149
+ # PRIORITY 0: Multiallelic variants (comma-separated ALT alleles)
1150
+ # Multiple alternative alleles in single ALT field indicate complex variant
1151
+ if "," in alt:
1152
+ return "multiallelic"
1153
+
1154
+ # PRIORITY 1: Handle missing/upstream deletion alleles
1155
+ # The '*' allele indicates missing due to upstream deletion (VCF 4.2 spec)
1156
+ if alt == "*":
1157
+ return "missing"
1158
+
1159
+ # PRIORITY 2: Structural variants with symbolic alleles
1160
+ # Format: <ID> where ID indicates structural variant type
1161
+ if alt.startswith("<") and alt.endswith(">"):
1162
+ sv_type = alt[1:-1].upper() # Extract type from <INV>, <DUP>, etc.
1163
+
1164
+ # Map symbolic alleles to standard classifications
1165
+ if sv_type in ["INV"]:
1166
+ return "SV_INV"
1167
+ elif sv_type in ["DUP", "DUP:TANDEM"]:
1168
+ return "SV_DUP"
1169
+ elif sv_type in ["DEL"]:
1170
+ return "SV_DEL"
1171
+ elif sv_type in ["INS"]:
1172
+ return "SV_INS"
1173
+ elif sv_type in ["CNV"]:
1174
+ return "SV_CNV"
1175
+ elif sv_type in ["BND", "TRA"]:
1176
+ return "SV_BND"
1177
+ else:
1178
+ # Fallback to returning the ALT
1179
+ return alt
1180
+
1181
+ # PRIORITY 3: Breakend notation (complex rearrangements)
1182
+ # Format examples: A[chr2:1000[, ]chr1:100]T, etc.
1183
+ breakend_pattern = r"[\[\]]"
1184
+ if re.search(breakend_pattern, alt):
1185
+ # Check if BND has inserted sequence by parsing the ALT field
1186
+ try:
1187
+ breakend_info = parse_breakend_alt(alt)
1188
+ if breakend_info["is_valid"] and breakend_info["inserted_seq"]:
1189
+ return "SV_BND_INS" # BND with insertion
1190
+ else:
1191
+ return "SV_BND" # Standard BND
1192
+ except:
1193
+ # If parsing fails, fallback to returning the ALT
1194
+ return alt
1195
+
1196
+ # PRIORITY 4: Check SVTYPE in INFO field for additional SV classification
1197
+ # Note: Symbolic ALT fields (<INV>, <DUP>) are handled by priority 3, so this mainly
1198
+ # serves as fallback for non-standard VCF files
1199
+ if info_dict and "SVTYPE" in info_dict:
1200
+ svtype = str(info_dict["SVTYPE"]).upper()
1201
+ if svtype in ["INV"]:
1202
+ return "SV_INV"
1203
+ elif svtype in ["DUP"]:
1204
+ return "SV_DUP"
1205
+ elif svtype in ["DEL"]:
1206
+ return "SV_DEL"
1207
+ elif svtype in ["INS"]:
1208
+ return "SV_INS"
1209
+ elif svtype in ["CNV"]:
1210
+ return "SV_CNV"
1211
+ elif svtype in ["BND", "TRA", "TRANSLOCATION"]:
1212
+ return "SV_BND"
1213
+
1214
+ # PRIORITY 5: Standard sequence variants based on length comparison
1215
+ ref_len = len(ref)
1216
+ alt_len = len(alt)
1217
+
1218
+ if ref_len == 1 and alt_len == 1:
1219
+ # Single base substitution
1220
+ if ref != alt:
1221
+ return "SNV"
1222
+ else:
1223
+ # Identical alleles - should not occur in valid VCF
1224
+ return alt
1225
+
1226
+ elif ref_len == 1 and alt_len > 1:
1227
+ # Potential insertion: check if REF is prefix of ALT
1228
+ if alt.startswith(ref):
1229
+ return "INS"
1230
+ else:
1231
+ # REF not a prefix - complex variant
1232
+ return alt
1233
+
1234
+ elif ref_len > 1 and alt_len == 1:
1235
+ # Potential deletion: check if ALT is prefix of REF
1236
+ if ref.startswith(alt):
1237
+ return "DEL"
1238
+ else:
1239
+ # ALT not a prefix - complex variant
1240
+ return alt
1241
+
1242
+ elif ref_len > 1 and alt_len > 1:
1243
+ # Multi-base variant - determine if complex substitution or indel
1244
+ # Check for shared prefix/suffix to identify indel vs substitution
1245
+
1246
+ # Find longest common prefix
1247
+ prefix_len = 0
1248
+ min_len = min(ref_len, alt_len)
1249
+ while prefix_len < min_len and ref[prefix_len] == alt[prefix_len]:
1250
+ prefix_len += 1
1251
+
1252
+ # Find longest common suffix
1253
+ suffix_len = 0
1254
+ while (
1255
+ suffix_len < min_len - prefix_len
1256
+ and ref[ref_len - 1 - suffix_len] == alt[alt_len - 1 - suffix_len]
1257
+ ):
1258
+ suffix_len += 1
1259
+
1260
+ # Analyze the variant structure
1261
+ if prefix_len + suffix_len >= min_len:
1262
+ # Significant overlap - likely indel
1263
+ if ref_len > alt_len:
1264
+ return "DEL"
1265
+ elif alt_len > ref_len:
1266
+ return "INS"
1267
+ else:
1268
+ # Same length with shared prefix/suffix - substitution
1269
+ return alt
1270
+ else:
1271
+ # Limited overlap - substitution
1272
+ return "MNV"
1273
+
1274
+ else:
1275
+ # Not parsed - should not occur in valid VCF
1276
+ return alt
1277
+
1278
+
1279
+ def parse_breakend_alt(alt_allele: str) -> Dict:
1280
+ """
1281
+ Parse breakend ALT field to extract mate information and inserted sequence.
1282
+
1283
+ Args:
1284
+ alt_allele: ALT field from BND variant (e.g., "G]17:198982]", "]13:123456]AGTNNNNNCAT")
1285
+
1286
+ Returns:
1287
+ dict: Parsed breakend information with keys:
1288
+ - 'mate_chrom': Chromosome of mate breakend
1289
+ - 'mate_pos': Position of mate breakend (1-based)
1290
+ - 'orientation': Breakend orientation ('t[p[', 't]p]', ']p]t', '[p[t')
1291
+ - 'inserted_seq': Novel sequence inserted at junction (empty string if none)
1292
+ - 'is_valid': Boolean indicating if ALT field was successfully parsed
1293
+
1294
+ Breakend ALT format examples (VCF 4.2):
1295
+
1296
+ - t[p[: piece extending to the right of p is joined after t
1297
+ - t]p]: reverse comp piece extending left of p is joined after t
1298
+ - ]p]t: piece extending to the left of p is joined before t
1299
+ - [p[t: reverse comp piece extending right of p is joined before t
1300
+
1301
+ Examples:
1302
+
1303
+ >>> parse_breakend_alt("G]17:198982]")
1304
+ {'mate_chrom': '17', 'mate_pos': 198982, 'orientation': 't]p]',
1305
+ 'inserted_seq': '', 'is_valid': True}
1306
+
1307
+ >>> parse_breakend_alt("]13:123456]AGTNNNNNCAT")
1308
+ {'mate_chrom': '13', 'mate_pos': 123456, 'orientation': ']p]t',
1309
+ 'inserted_seq': 'AGTNNNNNCAT', 'is_valid': True}
1310
+ """
1311
+ import re
1312
+
1313
+ result = {
1314
+ "mate_chrom": None,
1315
+ "mate_pos": None,
1316
+ "orientation": None,
1317
+ "inserted_seq": "",
1318
+ "is_valid": False,
1319
+ }
1320
+
1321
+ if not alt_allele or not isinstance(alt_allele, str):
1322
+ return result
1323
+
1324
+ # Patterns for the four breakend orientations
1325
+ # t[p[ format: sequence + [ + position + [
1326
+ pattern1 = r"^(.+?)\[([^:]+):(\d+)\[$"
1327
+ # t]p] format: sequence + ] + position + ]
1328
+ pattern2 = r"^(.+?)\]([^:]+):(\d+)\]$"
1329
+ # ]p]t format: ] + position + ] + sequence
1330
+ pattern3 = r"^\]([^:]+):(\d+)\](.+?)$"
1331
+ # [p[t format: [ + position + [ + sequence
1332
+ pattern4 = r"^\[([^:]+):(\d+)\[(.+?)$"
1333
+
1334
+ # Try each pattern
1335
+ match = re.match(pattern1, alt_allele)
1336
+ if match:
1337
+ prefix_seq, mate_chrom, mate_pos = match.groups()
1338
+ result["mate_chrom"] = mate_chrom
1339
+ result["mate_pos"] = int(mate_pos)
1340
+ result["orientation"] = "t[p[" # t[p[
1341
+ result["inserted_seq"] = (
1342
+ prefix_seq[1:] if len(prefix_seq) > 1 else ""
1343
+ ) # Remove reference base
1344
+ result["is_valid"] = True
1345
+ return result
1346
+
1347
+ match = re.match(pattern2, alt_allele)
1348
+ if match:
1349
+ prefix_seq, mate_chrom, mate_pos = match.groups()
1350
+ result["mate_chrom"] = mate_chrom
1351
+ result["mate_pos"] = int(mate_pos)
1352
+ result["orientation"] = "t]p]" # t]p]
1353
+ result["inserted_seq"] = (
1354
+ prefix_seq[1:] if len(prefix_seq) > 1 else ""
1355
+ ) # Remove reference base
1356
+ result["is_valid"] = True
1357
+ return result
1358
+
1359
+ match = re.match(pattern3, alt_allele)
1360
+ if match:
1361
+ mate_chrom, mate_pos, suffix_seq = match.groups()
1362
+ result["mate_chrom"] = mate_chrom
1363
+ result["mate_pos"] = int(mate_pos)
1364
+ result["orientation"] = "]p]t" # ]p]t
1365
+ result["inserted_seq"] = (
1366
+ suffix_seq[:-1] if len(suffix_seq) > 1 else ""
1367
+ ) # Remove reference base
1368
+ result["is_valid"] = True
1369
+ return result
1370
+
1371
+ match = re.match(pattern4, alt_allele)
1372
+ if match:
1373
+ mate_chrom, mate_pos, suffix_seq = match.groups()
1374
+ result["mate_chrom"] = mate_chrom
1375
+ result["mate_pos"] = int(mate_pos)
1376
+ result["orientation"] = "[p[t" # [p[t
1377
+ result["inserted_seq"] = (
1378
+ suffix_seq[:-1] if len(suffix_seq) > 1 else ""
1379
+ ) # Remove reference base
1380
+ result["is_valid"] = True
1381
+ return result
1382
+
1383
+ return result
1384
+
1385
+
1386
+ def validate_breakend_pair(bnd1: Dict, bnd2: Dict) -> Dict:
1387
+ """
1388
+ Validate that two breakend variants form a consistent mate pair.
1389
+
1390
+ Args:
1391
+ bnd1: First breakend variant (dict with id, mate_id, chrom, pos, etc.)
1392
+ bnd2: Second breakend variant (dict with id, mate_id, chrom, pos, etc.)
1393
+
1394
+ Returns:
1395
+ dict: Validation result with keys:
1396
+ - 'is_valid': Boolean indicating if pair is valid
1397
+ - 'errors': List of validation error messages
1398
+ - 'warnings': List of validation warning messages
1399
+ """
1400
+ result = {"is_valid": True, "errors": [], "warnings": []}
1401
+
1402
+ # Check that they reference each other as mates
1403
+ if bnd1.get("mate_id") != bnd2.get("id"):
1404
+ result["errors"].append(
1405
+ f"BND {bnd1.get('id')} MATEID {bnd1.get('mate_id')} does not match mate ID {bnd2.get('id')}"
1406
+ )
1407
+ result["is_valid"] = False
1408
+
1409
+ if bnd2.get("mate_id") != bnd1.get("id"):
1410
+ result["errors"].append(
1411
+ f"BND {bnd2.get('id')} MATEID {bnd2.get('mate_id')} does not match mate ID {bnd1.get('id')}"
1412
+ )
1413
+ result["is_valid"] = False
1414
+
1415
+ # Check that mate positions are consistent with actual positions
1416
+ if bnd1.get("mate_chrom") != bnd2.get("chrom"):
1417
+ result["errors"].append(
1418
+ f"BND {bnd1.get('id')} mate chromosome {bnd1.get('mate_chrom')} does not match actual chromosome {bnd2.get('chrom')}"
1419
+ )
1420
+ result["is_valid"] = False
1421
+
1422
+ if bnd1.get("mate_pos") != bnd2.get("pos"):
1423
+ result["errors"].append(
1424
+ f"BND {bnd1.get('id')} mate position {bnd1.get('mate_pos')} does not match actual position {bnd2.get('pos')}"
1425
+ )
1426
+ result["is_valid"] = False
1427
+
1428
+ # Check orientation consistency (complex logic depending on rearrangement type)
1429
+ orientation1 = bnd1.get("orientation")
1430
+ orientation2 = bnd2.get("orientation")
1431
+
1432
+ # For now, just warn about complex orientation validation - this would need detailed implementation
1433
+ if orientation1 and orientation2:
1434
+ result["warnings"].append(
1435
+ f"Orientation validation not fully implemented: {orientation1} vs {orientation2}"
1436
+ )
1437
+
1438
+ return result
1439
+
1440
+
1441
+ def create_breakend_pairs(variants_df: pd.DataFrame) -> List[BreakendPair]:
1442
+ """
1443
+ Create BreakendPair objects from BND variants in a DataFrame.
1444
+
1445
+ This function pairs breakend variants based on coordinate matching rather than MATEID,
1446
+ making it more robust and not dependent on optional INFO fields.
1447
+
1448
+ Args:
1449
+ variants_df: DataFrame containing BND variants with variant_type='SV_BND'
1450
+
1451
+ Returns:
1452
+ List of BreakendPair objects representing valid breakend pairs
1453
+
1454
+ Notes:
1455
+ - Pairs breakends by matching coordinates from ALT field parsing
1456
+ - Does not require MATEID field to be present
1457
+ - Issues warnings for unpaired or invalid breakends
1458
+ """
1459
+ # Filter for BND variants only (including BND with insertions)
1460
+ bnd_variants = variants_df[
1461
+ variants_df["variant_type"].isin(["SV_BND", "SV_BND_INS"])
1462
+ ].copy()
1463
+
1464
+ if len(bnd_variants) == 0:
1465
+ return []
1466
+
1467
+ # Parse all breakend variants
1468
+ breakend_variants = []
1469
+ for _, variant in bnd_variants.iterrows():
1470
+ try:
1471
+ # Parse ALT field to get mate information
1472
+ breakend_info = parse_breakend_alt(variant["alt"])
1473
+
1474
+ if not breakend_info["is_valid"]:
1475
+ warnings.warn(
1476
+ f"Could not parse breakend ALT field for variant {variant['id']}: {variant['alt']}"
1477
+ )
1478
+ continue
1479
+
1480
+ # Parse INFO field for optional MATEID (but don't require it)
1481
+ info_dict = parse_vcf_info(variant.get("info", ""))
1482
+ mate_id = info_dict.get("MATEID", None)
1483
+
1484
+ # Create BreakendVariant object
1485
+ breakend = BreakendVariant(
1486
+ id=variant["id"],
1487
+ chrom=variant["chrom"],
1488
+ pos=variant["pos1"],
1489
+ ref=variant["ref"],
1490
+ alt=variant["alt"],
1491
+ mate_id=mate_id, # May be None
1492
+ mate_chrom=breakend_info["mate_chrom"],
1493
+ mate_pos=breakend_info["mate_pos"],
1494
+ orientation=breakend_info["orientation"],
1495
+ inserted_seq=breakend_info["inserted_seq"],
1496
+ info=variant.get("info", ""),
1497
+ variant_type="SV_BND",
1498
+ )
1499
+ breakend_variants.append(breakend)
1500
+
1501
+ except Exception as e:
1502
+ warnings.warn(f"Error processing breakend variant {variant['id']}: {e}")
1503
+ continue
1504
+
1505
+ # Create pairs by coordinate matching
1506
+ pairs = []
1507
+ used_breakends = set()
1508
+
1509
+ for i, bnd1 in enumerate(breakend_variants):
1510
+ if bnd1.id in used_breakends:
1511
+ continue
1512
+
1513
+ # Find mate by coordinate matching
1514
+ mate_found = False
1515
+ for j, bnd2 in enumerate(breakend_variants):
1516
+ if i == j or bnd2.id in used_breakends:
1517
+ continue
1518
+
1519
+ # Check if these breakends are mates based on coordinates
1520
+ if (
1521
+ bnd1.mate_chrom == bnd2.chrom
1522
+ and bnd1.mate_pos == bnd2.pos
1523
+ and bnd2.mate_chrom == bnd1.chrom
1524
+ and bnd2.mate_pos == bnd1.pos
1525
+ ):
1526
+
1527
+ try:
1528
+ # Create pair (validation happens in BreakendPair.__post_init__)
1529
+ pair = BreakendPair(bnd1, bnd2)
1530
+ pairs.append(pair)
1531
+ used_breakends.add(bnd1.id)
1532
+ used_breakends.add(bnd2.id)
1533
+ mate_found = True
1534
+ break
1535
+
1536
+ except Exception as e:
1537
+ warnings.warn(f"Invalid breakend pair {bnd1.id}-{bnd2.id}: {e}")
1538
+ continue
1539
+
1540
+ if not mate_found:
1541
+ warnings.warn(
1542
+ f"No mate found for breakend {bnd1.id} at {bnd1.chrom}:{bnd1.pos}"
1543
+ )
1544
+
1545
+ return pairs
1546
+
1547
+
1548
+ def load_breakend_variants(
1549
+ variants_fn: Union[str, pd.DataFrame],
1550
+ ) -> Tuple[pd.DataFrame, List[Tuple]]:
1551
+ """
1552
+ Load variants and separate BND variants into pairs using enhanced classifier.
1553
+
1554
+ Args:
1555
+ variants_fn: Path to VCF file or DataFrame with variant data
1556
+
1557
+ Returns:
1558
+ Tuple of (standard_variants_df, breakend_pairs_list)
1559
+ - standard_variants_df: DataFrame with non-BND variants
1560
+ - breakend_pairs_list: List of tuples (bnd1, bnd2) for BND pairs
1561
+ """
1562
+ # Import here to avoid circular imports
1563
+ from .personalize import _load_variants
1564
+
1565
+ # Load all variants with proper normalization and classification
1566
+ if isinstance(variants_fn, str):
1567
+ all_variants = read_vcf(variants_fn, include_info=True, classify_variants=True)
1568
+ vcf_path = variants_fn
1569
+ else:
1570
+ # Use the existing _load_variants function which properly handles
1571
+ # DataFrame normalization (pos->pos1, variant_type, etc.)
1572
+ all_variants = _load_variants(variants_fn)
1573
+ vcf_path = None
1574
+
1575
+ # Separate BND and standard variants (including all BND types)
1576
+ bnd_variants = all_variants[
1577
+ all_variants["variant_type"].isin(
1578
+ ["SV_BND", "SV_BND_INS", "SV_BND_DUP", "SV_BND_INV"]
1579
+ )
1580
+ ]
1581
+ standard_variants = all_variants[
1582
+ ~all_variants["variant_type"].isin(
1583
+ ["SV_BND", "SV_BND_INS", "SV_BND_DUP", "SV_BND_INV"]
1584
+ )
1585
+ ]
1586
+
1587
+ # Create breakend pairs using enhanced classifier
1588
+ breakend_pairs = []
1589
+ if len(bnd_variants) > 0 and vcf_path:
1590
+ classifier = BNDClassifier()
1591
+ classified_breakends = classifier.classify_all_breakends(
1592
+ vcf_path, verbose=False
1593
+ )
1594
+
1595
+ # Convert classified breakends to pairs
1596
+ paired_breakends = classified_breakends["paired"]
1597
+ processed_ids = set()
1598
+
1599
+ for breakend in paired_breakends:
1600
+ if breakend.id in processed_ids:
1601
+ continue
1602
+
1603
+ if breakend.mate_breakend:
1604
+ pair = (breakend, breakend.mate_breakend)
1605
+ breakend_pairs.append(pair)
1606
+ processed_ids.add(breakend.id)
1607
+ processed_ids.add(breakend.mate_breakend.id)
1608
+
1609
+ return standard_variants, breakend_pairs
1610
+
1611
+
1612
+ def _filter_multiallelic_variants(df: pd.DataFrame) -> pd.DataFrame:
1613
+ """
1614
+ Filter out variants with multiallelic ALT fields (containing commas).
1615
+
1616
+ Args:
1617
+ df: DataFrame with variant data including 'alt' column
1618
+
1619
+ Returns:
1620
+ DataFrame with multiallelic variants removed
1621
+
1622
+ Notes:
1623
+ Issues a warning when multiallelic variants are found and removed.
1624
+ Multiallelic variants have ALT fields like "G,T" indicating multiple
1625
+ alternative alleles at the same position.
1626
+ """
1627
+ if "alt" not in df.columns or len(df) == 0:
1628
+ return df
1629
+
1630
+ # Identify multiallelic variants (ALT field contains comma)
1631
+ multiallelic_mask = df["alt"].str.contains(",", na=False)
1632
+ n_multiallelic = multiallelic_mask.sum()
1633
+
1634
+ if n_multiallelic > 0:
1635
+ warnings.warn(
1636
+ f"Found {n_multiallelic} multiallelic variants with comma-separated ALT alleles. "
1637
+ f"These variants have been removed from the dataset. "
1638
+ f"Consider preprocessing your VCF file to split multiallelic sites if needed.",
1639
+ UserWarning,
1640
+ )
1641
+
1642
+ # Filter out multiallelic variants
1643
+ df = df[~multiallelic_mask].reset_index(drop=True)
1644
+
1645
+ return df