supremo-lite 0.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- supremo_lite/__init__.py +59 -0
- supremo_lite/chromosome_utils.py +322 -0
- supremo_lite/core.py +41 -0
- supremo_lite/mock_models/__init__.py +110 -0
- supremo_lite/mock_models/testmodel_1d.py +184 -0
- supremo_lite/mock_models/testmodel_2d.py +203 -0
- supremo_lite/mutagenesis.py +414 -0
- supremo_lite/personalize.py +3098 -0
- supremo_lite/prediction_alignment.py +1014 -0
- supremo_lite/sequence_utils.py +137 -0
- supremo_lite/variant_utils.py +1645 -0
- supremo_lite-0.5.4.dist-info/METADATA +216 -0
- supremo_lite-0.5.4.dist-info/RECORD +15 -0
- supremo_lite-0.5.4.dist-info/WHEEL +4 -0
- supremo_lite-0.5.4.dist-info/licenses/LICENSE +22 -0
|
@@ -0,0 +1,1645 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Variant reading and handling utilities for supremo_lite.
|
|
3
|
+
|
|
4
|
+
This module provides functions for reading variants from VCF files
|
|
5
|
+
and other related operations.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import io
|
|
9
|
+
import pandas as pd
|
|
10
|
+
import numpy as np
|
|
11
|
+
import re
|
|
12
|
+
import warnings
|
|
13
|
+
from typing import Dict, Optional, List, Tuple, Union
|
|
14
|
+
from dataclasses import dataclass
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class BreakendVariant:
|
|
19
|
+
"""
|
|
20
|
+
Represents a single breakend variant from a VCF file.
|
|
21
|
+
|
|
22
|
+
This class stores all information needed to process a BND variant,
|
|
23
|
+
including mate relationships and inserted sequences.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
id: str # VCF ID field (e.g., "bnd_W")
|
|
27
|
+
chrom: str # Chromosome name
|
|
28
|
+
pos: int # 1-based position
|
|
29
|
+
ref: str # Reference allele
|
|
30
|
+
alt: str # Complete ALT field (e.g., "G]17:198982]")
|
|
31
|
+
mate_id: str # MATEID from INFO field
|
|
32
|
+
mate_chrom: str # Mate chromosome (parsed from ALT)
|
|
33
|
+
mate_pos: int # Mate position (parsed from ALT)
|
|
34
|
+
orientation: str # Breakend orientation (e.g., "ref_then_mate")
|
|
35
|
+
inserted_seq: str # Novel sequence at junction
|
|
36
|
+
info: str # Complete INFO field
|
|
37
|
+
variant_type: str = "SV_BND" # Always BND for breakend variants
|
|
38
|
+
|
|
39
|
+
def __post_init__(self):
|
|
40
|
+
"""Validate breakend data after initialization."""
|
|
41
|
+
if not self.id:
|
|
42
|
+
raise ValueError("Breakend ID cannot be empty")
|
|
43
|
+
if self.pos <= 0:
|
|
44
|
+
raise ValueError("Breakend position must be positive")
|
|
45
|
+
if self.mate_pos <= 0:
|
|
46
|
+
raise ValueError("Breakend mate position must be positive")
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@dataclass
|
|
50
|
+
class BreakendPair:
|
|
51
|
+
"""
|
|
52
|
+
Represents a pair of mated breakend variants that create a novel adjacency.
|
|
53
|
+
|
|
54
|
+
This class coordinates the application of both breakends to create
|
|
55
|
+
complex rearrangements like translocations, inversions, etc.
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
breakend1: BreakendVariant
|
|
59
|
+
breakend2: BreakendVariant
|
|
60
|
+
is_valid: bool = True
|
|
61
|
+
validation_errors: List[str] = None
|
|
62
|
+
validation_warnings: List[str] = None
|
|
63
|
+
|
|
64
|
+
def __post_init__(self):
|
|
65
|
+
"""Validate that the two breakends form a consistent pair."""
|
|
66
|
+
if self.validation_errors is None:
|
|
67
|
+
self.validation_errors = []
|
|
68
|
+
if self.validation_warnings is None:
|
|
69
|
+
self.validation_warnings = []
|
|
70
|
+
|
|
71
|
+
# Validate mate relationships
|
|
72
|
+
if self.breakend1.mate_id != self.breakend2.id:
|
|
73
|
+
self.validation_errors.append(
|
|
74
|
+
f"Breakend {self.breakend1.id} MATEID {self.breakend1.mate_id} "
|
|
75
|
+
f"does not match mate ID {self.breakend2.id}"
|
|
76
|
+
)
|
|
77
|
+
self.is_valid = False
|
|
78
|
+
|
|
79
|
+
if self.breakend2.mate_id != self.breakend1.id:
|
|
80
|
+
self.validation_errors.append(
|
|
81
|
+
f"Breakend {self.breakend2.id} MATEID {self.breakend2.mate_id} "
|
|
82
|
+
f"does not match mate ID {self.breakend1.id}"
|
|
83
|
+
)
|
|
84
|
+
self.is_valid = False
|
|
85
|
+
|
|
86
|
+
# Validate coordinate consistency
|
|
87
|
+
if self.breakend1.mate_chrom != self.breakend2.chrom:
|
|
88
|
+
self.validation_errors.append(
|
|
89
|
+
f"Breakend {self.breakend1.id} mate chromosome {self.breakend1.mate_chrom} "
|
|
90
|
+
f"does not match actual chromosome {self.breakend2.chrom}"
|
|
91
|
+
)
|
|
92
|
+
self.is_valid = False
|
|
93
|
+
|
|
94
|
+
if self.breakend1.mate_pos != self.breakend2.pos:
|
|
95
|
+
self.validation_errors.append(
|
|
96
|
+
f"Breakend {self.breakend1.id} mate position {self.breakend1.mate_pos} "
|
|
97
|
+
f"does not match actual position {self.breakend2.pos}"
|
|
98
|
+
)
|
|
99
|
+
self.is_valid = False
|
|
100
|
+
|
|
101
|
+
@property
|
|
102
|
+
def rearrangement_type(self) -> str:
|
|
103
|
+
"""
|
|
104
|
+
Determine the type of rearrangement represented by this breakend pair.
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
str: Rearrangement type ('translocation', 'inversion', 'duplication', 'complex')
|
|
108
|
+
"""
|
|
109
|
+
if not self.is_valid:
|
|
110
|
+
return "invalid"
|
|
111
|
+
|
|
112
|
+
# Check if breakends are on same chromosome
|
|
113
|
+
if self.breakend1.chrom == self.breakend2.chrom:
|
|
114
|
+
# Same chromosome - could be inversion, duplication, or deletion
|
|
115
|
+
pos1, pos2 = self.breakend1.pos, self.breakend2.pos
|
|
116
|
+
orient1, orient2 = self.breakend1.orientation, self.breakend2.orientation
|
|
117
|
+
|
|
118
|
+
# Simple heuristics for now - detailed implementation would need more logic
|
|
119
|
+
if abs(pos1 - pos2) < 1000: # Close positions might be duplication
|
|
120
|
+
return "duplication"
|
|
121
|
+
elif orient1 != orient2: # Different orientations suggest inversion
|
|
122
|
+
return "inversion"
|
|
123
|
+
else:
|
|
124
|
+
return "complex"
|
|
125
|
+
else:
|
|
126
|
+
# Different chromosomes - translocation
|
|
127
|
+
return "translocation"
|
|
128
|
+
|
|
129
|
+
def get_affected_regions(self) -> List[Tuple[str, int, int]]:
|
|
130
|
+
"""
|
|
131
|
+
Get genomic regions affected by this breakend pair.
|
|
132
|
+
|
|
133
|
+
Returns:
|
|
134
|
+
List of tuples (chrom, start, end) for affected regions
|
|
135
|
+
"""
|
|
136
|
+
regions = []
|
|
137
|
+
|
|
138
|
+
# Add region around first breakend
|
|
139
|
+
regions.append(
|
|
140
|
+
(
|
|
141
|
+
self.breakend1.chrom,
|
|
142
|
+
max(1, self.breakend1.pos - 1), # Include position before breakend
|
|
143
|
+
self.breakend1.pos + len(self.breakend1.ref),
|
|
144
|
+
)
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
# Add region around second breakend
|
|
148
|
+
regions.append(
|
|
149
|
+
(
|
|
150
|
+
self.breakend2.chrom,
|
|
151
|
+
max(1, self.breakend2.pos - 1), # Include position before breakend
|
|
152
|
+
self.breakend2.pos + len(self.breakend2.ref),
|
|
153
|
+
)
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
return regions
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
@dataclass
|
|
160
|
+
class Breakend:
|
|
161
|
+
"""Enhanced breakend with classification information."""
|
|
162
|
+
|
|
163
|
+
id: str
|
|
164
|
+
chrom: str
|
|
165
|
+
pos: int
|
|
166
|
+
ref: str
|
|
167
|
+
alt: str
|
|
168
|
+
mate_chrom: str
|
|
169
|
+
mate_pos: int
|
|
170
|
+
orientation: str
|
|
171
|
+
inserted_seq: str
|
|
172
|
+
classification: str # 'paired', 'missing_mate', 'singleton_insertion'
|
|
173
|
+
mate_breakend: Optional["Breakend"] = None
|
|
174
|
+
|
|
175
|
+
@classmethod
|
|
176
|
+
def from_breakend_variant(
|
|
177
|
+
cls, variant: BreakendVariant, classification: str
|
|
178
|
+
) -> "Breakend":
|
|
179
|
+
"""Create from BreakendVariant."""
|
|
180
|
+
return cls(
|
|
181
|
+
id=variant.id,
|
|
182
|
+
chrom=variant.chrom,
|
|
183
|
+
pos=variant.pos,
|
|
184
|
+
ref=variant.ref,
|
|
185
|
+
alt=variant.alt,
|
|
186
|
+
mate_chrom=variant.mate_chrom,
|
|
187
|
+
mate_pos=variant.mate_pos,
|
|
188
|
+
orientation=variant.orientation,
|
|
189
|
+
inserted_seq=variant.inserted_seq,
|
|
190
|
+
classification=classification,
|
|
191
|
+
mate_breakend=None,
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
class BNDClassifier:
|
|
196
|
+
"""
|
|
197
|
+
BND classifier that doesn't depend on MATEID fields.
|
|
198
|
+
|
|
199
|
+
Classifies BNDs into categories:
|
|
200
|
+
|
|
201
|
+
1. Paired breakends - have matching mates by coordinates
|
|
202
|
+
2. Missing mates - reference coordinates not present in VCF (can be inferred)
|
|
203
|
+
3. Insertions with mates - insertions where mate is present
|
|
204
|
+
4. Insertions without mates - insertions where mate is missing (inferred)
|
|
205
|
+
"""
|
|
206
|
+
|
|
207
|
+
def __init__(self):
|
|
208
|
+
self.all_breakends = []
|
|
209
|
+
self.coordinate_index = {} # Map (chrom, pos) -> breakend
|
|
210
|
+
|
|
211
|
+
def classify_all_breakends(
|
|
212
|
+
self, vcf_path: str, verbose: bool = False
|
|
213
|
+
) -> Dict[str, List[Breakend]]:
|
|
214
|
+
"""
|
|
215
|
+
Classify all BND variants from a VCF file.
|
|
216
|
+
|
|
217
|
+
Args:
|
|
218
|
+
vcf_path: Path to VCF file containing BND variants
|
|
219
|
+
verbose: Print detailed classification information (default: False)
|
|
220
|
+
|
|
221
|
+
Returns:
|
|
222
|
+
Dict with keys 'paired', 'missing_mate', 'singleton_insertion'
|
|
223
|
+
"""
|
|
224
|
+
# Load VCF with variant classification
|
|
225
|
+
variants_df = read_vcf(vcf_path, include_info=True, classify_variants=True)
|
|
226
|
+
bnd_variants = variants_df[
|
|
227
|
+
variants_df["variant_type"].isin(["SV_BND", "SV_BND_INS"])
|
|
228
|
+
]
|
|
229
|
+
|
|
230
|
+
if verbose:
|
|
231
|
+
print(f"Found {len(bnd_variants)} BND variants")
|
|
232
|
+
|
|
233
|
+
# Parse all breakends and build coordinate index
|
|
234
|
+
self._parse_and_index_breakends(bnd_variants)
|
|
235
|
+
|
|
236
|
+
# Classify breakends
|
|
237
|
+
classified = {
|
|
238
|
+
"paired": [],
|
|
239
|
+
"missing_mate": [],
|
|
240
|
+
"insertion_with_mate": [],
|
|
241
|
+
"insertion_missing_mate": [],
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
processed_ids = set()
|
|
245
|
+
|
|
246
|
+
for breakend in self.all_breakends:
|
|
247
|
+
if breakend.id in processed_ids:
|
|
248
|
+
continue
|
|
249
|
+
|
|
250
|
+
# Try to find mate by coordinates
|
|
251
|
+
mate_key = (breakend.mate_chrom, breakend.mate_pos)
|
|
252
|
+
|
|
253
|
+
if mate_key in self.coordinate_index:
|
|
254
|
+
# Found mate - this is a paired breakend
|
|
255
|
+
mate_breakend = self.coordinate_index[mate_key]
|
|
256
|
+
|
|
257
|
+
# Create enhanced breakends for the pair
|
|
258
|
+
enhanced1 = Breakend.from_breakend_variant(breakend, "paired")
|
|
259
|
+
enhanced2 = Breakend.from_breakend_variant(mate_breakend, "paired")
|
|
260
|
+
|
|
261
|
+
enhanced1.mate_breakend = enhanced2
|
|
262
|
+
enhanced2.mate_breakend = enhanced1
|
|
263
|
+
|
|
264
|
+
classified["paired"].extend([enhanced1, enhanced2])
|
|
265
|
+
processed_ids.add(breakend.id)
|
|
266
|
+
processed_ids.add(mate_breakend.id)
|
|
267
|
+
|
|
268
|
+
else:
|
|
269
|
+
# No mate found - ALWAYS infer the missing mate and create a fusion
|
|
270
|
+
# Create an inferred mate breakend
|
|
271
|
+
inferred_mate = BreakendVariant(
|
|
272
|
+
id=f"{breakend.id}_inferred_mate",
|
|
273
|
+
chrom=breakend.mate_chrom,
|
|
274
|
+
pos=breakend.mate_pos,
|
|
275
|
+
ref="N", # Unknown reference at mate position
|
|
276
|
+
alt="<INFERRED>",
|
|
277
|
+
mate_id=breakend.id,
|
|
278
|
+
mate_chrom=breakend.chrom,
|
|
279
|
+
mate_pos=breakend.pos,
|
|
280
|
+
orientation=self._infer_mate_orientation(breakend.orientation),
|
|
281
|
+
inserted_seq="", # Novel sequence stays on the original breakend
|
|
282
|
+
info=f"INFERRED_FROM={breakend.id}",
|
|
283
|
+
variant_type="SV_BND",
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
# Create enhanced breakends for the inferred pair
|
|
287
|
+
enhanced1 = Breakend.from_breakend_variant(breakend, "paired")
|
|
288
|
+
enhanced2 = Breakend.from_breakend_variant(inferred_mate, "paired")
|
|
289
|
+
|
|
290
|
+
enhanced1.mate_breakend = enhanced2
|
|
291
|
+
enhanced2.mate_breakend = enhanced1
|
|
292
|
+
|
|
293
|
+
classified["paired"].extend([enhanced1, enhanced2])
|
|
294
|
+
|
|
295
|
+
if verbose:
|
|
296
|
+
if breakend.inserted_seq:
|
|
297
|
+
print(
|
|
298
|
+
f" INFO: Inferred missing mate for {breakend.id} with novel sequence '{breakend.inserted_seq}' "
|
|
299
|
+
f"-> created fusion with inferred mate at {breakend.mate_chrom}:{breakend.mate_pos}"
|
|
300
|
+
)
|
|
301
|
+
else:
|
|
302
|
+
print(
|
|
303
|
+
f" INFO: Inferred missing mate for {breakend.id} "
|
|
304
|
+
f"-> created fusion with inferred mate at {breakend.mate_chrom}:{breakend.mate_pos}"
|
|
305
|
+
)
|
|
306
|
+
|
|
307
|
+
processed_ids.add(breakend.id)
|
|
308
|
+
|
|
309
|
+
# Apply semantic classification to detect DUP and INV patterns
|
|
310
|
+
# First detect duplications
|
|
311
|
+
dup_breakends = self._detect_duplication_pattern(classified["paired"])
|
|
312
|
+
|
|
313
|
+
# Remove duplication breakends from paired list before inversion detection
|
|
314
|
+
dup_ids = {b.id for b in dup_breakends}
|
|
315
|
+
remaining_paired = [b for b in classified["paired"] if b.id not in dup_ids]
|
|
316
|
+
|
|
317
|
+
# Then detect inversions from remaining breakends
|
|
318
|
+
inv_breakends = self._detect_inversion_pattern(remaining_paired)
|
|
319
|
+
|
|
320
|
+
# Remove reclassified breakends from 'paired' category and add to semantic categories
|
|
321
|
+
reclassified_ids = {b.id for b in dup_breakends + inv_breakends}
|
|
322
|
+
classified["paired"] = [
|
|
323
|
+
b for b in classified["paired"] if b.id not in reclassified_ids
|
|
324
|
+
]
|
|
325
|
+
|
|
326
|
+
# Add semantic classifications
|
|
327
|
+
classified["dup_breakends"] = dup_breakends
|
|
328
|
+
classified["inv_breakends"] = inv_breakends
|
|
329
|
+
|
|
330
|
+
# Print classification summary
|
|
331
|
+
if verbose:
|
|
332
|
+
print(f"\nBND Classification Summary:")
|
|
333
|
+
print(
|
|
334
|
+
f" Paired breakends (true translocations): {len(classified['paired'])}"
|
|
335
|
+
)
|
|
336
|
+
print(f" Duplication breakends (SV_BND_DUP): {len(dup_breakends)}")
|
|
337
|
+
print(f" Inversion breakends (SV_BND_INV): {len(inv_breakends)}")
|
|
338
|
+
total_inferred = len(
|
|
339
|
+
[
|
|
340
|
+
bnd
|
|
341
|
+
for bnd in classified["paired"] + dup_breakends + inv_breakends
|
|
342
|
+
if "inferred" in bnd.id
|
|
343
|
+
]
|
|
344
|
+
)
|
|
345
|
+
print(f" Inferred mates created: {total_inferred}")
|
|
346
|
+
|
|
347
|
+
return classified
|
|
348
|
+
|
|
349
|
+
def _detect_duplication_pattern(
|
|
350
|
+
self, paired_breakends: List[Breakend]
|
|
351
|
+
) -> List[Breakend]:
|
|
352
|
+
"""
|
|
353
|
+
Detect duplication patterns from paired breakends.
|
|
354
|
+
|
|
355
|
+
A duplication pattern consists of 2 breakends:
|
|
356
|
+
- BND1: position A pointing to position B
|
|
357
|
+
- BND2: position B pointing to position A
|
|
358
|
+
- Same chromosome, A < B (tandem duplication)
|
|
359
|
+
|
|
360
|
+
Returns:
|
|
361
|
+
List of breakends reclassified as SV_BND_DUP
|
|
362
|
+
"""
|
|
363
|
+
dup_breakends = []
|
|
364
|
+
processed_ids = set()
|
|
365
|
+
|
|
366
|
+
for breakend in paired_breakends:
|
|
367
|
+
if breakend.id in processed_ids or not breakend.mate_breakend:
|
|
368
|
+
continue
|
|
369
|
+
|
|
370
|
+
mate = breakend.mate_breakend
|
|
371
|
+
|
|
372
|
+
# Check if this forms a duplication pattern
|
|
373
|
+
if (
|
|
374
|
+
breakend.chrom == mate.chrom # Same chromosome
|
|
375
|
+
and breakend.chrom
|
|
376
|
+
== breakend.mate_chrom # Mate points to same chromosome
|
|
377
|
+
and mate.chrom
|
|
378
|
+
== mate.mate_chrom # Mate's mate points to same chromosome
|
|
379
|
+
and breakend.pos != mate.pos
|
|
380
|
+
): # Different positions
|
|
381
|
+
|
|
382
|
+
# Check mutual pointing (A->B, B->A)
|
|
383
|
+
points_to_mate = (
|
|
384
|
+
breakend.mate_chrom == mate.chrom and breakend.mate_pos == mate.pos
|
|
385
|
+
)
|
|
386
|
+
mate_points_back = (
|
|
387
|
+
mate.mate_chrom == breakend.chrom and mate.mate_pos == breakend.pos
|
|
388
|
+
)
|
|
389
|
+
|
|
390
|
+
if points_to_mate and mate_points_back:
|
|
391
|
+
# Check orientations to determine if this is truly a duplication or inversion
|
|
392
|
+
# Duplication: orientations should be compatible with copy-paste behavior
|
|
393
|
+
# Inversion: orientations should indicate sequence reversal
|
|
394
|
+
|
|
395
|
+
orientation1 = breakend.orientation
|
|
396
|
+
orientation2 = mate.orientation
|
|
397
|
+
|
|
398
|
+
# Simple heuristic: if we have more than 2 breakends on same chromosome pointing to each other,
|
|
399
|
+
# it's likely an inversion pattern, not duplication (which typically involves 2 breakends)
|
|
400
|
+
# Count breakends on this chromosome
|
|
401
|
+
same_chrom_breakends = [
|
|
402
|
+
b
|
|
403
|
+
for b in paired_breakends
|
|
404
|
+
if b.chrom == breakend.chrom and b.mate_chrom == breakend.chrom
|
|
405
|
+
]
|
|
406
|
+
|
|
407
|
+
if len(same_chrom_breakends) > 2:
|
|
408
|
+
# Likely inversion pattern with multiple breakends - skip duplication classification
|
|
409
|
+
continue
|
|
410
|
+
|
|
411
|
+
# This is a duplication pattern - reclassify both breakends
|
|
412
|
+
dup_breakend1 = Breakend.from_breakend_variant(
|
|
413
|
+
BreakendVariant(
|
|
414
|
+
id=breakend.id,
|
|
415
|
+
chrom=breakend.chrom,
|
|
416
|
+
pos=breakend.pos,
|
|
417
|
+
ref=breakend.ref,
|
|
418
|
+
alt=breakend.alt,
|
|
419
|
+
mate_id=getattr(breakend, "mate_id", ""),
|
|
420
|
+
mate_chrom=breakend.mate_chrom,
|
|
421
|
+
mate_pos=breakend.mate_pos,
|
|
422
|
+
orientation=breakend.orientation,
|
|
423
|
+
inserted_seq=breakend.inserted_seq,
|
|
424
|
+
info="",
|
|
425
|
+
variant_type="SV_BND_DUP",
|
|
426
|
+
),
|
|
427
|
+
"SV_BND_DUP",
|
|
428
|
+
)
|
|
429
|
+
dup_breakend2 = Breakend.from_breakend_variant(
|
|
430
|
+
BreakendVariant(
|
|
431
|
+
id=mate.id,
|
|
432
|
+
chrom=mate.chrom,
|
|
433
|
+
pos=mate.pos,
|
|
434
|
+
ref=mate.ref,
|
|
435
|
+
alt=mate.alt,
|
|
436
|
+
mate_id=getattr(mate, "mate_id", ""),
|
|
437
|
+
mate_chrom=mate.mate_chrom,
|
|
438
|
+
mate_pos=mate.mate_pos,
|
|
439
|
+
orientation=mate.orientation,
|
|
440
|
+
inserted_seq=mate.inserted_seq,
|
|
441
|
+
info="",
|
|
442
|
+
variant_type="SV_BND_DUP",
|
|
443
|
+
),
|
|
444
|
+
"SV_BND_DUP",
|
|
445
|
+
)
|
|
446
|
+
|
|
447
|
+
# Maintain mate relationships
|
|
448
|
+
dup_breakend1.mate_breakend = dup_breakend2
|
|
449
|
+
dup_breakend2.mate_breakend = dup_breakend1
|
|
450
|
+
|
|
451
|
+
dup_breakends.extend([dup_breakend1, dup_breakend2])
|
|
452
|
+
processed_ids.add(breakend.id)
|
|
453
|
+
processed_ids.add(mate.id)
|
|
454
|
+
|
|
455
|
+
return dup_breakends
|
|
456
|
+
|
|
457
|
+
def _detect_inversion_pattern(
|
|
458
|
+
self, paired_breakends: List[Breakend]
|
|
459
|
+
) -> List[Breakend]:
|
|
460
|
+
"""
|
|
461
|
+
Detect inversion patterns from paired breakends.
|
|
462
|
+
|
|
463
|
+
An inversion pattern consists of 4 breakends forming 2 pairs:
|
|
464
|
+
- Pair 1: Outer breakpoints (A, B) with inverted orientations
|
|
465
|
+
- Pair 2: Inner breakpoints (C, D) with inverted orientations
|
|
466
|
+
- Same chromosome, positions in order A < C < D < B
|
|
467
|
+
|
|
468
|
+
Returns:
|
|
469
|
+
List of breakends reclassified as SV_BND_INV
|
|
470
|
+
"""
|
|
471
|
+
inv_breakends = []
|
|
472
|
+
processed_ids = set()
|
|
473
|
+
|
|
474
|
+
# Group breakends by chromosome for efficiency
|
|
475
|
+
chrom_groups = {}
|
|
476
|
+
for breakend in paired_breakends:
|
|
477
|
+
if breakend.id in processed_ids:
|
|
478
|
+
continue
|
|
479
|
+
chrom = breakend.chrom
|
|
480
|
+
if chrom not in chrom_groups:
|
|
481
|
+
chrom_groups[chrom] = []
|
|
482
|
+
chrom_groups[chrom].append(breakend)
|
|
483
|
+
|
|
484
|
+
# Look for inversion patterns within each chromosome
|
|
485
|
+
for chrom, breakends in chrom_groups.items():
|
|
486
|
+
if len(breakends) < 4: # Need at least 4 breakends for inversion
|
|
487
|
+
continue
|
|
488
|
+
|
|
489
|
+
# Sort by position
|
|
490
|
+
breakends_sorted = sorted(breakends, key=lambda x: x.pos)
|
|
491
|
+
|
|
492
|
+
# Check for inversion patterns - simplified heuristic
|
|
493
|
+
# Look for breakends that point inward (toward each other)
|
|
494
|
+
for i in range(len(breakends_sorted) - 1):
|
|
495
|
+
breakend1 = breakends_sorted[i]
|
|
496
|
+
breakend2 = breakends_sorted[i + 1]
|
|
497
|
+
|
|
498
|
+
if (
|
|
499
|
+
breakend1.id in processed_ids
|
|
500
|
+
or breakend2.id in processed_ids
|
|
501
|
+
or not breakend1.mate_breakend
|
|
502
|
+
or not breakend2.mate_breakend
|
|
503
|
+
):
|
|
504
|
+
continue
|
|
505
|
+
|
|
506
|
+
# Check if this is part of an inversion pattern
|
|
507
|
+
# For inversion: we expect 4 breakends on same chromosome with crossed connections
|
|
508
|
+
if (
|
|
509
|
+
breakend1.chrom == breakend2.chrom == chrom
|
|
510
|
+
and breakend1.mate_chrom == breakend2.mate_chrom == chrom
|
|
511
|
+
and breakend1.mate_breakend
|
|
512
|
+
and breakend2.mate_breakend
|
|
513
|
+
):
|
|
514
|
+
|
|
515
|
+
# Check if the 4 breakends form inversion pattern
|
|
516
|
+
# Simple heuristic: 4 breakends all pointing to each other on same chromosome
|
|
517
|
+
same_chrom_count = len(
|
|
518
|
+
[
|
|
519
|
+
b
|
|
520
|
+
for b in breakends_sorted
|
|
521
|
+
if b.chrom == chrom and b.mate_chrom == chrom
|
|
522
|
+
]
|
|
523
|
+
)
|
|
524
|
+
|
|
525
|
+
if same_chrom_count == 4:
|
|
526
|
+
# Reclassify all 4 breakends as inversion
|
|
527
|
+
for breakend_to_classify in breakends_sorted:
|
|
528
|
+
if breakend_to_classify.id in processed_ids:
|
|
529
|
+
continue
|
|
530
|
+
|
|
531
|
+
# Reclassify as inversion
|
|
532
|
+
inv_breakend = Breakend.from_breakend_variant(
|
|
533
|
+
BreakendVariant(
|
|
534
|
+
id=breakend_to_classify.id,
|
|
535
|
+
chrom=breakend_to_classify.chrom,
|
|
536
|
+
pos=breakend_to_classify.pos,
|
|
537
|
+
ref=breakend_to_classify.ref,
|
|
538
|
+
alt=breakend_to_classify.alt,
|
|
539
|
+
mate_id=getattr(
|
|
540
|
+
breakend_to_classify, "mate_id", ""
|
|
541
|
+
),
|
|
542
|
+
mate_chrom=breakend_to_classify.mate_chrom,
|
|
543
|
+
mate_pos=breakend_to_classify.mate_pos,
|
|
544
|
+
orientation=breakend_to_classify.orientation,
|
|
545
|
+
inserted_seq=breakend_to_classify.inserted_seq,
|
|
546
|
+
info="",
|
|
547
|
+
variant_type="SV_BND_INV",
|
|
548
|
+
),
|
|
549
|
+
"SV_BND_INV",
|
|
550
|
+
)
|
|
551
|
+
inv_breakends.append(inv_breakend)
|
|
552
|
+
processed_ids.add(breakend_to_classify.id)
|
|
553
|
+
|
|
554
|
+
# Exit the loop since we processed all breakends for this chromosome
|
|
555
|
+
break
|
|
556
|
+
|
|
557
|
+
return inv_breakends
|
|
558
|
+
|
|
559
|
+
def _infer_mate_orientation(self, original_orientation: str) -> str:
|
|
560
|
+
"""
|
|
561
|
+
Infer the orientation of a missing mate breakend based on the original breakend's orientation.
|
|
562
|
+
|
|
563
|
+
BND orientation pairs (original -> inferred mate):
|
|
564
|
+
|
|
565
|
+
- t[p[ -> ]p]t
|
|
566
|
+
- t]p] -> [p[t
|
|
567
|
+
- ]p]t -> t[p[
|
|
568
|
+
- [p[t -> t]p]
|
|
569
|
+
"""
|
|
570
|
+
orientation_pairs = {
|
|
571
|
+
"t[p[": "]p]t",
|
|
572
|
+
"t]p]": "[p[t",
|
|
573
|
+
"]p]t": "t[p[",
|
|
574
|
+
"[p[t": "t]p]",
|
|
575
|
+
}
|
|
576
|
+
return orientation_pairs.get(original_orientation, "]p]t")
|
|
577
|
+
|
|
578
|
+
def _parse_and_index_breakends(self, bnd_variants: pd.DataFrame):
|
|
579
|
+
"""Parse breakends and build coordinate index."""
|
|
580
|
+
for _, variant in bnd_variants.iterrows():
|
|
581
|
+
try:
|
|
582
|
+
# Parse ALT field
|
|
583
|
+
alt_info = parse_breakend_alt(variant["alt"])
|
|
584
|
+
if not alt_info["is_valid"]:
|
|
585
|
+
warnings.warn(
|
|
586
|
+
f"Could not parse ALT field for {variant['id']}: {variant['alt']}"
|
|
587
|
+
)
|
|
588
|
+
continue
|
|
589
|
+
|
|
590
|
+
# Parse INFO field for optional MATEID
|
|
591
|
+
info_dict = parse_vcf_info(variant.get("info", ""))
|
|
592
|
+
mate_id = info_dict.get("MATEID", None)
|
|
593
|
+
|
|
594
|
+
# Create BreakendVariant
|
|
595
|
+
breakend_var = BreakendVariant(
|
|
596
|
+
id=variant["id"],
|
|
597
|
+
chrom=variant["chrom"],
|
|
598
|
+
pos=variant["pos1"],
|
|
599
|
+
ref=variant["ref"],
|
|
600
|
+
alt=variant["alt"],
|
|
601
|
+
mate_id=mate_id,
|
|
602
|
+
mate_chrom=alt_info["mate_chrom"],
|
|
603
|
+
mate_pos=alt_info["mate_pos"],
|
|
604
|
+
orientation=alt_info["orientation"],
|
|
605
|
+
inserted_seq=alt_info["inserted_seq"],
|
|
606
|
+
info=variant.get("info", ""),
|
|
607
|
+
variant_type="SV_BND",
|
|
608
|
+
)
|
|
609
|
+
|
|
610
|
+
self.all_breakends.append(breakend_var)
|
|
611
|
+
|
|
612
|
+
# Index by coordinates
|
|
613
|
+
coord_key = (breakend_var.chrom, breakend_var.pos)
|
|
614
|
+
self.coordinate_index[coord_key] = breakend_var
|
|
615
|
+
|
|
616
|
+
except Exception as e:
|
|
617
|
+
warnings.warn(f"Error processing breakend {variant['id']}: {e}")
|
|
618
|
+
|
|
619
|
+
|
|
620
|
+
def _count_vcf_header_lines(path: str) -> int:
|
|
621
|
+
"""
|
|
622
|
+
Count the number of header lines in a VCF file.
|
|
623
|
+
|
|
624
|
+
VCF files have two types of header lines:
|
|
625
|
+
- Lines starting with ## (metadata)
|
|
626
|
+
- Line starting with #CHROM (column header)
|
|
627
|
+
|
|
628
|
+
Args:
|
|
629
|
+
path: Path to VCF file
|
|
630
|
+
|
|
631
|
+
Returns:
|
|
632
|
+
Number of lines to skip (all ## lines + the #CHROM line)
|
|
633
|
+
"""
|
|
634
|
+
with open(path, "r") as f:
|
|
635
|
+
header_count = 0
|
|
636
|
+
for line in f:
|
|
637
|
+
if line.startswith("##"):
|
|
638
|
+
header_count += 1
|
|
639
|
+
elif line.startswith("#"):
|
|
640
|
+
header_count += 1 # Skip the #CHROM header line
|
|
641
|
+
break
|
|
642
|
+
else:
|
|
643
|
+
break # Reached data lines
|
|
644
|
+
return header_count
|
|
645
|
+
|
|
646
|
+
|
|
647
|
+
def read_vcf(path, include_info=True, classify_variants=True):
|
|
648
|
+
"""
|
|
649
|
+
Read VCF file into pandas DataFrame with enhanced variant classification.
|
|
650
|
+
|
|
651
|
+
Args:
|
|
652
|
+
path: Path to VCF file
|
|
653
|
+
include_info: Whether to include INFO field (default: True)
|
|
654
|
+
classify_variants: Whether to classify variant types (default: True)
|
|
655
|
+
|
|
656
|
+
Returns:
|
|
657
|
+
DataFrame with columns: chrom, pos1, id, ref, alt, [info], [variant_type]
|
|
658
|
+
|
|
659
|
+
Notes:
|
|
660
|
+
- INFO field parsing enables structural variant classification
|
|
661
|
+
- variant_type column uses VCF 4.2 compliant classification
|
|
662
|
+
- Compatible with existing code expecting basic 5-column format
|
|
663
|
+
"""
|
|
664
|
+
# Determine columns to read based on parameters
|
|
665
|
+
if include_info:
|
|
666
|
+
usecols = [0, 1, 2, 3, 4, 7] # Include INFO field
|
|
667
|
+
base_columns = ["chrom", "pos1", "id", "ref", "alt", "info"]
|
|
668
|
+
else:
|
|
669
|
+
usecols = [0, 1, 2, 3, 4] # Include ID field by default
|
|
670
|
+
base_columns = ["chrom", "pos1", "id", "ref", "alt"]
|
|
671
|
+
|
|
672
|
+
# Count header lines for VCF line tracking (needed for vcf_line column)
|
|
673
|
+
header_count = _count_vcf_header_lines(path)
|
|
674
|
+
|
|
675
|
+
# Read VCF using pandas with comment='#' to skip all header lines automatically
|
|
676
|
+
df = pd.read_table(
|
|
677
|
+
path, comment="#", header=None, names=base_columns, usecols=usecols
|
|
678
|
+
)
|
|
679
|
+
|
|
680
|
+
# Add VCF line numbers for debugging (1-indexed, accounting for header lines)
|
|
681
|
+
# Line number = header_lines + 1 (for 1-indexing) + row_index
|
|
682
|
+
df["vcf_line"] = df.index + header_count + 1
|
|
683
|
+
|
|
684
|
+
# Validate that pos1 column is numeric
|
|
685
|
+
if not pd.api.types.is_numeric_dtype(df["pos1"]):
|
|
686
|
+
raise ValueError(
|
|
687
|
+
f"Position column (second column) must be numeric, got {df['pos1'].dtype}"
|
|
688
|
+
)
|
|
689
|
+
|
|
690
|
+
# Filter out multiallelic variants (ALT alleles containing commas)
|
|
691
|
+
df = _filter_multiallelic_variants(df)
|
|
692
|
+
|
|
693
|
+
# Add variant classification if requested
|
|
694
|
+
if classify_variants:
|
|
695
|
+
df["variant_type"] = df.apply(
|
|
696
|
+
lambda row: classify_variant_type(
|
|
697
|
+
row["ref"],
|
|
698
|
+
row["alt"],
|
|
699
|
+
parse_vcf_info(row.get("info", "")) if include_info else None,
|
|
700
|
+
),
|
|
701
|
+
axis=1,
|
|
702
|
+
)
|
|
703
|
+
# Note: INV and DUP variants represented by multiple BNDs are handled by
|
|
704
|
+
# BNDClassifier and group_variants_by_semantic_type() functions
|
|
705
|
+
return df
|
|
706
|
+
|
|
707
|
+
|
|
708
|
+
def read_vcf_chunked(path, n_chunks=1, include_info=True, classify_variants=True):
|
|
709
|
+
"""
|
|
710
|
+
Read VCF file in chunks using generator with enhanced variant classification.
|
|
711
|
+
|
|
712
|
+
Args:
|
|
713
|
+
path: Path to VCF file
|
|
714
|
+
n_chunks: Number of chunks to split variants into (default: 1)
|
|
715
|
+
include_info: Whether to include INFO field (default: True)
|
|
716
|
+
classify_variants: Whether to classify variant types (default: True)
|
|
717
|
+
|
|
718
|
+
Yields:
|
|
719
|
+
DataFrame chunks with columns: chrom, pos1, id, ref, alt, [info], [variant_type]
|
|
720
|
+
"""
|
|
721
|
+
# Determine columns to read based on parameters
|
|
722
|
+
if include_info:
|
|
723
|
+
usecols = [0, 1, 2, 3, 4, 7] # Include INFO field
|
|
724
|
+
base_columns = ["chrom", "pos1", "id", "ref", "alt", "info"]
|
|
725
|
+
else:
|
|
726
|
+
usecols = [0, 1, 2, 3, 4] # Include ID field by default
|
|
727
|
+
base_columns = ["chrom", "pos1", "id", "ref", "alt"]
|
|
728
|
+
|
|
729
|
+
# Count header lines for VCF line tracking (needed for vcf_line column)
|
|
730
|
+
header_count = _count_vcf_header_lines(path)
|
|
731
|
+
|
|
732
|
+
# Read VCF using pandas with comment='#' to skip all header lines automatically
|
|
733
|
+
full_df = pd.read_table(
|
|
734
|
+
path, comment="#", header=None, names=base_columns, usecols=usecols
|
|
735
|
+
)
|
|
736
|
+
|
|
737
|
+
# Handle empty DataFrame
|
|
738
|
+
if len(full_df) == 0:
|
|
739
|
+
return
|
|
740
|
+
|
|
741
|
+
# Add VCF line numbers for debugging (1-indexed, accounting for header lines)
|
|
742
|
+
# Line number = header_lines + 1 (for 1-indexing) + row_index
|
|
743
|
+
full_df["vcf_line"] = full_df.index + header_count + 1
|
|
744
|
+
|
|
745
|
+
# Validate that pos1 column is numeric
|
|
746
|
+
if not pd.api.types.is_numeric_dtype(full_df["pos1"]):
|
|
747
|
+
raise ValueError(
|
|
748
|
+
f"Position column (second column) must be numeric, got {full_df['pos1'].dtype}"
|
|
749
|
+
)
|
|
750
|
+
|
|
751
|
+
# Filter out multiallelic variants (ALT alleles containing commas)
|
|
752
|
+
full_df = _filter_multiallelic_variants(full_df)
|
|
753
|
+
|
|
754
|
+
# Add variant classification if requested
|
|
755
|
+
if classify_variants:
|
|
756
|
+
full_df["variant_type"] = full_df.apply(
|
|
757
|
+
lambda row: classify_variant_type(
|
|
758
|
+
row["ref"],
|
|
759
|
+
row["alt"],
|
|
760
|
+
parse_vcf_info(row.get("info", "")) if include_info else None,
|
|
761
|
+
),
|
|
762
|
+
axis=1,
|
|
763
|
+
)
|
|
764
|
+
|
|
765
|
+
# Split into chunks using numpy array_split
|
|
766
|
+
# Use numpy array_split to create n_chunks approximately equal chunks
|
|
767
|
+
indices = np.array_split(np.arange(len(full_df)), n_chunks)
|
|
768
|
+
|
|
769
|
+
for chunk_indices in indices:
|
|
770
|
+
if len(chunk_indices) > 0:
|
|
771
|
+
yield full_df.iloc[chunk_indices].reset_index(drop=True)
|
|
772
|
+
|
|
773
|
+
|
|
774
|
+
def get_vcf_chromosomes(path):
|
|
775
|
+
"""
|
|
776
|
+
Get list of chromosomes in VCF file without loading all variants.
|
|
777
|
+
|
|
778
|
+
Args:
|
|
779
|
+
path: Path to VCF file
|
|
780
|
+
|
|
781
|
+
Returns:
|
|
782
|
+
Set of chromosome names found in the VCF file
|
|
783
|
+
"""
|
|
784
|
+
chromosomes = set()
|
|
785
|
+
with open(path, "r") as f:
|
|
786
|
+
for line in f:
|
|
787
|
+
if line.startswith("##"):
|
|
788
|
+
continue
|
|
789
|
+
if line.startswith("#CHROM"):
|
|
790
|
+
continue
|
|
791
|
+
# Parse first column (chromosome)
|
|
792
|
+
chrom = line.split("\t")[0]
|
|
793
|
+
chromosomes.add(chrom)
|
|
794
|
+
return chromosomes
|
|
795
|
+
|
|
796
|
+
|
|
797
|
+
def read_vcf_chromosome(
|
|
798
|
+
path, target_chromosome, include_info=True, classify_variants=True
|
|
799
|
+
):
|
|
800
|
+
"""
|
|
801
|
+
Read VCF file for a specific chromosome only with enhanced variant classification.
|
|
802
|
+
|
|
803
|
+
Args:
|
|
804
|
+
path: Path to VCF file
|
|
805
|
+
target_chromosome: Chromosome name to filter for
|
|
806
|
+
include_info: Whether to include INFO field (default: True)
|
|
807
|
+
classify_variants: Whether to classify variant types (default: True)
|
|
808
|
+
|
|
809
|
+
Returns:
|
|
810
|
+
DataFrame with variants only from specified chromosome
|
|
811
|
+
(columns: chrom, pos1, id, ref, alt, [info], [variant_type])
|
|
812
|
+
"""
|
|
813
|
+
chromosome_lines = []
|
|
814
|
+
header_line = None
|
|
815
|
+
|
|
816
|
+
with open(path, "r") as f:
|
|
817
|
+
for line in f:
|
|
818
|
+
if line.startswith("##"):
|
|
819
|
+
continue
|
|
820
|
+
if line.startswith("#CHROM"):
|
|
821
|
+
header_line = line
|
|
822
|
+
continue
|
|
823
|
+
|
|
824
|
+
# Check if this line is for our target chromosome
|
|
825
|
+
chrom = line.split("\t")[0]
|
|
826
|
+
if chrom == target_chromosome:
|
|
827
|
+
chromosome_lines.append(line)
|
|
828
|
+
|
|
829
|
+
# Determine columns to read based on parameters
|
|
830
|
+
if include_info:
|
|
831
|
+
usecols = [0, 1, 2, 3, 4, 7] # Include INFO field
|
|
832
|
+
base_columns = ["chrom", "pos1", "id", "ref", "alt", "info"]
|
|
833
|
+
else:
|
|
834
|
+
usecols = [0, 1, 2, 3, 4] # Include ID field by default
|
|
835
|
+
base_columns = ["chrom", "pos1", "id", "ref", "alt"]
|
|
836
|
+
|
|
837
|
+
if not chromosome_lines:
|
|
838
|
+
# Return empty DataFrame with correct columns if no variants found
|
|
839
|
+
empty_columns = base_columns.copy()
|
|
840
|
+
if classify_variants:
|
|
841
|
+
empty_columns.append("variant_type")
|
|
842
|
+
return pd.DataFrame(columns=empty_columns)
|
|
843
|
+
|
|
844
|
+
# Combine header and chromosome-specific lines
|
|
845
|
+
vcf_data = header_line + "".join(chromosome_lines)
|
|
846
|
+
|
|
847
|
+
# Parse into DataFrame
|
|
848
|
+
df = pd.read_csv(io.StringIO(vcf_data), sep="\t", usecols=usecols)
|
|
849
|
+
|
|
850
|
+
# Set column names
|
|
851
|
+
df.columns = base_columns
|
|
852
|
+
|
|
853
|
+
# Validate that pos1 column is numeric
|
|
854
|
+
if len(df) > 0 and not pd.api.types.is_numeric_dtype(df["pos1"]):
|
|
855
|
+
raise ValueError(
|
|
856
|
+
f"Position column (second column) must be numeric, got {df['pos1'].dtype}"
|
|
857
|
+
)
|
|
858
|
+
|
|
859
|
+
# Filter out multiallelic variants (ALT alleles containing commas)
|
|
860
|
+
if len(df) > 0:
|
|
861
|
+
df = _filter_multiallelic_variants(df)
|
|
862
|
+
|
|
863
|
+
# Add variant classification if requested
|
|
864
|
+
if classify_variants and len(df) > 0:
|
|
865
|
+
df["variant_type"] = df.apply(
|
|
866
|
+
lambda row: classify_variant_type(
|
|
867
|
+
row["ref"],
|
|
868
|
+
row["alt"],
|
|
869
|
+
parse_vcf_info(row.get("info", "")) if include_info else None,
|
|
870
|
+
),
|
|
871
|
+
axis=1,
|
|
872
|
+
)
|
|
873
|
+
|
|
874
|
+
return df
|
|
875
|
+
|
|
876
|
+
|
|
877
|
+
def read_vcf_chromosomes_chunked(
|
|
878
|
+
path, target_chromosomes, n_chunks=1, include_info=True, classify_variants=True
|
|
879
|
+
):
|
|
880
|
+
"""
|
|
881
|
+
Read VCF file for specific chromosomes in chunks with enhanced variant classification.
|
|
882
|
+
|
|
883
|
+
Args:
|
|
884
|
+
path: Path to VCF file
|
|
885
|
+
target_chromosomes: List/set of chromosome names to include
|
|
886
|
+
n_chunks: Number of chunks per chromosome (default: 1)
|
|
887
|
+
include_info: Whether to include INFO field (default: True)
|
|
888
|
+
classify_variants: Whether to classify variant types (default: True)
|
|
889
|
+
|
|
890
|
+
Yields:
|
|
891
|
+
Tuples of (chromosome, variants_dataframe) for each chunk
|
|
892
|
+
DataFrame columns: chrom, pos1, id, ref, alt, [info], [variant_type]
|
|
893
|
+
"""
|
|
894
|
+
target_chromosomes = set(target_chromosomes)
|
|
895
|
+
|
|
896
|
+
for chrom in target_chromosomes:
|
|
897
|
+
chrom_variants = read_vcf_chromosome(
|
|
898
|
+
path, chrom, include_info, classify_variants
|
|
899
|
+
)
|
|
900
|
+
|
|
901
|
+
if len(chrom_variants) == 0:
|
|
902
|
+
continue
|
|
903
|
+
|
|
904
|
+
if n_chunks == 1:
|
|
905
|
+
# Single chunk - yield all variants for this chromosome
|
|
906
|
+
yield chrom, chrom_variants
|
|
907
|
+
else:
|
|
908
|
+
# Multiple chunks - split chromosome variants into n_chunks
|
|
909
|
+
indices = np.array_split(np.arange(len(chrom_variants)), n_chunks)
|
|
910
|
+
|
|
911
|
+
for i, chunk_indices in enumerate(indices):
|
|
912
|
+
if len(chunk_indices) > 0:
|
|
913
|
+
chunk_df = chrom_variants.iloc[chunk_indices].reset_index(drop=True)
|
|
914
|
+
yield f"{chrom}_chunk_{i+1}", chunk_df
|
|
915
|
+
|
|
916
|
+
|
|
917
|
+
def group_variants_by_semantic_type(
|
|
918
|
+
variants_df: pd.DataFrame, vcf_path: Optional[str] = None
|
|
919
|
+
) -> Dict[str, pd.DataFrame]:
|
|
920
|
+
"""
|
|
921
|
+
Group variants by semantic type for unified processing.
|
|
922
|
+
|
|
923
|
+
This function groups variants so that DUP and SV_BND_DUP are processed together,
|
|
924
|
+
INV and SV_BND_INV are processed together, etc.
|
|
925
|
+
|
|
926
|
+
Args:
|
|
927
|
+
variants_df: DataFrame with variants including variant_type column
|
|
928
|
+
vcf_path: Optional VCF path for BND semantic classification
|
|
929
|
+
|
|
930
|
+
Returns:
|
|
931
|
+
Dict with keys: 'standard', 'dup_variants', 'inv_variants', 'bnd_variants'
|
|
932
|
+
"""
|
|
933
|
+
grouped = {
|
|
934
|
+
"standard": pd.DataFrame(),
|
|
935
|
+
"dup_variants": pd.DataFrame(),
|
|
936
|
+
"inv_variants": pd.DataFrame(),
|
|
937
|
+
"bnd_variants": pd.DataFrame(),
|
|
938
|
+
}
|
|
939
|
+
|
|
940
|
+
# Standard variants (SNV, INS, DEL, MNV)
|
|
941
|
+
standard_types = ["SNV", "MNV", "INS", "DEL", "complex"]
|
|
942
|
+
grouped["standard"] = variants_df[
|
|
943
|
+
variants_df["variant_type"].isin(standard_types)
|
|
944
|
+
].copy()
|
|
945
|
+
|
|
946
|
+
# Symbolic DUP variants
|
|
947
|
+
dup_types = ["SV_DUP"]
|
|
948
|
+
grouped["dup_variants"] = variants_df[
|
|
949
|
+
variants_df["variant_type"].isin(dup_types)
|
|
950
|
+
].copy()
|
|
951
|
+
|
|
952
|
+
# Symbolic INV variants
|
|
953
|
+
inv_types = ["SV_INV"]
|
|
954
|
+
grouped["inv_variants"] = variants_df[
|
|
955
|
+
variants_df["variant_type"].isin(inv_types)
|
|
956
|
+
].copy()
|
|
957
|
+
|
|
958
|
+
# Handle BND variants with semantic classification
|
|
959
|
+
bnd_types = ["SV_BND", "SV_BND_INS"]
|
|
960
|
+
bnd_variants = variants_df[variants_df["variant_type"].isin(bnd_types)]
|
|
961
|
+
|
|
962
|
+
if len(bnd_variants) > 0 and vcf_path:
|
|
963
|
+
# Use BNDClassifier to get semantic classifications
|
|
964
|
+
classifier = BNDClassifier()
|
|
965
|
+
classified_breakends = classifier.classify_all_breakends(
|
|
966
|
+
vcf_path, verbose=False
|
|
967
|
+
)
|
|
968
|
+
|
|
969
|
+
# Extract variant IDs for each semantic type
|
|
970
|
+
dup_bnd_ids = {b.id for b in classified_breakends.get("dup_breakends", [])}
|
|
971
|
+
inv_bnd_ids = {b.id for b in classified_breakends.get("inv_breakends", [])}
|
|
972
|
+
true_bnd_ids = {b.id for b in classified_breakends.get("paired", [])}
|
|
973
|
+
|
|
974
|
+
# Group BND variants by semantic type
|
|
975
|
+
dup_bnd_variants = bnd_variants[bnd_variants["id"].isin(dup_bnd_ids)].copy()
|
|
976
|
+
inv_bnd_variants = bnd_variants[bnd_variants["id"].isin(inv_bnd_ids)].copy()
|
|
977
|
+
true_bnd_variants = bnd_variants[bnd_variants["id"].isin(true_bnd_ids)].copy()
|
|
978
|
+
|
|
979
|
+
# Update variant_type for semantic consistency
|
|
980
|
+
dup_bnd_variants["variant_type"] = "SV_BND_DUP"
|
|
981
|
+
inv_bnd_variants["variant_type"] = "SV_BND_INV"
|
|
982
|
+
|
|
983
|
+
# Combine with symbolic variants
|
|
984
|
+
grouped["dup_variants"] = pd.concat(
|
|
985
|
+
[grouped["dup_variants"], dup_bnd_variants], ignore_index=True
|
|
986
|
+
)
|
|
987
|
+
grouped["inv_variants"] = pd.concat(
|
|
988
|
+
[grouped["inv_variants"], inv_bnd_variants], ignore_index=True
|
|
989
|
+
)
|
|
990
|
+
grouped["bnd_variants"] = true_bnd_variants.copy()
|
|
991
|
+
else:
|
|
992
|
+
# No BND semantic classification possible
|
|
993
|
+
grouped["bnd_variants"] = bnd_variants.copy()
|
|
994
|
+
|
|
995
|
+
return grouped
|
|
996
|
+
|
|
997
|
+
|
|
998
|
+
def parse_vcf_info(info_string: str) -> Dict:
|
|
999
|
+
"""
|
|
1000
|
+
Parse VCF INFO field to extract variant information according to VCF 4.2 specification.
|
|
1001
|
+
|
|
1002
|
+
Args:
|
|
1003
|
+
info_string: VCF INFO field string (e.g., "SVTYPE=INV;END=1234;SVLEN=100")
|
|
1004
|
+
|
|
1005
|
+
Returns:
|
|
1006
|
+
dict: Parsed INFO field values with appropriate type conversion
|
|
1007
|
+
|
|
1008
|
+
VCF 4.2 INFO field specification:
|
|
1009
|
+
|
|
1010
|
+
- Key=Value pairs separated by semicolons
|
|
1011
|
+
- Boolean flags have no value (key presence = True)
|
|
1012
|
+
- Numeric values auto-converted to int/float
|
|
1013
|
+
- Reserved keys: AA, AC, AF, AN, BQ, CIGAR, DB, DP, END, H2, H3, MQ, MQ0, NS, SB, etc.
|
|
1014
|
+
|
|
1015
|
+
Examples:
|
|
1016
|
+
|
|
1017
|
+
>>> parse_vcf_info("SVTYPE=INV;END=1234;SVLEN=100")
|
|
1018
|
+
{'SVTYPE': 'INV', 'END': 1234, 'SVLEN': 100}
|
|
1019
|
+
|
|
1020
|
+
>>> parse_vcf_info("DB;H2;AF=0.5")
|
|
1021
|
+
{'DB': True, 'H2': True, 'AF': 0.5}
|
|
1022
|
+
"""
|
|
1023
|
+
info_dict = {}
|
|
1024
|
+
if not info_string or info_string == ".":
|
|
1025
|
+
return info_dict
|
|
1026
|
+
|
|
1027
|
+
for field in info_string.split(";"):
|
|
1028
|
+
field = field.strip()
|
|
1029
|
+
if not field:
|
|
1030
|
+
continue
|
|
1031
|
+
|
|
1032
|
+
if "=" in field:
|
|
1033
|
+
key, value = field.split("=", 1)
|
|
1034
|
+
key = key.strip()
|
|
1035
|
+
value = value.strip()
|
|
1036
|
+
|
|
1037
|
+
# Handle comma-separated lists (like AC=1,2,3)
|
|
1038
|
+
if "," in value:
|
|
1039
|
+
value_list = [v.strip() for v in value.split(",")]
|
|
1040
|
+
# Try to convert list elements to numbers
|
|
1041
|
+
converted_list = []
|
|
1042
|
+
for v in value_list:
|
|
1043
|
+
try:
|
|
1044
|
+
if "." in v:
|
|
1045
|
+
converted_list.append(float(v))
|
|
1046
|
+
else:
|
|
1047
|
+
converted_list.append(int(v))
|
|
1048
|
+
except ValueError:
|
|
1049
|
+
converted_list.append(v)
|
|
1050
|
+
info_dict[key] = converted_list
|
|
1051
|
+
else:
|
|
1052
|
+
# Single value - try numeric conversion
|
|
1053
|
+
try:
|
|
1054
|
+
if "." in value:
|
|
1055
|
+
info_dict[key] = float(value)
|
|
1056
|
+
else:
|
|
1057
|
+
info_dict[key] = int(value)
|
|
1058
|
+
except ValueError:
|
|
1059
|
+
info_dict[key] = value
|
|
1060
|
+
else:
|
|
1061
|
+
# Boolean flag (presence = True)
|
|
1062
|
+
info_dict[field.strip()] = True
|
|
1063
|
+
|
|
1064
|
+
return info_dict
|
|
1065
|
+
|
|
1066
|
+
|
|
1067
|
+
def classify_variant_type(
|
|
1068
|
+
ref_allele: str, alt_allele: str, info_dict: Optional[Dict] = None
|
|
1069
|
+
) -> str:
|
|
1070
|
+
"""
|
|
1071
|
+
Classify variant type according to VCF 4.2 specification using comprehensive heuristics.
|
|
1072
|
+
|
|
1073
|
+
Note: This function only correctly classifies variants that are represented in a single
|
|
1074
|
+
VCF record, this means that an additional classification step is needed for BNDs that
|
|
1075
|
+
actually represent INV or DUP variants as those can be represented as 4 or 2 VCF records
|
|
1076
|
+
respectively.
|
|
1077
|
+
|
|
1078
|
+
This function implements the complete VCF 4.2 variant classification rules with proper
|
|
1079
|
+
handling of structural variants, standard sequence variants, and edge cases.
|
|
1080
|
+
|
|
1081
|
+
Args:
|
|
1082
|
+
ref_allele: Reference allele sequence (REF field)
|
|
1083
|
+
alt_allele: Alternate allele sequence (ALT field)
|
|
1084
|
+
info_dict: Parsed INFO field dictionary (optional, for structural variants)
|
|
1085
|
+
|
|
1086
|
+
Returns:
|
|
1087
|
+
str: Variant type classification
|
|
1088
|
+
|
|
1089
|
+
VCF 4.2 Variant Types (in classification priority order):
|
|
1090
|
+
|
|
1091
|
+
- 'complex': Complex/multiallelic variants (ALT contains comma)
|
|
1092
|
+
- 'missing': Missing/upstream deletion allele (ALT = '*')
|
|
1093
|
+
- 'SV_INV': Inversion structural variant
|
|
1094
|
+
- 'SV_DUP': Duplication structural variant
|
|
1095
|
+
- 'SV_DEL': Deletion structural variant
|
|
1096
|
+
- 'SV_INS': Insertion structural variant
|
|
1097
|
+
- 'SV_CNV': Copy number variant
|
|
1098
|
+
- 'SV_BND': Breakend/translocation
|
|
1099
|
+
- 'SV_BND_INS': Breakend/translocation with inserted sequence
|
|
1100
|
+
- 'SNV': Single nucleotide variant
|
|
1101
|
+
- 'MNV': Milti-nucleotide variant (alt len = ref len but no prefix)
|
|
1102
|
+
- 'INS': Sequence insertion
|
|
1103
|
+
- 'DEL': Sequence deletion
|
|
1104
|
+
- 'complex': Complex/multi-nucleotide variant (same length substitution)
|
|
1105
|
+
- 'unknown': Unclassifiable variant
|
|
1106
|
+
|
|
1107
|
+
Note: MNV is not part of the official VCF 4.2 spec, they are treated the same as SNVs
|
|
1108
|
+
for all functions in supremo_lite.
|
|
1109
|
+
|
|
1110
|
+
Examples:
|
|
1111
|
+
|
|
1112
|
+
# Multiallelic variants
|
|
1113
|
+
>>> classify_variant_type('A', 'G,T')
|
|
1114
|
+
'multiallelic'
|
|
1115
|
+
>>> classify_variant_type('T', 'TGGG,C')
|
|
1116
|
+
'multiallelic'
|
|
1117
|
+
|
|
1118
|
+
# Standard variants
|
|
1119
|
+
>>> classify_variant_type('A', 'G')
|
|
1120
|
+
'SNV'
|
|
1121
|
+
>>> classify_variant_type('AGG', 'TCG')
|
|
1122
|
+
'MNV'
|
|
1123
|
+
>>> classify_variant_type('T', 'TGGG')
|
|
1124
|
+
'INS'
|
|
1125
|
+
>>> classify_variant_type('CGAGAA', 'C')
|
|
1126
|
+
'DEL'
|
|
1127
|
+
|
|
1128
|
+
# Structural variants
|
|
1129
|
+
>>> classify_variant_type('N', '<INV>')
|
|
1130
|
+
'SV_INV'
|
|
1131
|
+
>>> classify_variant_type('G', 'G]17:198982]')
|
|
1132
|
+
'SV_BND'
|
|
1133
|
+
>>> classify_variant_type('T', ']chr2:20]ATCGT')
|
|
1134
|
+
'SV_BND_INS'
|
|
1135
|
+
|
|
1136
|
+
# Special cases
|
|
1137
|
+
>>> classify_variant_type('T', '*')
|
|
1138
|
+
'missing'
|
|
1139
|
+
|
|
1140
|
+
VCF 4.2 Reference: https://samtools.github.io/hts-specs/VCFv4.2.pdf
|
|
1141
|
+
"""
|
|
1142
|
+
if not ref_allele or not alt_allele:
|
|
1143
|
+
return "missing_ref_or_alt"
|
|
1144
|
+
|
|
1145
|
+
# Normalize alleles (VCF allows mixed case)
|
|
1146
|
+
ref = ref_allele.upper().strip()
|
|
1147
|
+
alt = alt_allele.upper().strip()
|
|
1148
|
+
|
|
1149
|
+
# PRIORITY 0: Multiallelic variants (comma-separated ALT alleles)
|
|
1150
|
+
# Multiple alternative alleles in single ALT field indicate complex variant
|
|
1151
|
+
if "," in alt:
|
|
1152
|
+
return "multiallelic"
|
|
1153
|
+
|
|
1154
|
+
# PRIORITY 1: Handle missing/upstream deletion alleles
|
|
1155
|
+
# The '*' allele indicates missing due to upstream deletion (VCF 4.2 spec)
|
|
1156
|
+
if alt == "*":
|
|
1157
|
+
return "missing"
|
|
1158
|
+
|
|
1159
|
+
# PRIORITY 2: Structural variants with symbolic alleles
|
|
1160
|
+
# Format: <ID> where ID indicates structural variant type
|
|
1161
|
+
if alt.startswith("<") and alt.endswith(">"):
|
|
1162
|
+
sv_type = alt[1:-1].upper() # Extract type from <INV>, <DUP>, etc.
|
|
1163
|
+
|
|
1164
|
+
# Map symbolic alleles to standard classifications
|
|
1165
|
+
if sv_type in ["INV"]:
|
|
1166
|
+
return "SV_INV"
|
|
1167
|
+
elif sv_type in ["DUP", "DUP:TANDEM"]:
|
|
1168
|
+
return "SV_DUP"
|
|
1169
|
+
elif sv_type in ["DEL"]:
|
|
1170
|
+
return "SV_DEL"
|
|
1171
|
+
elif sv_type in ["INS"]:
|
|
1172
|
+
return "SV_INS"
|
|
1173
|
+
elif sv_type in ["CNV"]:
|
|
1174
|
+
return "SV_CNV"
|
|
1175
|
+
elif sv_type in ["BND", "TRA"]:
|
|
1176
|
+
return "SV_BND"
|
|
1177
|
+
else:
|
|
1178
|
+
# Fallback to returning the ALT
|
|
1179
|
+
return alt
|
|
1180
|
+
|
|
1181
|
+
# PRIORITY 3: Breakend notation (complex rearrangements)
|
|
1182
|
+
# Format examples: A[chr2:1000[, ]chr1:100]T, etc.
|
|
1183
|
+
breakend_pattern = r"[\[\]]"
|
|
1184
|
+
if re.search(breakend_pattern, alt):
|
|
1185
|
+
# Check if BND has inserted sequence by parsing the ALT field
|
|
1186
|
+
try:
|
|
1187
|
+
breakend_info = parse_breakend_alt(alt)
|
|
1188
|
+
if breakend_info["is_valid"] and breakend_info["inserted_seq"]:
|
|
1189
|
+
return "SV_BND_INS" # BND with insertion
|
|
1190
|
+
else:
|
|
1191
|
+
return "SV_BND" # Standard BND
|
|
1192
|
+
except:
|
|
1193
|
+
# If parsing fails, fallback to returning the ALT
|
|
1194
|
+
return alt
|
|
1195
|
+
|
|
1196
|
+
# PRIORITY 4: Check SVTYPE in INFO field for additional SV classification
|
|
1197
|
+
# Note: Symbolic ALT fields (<INV>, <DUP>) are handled by priority 3, so this mainly
|
|
1198
|
+
# serves as fallback for non-standard VCF files
|
|
1199
|
+
if info_dict and "SVTYPE" in info_dict:
|
|
1200
|
+
svtype = str(info_dict["SVTYPE"]).upper()
|
|
1201
|
+
if svtype in ["INV"]:
|
|
1202
|
+
return "SV_INV"
|
|
1203
|
+
elif svtype in ["DUP"]:
|
|
1204
|
+
return "SV_DUP"
|
|
1205
|
+
elif svtype in ["DEL"]:
|
|
1206
|
+
return "SV_DEL"
|
|
1207
|
+
elif svtype in ["INS"]:
|
|
1208
|
+
return "SV_INS"
|
|
1209
|
+
elif svtype in ["CNV"]:
|
|
1210
|
+
return "SV_CNV"
|
|
1211
|
+
elif svtype in ["BND", "TRA", "TRANSLOCATION"]:
|
|
1212
|
+
return "SV_BND"
|
|
1213
|
+
|
|
1214
|
+
# PRIORITY 5: Standard sequence variants based on length comparison
|
|
1215
|
+
ref_len = len(ref)
|
|
1216
|
+
alt_len = len(alt)
|
|
1217
|
+
|
|
1218
|
+
if ref_len == 1 and alt_len == 1:
|
|
1219
|
+
# Single base substitution
|
|
1220
|
+
if ref != alt:
|
|
1221
|
+
return "SNV"
|
|
1222
|
+
else:
|
|
1223
|
+
# Identical alleles - should not occur in valid VCF
|
|
1224
|
+
return alt
|
|
1225
|
+
|
|
1226
|
+
elif ref_len == 1 and alt_len > 1:
|
|
1227
|
+
# Potential insertion: check if REF is prefix of ALT
|
|
1228
|
+
if alt.startswith(ref):
|
|
1229
|
+
return "INS"
|
|
1230
|
+
else:
|
|
1231
|
+
# REF not a prefix - complex variant
|
|
1232
|
+
return alt
|
|
1233
|
+
|
|
1234
|
+
elif ref_len > 1 and alt_len == 1:
|
|
1235
|
+
# Potential deletion: check if ALT is prefix of REF
|
|
1236
|
+
if ref.startswith(alt):
|
|
1237
|
+
return "DEL"
|
|
1238
|
+
else:
|
|
1239
|
+
# ALT not a prefix - complex variant
|
|
1240
|
+
return alt
|
|
1241
|
+
|
|
1242
|
+
elif ref_len > 1 and alt_len > 1:
|
|
1243
|
+
# Multi-base variant - determine if complex substitution or indel
|
|
1244
|
+
# Check for shared prefix/suffix to identify indel vs substitution
|
|
1245
|
+
|
|
1246
|
+
# Find longest common prefix
|
|
1247
|
+
prefix_len = 0
|
|
1248
|
+
min_len = min(ref_len, alt_len)
|
|
1249
|
+
while prefix_len < min_len and ref[prefix_len] == alt[prefix_len]:
|
|
1250
|
+
prefix_len += 1
|
|
1251
|
+
|
|
1252
|
+
# Find longest common suffix
|
|
1253
|
+
suffix_len = 0
|
|
1254
|
+
while (
|
|
1255
|
+
suffix_len < min_len - prefix_len
|
|
1256
|
+
and ref[ref_len - 1 - suffix_len] == alt[alt_len - 1 - suffix_len]
|
|
1257
|
+
):
|
|
1258
|
+
suffix_len += 1
|
|
1259
|
+
|
|
1260
|
+
# Analyze the variant structure
|
|
1261
|
+
if prefix_len + suffix_len >= min_len:
|
|
1262
|
+
# Significant overlap - likely indel
|
|
1263
|
+
if ref_len > alt_len:
|
|
1264
|
+
return "DEL"
|
|
1265
|
+
elif alt_len > ref_len:
|
|
1266
|
+
return "INS"
|
|
1267
|
+
else:
|
|
1268
|
+
# Same length with shared prefix/suffix - substitution
|
|
1269
|
+
return alt
|
|
1270
|
+
else:
|
|
1271
|
+
# Limited overlap - substitution
|
|
1272
|
+
return "MNV"
|
|
1273
|
+
|
|
1274
|
+
else:
|
|
1275
|
+
# Not parsed - should not occur in valid VCF
|
|
1276
|
+
return alt
|
|
1277
|
+
|
|
1278
|
+
|
|
1279
|
+
def parse_breakend_alt(alt_allele: str) -> Dict:
|
|
1280
|
+
"""
|
|
1281
|
+
Parse breakend ALT field to extract mate information and inserted sequence.
|
|
1282
|
+
|
|
1283
|
+
Args:
|
|
1284
|
+
alt_allele: ALT field from BND variant (e.g., "G]17:198982]", "]13:123456]AGTNNNNNCAT")
|
|
1285
|
+
|
|
1286
|
+
Returns:
|
|
1287
|
+
dict: Parsed breakend information with keys:
|
|
1288
|
+
- 'mate_chrom': Chromosome of mate breakend
|
|
1289
|
+
- 'mate_pos': Position of mate breakend (1-based)
|
|
1290
|
+
- 'orientation': Breakend orientation ('t[p[', 't]p]', ']p]t', '[p[t')
|
|
1291
|
+
- 'inserted_seq': Novel sequence inserted at junction (empty string if none)
|
|
1292
|
+
- 'is_valid': Boolean indicating if ALT field was successfully parsed
|
|
1293
|
+
|
|
1294
|
+
Breakend ALT format examples (VCF 4.2):
|
|
1295
|
+
|
|
1296
|
+
- t[p[: piece extending to the right of p is joined after t
|
|
1297
|
+
- t]p]: reverse comp piece extending left of p is joined after t
|
|
1298
|
+
- ]p]t: piece extending to the left of p is joined before t
|
|
1299
|
+
- [p[t: reverse comp piece extending right of p is joined before t
|
|
1300
|
+
|
|
1301
|
+
Examples:
|
|
1302
|
+
|
|
1303
|
+
>>> parse_breakend_alt("G]17:198982]")
|
|
1304
|
+
{'mate_chrom': '17', 'mate_pos': 198982, 'orientation': 't]p]',
|
|
1305
|
+
'inserted_seq': '', 'is_valid': True}
|
|
1306
|
+
|
|
1307
|
+
>>> parse_breakend_alt("]13:123456]AGTNNNNNCAT")
|
|
1308
|
+
{'mate_chrom': '13', 'mate_pos': 123456, 'orientation': ']p]t',
|
|
1309
|
+
'inserted_seq': 'AGTNNNNNCAT', 'is_valid': True}
|
|
1310
|
+
"""
|
|
1311
|
+
import re
|
|
1312
|
+
|
|
1313
|
+
result = {
|
|
1314
|
+
"mate_chrom": None,
|
|
1315
|
+
"mate_pos": None,
|
|
1316
|
+
"orientation": None,
|
|
1317
|
+
"inserted_seq": "",
|
|
1318
|
+
"is_valid": False,
|
|
1319
|
+
}
|
|
1320
|
+
|
|
1321
|
+
if not alt_allele or not isinstance(alt_allele, str):
|
|
1322
|
+
return result
|
|
1323
|
+
|
|
1324
|
+
# Patterns for the four breakend orientations
|
|
1325
|
+
# t[p[ format: sequence + [ + position + [
|
|
1326
|
+
pattern1 = r"^(.+?)\[([^:]+):(\d+)\[$"
|
|
1327
|
+
# t]p] format: sequence + ] + position + ]
|
|
1328
|
+
pattern2 = r"^(.+?)\]([^:]+):(\d+)\]$"
|
|
1329
|
+
# ]p]t format: ] + position + ] + sequence
|
|
1330
|
+
pattern3 = r"^\]([^:]+):(\d+)\](.+?)$"
|
|
1331
|
+
# [p[t format: [ + position + [ + sequence
|
|
1332
|
+
pattern4 = r"^\[([^:]+):(\d+)\[(.+?)$"
|
|
1333
|
+
|
|
1334
|
+
# Try each pattern
|
|
1335
|
+
match = re.match(pattern1, alt_allele)
|
|
1336
|
+
if match:
|
|
1337
|
+
prefix_seq, mate_chrom, mate_pos = match.groups()
|
|
1338
|
+
result["mate_chrom"] = mate_chrom
|
|
1339
|
+
result["mate_pos"] = int(mate_pos)
|
|
1340
|
+
result["orientation"] = "t[p[" # t[p[
|
|
1341
|
+
result["inserted_seq"] = (
|
|
1342
|
+
prefix_seq[1:] if len(prefix_seq) > 1 else ""
|
|
1343
|
+
) # Remove reference base
|
|
1344
|
+
result["is_valid"] = True
|
|
1345
|
+
return result
|
|
1346
|
+
|
|
1347
|
+
match = re.match(pattern2, alt_allele)
|
|
1348
|
+
if match:
|
|
1349
|
+
prefix_seq, mate_chrom, mate_pos = match.groups()
|
|
1350
|
+
result["mate_chrom"] = mate_chrom
|
|
1351
|
+
result["mate_pos"] = int(mate_pos)
|
|
1352
|
+
result["orientation"] = "t]p]" # t]p]
|
|
1353
|
+
result["inserted_seq"] = (
|
|
1354
|
+
prefix_seq[1:] if len(prefix_seq) > 1 else ""
|
|
1355
|
+
) # Remove reference base
|
|
1356
|
+
result["is_valid"] = True
|
|
1357
|
+
return result
|
|
1358
|
+
|
|
1359
|
+
match = re.match(pattern3, alt_allele)
|
|
1360
|
+
if match:
|
|
1361
|
+
mate_chrom, mate_pos, suffix_seq = match.groups()
|
|
1362
|
+
result["mate_chrom"] = mate_chrom
|
|
1363
|
+
result["mate_pos"] = int(mate_pos)
|
|
1364
|
+
result["orientation"] = "]p]t" # ]p]t
|
|
1365
|
+
result["inserted_seq"] = (
|
|
1366
|
+
suffix_seq[:-1] if len(suffix_seq) > 1 else ""
|
|
1367
|
+
) # Remove reference base
|
|
1368
|
+
result["is_valid"] = True
|
|
1369
|
+
return result
|
|
1370
|
+
|
|
1371
|
+
match = re.match(pattern4, alt_allele)
|
|
1372
|
+
if match:
|
|
1373
|
+
mate_chrom, mate_pos, suffix_seq = match.groups()
|
|
1374
|
+
result["mate_chrom"] = mate_chrom
|
|
1375
|
+
result["mate_pos"] = int(mate_pos)
|
|
1376
|
+
result["orientation"] = "[p[t" # [p[t
|
|
1377
|
+
result["inserted_seq"] = (
|
|
1378
|
+
suffix_seq[:-1] if len(suffix_seq) > 1 else ""
|
|
1379
|
+
) # Remove reference base
|
|
1380
|
+
result["is_valid"] = True
|
|
1381
|
+
return result
|
|
1382
|
+
|
|
1383
|
+
return result
|
|
1384
|
+
|
|
1385
|
+
|
|
1386
|
+
def validate_breakend_pair(bnd1: Dict, bnd2: Dict) -> Dict:
|
|
1387
|
+
"""
|
|
1388
|
+
Validate that two breakend variants form a consistent mate pair.
|
|
1389
|
+
|
|
1390
|
+
Args:
|
|
1391
|
+
bnd1: First breakend variant (dict with id, mate_id, chrom, pos, etc.)
|
|
1392
|
+
bnd2: Second breakend variant (dict with id, mate_id, chrom, pos, etc.)
|
|
1393
|
+
|
|
1394
|
+
Returns:
|
|
1395
|
+
dict: Validation result with keys:
|
|
1396
|
+
- 'is_valid': Boolean indicating if pair is valid
|
|
1397
|
+
- 'errors': List of validation error messages
|
|
1398
|
+
- 'warnings': List of validation warning messages
|
|
1399
|
+
"""
|
|
1400
|
+
result = {"is_valid": True, "errors": [], "warnings": []}
|
|
1401
|
+
|
|
1402
|
+
# Check that they reference each other as mates
|
|
1403
|
+
if bnd1.get("mate_id") != bnd2.get("id"):
|
|
1404
|
+
result["errors"].append(
|
|
1405
|
+
f"BND {bnd1.get('id')} MATEID {bnd1.get('mate_id')} does not match mate ID {bnd2.get('id')}"
|
|
1406
|
+
)
|
|
1407
|
+
result["is_valid"] = False
|
|
1408
|
+
|
|
1409
|
+
if bnd2.get("mate_id") != bnd1.get("id"):
|
|
1410
|
+
result["errors"].append(
|
|
1411
|
+
f"BND {bnd2.get('id')} MATEID {bnd2.get('mate_id')} does not match mate ID {bnd1.get('id')}"
|
|
1412
|
+
)
|
|
1413
|
+
result["is_valid"] = False
|
|
1414
|
+
|
|
1415
|
+
# Check that mate positions are consistent with actual positions
|
|
1416
|
+
if bnd1.get("mate_chrom") != bnd2.get("chrom"):
|
|
1417
|
+
result["errors"].append(
|
|
1418
|
+
f"BND {bnd1.get('id')} mate chromosome {bnd1.get('mate_chrom')} does not match actual chromosome {bnd2.get('chrom')}"
|
|
1419
|
+
)
|
|
1420
|
+
result["is_valid"] = False
|
|
1421
|
+
|
|
1422
|
+
if bnd1.get("mate_pos") != bnd2.get("pos"):
|
|
1423
|
+
result["errors"].append(
|
|
1424
|
+
f"BND {bnd1.get('id')} mate position {bnd1.get('mate_pos')} does not match actual position {bnd2.get('pos')}"
|
|
1425
|
+
)
|
|
1426
|
+
result["is_valid"] = False
|
|
1427
|
+
|
|
1428
|
+
# Check orientation consistency (complex logic depending on rearrangement type)
|
|
1429
|
+
orientation1 = bnd1.get("orientation")
|
|
1430
|
+
orientation2 = bnd2.get("orientation")
|
|
1431
|
+
|
|
1432
|
+
# For now, just warn about complex orientation validation - this would need detailed implementation
|
|
1433
|
+
if orientation1 and orientation2:
|
|
1434
|
+
result["warnings"].append(
|
|
1435
|
+
f"Orientation validation not fully implemented: {orientation1} vs {orientation2}"
|
|
1436
|
+
)
|
|
1437
|
+
|
|
1438
|
+
return result
|
|
1439
|
+
|
|
1440
|
+
|
|
1441
|
+
def create_breakend_pairs(variants_df: pd.DataFrame) -> List[BreakendPair]:
|
|
1442
|
+
"""
|
|
1443
|
+
Create BreakendPair objects from BND variants in a DataFrame.
|
|
1444
|
+
|
|
1445
|
+
This function pairs breakend variants based on coordinate matching rather than MATEID,
|
|
1446
|
+
making it more robust and not dependent on optional INFO fields.
|
|
1447
|
+
|
|
1448
|
+
Args:
|
|
1449
|
+
variants_df: DataFrame containing BND variants with variant_type='SV_BND'
|
|
1450
|
+
|
|
1451
|
+
Returns:
|
|
1452
|
+
List of BreakendPair objects representing valid breakend pairs
|
|
1453
|
+
|
|
1454
|
+
Notes:
|
|
1455
|
+
- Pairs breakends by matching coordinates from ALT field parsing
|
|
1456
|
+
- Does not require MATEID field to be present
|
|
1457
|
+
- Issues warnings for unpaired or invalid breakends
|
|
1458
|
+
"""
|
|
1459
|
+
# Filter for BND variants only (including BND with insertions)
|
|
1460
|
+
bnd_variants = variants_df[
|
|
1461
|
+
variants_df["variant_type"].isin(["SV_BND", "SV_BND_INS"])
|
|
1462
|
+
].copy()
|
|
1463
|
+
|
|
1464
|
+
if len(bnd_variants) == 0:
|
|
1465
|
+
return []
|
|
1466
|
+
|
|
1467
|
+
# Parse all breakend variants
|
|
1468
|
+
breakend_variants = []
|
|
1469
|
+
for _, variant in bnd_variants.iterrows():
|
|
1470
|
+
try:
|
|
1471
|
+
# Parse ALT field to get mate information
|
|
1472
|
+
breakend_info = parse_breakend_alt(variant["alt"])
|
|
1473
|
+
|
|
1474
|
+
if not breakend_info["is_valid"]:
|
|
1475
|
+
warnings.warn(
|
|
1476
|
+
f"Could not parse breakend ALT field for variant {variant['id']}: {variant['alt']}"
|
|
1477
|
+
)
|
|
1478
|
+
continue
|
|
1479
|
+
|
|
1480
|
+
# Parse INFO field for optional MATEID (but don't require it)
|
|
1481
|
+
info_dict = parse_vcf_info(variant.get("info", ""))
|
|
1482
|
+
mate_id = info_dict.get("MATEID", None)
|
|
1483
|
+
|
|
1484
|
+
# Create BreakendVariant object
|
|
1485
|
+
breakend = BreakendVariant(
|
|
1486
|
+
id=variant["id"],
|
|
1487
|
+
chrom=variant["chrom"],
|
|
1488
|
+
pos=variant["pos1"],
|
|
1489
|
+
ref=variant["ref"],
|
|
1490
|
+
alt=variant["alt"],
|
|
1491
|
+
mate_id=mate_id, # May be None
|
|
1492
|
+
mate_chrom=breakend_info["mate_chrom"],
|
|
1493
|
+
mate_pos=breakend_info["mate_pos"],
|
|
1494
|
+
orientation=breakend_info["orientation"],
|
|
1495
|
+
inserted_seq=breakend_info["inserted_seq"],
|
|
1496
|
+
info=variant.get("info", ""),
|
|
1497
|
+
variant_type="SV_BND",
|
|
1498
|
+
)
|
|
1499
|
+
breakend_variants.append(breakend)
|
|
1500
|
+
|
|
1501
|
+
except Exception as e:
|
|
1502
|
+
warnings.warn(f"Error processing breakend variant {variant['id']}: {e}")
|
|
1503
|
+
continue
|
|
1504
|
+
|
|
1505
|
+
# Create pairs by coordinate matching
|
|
1506
|
+
pairs = []
|
|
1507
|
+
used_breakends = set()
|
|
1508
|
+
|
|
1509
|
+
for i, bnd1 in enumerate(breakend_variants):
|
|
1510
|
+
if bnd1.id in used_breakends:
|
|
1511
|
+
continue
|
|
1512
|
+
|
|
1513
|
+
# Find mate by coordinate matching
|
|
1514
|
+
mate_found = False
|
|
1515
|
+
for j, bnd2 in enumerate(breakend_variants):
|
|
1516
|
+
if i == j or bnd2.id in used_breakends:
|
|
1517
|
+
continue
|
|
1518
|
+
|
|
1519
|
+
# Check if these breakends are mates based on coordinates
|
|
1520
|
+
if (
|
|
1521
|
+
bnd1.mate_chrom == bnd2.chrom
|
|
1522
|
+
and bnd1.mate_pos == bnd2.pos
|
|
1523
|
+
and bnd2.mate_chrom == bnd1.chrom
|
|
1524
|
+
and bnd2.mate_pos == bnd1.pos
|
|
1525
|
+
):
|
|
1526
|
+
|
|
1527
|
+
try:
|
|
1528
|
+
# Create pair (validation happens in BreakendPair.__post_init__)
|
|
1529
|
+
pair = BreakendPair(bnd1, bnd2)
|
|
1530
|
+
pairs.append(pair)
|
|
1531
|
+
used_breakends.add(bnd1.id)
|
|
1532
|
+
used_breakends.add(bnd2.id)
|
|
1533
|
+
mate_found = True
|
|
1534
|
+
break
|
|
1535
|
+
|
|
1536
|
+
except Exception as e:
|
|
1537
|
+
warnings.warn(f"Invalid breakend pair {bnd1.id}-{bnd2.id}: {e}")
|
|
1538
|
+
continue
|
|
1539
|
+
|
|
1540
|
+
if not mate_found:
|
|
1541
|
+
warnings.warn(
|
|
1542
|
+
f"No mate found for breakend {bnd1.id} at {bnd1.chrom}:{bnd1.pos}"
|
|
1543
|
+
)
|
|
1544
|
+
|
|
1545
|
+
return pairs
|
|
1546
|
+
|
|
1547
|
+
|
|
1548
|
+
def load_breakend_variants(
|
|
1549
|
+
variants_fn: Union[str, pd.DataFrame],
|
|
1550
|
+
) -> Tuple[pd.DataFrame, List[Tuple]]:
|
|
1551
|
+
"""
|
|
1552
|
+
Load variants and separate BND variants into pairs using enhanced classifier.
|
|
1553
|
+
|
|
1554
|
+
Args:
|
|
1555
|
+
variants_fn: Path to VCF file or DataFrame with variant data
|
|
1556
|
+
|
|
1557
|
+
Returns:
|
|
1558
|
+
Tuple of (standard_variants_df, breakend_pairs_list)
|
|
1559
|
+
- standard_variants_df: DataFrame with non-BND variants
|
|
1560
|
+
- breakend_pairs_list: List of tuples (bnd1, bnd2) for BND pairs
|
|
1561
|
+
"""
|
|
1562
|
+
# Import here to avoid circular imports
|
|
1563
|
+
from .personalize import _load_variants
|
|
1564
|
+
|
|
1565
|
+
# Load all variants with proper normalization and classification
|
|
1566
|
+
if isinstance(variants_fn, str):
|
|
1567
|
+
all_variants = read_vcf(variants_fn, include_info=True, classify_variants=True)
|
|
1568
|
+
vcf_path = variants_fn
|
|
1569
|
+
else:
|
|
1570
|
+
# Use the existing _load_variants function which properly handles
|
|
1571
|
+
# DataFrame normalization (pos->pos1, variant_type, etc.)
|
|
1572
|
+
all_variants = _load_variants(variants_fn)
|
|
1573
|
+
vcf_path = None
|
|
1574
|
+
|
|
1575
|
+
# Separate BND and standard variants (including all BND types)
|
|
1576
|
+
bnd_variants = all_variants[
|
|
1577
|
+
all_variants["variant_type"].isin(
|
|
1578
|
+
["SV_BND", "SV_BND_INS", "SV_BND_DUP", "SV_BND_INV"]
|
|
1579
|
+
)
|
|
1580
|
+
]
|
|
1581
|
+
standard_variants = all_variants[
|
|
1582
|
+
~all_variants["variant_type"].isin(
|
|
1583
|
+
["SV_BND", "SV_BND_INS", "SV_BND_DUP", "SV_BND_INV"]
|
|
1584
|
+
)
|
|
1585
|
+
]
|
|
1586
|
+
|
|
1587
|
+
# Create breakend pairs using enhanced classifier
|
|
1588
|
+
breakend_pairs = []
|
|
1589
|
+
if len(bnd_variants) > 0 and vcf_path:
|
|
1590
|
+
classifier = BNDClassifier()
|
|
1591
|
+
classified_breakends = classifier.classify_all_breakends(
|
|
1592
|
+
vcf_path, verbose=False
|
|
1593
|
+
)
|
|
1594
|
+
|
|
1595
|
+
# Convert classified breakends to pairs
|
|
1596
|
+
paired_breakends = classified_breakends["paired"]
|
|
1597
|
+
processed_ids = set()
|
|
1598
|
+
|
|
1599
|
+
for breakend in paired_breakends:
|
|
1600
|
+
if breakend.id in processed_ids:
|
|
1601
|
+
continue
|
|
1602
|
+
|
|
1603
|
+
if breakend.mate_breakend:
|
|
1604
|
+
pair = (breakend, breakend.mate_breakend)
|
|
1605
|
+
breakend_pairs.append(pair)
|
|
1606
|
+
processed_ids.add(breakend.id)
|
|
1607
|
+
processed_ids.add(breakend.mate_breakend.id)
|
|
1608
|
+
|
|
1609
|
+
return standard_variants, breakend_pairs
|
|
1610
|
+
|
|
1611
|
+
|
|
1612
|
+
def _filter_multiallelic_variants(df: pd.DataFrame) -> pd.DataFrame:
|
|
1613
|
+
"""
|
|
1614
|
+
Filter out variants with multiallelic ALT fields (containing commas).
|
|
1615
|
+
|
|
1616
|
+
Args:
|
|
1617
|
+
df: DataFrame with variant data including 'alt' column
|
|
1618
|
+
|
|
1619
|
+
Returns:
|
|
1620
|
+
DataFrame with multiallelic variants removed
|
|
1621
|
+
|
|
1622
|
+
Notes:
|
|
1623
|
+
Issues a warning when multiallelic variants are found and removed.
|
|
1624
|
+
Multiallelic variants have ALT fields like "G,T" indicating multiple
|
|
1625
|
+
alternative alleles at the same position.
|
|
1626
|
+
"""
|
|
1627
|
+
if "alt" not in df.columns or len(df) == 0:
|
|
1628
|
+
return df
|
|
1629
|
+
|
|
1630
|
+
# Identify multiallelic variants (ALT field contains comma)
|
|
1631
|
+
multiallelic_mask = df["alt"].str.contains(",", na=False)
|
|
1632
|
+
n_multiallelic = multiallelic_mask.sum()
|
|
1633
|
+
|
|
1634
|
+
if n_multiallelic > 0:
|
|
1635
|
+
warnings.warn(
|
|
1636
|
+
f"Found {n_multiallelic} multiallelic variants with comma-separated ALT alleles. "
|
|
1637
|
+
f"These variants have been removed from the dataset. "
|
|
1638
|
+
f"Consider preprocessing your VCF file to split multiallelic sites if needed.",
|
|
1639
|
+
UserWarning,
|
|
1640
|
+
)
|
|
1641
|
+
|
|
1642
|
+
# Filter out multiallelic variants
|
|
1643
|
+
df = df[~multiallelic_mask].reset_index(drop=True)
|
|
1644
|
+
|
|
1645
|
+
return df
|