supremo-lite 0.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- supremo_lite/__init__.py +59 -0
- supremo_lite/chromosome_utils.py +322 -0
- supremo_lite/core.py +41 -0
- supremo_lite/mock_models/__init__.py +110 -0
- supremo_lite/mock_models/testmodel_1d.py +184 -0
- supremo_lite/mock_models/testmodel_2d.py +203 -0
- supremo_lite/mutagenesis.py +414 -0
- supremo_lite/personalize.py +3098 -0
- supremo_lite/prediction_alignment.py +1014 -0
- supremo_lite/sequence_utils.py +137 -0
- supremo_lite/variant_utils.py +1645 -0
- supremo_lite-0.5.4.dist-info/METADATA +216 -0
- supremo_lite-0.5.4.dist-info/RECORD +15 -0
- supremo_lite-0.5.4.dist-info/WHEEL +4 -0
- supremo_lite-0.5.4.dist-info/licenses/LICENSE +22 -0
|
@@ -0,0 +1,3098 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Personalized sequence generation for supremo_lite.
|
|
3
|
+
|
|
4
|
+
This module provides functions for creating personalized genomes by applying
|
|
5
|
+
variants to a reference genome and generating sequence windows around variants.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import bisect
|
|
9
|
+
import re
|
|
10
|
+
import warnings
|
|
11
|
+
import os
|
|
12
|
+
from typing import Dict, List, Tuple, Union, NamedTuple
|
|
13
|
+
import pandas as pd
|
|
14
|
+
import numpy as np
|
|
15
|
+
from pyfaidx import Fasta
|
|
16
|
+
from .variant_utils import read_vcf
|
|
17
|
+
from .chromosome_utils import match_chromosomes_with_report, apply_chromosome_mapping
|
|
18
|
+
from .sequence_utils import encode_seq
|
|
19
|
+
from .core import TORCH_AVAILABLE
|
|
20
|
+
from .variant_utils import classify_variant_type, parse_vcf_info
|
|
21
|
+
|
|
22
|
+
try:
|
|
23
|
+
import torch
|
|
24
|
+
except ImportError:
|
|
25
|
+
pass # Already handled in core
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
# IUPAC degenerate nucleotide codes for PAM pattern matching
|
|
29
|
+
IUPAC_CODES = {
|
|
30
|
+
"A": "A",
|
|
31
|
+
"C": "C",
|
|
32
|
+
"G": "G",
|
|
33
|
+
"T": "T",
|
|
34
|
+
"U": "U",
|
|
35
|
+
"W": "[AT]",
|
|
36
|
+
"S": "[CG]",
|
|
37
|
+
"M": "[AC]",
|
|
38
|
+
"K": "[GT]",
|
|
39
|
+
"R": "[AG]",
|
|
40
|
+
"Y": "[CT]",
|
|
41
|
+
"B": "[CGT]",
|
|
42
|
+
"D": "[AGT]",
|
|
43
|
+
"H": "[ACT]",
|
|
44
|
+
"V": "[ACG]",
|
|
45
|
+
"N": "[ACGT]"
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class ChromosomeOffsetTracker:
|
|
50
|
+
"""
|
|
51
|
+
Tracks cumulative coordinate offsets per chromosome from applied variants.
|
|
52
|
+
|
|
53
|
+
When standard variants (INS/DEL) change chromosome lengths, the original VCF
|
|
54
|
+
coordinates for later BND variants become invalid. This class tracks the
|
|
55
|
+
cumulative offset at each position to enable coordinate transformation.
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
def __init__(self):
|
|
59
|
+
"""Initialize empty offset tracker."""
|
|
60
|
+
self.chromosome_offsets: Dict[str, List[Tuple[int, int]]] = (
|
|
61
|
+
{}
|
|
62
|
+
) # chrom -> [(pos, cumulative_offset)]
|
|
63
|
+
|
|
64
|
+
def add_offset(self, chrom: str, pos: int, offset: int) -> None:
|
|
65
|
+
"""
|
|
66
|
+
Add an offset at a specific position on a chromosome.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
chrom: Chromosome name
|
|
70
|
+
pos: Genomic position (0-based) where offset occurs
|
|
71
|
+
offset: Length change (+/- bases) from the variant
|
|
72
|
+
"""
|
|
73
|
+
if chrom not in self.chromosome_offsets:
|
|
74
|
+
self.chromosome_offsets[chrom] = []
|
|
75
|
+
|
|
76
|
+
# Find insertion point and update cumulative offsets
|
|
77
|
+
offset_list = self.chromosome_offsets[chrom]
|
|
78
|
+
|
|
79
|
+
# Calculate cumulative offset at this position
|
|
80
|
+
cumulative_offset = offset
|
|
81
|
+
for existing_pos, existing_cumulative in offset_list:
|
|
82
|
+
if existing_pos <= pos:
|
|
83
|
+
cumulative_offset += existing_cumulative
|
|
84
|
+
|
|
85
|
+
# Insert new offset entry, maintaining sorted order by position
|
|
86
|
+
inserted = False
|
|
87
|
+
for i, (existing_pos, existing_cumulative) in enumerate(offset_list):
|
|
88
|
+
if pos < existing_pos:
|
|
89
|
+
offset_list.insert(i, (pos, cumulative_offset))
|
|
90
|
+
inserted = True
|
|
91
|
+
# Update all downstream offsets
|
|
92
|
+
for j in range(i + 1, len(offset_list)):
|
|
93
|
+
old_pos, old_cumulative = offset_list[j]
|
|
94
|
+
offset_list[j] = (old_pos, old_cumulative + offset)
|
|
95
|
+
break
|
|
96
|
+
|
|
97
|
+
if not inserted:
|
|
98
|
+
offset_list.append((pos, cumulative_offset))
|
|
99
|
+
|
|
100
|
+
def get_offset_at_position(self, chrom: str, pos: int) -> int:
|
|
101
|
+
"""
|
|
102
|
+
Get the cumulative offset at a specific position.
|
|
103
|
+
|
|
104
|
+
Args:
|
|
105
|
+
chrom: Chromosome name
|
|
106
|
+
pos: Genomic position (0-based) to query
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
Cumulative offset at this position
|
|
110
|
+
"""
|
|
111
|
+
if chrom not in self.chromosome_offsets:
|
|
112
|
+
return 0
|
|
113
|
+
|
|
114
|
+
offset_list = self.chromosome_offsets[chrom]
|
|
115
|
+
cumulative_offset = 0
|
|
116
|
+
|
|
117
|
+
for offset_pos, offset_cumulative in offset_list:
|
|
118
|
+
if offset_pos <= pos:
|
|
119
|
+
cumulative_offset = offset_cumulative
|
|
120
|
+
else:
|
|
121
|
+
break
|
|
122
|
+
|
|
123
|
+
return cumulative_offset
|
|
124
|
+
|
|
125
|
+
def transform_coordinate(self, chrom: str, pos: int) -> int:
|
|
126
|
+
"""
|
|
127
|
+
Transform a VCF coordinate to account for applied variants.
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
chrom: Chromosome name
|
|
131
|
+
pos: Original VCF position (1-based)
|
|
132
|
+
|
|
133
|
+
Returns:
|
|
134
|
+
Transformed position (1-based) in the modified sequence
|
|
135
|
+
"""
|
|
136
|
+
# Convert to 0-based, apply offset, convert back to 1-based
|
|
137
|
+
pos_0based = pos - 1
|
|
138
|
+
offset = self.get_offset_at_position(chrom, pos_0based)
|
|
139
|
+
return pos + offset
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
class SequenceSegment(NamedTuple):
|
|
143
|
+
"""Represents a segment within a sequence with its source and position."""
|
|
144
|
+
|
|
145
|
+
source_type: str # 'reference', 'novel', 'rc_reference'
|
|
146
|
+
source_chrom: str # chromosome name or 'NOVEL'
|
|
147
|
+
start_pos: int # start position in the final sequence
|
|
148
|
+
end_pos: int # end position in the final sequence
|
|
149
|
+
length: int # segment length
|
|
150
|
+
orientation: str # 'forward', 'reverse', 'novel'
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
class FrozenRegionTracker:
|
|
154
|
+
"""
|
|
155
|
+
Efficiently tracks genomic regions that are 'frozen' due to applied variants.
|
|
156
|
+
|
|
157
|
+
Frozen regions prevent overlapping variants from being applied to the same
|
|
158
|
+
genomic coordinates. Uses a sorted list of non-overlapping intervals with
|
|
159
|
+
binary search for lookup.
|
|
160
|
+
"""
|
|
161
|
+
|
|
162
|
+
def __init__(self):
|
|
163
|
+
"""Initialize empty interval tracker."""
|
|
164
|
+
self.intervals: List[Tuple[int, int]] = [] # sorted list of (start, end) tuples
|
|
165
|
+
|
|
166
|
+
def is_frozen(self, pos: int) -> bool:
|
|
167
|
+
"""
|
|
168
|
+
Check if a genomic position is within any frozen region.
|
|
169
|
+
|
|
170
|
+
Args:
|
|
171
|
+
pos: Genomic position (0-based)
|
|
172
|
+
|
|
173
|
+
Returns:
|
|
174
|
+
True if position is frozen, False otherwise
|
|
175
|
+
|
|
176
|
+
"""
|
|
177
|
+
if not self.intervals:
|
|
178
|
+
return False
|
|
179
|
+
|
|
180
|
+
# Binary search for interval that could contain pos
|
|
181
|
+
idx = bisect.bisect_right(self.intervals, (pos, float("inf"))) - 1
|
|
182
|
+
|
|
183
|
+
if idx >= 0:
|
|
184
|
+
start, end = self.intervals[idx]
|
|
185
|
+
return start <= pos <= end
|
|
186
|
+
|
|
187
|
+
return False
|
|
188
|
+
|
|
189
|
+
def add_range(self, start: int, end: int) -> None:
|
|
190
|
+
"""
|
|
191
|
+
Add a new frozen region, merging with existing overlapping intervals.
|
|
192
|
+
|
|
193
|
+
Args:
|
|
194
|
+
start: Start position of region (0-based, inclusive)
|
|
195
|
+
end: End position of region (0-based, inclusive)
|
|
196
|
+
"""
|
|
197
|
+
if start > end:
|
|
198
|
+
return
|
|
199
|
+
|
|
200
|
+
# Find insertion point and overlapping intervals
|
|
201
|
+
left_idx = bisect.bisect_left(self.intervals, (start, start))
|
|
202
|
+
right_idx = bisect.bisect_right(self.intervals, (end, end))
|
|
203
|
+
|
|
204
|
+
# Check for overlap with interval before insertion point
|
|
205
|
+
if left_idx > 0:
|
|
206
|
+
prev_start, prev_end = self.intervals[left_idx - 1]
|
|
207
|
+
if prev_end >= start - 1: # Adjacent or overlapping
|
|
208
|
+
left_idx -= 1
|
|
209
|
+
start = min(start, prev_start)
|
|
210
|
+
end = max(end, prev_end)
|
|
211
|
+
|
|
212
|
+
# Merge with all overlapping intervals
|
|
213
|
+
for i in range(left_idx, min(right_idx, len(self.intervals))):
|
|
214
|
+
interval_start, interval_end = self.intervals[i]
|
|
215
|
+
if interval_start <= end + 1: # Adjacent or overlapping
|
|
216
|
+
start = min(start, interval_start)
|
|
217
|
+
end = max(end, interval_end)
|
|
218
|
+
|
|
219
|
+
# Remove old intervals and insert merged interval
|
|
220
|
+
del self.intervals[left_idx:right_idx]
|
|
221
|
+
self.intervals.insert(left_idx, (start, end))
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
class ChromosomeSegmentTracker:
|
|
225
|
+
"""Track which segments of each chromosome are used by fusions."""
|
|
226
|
+
|
|
227
|
+
def __init__(self, ref_sequences: Dict[str, str]):
|
|
228
|
+
self.ref_sequences = ref_sequences
|
|
229
|
+
self.used_segments = {chrom: [] for chrom in ref_sequences.keys()}
|
|
230
|
+
|
|
231
|
+
def add_used_segment(self, chrom: str, start: int, end: int, verbose: bool = False):
|
|
232
|
+
"""Add a used segment (0-based coordinates)."""
|
|
233
|
+
if chrom in self.used_segments:
|
|
234
|
+
self.used_segments[chrom].append((start, end))
|
|
235
|
+
if verbose:
|
|
236
|
+
print(
|
|
237
|
+
f" 🔍 Tracking used segment: {chrom}[{start}:{end}] = {end-start}bp"
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
def get_leftover_sequences(self, verbose: bool = False) -> Dict[str, str]:
|
|
241
|
+
"""Calculate leftover sequences not used by any fusion."""
|
|
242
|
+
leftover_sequences = {}
|
|
243
|
+
|
|
244
|
+
for chrom, ref_seq in self.ref_sequences.items():
|
|
245
|
+
segments = sorted(self.used_segments[chrom])
|
|
246
|
+
leftover_parts = []
|
|
247
|
+
|
|
248
|
+
if not segments:
|
|
249
|
+
# No segments used - entire chromosome is leftover
|
|
250
|
+
leftover_parts = [ref_seq]
|
|
251
|
+
else:
|
|
252
|
+
# Find gaps between used segments
|
|
253
|
+
current_pos = 0
|
|
254
|
+
|
|
255
|
+
for start, end in segments:
|
|
256
|
+
# Add leftover before this segment
|
|
257
|
+
if current_pos < start:
|
|
258
|
+
leftover_parts.append(ref_seq[current_pos:start])
|
|
259
|
+
current_pos = max(current_pos, end)
|
|
260
|
+
|
|
261
|
+
# Add leftover after last segment
|
|
262
|
+
if current_pos < len(ref_seq):
|
|
263
|
+
leftover_parts.append(ref_seq[current_pos:])
|
|
264
|
+
|
|
265
|
+
# Combine leftover parts
|
|
266
|
+
if leftover_parts:
|
|
267
|
+
leftover_seq = "".join(leftover_parts)
|
|
268
|
+
if leftover_seq: # Only add non-empty leftovers
|
|
269
|
+
leftover_sequences[chrom] = leftover_seq
|
|
270
|
+
if verbose:
|
|
271
|
+
print(f" ✂️ Created leftover {chrom}: {len(leftover_seq)} bp")
|
|
272
|
+
|
|
273
|
+
return leftover_sequences
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
class VariantApplicator:
|
|
277
|
+
"""
|
|
278
|
+
Applies VCF variants to a reference sequence in memory.
|
|
279
|
+
|
|
280
|
+
Handles coordinate system transformations, frozen region tracking,
|
|
281
|
+
and sequence modifications for SNVs, insertions, and deletions.
|
|
282
|
+
"""
|
|
283
|
+
|
|
284
|
+
def __init__(
|
|
285
|
+
self,
|
|
286
|
+
sequence_str: str,
|
|
287
|
+
variants_df: pd.DataFrame,
|
|
288
|
+
frozen_tracker: FrozenRegionTracker = None,
|
|
289
|
+
offset_tracker: ChromosomeOffsetTracker = None,
|
|
290
|
+
chrom: str = None,
|
|
291
|
+
):
|
|
292
|
+
"""
|
|
293
|
+
Initialize variant applicator for a single chromosome.
|
|
294
|
+
|
|
295
|
+
Args:
|
|
296
|
+
sequence_str: Reference sequence as string
|
|
297
|
+
variants_df: DataFrame containing variants for this chromosome
|
|
298
|
+
frozen_tracker: Optional existing FrozenRegionTracker to preserve overlap state across chunks
|
|
299
|
+
offset_tracker: Optional ChromosomeOffsetTracker to track coordinate offsets
|
|
300
|
+
chrom: Chromosome name (required if offset_tracker is provided)
|
|
301
|
+
"""
|
|
302
|
+
self.sequence = bytearray(sequence_str.encode()) # Mutable sequence
|
|
303
|
+
self.variants = variants_df.sort_values("pos1").reset_index(drop=True)
|
|
304
|
+
self.frozen_tracker = (
|
|
305
|
+
frozen_tracker if frozen_tracker is not None else FrozenRegionTracker()
|
|
306
|
+
)
|
|
307
|
+
self.offset_tracker = offset_tracker
|
|
308
|
+
self.chrom = chrom
|
|
309
|
+
self.cumulative_offset = 0 # Track length changes from applied variants
|
|
310
|
+
self.applied_count = 0
|
|
311
|
+
self.skipped_count = 0
|
|
312
|
+
self.skipped_variants = (
|
|
313
|
+
[]
|
|
314
|
+
) # List of (vcf_line, chrom, pos1, ref, alt, reason) tuples
|
|
315
|
+
|
|
316
|
+
def apply_variants(self) -> Tuple[str, Dict[str, int]]:
|
|
317
|
+
"""
|
|
318
|
+
Apply all variants to the sequence.
|
|
319
|
+
|
|
320
|
+
Returns:
|
|
321
|
+
Tuple of (modified_sequence, statistics_dict)
|
|
322
|
+
"""
|
|
323
|
+
for _, variant in self.variants.iterrows():
|
|
324
|
+
try:
|
|
325
|
+
self._apply_single_variant(variant)
|
|
326
|
+
except Exception as e:
|
|
327
|
+
# Extract concise error message and context
|
|
328
|
+
vcf_line = variant.get("vcf_line", "?")
|
|
329
|
+
chrom = variant.get("chrom", self.chrom)
|
|
330
|
+
error_msg = str(e).split(":")[0] if ":" in str(e) else str(e)
|
|
331
|
+
warnings.warn(
|
|
332
|
+
f"Skipped variant at {chrom}:{variant.pos1} (VCF line {vcf_line}): {error_msg}"
|
|
333
|
+
)
|
|
334
|
+
self.skipped_count += 1
|
|
335
|
+
# Record skip details for reporting
|
|
336
|
+
self.skipped_variants.append(
|
|
337
|
+
(
|
|
338
|
+
vcf_line,
|
|
339
|
+
chrom,
|
|
340
|
+
variant.pos1,
|
|
341
|
+
variant.ref,
|
|
342
|
+
variant.alt,
|
|
343
|
+
"validation_error",
|
|
344
|
+
)
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
stats = {
|
|
348
|
+
"applied": self.applied_count,
|
|
349
|
+
"skipped": self.skipped_count,
|
|
350
|
+
"total": len(self.variants),
|
|
351
|
+
"skipped_variants": self.skipped_variants,
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
return self.sequence.decode(), stats
|
|
355
|
+
|
|
356
|
+
def apply_single_variant_to_window(
|
|
357
|
+
self, variant: pd.Series, window_start: int, window_end: int
|
|
358
|
+
) -> str:
|
|
359
|
+
"""
|
|
360
|
+
Apply a single variant to a sequence window.
|
|
361
|
+
|
|
362
|
+
Args:
|
|
363
|
+
variant: Series containing variant information (pos, ref, alt)
|
|
364
|
+
window_start: Start position of window (0-based)
|
|
365
|
+
window_end: End position of window (0-based, exclusive)
|
|
366
|
+
|
|
367
|
+
Returns:
|
|
368
|
+
Modified sequence string
|
|
369
|
+
"""
|
|
370
|
+
# Create a copy of the window sequence
|
|
371
|
+
window_seq = self.sequence[window_start:window_end].copy()
|
|
372
|
+
|
|
373
|
+
# Handle multiple ALT alleles - take first one
|
|
374
|
+
alt_allele = variant.alt.split(",")[0]
|
|
375
|
+
|
|
376
|
+
# Calculate variant position relative to window
|
|
377
|
+
genomic_pos = variant.pos1 - 1 # Convert VCF 1-based to 0-based
|
|
378
|
+
var_pos_in_window = genomic_pos - window_start
|
|
379
|
+
|
|
380
|
+
# Check if variant is within window
|
|
381
|
+
if var_pos_in_window < 0 or var_pos_in_window >= len(window_seq):
|
|
382
|
+
return window_seq.decode()
|
|
383
|
+
|
|
384
|
+
# Check if entire variant fits in window
|
|
385
|
+
ref_end = var_pos_in_window + len(variant.ref)
|
|
386
|
+
if ref_end > len(window_seq):
|
|
387
|
+
return window_seq.decode()
|
|
388
|
+
|
|
389
|
+
# Validate reference matches
|
|
390
|
+
expected_ref = window_seq[var_pos_in_window:ref_end].decode()
|
|
391
|
+
if expected_ref.upper() != variant.ref.upper():
|
|
392
|
+
warnings.warn(
|
|
393
|
+
f"Reference mismatch at position {variant.pos1}: "
|
|
394
|
+
f"expected '{variant.ref}', found '{expected_ref}'"
|
|
395
|
+
)
|
|
396
|
+
return window_seq.decode()
|
|
397
|
+
|
|
398
|
+
# Apply variant
|
|
399
|
+
if len(alt_allele) == len(variant.ref):
|
|
400
|
+
# SNV: Direct substitution
|
|
401
|
+
window_seq[var_pos_in_window:ref_end] = alt_allele.encode()
|
|
402
|
+
elif len(alt_allele) < len(variant.ref):
|
|
403
|
+
# Deletion
|
|
404
|
+
window_seq[var_pos_in_window : var_pos_in_window + len(alt_allele)] = (
|
|
405
|
+
alt_allele.encode()
|
|
406
|
+
)
|
|
407
|
+
del window_seq[var_pos_in_window + len(alt_allele) : ref_end]
|
|
408
|
+
else:
|
|
409
|
+
# Insertion
|
|
410
|
+
window_seq[var_pos_in_window:ref_end] = alt_allele.encode()
|
|
411
|
+
|
|
412
|
+
return window_seq.decode()
|
|
413
|
+
|
|
414
|
+
def _apply_single_variant(self, variant: pd.Series) -> None:
|
|
415
|
+
"""
|
|
416
|
+
Apply a single variant to the sequence using variant type classifications.
|
|
417
|
+
|
|
418
|
+
Args:
|
|
419
|
+
variant: Series containing variant information (pos, ref, alt, variant_type)
|
|
420
|
+
"""
|
|
421
|
+
# 1. VARIANT TYPE VALIDATION
|
|
422
|
+
variant_type = variant.get("variant_type", "unknown")
|
|
423
|
+
|
|
424
|
+
# Define supported variant types for standard variant processing
|
|
425
|
+
supported_types = {
|
|
426
|
+
"SNV",
|
|
427
|
+
"MNV",
|
|
428
|
+
"INS",
|
|
429
|
+
"DEL",
|
|
430
|
+
"complex",
|
|
431
|
+
"SV_DUP",
|
|
432
|
+
"SV_INV",
|
|
433
|
+
"SV_BND_DUP",
|
|
434
|
+
"SV_BND_INV",
|
|
435
|
+
}
|
|
436
|
+
|
|
437
|
+
# Handle variants that should be processed elsewhere or are unsupported
|
|
438
|
+
vcf_line = variant.get("vcf_line", "?")
|
|
439
|
+
chrom = variant.get("chrom", self.chrom)
|
|
440
|
+
|
|
441
|
+
if variant_type in ["SV_BND"]:
|
|
442
|
+
warnings.warn(
|
|
443
|
+
f"Skipped variant at {chrom}:{variant.pos1} (VCF line {vcf_line}): type '{variant_type}' not supported"
|
|
444
|
+
)
|
|
445
|
+
self.skipped_count += 1
|
|
446
|
+
self.skipped_variants.append(
|
|
447
|
+
(
|
|
448
|
+
vcf_line,
|
|
449
|
+
chrom,
|
|
450
|
+
variant.pos1,
|
|
451
|
+
variant.ref,
|
|
452
|
+
variant.alt,
|
|
453
|
+
"unsupported_type",
|
|
454
|
+
)
|
|
455
|
+
)
|
|
456
|
+
return
|
|
457
|
+
elif variant_type in {"SV_DEL", "SV_INS", "SV_CNV"}:
|
|
458
|
+
warnings.warn(
|
|
459
|
+
f"Skipped variant at {chrom}:{variant.pos1} (VCF line {vcf_line}): type '{variant_type}' not supported"
|
|
460
|
+
)
|
|
461
|
+
self.skipped_count += 1
|
|
462
|
+
self.skipped_variants.append(
|
|
463
|
+
(
|
|
464
|
+
vcf_line,
|
|
465
|
+
chrom,
|
|
466
|
+
variant.pos1,
|
|
467
|
+
variant.ref,
|
|
468
|
+
variant.alt,
|
|
469
|
+
"unsupported_type",
|
|
470
|
+
)
|
|
471
|
+
)
|
|
472
|
+
return
|
|
473
|
+
elif variant_type in {"missing", "unknown"}:
|
|
474
|
+
warnings.warn(
|
|
475
|
+
f"Skipped variant at {chrom}:{variant.pos1} (VCF line {vcf_line}): type '{variant_type}' not supported"
|
|
476
|
+
)
|
|
477
|
+
self.skipped_count += 1
|
|
478
|
+
self.skipped_variants.append(
|
|
479
|
+
(
|
|
480
|
+
vcf_line,
|
|
481
|
+
chrom,
|
|
482
|
+
variant.pos1,
|
|
483
|
+
variant.ref,
|
|
484
|
+
variant.alt,
|
|
485
|
+
"missing_type",
|
|
486
|
+
)
|
|
487
|
+
)
|
|
488
|
+
return
|
|
489
|
+
elif variant_type not in supported_types:
|
|
490
|
+
warnings.warn(
|
|
491
|
+
f"Skipped variant at {chrom}:{variant.pos1} (VCF line {vcf_line}): type '{variant_type}' not supported"
|
|
492
|
+
)
|
|
493
|
+
self.skipped_count += 1
|
|
494
|
+
self.skipped_variants.append(
|
|
495
|
+
(
|
|
496
|
+
vcf_line,
|
|
497
|
+
chrom,
|
|
498
|
+
variant.pos1,
|
|
499
|
+
variant.ref,
|
|
500
|
+
variant.alt,
|
|
501
|
+
"unsupported_type",
|
|
502
|
+
)
|
|
503
|
+
)
|
|
504
|
+
return
|
|
505
|
+
|
|
506
|
+
# 2. STRUCTURAL VARIANT INFO PARSING
|
|
507
|
+
info_dict = {}
|
|
508
|
+
if variant_type in ["SV_DUP", "SV_INV"] and "info" in variant:
|
|
509
|
+
info_dict = parse_vcf_info(variant["info"])
|
|
510
|
+
|
|
511
|
+
# 3. BASIC VALIDATION CHECKS
|
|
512
|
+
if variant.alt == variant.ref:
|
|
513
|
+
self.skipped_count += 1
|
|
514
|
+
return # Skip ref-only variants
|
|
515
|
+
|
|
516
|
+
# Handle multiple ALT alleles - take first one
|
|
517
|
+
alt_allele = variant.alt.split(",")[0]
|
|
518
|
+
|
|
519
|
+
# 4. COORDINATE CALCULATION
|
|
520
|
+
genomic_pos = variant.pos1 - 1 # Convert VCF 1-based to 0-based
|
|
521
|
+
buffer_pos = genomic_pos + self.cumulative_offset
|
|
522
|
+
|
|
523
|
+
# For structural variants, calculate affected region from INFO fields
|
|
524
|
+
if variant_type in ["SV_DUP", "SV_INV"]:
|
|
525
|
+
end_pos = info_dict.get("END", None)
|
|
526
|
+
svlen = info_dict.get("SVLEN", None)
|
|
527
|
+
|
|
528
|
+
# Calculate END position if not provided
|
|
529
|
+
if end_pos is None and svlen is not None:
|
|
530
|
+
end_pos = variant.pos1 + abs(svlen) - 1
|
|
531
|
+
elif end_pos is None:
|
|
532
|
+
# Fallback to REF length for structural variants
|
|
533
|
+
end_pos = variant.pos1 + len(variant.ref) - 1
|
|
534
|
+
warnings.warn(
|
|
535
|
+
f"Cannot determine structural variant end position for {variant.get('id', 'unknown')} at {variant.pos1}"
|
|
536
|
+
)
|
|
537
|
+
|
|
538
|
+
ref_length = end_pos - variant.pos1 + 1 # Total affected region length
|
|
539
|
+
else:
|
|
540
|
+
ref_length = len(variant.ref)
|
|
541
|
+
|
|
542
|
+
# 5. FROZEN REGION CHECK
|
|
543
|
+
ref_start = genomic_pos
|
|
544
|
+
ref_end = genomic_pos + ref_length - 1
|
|
545
|
+
|
|
546
|
+
if self.frozen_tracker.is_frozen(ref_start) or self.frozen_tracker.is_frozen(
|
|
547
|
+
ref_end
|
|
548
|
+
):
|
|
549
|
+
self.skipped_count += 1
|
|
550
|
+
# Record skip details for reporting
|
|
551
|
+
vcf_line = variant.get("vcf_line", "?")
|
|
552
|
+
chrom = variant.get("chrom", self.chrom)
|
|
553
|
+
self.skipped_variants.append(
|
|
554
|
+
(vcf_line, chrom, variant.pos1, variant.ref, variant.alt, "overlap")
|
|
555
|
+
)
|
|
556
|
+
return # Skip overlapping variants
|
|
557
|
+
|
|
558
|
+
# 6. BOUNDS CHECK
|
|
559
|
+
if buffer_pos < 0 or buffer_pos + ref_length > len(self.sequence):
|
|
560
|
+
raise ValueError(f"Variant position {variant.pos1} out of sequence bounds")
|
|
561
|
+
|
|
562
|
+
# 7. REFERENCE VALIDATION (skip for symbolic structural variants)
|
|
563
|
+
if variant_type not in ["SV_DUP", "SV_INV"]:
|
|
564
|
+
expected_ref = self.sequence[
|
|
565
|
+
buffer_pos : buffer_pos + len(variant.ref)
|
|
566
|
+
].decode()
|
|
567
|
+
if expected_ref.upper() != variant.ref.upper():
|
|
568
|
+
raise ValueError(
|
|
569
|
+
f"Reference mismatch at position {variant.pos1}: "
|
|
570
|
+
f"expected '{variant.ref}', found '{expected_ref}'"
|
|
571
|
+
)
|
|
572
|
+
|
|
573
|
+
# 8. SEQUENCE MODIFICATION
|
|
574
|
+
self._modify_sequence(
|
|
575
|
+
buffer_pos, variant.ref, alt_allele, variant_type, info_dict
|
|
576
|
+
)
|
|
577
|
+
|
|
578
|
+
# 9. UPDATE TRACKING
|
|
579
|
+
if variant_type in ["SV_DUP", "SV_INV"]:
|
|
580
|
+
# For structural variants, calculate length difference based on variant type
|
|
581
|
+
if variant_type == "SV_DUP":
|
|
582
|
+
# Duplication adds the duplicated region length
|
|
583
|
+
length_diff = ref_length
|
|
584
|
+
elif variant_type == "SV_INV":
|
|
585
|
+
# Inversion doesn't change sequence length
|
|
586
|
+
length_diff = 0
|
|
587
|
+
else:
|
|
588
|
+
length_diff = len(alt_allele) - len(variant.ref)
|
|
589
|
+
self.cumulative_offset += length_diff
|
|
590
|
+
self.frozen_tracker.add_range(ref_start, ref_end)
|
|
591
|
+
|
|
592
|
+
# Record offset for coordinate transformation if tracker is provided
|
|
593
|
+
if self.offset_tracker and self.chrom and length_diff != 0:
|
|
594
|
+
self.offset_tracker.add_offset(self.chrom, ref_start, length_diff)
|
|
595
|
+
|
|
596
|
+
self.applied_count += 1
|
|
597
|
+
|
|
598
|
+
def _modify_sequence(
|
|
599
|
+
self,
|
|
600
|
+
pos: int,
|
|
601
|
+
ref_allele: str,
|
|
602
|
+
alt_allele: str,
|
|
603
|
+
variant_type: str,
|
|
604
|
+
info_dict: dict = None,
|
|
605
|
+
) -> None:
|
|
606
|
+
"""
|
|
607
|
+
Modify sequence at specified position using variant type classification.
|
|
608
|
+
|
|
609
|
+
Args:
|
|
610
|
+
pos: Buffer position (0-based)
|
|
611
|
+
ref_allele: Reference allele sequence
|
|
612
|
+
alt_allele: Alternate allele sequence
|
|
613
|
+
variant_type: Classified variant type (SNV, MNV, INS, DEL, complex, SV_DUP, SV_INV)
|
|
614
|
+
info_dict: Parsed INFO field for structural variants (optional)
|
|
615
|
+
"""
|
|
616
|
+
# Dispatch based on variant type classification
|
|
617
|
+
if variant_type in ["SNV", "MNV"]:
|
|
618
|
+
# Single or multi-nucleotide substitution
|
|
619
|
+
self.sequence[pos : pos + len(ref_allele)] = alt_allele.encode()
|
|
620
|
+
|
|
621
|
+
elif variant_type == "INS":
|
|
622
|
+
# Insertion: replace reference with longer alternate sequence
|
|
623
|
+
self.sequence[pos : pos + len(ref_allele)] = alt_allele.encode()
|
|
624
|
+
|
|
625
|
+
elif variant_type == "DEL":
|
|
626
|
+
# Deletion: replace reference with shorter alternate sequence
|
|
627
|
+
self.sequence[pos : pos + len(alt_allele)] = alt_allele.encode()
|
|
628
|
+
del self.sequence[pos + len(alt_allele) : pos + len(ref_allele)]
|
|
629
|
+
|
|
630
|
+
elif variant_type == "complex":
|
|
631
|
+
# Complex variant: use length-based logic as fallback
|
|
632
|
+
ref_len = len(ref_allele)
|
|
633
|
+
alt_len = len(alt_allele)
|
|
634
|
+
|
|
635
|
+
if alt_len == ref_len:
|
|
636
|
+
# Same length substitution
|
|
637
|
+
self.sequence[pos : pos + ref_len] = alt_allele.encode()
|
|
638
|
+
elif alt_len < ref_len:
|
|
639
|
+
# Deletion-like complex variant
|
|
640
|
+
self.sequence[pos : pos + alt_len] = alt_allele.encode()
|
|
641
|
+
del self.sequence[pos + alt_len : pos + ref_len]
|
|
642
|
+
else:
|
|
643
|
+
# Insertion-like complex variant
|
|
644
|
+
self.sequence[pos : pos + ref_len] = alt_allele.encode()
|
|
645
|
+
|
|
646
|
+
elif variant_type == "SV_DUP":
|
|
647
|
+
# Tandem duplication: insert duplicated region after original
|
|
648
|
+
if not info_dict:
|
|
649
|
+
raise ValueError("INFO field required for SV_DUP variant")
|
|
650
|
+
|
|
651
|
+
end_pos = info_dict.get("END")
|
|
652
|
+
|
|
653
|
+
# Calculate duplication region using END field only
|
|
654
|
+
if end_pos is None:
|
|
655
|
+
raise ValueError("END field required for SV_DUP variant")
|
|
656
|
+
|
|
657
|
+
# Calculate from buffer position (already offset-adjusted)
|
|
658
|
+
genomic_start = (
|
|
659
|
+
pos - self.cumulative_offset + len(ref_allele)
|
|
660
|
+
) # Position after current cumulative changes
|
|
661
|
+
genomic_end = end_pos - 1 # Convert VCF 1-based to 0-based
|
|
662
|
+
dup_length = genomic_end - genomic_start + 1
|
|
663
|
+
|
|
664
|
+
# Extract the region to duplicate from current sequence
|
|
665
|
+
duplicated_region = self.sequence[pos : pos + dup_length]
|
|
666
|
+
|
|
667
|
+
# Insert duplicated region after original (tandem duplication)
|
|
668
|
+
self.sequence[pos + dup_length : pos + dup_length] = duplicated_region
|
|
669
|
+
|
|
670
|
+
elif variant_type == "SV_INV":
|
|
671
|
+
# Inversion: reverse complement the affected region
|
|
672
|
+
if not info_dict:
|
|
673
|
+
raise ValueError("INFO field required for SV_INV variant")
|
|
674
|
+
|
|
675
|
+
end_pos = info_dict.get("END")
|
|
676
|
+
|
|
677
|
+
# Calculate inversion region using END field only
|
|
678
|
+
if end_pos is None:
|
|
679
|
+
raise ValueError("END field required for SV_INV variant")
|
|
680
|
+
|
|
681
|
+
# pos is already the correct buffer position (0-based) where inversion starts
|
|
682
|
+
# END field is 1-based, so convert to 0-based buffer position
|
|
683
|
+
buffer_start = pos
|
|
684
|
+
buffer_end = end_pos - 1 # Convert 1-based END to 0-based
|
|
685
|
+
inv_length = buffer_end - buffer_start + 1
|
|
686
|
+
|
|
687
|
+
# Extract region to invert
|
|
688
|
+
region_to_invert = self.sequence[
|
|
689
|
+
buffer_start : buffer_start + inv_length
|
|
690
|
+
].decode()
|
|
691
|
+
|
|
692
|
+
# Apply reverse complement
|
|
693
|
+
from .sequence_utils import rc_str
|
|
694
|
+
|
|
695
|
+
inverted_region = rc_str(region_to_invert)
|
|
696
|
+
|
|
697
|
+
# Replace with inverted sequence
|
|
698
|
+
self.sequence[buffer_start : buffer_start + inv_length] = (
|
|
699
|
+
inverted_region.encode()
|
|
700
|
+
)
|
|
701
|
+
|
|
702
|
+
elif variant_type == "SV_BND_DUP":
|
|
703
|
+
# BND-derived tandem duplication
|
|
704
|
+
# Note: Individual SV_BND_DUP variants should not reach this point as they are
|
|
705
|
+
# preprocessed by _preprocess_bnd_derived_variants() into synthetic SV_DUP variants
|
|
706
|
+
genomic_pos = (
|
|
707
|
+
pos + self.cumulative_offset + 1
|
|
708
|
+
) # Convert back to 1-based genomic position
|
|
709
|
+
raise ValueError(
|
|
710
|
+
f"SV_BND_DUP variants should be preprocessed into SV_DUP variants. Position: {genomic_pos}"
|
|
711
|
+
)
|
|
712
|
+
|
|
713
|
+
elif variant_type == "SV_BND_INV":
|
|
714
|
+
# BND-derived inversion
|
|
715
|
+
# Note: Individual SV_BND_INV variants should not reach this point as they are
|
|
716
|
+
# preprocessed by _preprocess_bnd_derived_variants() into synthetic SV_INV variants
|
|
717
|
+
genomic_pos = (
|
|
718
|
+
pos + self.cumulative_offset + 1
|
|
719
|
+
) # Convert back to 1-based genomic position
|
|
720
|
+
raise ValueError(
|
|
721
|
+
f"SV_BND_INV variants should be preprocessed into SV_INV variants. Position: {genomic_pos}"
|
|
722
|
+
)
|
|
723
|
+
|
|
724
|
+
else:
|
|
725
|
+
# This should not happen due to validation in _apply_single_variant
|
|
726
|
+
raise ValueError(
|
|
727
|
+
f"Unsupported variant type in sequence modification: {variant_type}"
|
|
728
|
+
)
|
|
729
|
+
|
|
730
|
+
|
|
731
|
+
class ChimericSequenceBuilder:
|
|
732
|
+
"""Builds chimeric sequences from BND rearrangements."""
|
|
733
|
+
|
|
734
|
+
def __init__(self, reference_sequences: Dict[str, str]):
|
|
735
|
+
self.reference_sequences = reference_sequences
|
|
736
|
+
self.chimeric_sequences = {}
|
|
737
|
+
self.sequence_segments = {} # Store segment metadata for each sequence
|
|
738
|
+
|
|
739
|
+
def create_fusion_from_pair(self, breakend_pair: Tuple) -> Tuple[str, str]:
|
|
740
|
+
"""
|
|
741
|
+
Create fusion sequence from a pair of breakends.
|
|
742
|
+
|
|
743
|
+
Returns:
|
|
744
|
+
Tuple of (fusion_name, fusion_sequence)
|
|
745
|
+
"""
|
|
746
|
+
bnd1, bnd2 = breakend_pair
|
|
747
|
+
|
|
748
|
+
# Generate fusion name
|
|
749
|
+
fusion_name = f"{bnd1.chrom}_{bnd2.chrom}_fusion_{bnd1.id}_{bnd2.id}"
|
|
750
|
+
|
|
751
|
+
# Get sequences
|
|
752
|
+
seq1 = self.reference_sequences[bnd1.chrom]
|
|
753
|
+
seq2 = self.reference_sequences[bnd2.chrom]
|
|
754
|
+
|
|
755
|
+
# Convert VCF 1-based coordinates to 0-based array indices
|
|
756
|
+
pos1_0 = bnd1.pos - 1 # VCF 1-based -> 0-based array index
|
|
757
|
+
pos2_0 = bnd2.pos - 1 # VCF 1-based -> 0-based array index
|
|
758
|
+
|
|
759
|
+
# Create fusion based on orientation
|
|
760
|
+
fusion_seq, segments = self._build_oriented_fusion(
|
|
761
|
+
seq1,
|
|
762
|
+
pos1_0,
|
|
763
|
+
bnd1.orientation,
|
|
764
|
+
seq2,
|
|
765
|
+
pos2_0,
|
|
766
|
+
bnd2.orientation,
|
|
767
|
+
bnd1.inserted_seq + bnd2.inserted_seq,
|
|
768
|
+
bnd1.chrom,
|
|
769
|
+
bnd2.chrom,
|
|
770
|
+
bnd1,
|
|
771
|
+
bnd2, # Pass original breakends for VCF positions
|
|
772
|
+
)
|
|
773
|
+
|
|
774
|
+
# Store segment metadata
|
|
775
|
+
self.sequence_segments[fusion_name] = segments
|
|
776
|
+
|
|
777
|
+
return fusion_name, fusion_seq
|
|
778
|
+
|
|
779
|
+
def _build_oriented_fusion(
|
|
780
|
+
self,
|
|
781
|
+
seq1: str,
|
|
782
|
+
pos1_0: int,
|
|
783
|
+
orient1: str,
|
|
784
|
+
seq2: str,
|
|
785
|
+
pos2_0: int,
|
|
786
|
+
orient2: str,
|
|
787
|
+
novel_seq: str,
|
|
788
|
+
seq1_chrom: str,
|
|
789
|
+
seq2_chrom: str,
|
|
790
|
+
bnd1,
|
|
791
|
+
bnd2,
|
|
792
|
+
) -> Tuple[str, List[SequenceSegment]]:
|
|
793
|
+
"""
|
|
794
|
+
Build fusion sequence respecting coordinated breakend pair orientations.
|
|
795
|
+
|
|
796
|
+
Coordinated BND fusion patterns:
|
|
797
|
+
|
|
798
|
+
1. RC Coordination Patterns (require special handling):
|
|
799
|
+
- [p[t + [p[t : RC(seq2[pos2:]) + seq1[pos1:]
|
|
800
|
+
- t]p] + t]p] : seq1[:pos1] + RC(seq2[:pos2])
|
|
801
|
+
|
|
802
|
+
2. Same Direction Patterns (simple concatenation):
|
|
803
|
+
- ]p]t + ]p]t : seq2[:pos2] + seq1[pos1:]
|
|
804
|
+
- t[p[ + t[p[ : seq1[:pos1] + seq2[pos2:]
|
|
805
|
+
"""
|
|
806
|
+
from .sequence_utils import rc_str as reverse_complement
|
|
807
|
+
|
|
808
|
+
# Handle coordinated patterns by looking at both orientations together
|
|
809
|
+
if orient1 == "[p[t" and orient2 == "[p[t":
|
|
810
|
+
# [p[t + [p[t pattern: RC(seq2[pos2_0:]) + seq1[pos1_0:]
|
|
811
|
+
left_part = reverse_complement(seq2[pos2_0:])
|
|
812
|
+
right_part = seq1[pos1_0:]
|
|
813
|
+
left_chrom = seq2_chrom
|
|
814
|
+
right_chrom = seq1_chrom
|
|
815
|
+
left_orientation = "reverse"
|
|
816
|
+
right_orientation = "forward"
|
|
817
|
+
|
|
818
|
+
elif orient1 == "t]p]" and orient2 == "t]p]":
|
|
819
|
+
# t]p] + t]p] pattern: seq1[:pos1] + RC(seq2[:pos2]) (use VCF positions as base counts)
|
|
820
|
+
left_part = seq1[: bnd1.pos] # Include pos1 bases from seq1
|
|
821
|
+
right_part = reverse_complement(
|
|
822
|
+
seq2[: bnd2.pos]
|
|
823
|
+
) # Include pos2 bases from seq2
|
|
824
|
+
left_chrom = seq1_chrom
|
|
825
|
+
right_chrom = seq2_chrom
|
|
826
|
+
left_orientation = "forward"
|
|
827
|
+
right_orientation = "reverse"
|
|
828
|
+
|
|
829
|
+
elif orient1 == "]p]t" and orient2 == "]p]t":
|
|
830
|
+
# ]p]t + ]p]t pattern: seq2[:pos2] + seq1[pos1_0:] (use VCF pos2 as base count)
|
|
831
|
+
left_part = seq2[: bnd2.pos] # Include pos2 bases from seq2
|
|
832
|
+
right_part = seq1[pos1_0:] # From pos1_0 to end of seq1
|
|
833
|
+
left_chrom = seq2_chrom
|
|
834
|
+
right_chrom = seq1_chrom
|
|
835
|
+
left_orientation = "forward"
|
|
836
|
+
right_orientation = "forward"
|
|
837
|
+
|
|
838
|
+
elif orient1 == "t[p[" and orient2 == "t[p[":
|
|
839
|
+
# t[p[ + t[p[ pattern: seq1[:pos1_0] + seq2[pos2_0:]
|
|
840
|
+
left_part = seq1[:pos1_0]
|
|
841
|
+
right_part = seq2[pos2_0:]
|
|
842
|
+
left_chrom = seq1_chrom
|
|
843
|
+
right_chrom = seq2_chrom
|
|
844
|
+
left_orientation = "forward"
|
|
845
|
+
right_orientation = "forward"
|
|
846
|
+
|
|
847
|
+
elif orient1 == "]p]t" and orient2 == "t[p[":
|
|
848
|
+
# ]p]t + t[p[ pattern: seq2[:pos2] + seq1[pos1_0:] (use VCF pos2 as base count)
|
|
849
|
+
left_part = seq2[: bnd2.pos] # Include pos2 bases from seq2
|
|
850
|
+
right_part = seq1[pos1_0:] # From pos1_0 to end of seq1
|
|
851
|
+
left_chrom = seq2_chrom
|
|
852
|
+
right_chrom = seq1_chrom
|
|
853
|
+
left_orientation = "forward"
|
|
854
|
+
right_orientation = "forward"
|
|
855
|
+
|
|
856
|
+
elif orient1 == "t[p[" and orient2 == "]p]t":
|
|
857
|
+
# t[p[ + ]p]t pattern: seq1[:pos1_0] + seq2[pos2_0:]
|
|
858
|
+
left_part = seq1[:pos1_0]
|
|
859
|
+
right_part = seq2[pos2_0:]
|
|
860
|
+
left_chrom = seq1_chrom
|
|
861
|
+
right_chrom = seq2_chrom
|
|
862
|
+
left_orientation = "forward"
|
|
863
|
+
right_orientation = "forward"
|
|
864
|
+
|
|
865
|
+
elif orient1 == "t]p]" and orient2 == "]p]t":
|
|
866
|
+
# t]p] + ]p]t pattern (mixed coordination): seq1[:pos1] + seq2[pos2_0:]
|
|
867
|
+
left_part = seq1[: bnd1.pos] # Include pos1 bases from seq1 (VCF position)
|
|
868
|
+
right_part = seq2[pos2_0:] # From pos2_0 to end of seq2
|
|
869
|
+
left_chrom = seq1_chrom
|
|
870
|
+
right_chrom = seq2_chrom
|
|
871
|
+
left_orientation = "forward"
|
|
872
|
+
right_orientation = "forward"
|
|
873
|
+
|
|
874
|
+
else:
|
|
875
|
+
# Unknown orientation pattern - fail fast to ensure proper implementation
|
|
876
|
+
supported_patterns = [
|
|
877
|
+
"[p[t + [p[t (RC coordination)",
|
|
878
|
+
"t]p] + t]p] (RC coordination)",
|
|
879
|
+
"]p]t + ]p]t (same direction)",
|
|
880
|
+
"t[p[ + t[p[ (same direction)",
|
|
881
|
+
"t]p] + ]p]t (mixed coordination)",
|
|
882
|
+
]
|
|
883
|
+
raise ValueError(
|
|
884
|
+
f"Unsupported BND orientation pattern: '{orient1}' + '{orient2}'. "
|
|
885
|
+
f"Supported patterns: {', '.join(supported_patterns)}. "
|
|
886
|
+
f"This pattern requires explicit implementation."
|
|
887
|
+
)
|
|
888
|
+
|
|
889
|
+
# Build fusion sequence and track segments
|
|
890
|
+
segments = []
|
|
891
|
+
current_pos = 0
|
|
892
|
+
|
|
893
|
+
# Add left segment
|
|
894
|
+
if len(left_part) > 0:
|
|
895
|
+
left_type = "rc_reference" if left_orientation == "reverse" else "reference"
|
|
896
|
+
segments.append(
|
|
897
|
+
SequenceSegment(
|
|
898
|
+
source_type=left_type,
|
|
899
|
+
source_chrom=left_chrom,
|
|
900
|
+
start_pos=current_pos,
|
|
901
|
+
end_pos=current_pos + len(left_part),
|
|
902
|
+
length=len(left_part),
|
|
903
|
+
orientation=left_orientation,
|
|
904
|
+
)
|
|
905
|
+
)
|
|
906
|
+
current_pos += len(left_part)
|
|
907
|
+
|
|
908
|
+
# Add novel sequence segment
|
|
909
|
+
if len(novel_seq) > 0:
|
|
910
|
+
segments.append(
|
|
911
|
+
SequenceSegment(
|
|
912
|
+
source_type="novel",
|
|
913
|
+
source_chrom="NOVEL",
|
|
914
|
+
start_pos=current_pos,
|
|
915
|
+
end_pos=current_pos + len(novel_seq),
|
|
916
|
+
length=len(novel_seq),
|
|
917
|
+
orientation="novel",
|
|
918
|
+
)
|
|
919
|
+
)
|
|
920
|
+
current_pos += len(novel_seq)
|
|
921
|
+
|
|
922
|
+
# Add right segment
|
|
923
|
+
if len(right_part) > 0:
|
|
924
|
+
right_type = (
|
|
925
|
+
"rc_reference" if right_orientation == "reverse" else "reference"
|
|
926
|
+
)
|
|
927
|
+
segments.append(
|
|
928
|
+
SequenceSegment(
|
|
929
|
+
source_type=right_type,
|
|
930
|
+
source_chrom=right_chrom,
|
|
931
|
+
start_pos=current_pos,
|
|
932
|
+
end_pos=current_pos + len(right_part),
|
|
933
|
+
length=len(right_part),
|
|
934
|
+
orientation=right_orientation,
|
|
935
|
+
)
|
|
936
|
+
)
|
|
937
|
+
|
|
938
|
+
# Combine parts to create fusion sequence
|
|
939
|
+
fusion = left_part + novel_seq + right_part
|
|
940
|
+
|
|
941
|
+
return fusion, segments
|
|
942
|
+
|
|
943
|
+
def get_sequence_segments(self, sequence_name: str) -> List[SequenceSegment]:
|
|
944
|
+
"""Get segment metadata for a sequence."""
|
|
945
|
+
return self.sequence_segments.get(sequence_name, [])
|
|
946
|
+
|
|
947
|
+
|
|
948
|
+
def _load_reference(reference_fn: Union[str, Dict, Fasta]) -> Union[Dict, Fasta]:
|
|
949
|
+
"""Load reference genome from file or return as-is if already loaded."""
|
|
950
|
+
if isinstance(reference_fn, str) and os.path.isfile(reference_fn):
|
|
951
|
+
return Fasta(reference_fn)
|
|
952
|
+
return reference_fn
|
|
953
|
+
|
|
954
|
+
|
|
955
|
+
def _encode_genome_sequences(reference, encode=True, encoder=None):
|
|
956
|
+
"""Helper function to encode genome sequences for output."""
|
|
957
|
+
genome = {}
|
|
958
|
+
for chrom, seq in reference.items():
|
|
959
|
+
seq_str = str(seq)
|
|
960
|
+
if encode:
|
|
961
|
+
if encoder:
|
|
962
|
+
genome[chrom] = encoder(seq_str)
|
|
963
|
+
else:
|
|
964
|
+
genome[chrom] = encode_seq(seq_str)
|
|
965
|
+
else:
|
|
966
|
+
genome[chrom] = seq_str
|
|
967
|
+
return genome
|
|
968
|
+
|
|
969
|
+
|
|
970
|
+
def _load_variants(variants_fn: Union[str, pd.DataFrame]) -> pd.DataFrame:
|
|
971
|
+
"""
|
|
972
|
+
Load variants from file or return as-is if already a DataFrame.
|
|
973
|
+
Ensures variant classification happens once during loading.
|
|
974
|
+
|
|
975
|
+
For DataFrames, assumes position column is either 'pos', 'pos1', or the second column.
|
|
976
|
+
If DataFrame lacks variant_type column, classification will be added.
|
|
977
|
+
"""
|
|
978
|
+
if isinstance(variants_fn, str):
|
|
979
|
+
# Always load all variants with classification
|
|
980
|
+
variants_df = read_vcf(variants_fn, classify_variants=True)
|
|
981
|
+
# Rename pos to pos1 for consistency
|
|
982
|
+
|
|
983
|
+
return variants_df
|
|
984
|
+
else:
|
|
985
|
+
# Handle DataFrame input
|
|
986
|
+
variants_df = variants_fn.copy()
|
|
987
|
+
|
|
988
|
+
# Always use second column as pos1, regardless of current name
|
|
989
|
+
if len(variants_df.columns) >= 2:
|
|
990
|
+
# Rename second column to pos1 if it's not already named that
|
|
991
|
+
if variants_df.columns[1] != "pos1":
|
|
992
|
+
new_columns = list(variants_df.columns)
|
|
993
|
+
new_columns[1] = "pos1"
|
|
994
|
+
variants_df.columns = new_columns
|
|
995
|
+
|
|
996
|
+
# Validate that pos1 column is numeric
|
|
997
|
+
if not pd.api.types.is_numeric_dtype(variants_df["pos1"]):
|
|
998
|
+
raise ValueError(
|
|
999
|
+
f"Position column (second column) must be numeric, got {variants_df['pos1'].dtype}"
|
|
1000
|
+
)
|
|
1001
|
+
else:
|
|
1002
|
+
raise ValueError(
|
|
1003
|
+
"DataFrame must have at least 2 columns with position in second column"
|
|
1004
|
+
)
|
|
1005
|
+
|
|
1006
|
+
# Ensure variant classification exists
|
|
1007
|
+
if "variant_type" not in variants_df.columns:
|
|
1008
|
+
if len(variants_df) > 0:
|
|
1009
|
+
# Add variant classification to non-empty DataFrame
|
|
1010
|
+
variants_df["variant_type"] = variants_df.apply(
|
|
1011
|
+
lambda row: classify_variant_type(
|
|
1012
|
+
row["ref"],
|
|
1013
|
+
row["alt"],
|
|
1014
|
+
parse_vcf_info(row.get("info", "")) if "info" in row else None,
|
|
1015
|
+
),
|
|
1016
|
+
axis=1,
|
|
1017
|
+
)
|
|
1018
|
+
else:
|
|
1019
|
+
# Handle empty DataFrame - just add empty column
|
|
1020
|
+
variants_df["variant_type"] = pd.Series(dtype="object")
|
|
1021
|
+
|
|
1022
|
+
return variants_df
|
|
1023
|
+
|
|
1024
|
+
|
|
1025
|
+
def _preprocess_bnd_derived_variants(chrom_variants, vcf_path=None, verbose=False):
|
|
1026
|
+
"""
|
|
1027
|
+
Convert BND-derived DUP/INV pairs to synthetic SV_DUP/SV_INV variants.
|
|
1028
|
+
|
|
1029
|
+
This pre-processing step allows BND-derived structural variants to be processed
|
|
1030
|
+
by the existing SV_DUP/SV_INV logic, ensuring proper frozen region tracking
|
|
1031
|
+
and coordinate transformation.
|
|
1032
|
+
|
|
1033
|
+
Args:
|
|
1034
|
+
chrom_variants: DataFrame of variants for a single chromosome
|
|
1035
|
+
vcf_path: Path to VCF file for BND classification (optional)
|
|
1036
|
+
verbose: Print processing information
|
|
1037
|
+
|
|
1038
|
+
Returns:
|
|
1039
|
+
DataFrame with BND-derived variants replaced by synthetic variants
|
|
1040
|
+
"""
|
|
1041
|
+
import pandas as pd
|
|
1042
|
+
from .variant_utils import parse_breakend_alt
|
|
1043
|
+
|
|
1044
|
+
# Extract BND-derived variants that need pair processing
|
|
1045
|
+
bnd_dup_variants = chrom_variants[
|
|
1046
|
+
chrom_variants["variant_type"] == "SV_BND_DUP"
|
|
1047
|
+
].copy()
|
|
1048
|
+
bnd_inv_variants = chrom_variants[
|
|
1049
|
+
chrom_variants["variant_type"] == "SV_BND_INV"
|
|
1050
|
+
].copy()
|
|
1051
|
+
|
|
1052
|
+
if len(bnd_dup_variants) == 0 and len(bnd_inv_variants) == 0:
|
|
1053
|
+
return chrom_variants
|
|
1054
|
+
|
|
1055
|
+
if verbose:
|
|
1056
|
+
print(
|
|
1057
|
+
f" 🔄 Pre-processing {len(bnd_dup_variants)} BND-DUP + {len(bnd_inv_variants)} BND-INV variants"
|
|
1058
|
+
)
|
|
1059
|
+
|
|
1060
|
+
# Get BND classification results to find mate coordinates
|
|
1061
|
+
synthetic_variants = []
|
|
1062
|
+
processed_ids = set()
|
|
1063
|
+
|
|
1064
|
+
# Process BND-derived duplications
|
|
1065
|
+
for _, variant in bnd_dup_variants.iterrows():
|
|
1066
|
+
if variant["id"] in processed_ids:
|
|
1067
|
+
continue
|
|
1068
|
+
|
|
1069
|
+
# Parse mate coordinates from ALT field
|
|
1070
|
+
alt_info = parse_breakend_alt(variant["alt"])
|
|
1071
|
+
if not alt_info["is_valid"]:
|
|
1072
|
+
if verbose:
|
|
1073
|
+
print(f" ⚠️ Could not parse BND ALT field: {variant['alt']}")
|
|
1074
|
+
continue
|
|
1075
|
+
|
|
1076
|
+
mate_chrom = alt_info["mate_chrom"]
|
|
1077
|
+
mate_pos = alt_info["mate_pos"]
|
|
1078
|
+
|
|
1079
|
+
# Ensure this is an intrachromosomal duplication (same chromosome)
|
|
1080
|
+
if mate_chrom != variant["chrom"]:
|
|
1081
|
+
if verbose:
|
|
1082
|
+
print(
|
|
1083
|
+
f" ⚠️ Skipping interchromosomal BND: {variant['chrom']}:{variant['pos1']} -> {mate_chrom}:{mate_pos}"
|
|
1084
|
+
)
|
|
1085
|
+
continue
|
|
1086
|
+
|
|
1087
|
+
# Calculate duplication region boundaries
|
|
1088
|
+
start_pos = min(variant["pos1"], mate_pos)
|
|
1089
|
+
end_pos = max(variant["pos1"], mate_pos)
|
|
1090
|
+
|
|
1091
|
+
# Create synthetic SV_DUP variant
|
|
1092
|
+
synthetic_variant = variant.copy()
|
|
1093
|
+
synthetic_variant["variant_type"] = "SV_DUP"
|
|
1094
|
+
synthetic_variant["pos1"] = start_pos
|
|
1095
|
+
synthetic_variant["ref"] = "N" # Placeholder
|
|
1096
|
+
synthetic_variant["alt"] = "<DUP>"
|
|
1097
|
+
synthetic_variant["info"] = f"END={end_pos};SVTYPE=DUP"
|
|
1098
|
+
|
|
1099
|
+
synthetic_variants.append(synthetic_variant)
|
|
1100
|
+
processed_ids.add(variant["id"])
|
|
1101
|
+
|
|
1102
|
+
if verbose:
|
|
1103
|
+
region_length = end_pos - start_pos
|
|
1104
|
+
print(
|
|
1105
|
+
f" ✅ Created synthetic DUP: {variant['chrom']}:{start_pos}-{end_pos} ({region_length}bp)"
|
|
1106
|
+
)
|
|
1107
|
+
|
|
1108
|
+
# Process BND-derived inversions: handle 4-breakend inversion topology
|
|
1109
|
+
if len(bnd_inv_variants) > 0:
|
|
1110
|
+
# Group BND inversions by chromosome to handle 4-breakend patterns
|
|
1111
|
+
chrom_groups = bnd_inv_variants.groupby("chrom")
|
|
1112
|
+
|
|
1113
|
+
for chrom, chrom_bnd_invs in chrom_groups:
|
|
1114
|
+
chrom_breakends = chrom_bnd_invs.copy()
|
|
1115
|
+
|
|
1116
|
+
# Check if we have exactly 4 breakends (standard inversion pattern)
|
|
1117
|
+
if len(chrom_breakends) == 4:
|
|
1118
|
+
# Sort breakends by position to identify topology
|
|
1119
|
+
chrom_breakends = chrom_breakends.sort_values("pos1")
|
|
1120
|
+
positions = chrom_breakends["pos1"].tolist()
|
|
1121
|
+
|
|
1122
|
+
# 4-breakend inversion: outer breakpoints define boundaries, inner breakpoints define inverted region
|
|
1123
|
+
# Positions: [W, V, U, X] where W-X are outer, V-U are inner (get inverted)
|
|
1124
|
+
outer_start = positions[0] # W (position 10)
|
|
1125
|
+
inner_start = positions[1] # V (position 11)
|
|
1126
|
+
inner_end = positions[2] # U (position 30)
|
|
1127
|
+
outer_end = positions[3] # X (position 31)
|
|
1128
|
+
|
|
1129
|
+
# Create single synthetic SV_INV for the inner region (what gets inverted)
|
|
1130
|
+
first_variant = chrom_breakends.iloc[0].copy()
|
|
1131
|
+
synthetic_variant = first_variant.copy()
|
|
1132
|
+
synthetic_variant["variant_type"] = "SV_INV"
|
|
1133
|
+
synthetic_variant["pos1"] = inner_start # Start of inverted region
|
|
1134
|
+
synthetic_variant["ref"] = "N" # Placeholder
|
|
1135
|
+
synthetic_variant["alt"] = "<INV>"
|
|
1136
|
+
synthetic_variant["info"] = (
|
|
1137
|
+
f"END={inner_end};SVTYPE=INV" # End of inverted region
|
|
1138
|
+
)
|
|
1139
|
+
|
|
1140
|
+
synthetic_variants.append(synthetic_variant)
|
|
1141
|
+
|
|
1142
|
+
# Mark all 4 breakends as processed
|
|
1143
|
+
for _, variant in chrom_breakends.iterrows():
|
|
1144
|
+
processed_ids.add(variant["id"])
|
|
1145
|
+
|
|
1146
|
+
if verbose:
|
|
1147
|
+
inversion_length = inner_end - inner_start
|
|
1148
|
+
boundary_span = outer_end - outer_start
|
|
1149
|
+
print(
|
|
1150
|
+
f" ✅ Created synthetic INV: {chrom}:{inner_start}-{inner_end} ({inversion_length}bp) [4-breakend topology, boundary span {outer_start}-{outer_end}]"
|
|
1151
|
+
)
|
|
1152
|
+
|
|
1153
|
+
else:
|
|
1154
|
+
# Handle non-standard cases (not exactly 4 breakends)
|
|
1155
|
+
if verbose:
|
|
1156
|
+
print(
|
|
1157
|
+
f" ⚠️ Non-standard BND inversion pattern: {len(chrom_breakends)} breakends on {chrom}"
|
|
1158
|
+
)
|
|
1159
|
+
|
|
1160
|
+
# Fallback: process individually for non-4-breakend cases
|
|
1161
|
+
for _, variant in chrom_breakends.iterrows():
|
|
1162
|
+
if variant["id"] in processed_ids:
|
|
1163
|
+
continue
|
|
1164
|
+
|
|
1165
|
+
# Parse mate coordinates from ALT field
|
|
1166
|
+
alt_info = parse_breakend_alt(variant["alt"])
|
|
1167
|
+
if not alt_info["is_valid"]:
|
|
1168
|
+
if verbose:
|
|
1169
|
+
print(
|
|
1170
|
+
f" ⚠️ Could not parse BND ALT field: {variant['alt']}"
|
|
1171
|
+
)
|
|
1172
|
+
continue
|
|
1173
|
+
|
|
1174
|
+
mate_pos = alt_info["mate_pos"]
|
|
1175
|
+
start_pos = min(variant["pos1"], mate_pos)
|
|
1176
|
+
end_pos = max(variant["pos1"], mate_pos)
|
|
1177
|
+
|
|
1178
|
+
# Create synthetic SV_INV variant
|
|
1179
|
+
synthetic_variant = variant.copy()
|
|
1180
|
+
synthetic_variant["variant_type"] = "SV_INV"
|
|
1181
|
+
synthetic_variant["pos1"] = start_pos
|
|
1182
|
+
synthetic_variant["ref"] = "N"
|
|
1183
|
+
synthetic_variant["alt"] = "<INV>"
|
|
1184
|
+
synthetic_variant["info"] = f"END={end_pos};SVTYPE=INV"
|
|
1185
|
+
|
|
1186
|
+
synthetic_variants.append(synthetic_variant)
|
|
1187
|
+
processed_ids.add(variant["id"])
|
|
1188
|
+
|
|
1189
|
+
if verbose:
|
|
1190
|
+
region_length = end_pos - start_pos
|
|
1191
|
+
print(
|
|
1192
|
+
f" ✅ Created synthetic INV: {chrom}:{start_pos}-{end_pos} ({region_length}bp) [fallback]"
|
|
1193
|
+
)
|
|
1194
|
+
|
|
1195
|
+
# Create result DataFrame: remove BND-derived variants, add synthetic variants
|
|
1196
|
+
result_variants = chrom_variants[
|
|
1197
|
+
~chrom_variants["variant_type"].isin(["SV_BND_DUP", "SV_BND_INV"])
|
|
1198
|
+
].copy()
|
|
1199
|
+
|
|
1200
|
+
if synthetic_variants:
|
|
1201
|
+
synthetic_df = pd.DataFrame(synthetic_variants)
|
|
1202
|
+
result_variants = pd.concat([result_variants, synthetic_df], ignore_index=True)
|
|
1203
|
+
# Re-sort by position to maintain VCF order
|
|
1204
|
+
result_variants = result_variants.sort_values("pos1")
|
|
1205
|
+
|
|
1206
|
+
if verbose and len(synthetic_variants) > 0:
|
|
1207
|
+
print(
|
|
1208
|
+
f" 🎯 Pre-processing complete: {len(synthetic_variants)} synthetic variants created"
|
|
1209
|
+
)
|
|
1210
|
+
|
|
1211
|
+
return result_variants
|
|
1212
|
+
|
|
1213
|
+
|
|
1214
|
+
def _format_skipped_variant_report(skipped_variants_list):
|
|
1215
|
+
"""
|
|
1216
|
+
Format skipped variant details for reporting.
|
|
1217
|
+
|
|
1218
|
+
Args:
|
|
1219
|
+
skipped_variants_list: List of (vcf_line, chrom, pos1, ref, alt, reason) tuples
|
|
1220
|
+
|
|
1221
|
+
Returns:
|
|
1222
|
+
Formatted string with grouped skip reasons
|
|
1223
|
+
"""
|
|
1224
|
+
if not skipped_variants_list:
|
|
1225
|
+
return ""
|
|
1226
|
+
|
|
1227
|
+
from collections import defaultdict
|
|
1228
|
+
|
|
1229
|
+
# Group by reason
|
|
1230
|
+
by_reason = defaultdict(list)
|
|
1231
|
+
for vcf_line, chrom, pos1, ref, alt, reason in skipped_variants_list:
|
|
1232
|
+
by_reason[reason].append((vcf_line, chrom, pos1, ref, alt))
|
|
1233
|
+
|
|
1234
|
+
# Format output
|
|
1235
|
+
lines = []
|
|
1236
|
+
reason_labels = {
|
|
1237
|
+
"overlap": "overlap with previous variant",
|
|
1238
|
+
"unsupported_type": "unsupported variant type",
|
|
1239
|
+
"validation_error": "validation error",
|
|
1240
|
+
"missing_type": "missing/unknown type",
|
|
1241
|
+
}
|
|
1242
|
+
|
|
1243
|
+
for reason, variants in sorted(by_reason.items()):
|
|
1244
|
+
label = reason_labels.get(reason, reason)
|
|
1245
|
+
# Group by position for concise output
|
|
1246
|
+
by_pos = defaultdict(list)
|
|
1247
|
+
for vcf_line, chrom, pos1, ref, alt in variants:
|
|
1248
|
+
by_pos[f"{chrom}:{pos1}"].append(vcf_line)
|
|
1249
|
+
|
|
1250
|
+
for pos, vcf_lines in sorted(by_pos.items()):
|
|
1251
|
+
vcf_lines_str = ", ".join(map(str, sorted(vcf_lines)))
|
|
1252
|
+
lines.append(f" • {label}: VCF line(s) {vcf_lines_str} at {pos}")
|
|
1253
|
+
|
|
1254
|
+
return "\n".join(lines)
|
|
1255
|
+
|
|
1256
|
+
|
|
1257
|
+
def get_personal_genome(
|
|
1258
|
+
reference_fn,
|
|
1259
|
+
variants_fn,
|
|
1260
|
+
encode=True,
|
|
1261
|
+
n_chunks=1,
|
|
1262
|
+
verbose=False,
|
|
1263
|
+
encoder=None,
|
|
1264
|
+
auto_map_chromosomes=False,
|
|
1265
|
+
):
|
|
1266
|
+
"""
|
|
1267
|
+
Create a personalized genome by applying variants to a reference genome.
|
|
1268
|
+
|
|
1269
|
+
This function implements multi-phase variant processing with proper frozen region tracking:
|
|
1270
|
+
|
|
1271
|
+
Phase 1: Standard variants + Early structural variants (in VCF order):
|
|
1272
|
+
- SNV, MNV, INS, DEL, SV_DUP, SV_INV
|
|
1273
|
+
|
|
1274
|
+
Phase 2: BND semantic classification and application:
|
|
1275
|
+
- Classify BNDs to identify SV_BND_DUP and SV_BND_INV patterns
|
|
1276
|
+
- Apply SV_BND_DUP and SV_BND_INV first
|
|
1277
|
+
- Apply remaining true BND translocations
|
|
1278
|
+
|
|
1279
|
+
Frozen region enforcement:
|
|
1280
|
+
- Each variant freezes its genomic region after application
|
|
1281
|
+
- Later variants overlapping frozen regions are skipped with warnings
|
|
1282
|
+
- BND breakpoints in frozen regions cause entire BND to be skipped
|
|
1283
|
+
|
|
1284
|
+
Output chromosome ordering:
|
|
1285
|
+
- Chromosomes are returned in the same order as the reference genome
|
|
1286
|
+
- BND-generated fusion sequences appear after all original chromosomes
|
|
1287
|
+
- Leftover sequences (from consumed chromosomes) follow fusion sequences
|
|
1288
|
+
|
|
1289
|
+
Args:
|
|
1290
|
+
reference_fn: Path to reference genome file or dictionary-like object
|
|
1291
|
+
variants_fn: Path to variants file or DataFrame. Supports VCF 4.2 format
|
|
1292
|
+
including BND (breakend) variants with bracket notation.
|
|
1293
|
+
encode: Return sequences as one-hot encoded arrays (default: True)
|
|
1294
|
+
n_chunks: Number of chunks to split variants into for processing (default: 1)
|
|
1295
|
+
verbose: Print progress information (default: False)
|
|
1296
|
+
encoder: Optional custom encoding function. If provided, should accept a single
|
|
1297
|
+
sequence string and return encoded array with shape (L, 4). Default: None
|
|
1298
|
+
auto_map_chromosomes: Automatically map chromosome names between VCF and reference
|
|
1299
|
+
when they don't match exactly (e.g., 'chr1' <-> '1', 'chrM' <-> 'MT').
|
|
1300
|
+
Default: False. When False, raises ChromosomeMismatchError if names
|
|
1301
|
+
don't match. (default: False)
|
|
1302
|
+
|
|
1303
|
+
Returns:
|
|
1304
|
+
If encode=True: A dictionary mapping chromosome names to encoded tensors/arrays
|
|
1305
|
+
If encode=False: A dictionary mapping chromosome names to sequence strings
|
|
1306
|
+
|
|
1307
|
+
The dictionary preserves reference genome chromosome order, with any fusion
|
|
1308
|
+
or leftover sequences appended at the end.
|
|
1309
|
+
|
|
1310
|
+
Raises:
|
|
1311
|
+
ChromosomeMismatchError: If auto_map_chromosomes=False and chromosome names in VCF
|
|
1312
|
+
and reference don't match exactly
|
|
1313
|
+
|
|
1314
|
+
Examples:
|
|
1315
|
+
# Apply variants with proper ordering and conflict resolution
|
|
1316
|
+
personal_genome = get_personal_genome('reference.fa', 'variants.vcf')
|
|
1317
|
+
|
|
1318
|
+
# Get raw sequences without encoding
|
|
1319
|
+
personal_genome = get_personal_genome('reference.fa', 'variants.vcf', encode=False)
|
|
1320
|
+
|
|
1321
|
+
# Enable automatic chromosome mapping if VCF uses 'chr1' and reference uses '1'
|
|
1322
|
+
personal_genome = get_personal_genome('reference.fa', 'variants.vcf', auto_map_chromosomes=True)
|
|
1323
|
+
|
|
1324
|
+
# Verify chromosome order is preserved
|
|
1325
|
+
ref_chroms = list(pyfaidx.Fasta('reference.fa').keys())
|
|
1326
|
+
personal_chroms = list(personal_genome.keys())
|
|
1327
|
+
assert personal_chroms[:len(ref_chroms)] == ref_chroms # Original order preserved
|
|
1328
|
+
"""
|
|
1329
|
+
# Load ALL variants with classification
|
|
1330
|
+
from .variant_utils import group_variants_by_semantic_type
|
|
1331
|
+
|
|
1332
|
+
variants_df = _load_variants(variants_fn)
|
|
1333
|
+
reference = _load_reference(reference_fn)
|
|
1334
|
+
|
|
1335
|
+
if len(variants_df) == 0:
|
|
1336
|
+
if verbose:
|
|
1337
|
+
print("🧬 No variants found - returning reference genome")
|
|
1338
|
+
return _encode_genome_sequences(reference, encode, encoder)
|
|
1339
|
+
|
|
1340
|
+
# Group variants by semantic type for proper processing order
|
|
1341
|
+
# Pass VCF path for BND semantic classification if available
|
|
1342
|
+
vcf_path = variants_fn if isinstance(variants_fn, str) else None
|
|
1343
|
+
grouped_variants = group_variants_by_semantic_type(variants_df, vcf_path)
|
|
1344
|
+
|
|
1345
|
+
if verbose:
|
|
1346
|
+
total_variants = len(variants_df)
|
|
1347
|
+
print(
|
|
1348
|
+
f"🧬 Processing {total_variants:,} variants across {len(variants_df['chrom'].unique())} chromosomes"
|
|
1349
|
+
)
|
|
1350
|
+
print(
|
|
1351
|
+
f" Phase 1: {len(grouped_variants['standard']) + len(grouped_variants['dup_variants']) + len(grouped_variants['inv_variants'])} standard variants (SNV, MNV, INS, DEL, SV_DUP, SV_INV)"
|
|
1352
|
+
)
|
|
1353
|
+
print(
|
|
1354
|
+
f" Phase 2: {len(grouped_variants['bnd_variants'])} BND variants for semantic classification"
|
|
1355
|
+
)
|
|
1356
|
+
|
|
1357
|
+
# Apply chromosome name matching
|
|
1358
|
+
ref_chroms = set(reference.keys())
|
|
1359
|
+
vcf_chroms = set(variants_df["chrom"].unique())
|
|
1360
|
+
|
|
1361
|
+
mapping, unmatched = match_chromosomes_with_report(
|
|
1362
|
+
ref_chroms,
|
|
1363
|
+
vcf_chroms,
|
|
1364
|
+
verbose=verbose,
|
|
1365
|
+
auto_map_chromosomes=auto_map_chromosomes,
|
|
1366
|
+
)
|
|
1367
|
+
|
|
1368
|
+
# Apply chromosome name mapping to all variants
|
|
1369
|
+
if mapping:
|
|
1370
|
+
for group_name, variant_group in grouped_variants.items():
|
|
1371
|
+
if len(variant_group) > 0:
|
|
1372
|
+
grouped_variants[group_name] = apply_chromosome_mapping(
|
|
1373
|
+
variant_group, mapping
|
|
1374
|
+
)
|
|
1375
|
+
|
|
1376
|
+
# Initialize processing state
|
|
1377
|
+
personal_genome = {}
|
|
1378
|
+
total_processed = 0
|
|
1379
|
+
offset_tracker = ChromosomeOffsetTracker()
|
|
1380
|
+
modified_sequences = {}
|
|
1381
|
+
|
|
1382
|
+
# PHASE 1: Apply standard variants + early structural variants (in VCF order)
|
|
1383
|
+
# Include both symbolic and BND-derived DUP/INV variants
|
|
1384
|
+
symbolic_dup_variants = (
|
|
1385
|
+
grouped_variants["dup_variants"][
|
|
1386
|
+
grouped_variants["dup_variants"]["variant_type"] == "SV_DUP"
|
|
1387
|
+
]
|
|
1388
|
+
if len(grouped_variants["dup_variants"]) > 0
|
|
1389
|
+
else pd.DataFrame()
|
|
1390
|
+
)
|
|
1391
|
+
|
|
1392
|
+
symbolic_inv_variants = (
|
|
1393
|
+
grouped_variants["inv_variants"][
|
|
1394
|
+
grouped_variants["inv_variants"]["variant_type"] == "SV_INV"
|
|
1395
|
+
]
|
|
1396
|
+
if len(grouped_variants["inv_variants"]) > 0
|
|
1397
|
+
else pd.DataFrame()
|
|
1398
|
+
)
|
|
1399
|
+
|
|
1400
|
+
# Extract BND-derived DUP/INV variants for Phase 1 processing
|
|
1401
|
+
bnd_dup_variants = (
|
|
1402
|
+
grouped_variants["dup_variants"][
|
|
1403
|
+
grouped_variants["dup_variants"]["variant_type"] == "SV_BND_DUP"
|
|
1404
|
+
]
|
|
1405
|
+
if len(grouped_variants["dup_variants"]) > 0
|
|
1406
|
+
else pd.DataFrame()
|
|
1407
|
+
)
|
|
1408
|
+
|
|
1409
|
+
bnd_inv_variants = (
|
|
1410
|
+
grouped_variants["inv_variants"][
|
|
1411
|
+
grouped_variants["inv_variants"]["variant_type"] == "SV_BND_INV"
|
|
1412
|
+
]
|
|
1413
|
+
if len(grouped_variants["inv_variants"]) > 0
|
|
1414
|
+
else pd.DataFrame()
|
|
1415
|
+
)
|
|
1416
|
+
|
|
1417
|
+
phase1_variants = pd.concat(
|
|
1418
|
+
[
|
|
1419
|
+
grouped_variants["standard"],
|
|
1420
|
+
symbolic_dup_variants,
|
|
1421
|
+
symbolic_inv_variants,
|
|
1422
|
+
bnd_dup_variants,
|
|
1423
|
+
bnd_inv_variants,
|
|
1424
|
+
],
|
|
1425
|
+
ignore_index=True,
|
|
1426
|
+
)
|
|
1427
|
+
|
|
1428
|
+
if len(phase1_variants) > 0:
|
|
1429
|
+
# Sort by chromosome and position to maintain VCF order
|
|
1430
|
+
phase1_variants = phase1_variants.sort_values(["chrom", "pos1"])
|
|
1431
|
+
|
|
1432
|
+
for chrom, chrom_variants in phase1_variants.groupby("chrom"):
|
|
1433
|
+
if chrom not in reference:
|
|
1434
|
+
if verbose:
|
|
1435
|
+
print(f"⚠️ Skipping {chrom}: not found in reference")
|
|
1436
|
+
continue
|
|
1437
|
+
|
|
1438
|
+
ref_seq = str(reference[chrom])
|
|
1439
|
+
|
|
1440
|
+
if verbose:
|
|
1441
|
+
variant_counts = chrom_variants["variant_type"].value_counts().to_dict()
|
|
1442
|
+
type_summary = ", ".join(
|
|
1443
|
+
[f"{count} {vtype}" for vtype, count in variant_counts.items()]
|
|
1444
|
+
)
|
|
1445
|
+
print(
|
|
1446
|
+
f"🔄 Processing chromosome {chrom}: {len(chrom_variants):,} variants ({type_summary})"
|
|
1447
|
+
)
|
|
1448
|
+
|
|
1449
|
+
# Process all Phase 1 variants with offset tracking (chunking available for standard variants only)
|
|
1450
|
+
if n_chunks == 1 or any(
|
|
1451
|
+
vtype in ["SV_DUP", "SV_INV"]
|
|
1452
|
+
for vtype in chrom_variants["variant_type"].unique()
|
|
1453
|
+
):
|
|
1454
|
+
# PRE-PROCESS: Convert BND-derived variants to synthetic variants
|
|
1455
|
+
processed_variants = _preprocess_bnd_derived_variants(
|
|
1456
|
+
chrom_variants, vcf_path, verbose
|
|
1457
|
+
)
|
|
1458
|
+
|
|
1459
|
+
# Process all variants at once (required for structural variants)
|
|
1460
|
+
applicator = VariantApplicator(
|
|
1461
|
+
ref_seq,
|
|
1462
|
+
processed_variants,
|
|
1463
|
+
offset_tracker=offset_tracker,
|
|
1464
|
+
chrom=chrom,
|
|
1465
|
+
)
|
|
1466
|
+
personal_seq, stats = applicator.apply_variants()
|
|
1467
|
+
|
|
1468
|
+
if verbose and stats["total"] > 0:
|
|
1469
|
+
# Report skipped variants if any
|
|
1470
|
+
if stats["skipped"] > 0 and stats.get("skipped_variants"):
|
|
1471
|
+
print(f" ⚠️ Skipped {stats['skipped']} variant(s):")
|
|
1472
|
+
print(_format_skipped_variant_report(stats["skipped_variants"]))
|
|
1473
|
+
print(
|
|
1474
|
+
f" ✅ Applied {stats['applied']}/{stats['total']} variants ({stats['skipped']} skipped)"
|
|
1475
|
+
)
|
|
1476
|
+
|
|
1477
|
+
else:
|
|
1478
|
+
# PRE-PROCESS: Convert BND-derived variants to synthetic variants (for chunked processing too)
|
|
1479
|
+
processed_variants = _preprocess_bnd_derived_variants(
|
|
1480
|
+
chrom_variants, vcf_path, verbose
|
|
1481
|
+
)
|
|
1482
|
+
|
|
1483
|
+
# Process in chunks (standard variants only)
|
|
1484
|
+
current_sequence = ref_seq
|
|
1485
|
+
shared_frozen_tracker = FrozenRegionTracker()
|
|
1486
|
+
total_applied = 0
|
|
1487
|
+
total_skipped = 0
|
|
1488
|
+
all_skipped_variants = []
|
|
1489
|
+
|
|
1490
|
+
indices = np.array_split(np.arange(len(processed_variants)), n_chunks)
|
|
1491
|
+
|
|
1492
|
+
if verbose:
|
|
1493
|
+
avg_chunk_size = len(processed_variants) // n_chunks
|
|
1494
|
+
print(
|
|
1495
|
+
f" 📦 Processing {n_chunks} chunks of ~{avg_chunk_size:,} variants each"
|
|
1496
|
+
)
|
|
1497
|
+
|
|
1498
|
+
for i, chunk_indices in enumerate(indices):
|
|
1499
|
+
if len(chunk_indices) == 0:
|
|
1500
|
+
continue
|
|
1501
|
+
|
|
1502
|
+
chunk_df = processed_variants.iloc[chunk_indices].reset_index(
|
|
1503
|
+
drop=True
|
|
1504
|
+
)
|
|
1505
|
+
|
|
1506
|
+
applicator = VariantApplicator(
|
|
1507
|
+
current_sequence,
|
|
1508
|
+
chunk_df,
|
|
1509
|
+
shared_frozen_tracker,
|
|
1510
|
+
offset_tracker,
|
|
1511
|
+
chrom,
|
|
1512
|
+
)
|
|
1513
|
+
current_sequence, stats = applicator.apply_variants()
|
|
1514
|
+
|
|
1515
|
+
total_applied += stats["applied"]
|
|
1516
|
+
total_skipped += stats["skipped"]
|
|
1517
|
+
all_skipped_variants.extend(stats.get("skipped_variants", []))
|
|
1518
|
+
|
|
1519
|
+
if verbose:
|
|
1520
|
+
print(
|
|
1521
|
+
f" ✅ Chunk {i+1}: {stats['applied']}/{stats['total']} variants applied"
|
|
1522
|
+
)
|
|
1523
|
+
|
|
1524
|
+
personal_seq = current_sequence
|
|
1525
|
+
|
|
1526
|
+
if verbose:
|
|
1527
|
+
# Report skipped variants if any
|
|
1528
|
+
if total_skipped > 0 and all_skipped_variants:
|
|
1529
|
+
print(f" ⚠️ Skipped {total_skipped} variant(s):")
|
|
1530
|
+
print(_format_skipped_variant_report(all_skipped_variants))
|
|
1531
|
+
print(
|
|
1532
|
+
f" 🎯 Total: {total_applied}/{len(processed_variants)} variants applied ({total_skipped} skipped)"
|
|
1533
|
+
)
|
|
1534
|
+
|
|
1535
|
+
modified_sequences[chrom] = personal_seq
|
|
1536
|
+
total_processed += len(chrom_variants)
|
|
1537
|
+
|
|
1538
|
+
# Initialize sequences for chromosomes not processed in Phase 1
|
|
1539
|
+
for chrom in reference.keys():
|
|
1540
|
+
if chrom not in modified_sequences:
|
|
1541
|
+
modified_sequences[chrom] = str(reference[chrom])
|
|
1542
|
+
|
|
1543
|
+
# PHASE 2: BND translocation processing
|
|
1544
|
+
# Only process true BND translocations (BND-derived DUP/INV are now handled in Phase 1)
|
|
1545
|
+
true_bnd_variants = grouped_variants["bnd_variants"]
|
|
1546
|
+
|
|
1547
|
+
# Phase 2 variants are now only true BND translocations
|
|
1548
|
+
phase2_variants = true_bnd_variants
|
|
1549
|
+
|
|
1550
|
+
if len(phase2_variants) > 0:
|
|
1551
|
+
if verbose:
|
|
1552
|
+
phase2_counts = phase2_variants["variant_type"].value_counts().to_dict()
|
|
1553
|
+
counts_msg = ", ".join(
|
|
1554
|
+
[f"{count} {vtype}" for vtype, count in phase2_counts.items()]
|
|
1555
|
+
)
|
|
1556
|
+
print(
|
|
1557
|
+
f"🔄 Phase 2: Processing {len(phase2_variants)} BND variants with semantic classification ({counts_msg})"
|
|
1558
|
+
)
|
|
1559
|
+
|
|
1560
|
+
# Use the BND classifier results directly instead of create_breakend_pairs
|
|
1561
|
+
# This ensures we get the inferred mates that the classifier created
|
|
1562
|
+
if vcf_path:
|
|
1563
|
+
from .variant_utils import BNDClassifier
|
|
1564
|
+
|
|
1565
|
+
classifier = BNDClassifier()
|
|
1566
|
+
classified_breakends = classifier.classify_all_breakends(
|
|
1567
|
+
vcf_path, verbose=verbose
|
|
1568
|
+
)
|
|
1569
|
+
|
|
1570
|
+
# Extract all paired breakends (including those with inferred mates)
|
|
1571
|
+
all_paired_breakends = classified_breakends["paired"]
|
|
1572
|
+
|
|
1573
|
+
# Convert to BreakendPair-like objects for ChimericSequenceBuilder compatibility
|
|
1574
|
+
breakend_pairs = []
|
|
1575
|
+
processed_ids = set()
|
|
1576
|
+
|
|
1577
|
+
for breakend in all_paired_breakends:
|
|
1578
|
+
if breakend.id in processed_ids or not breakend.mate_breakend:
|
|
1579
|
+
continue
|
|
1580
|
+
|
|
1581
|
+
# Create a pair tuple (bnd1, bnd2) for ChimericSequenceBuilder
|
|
1582
|
+
pair_tuple = (breakend, breakend.mate_breakend)
|
|
1583
|
+
breakend_pairs.append(pair_tuple)
|
|
1584
|
+
processed_ids.add(breakend.id)
|
|
1585
|
+
processed_ids.add(breakend.mate_breakend.id)
|
|
1586
|
+
else:
|
|
1587
|
+
# Fallback to create_breakend_pairs if no VCF path available
|
|
1588
|
+
from .variant_utils import create_breakend_pairs
|
|
1589
|
+
|
|
1590
|
+
breakend_pairs = create_breakend_pairs(phase2_variants)
|
|
1591
|
+
|
|
1592
|
+
if len(breakend_pairs) > 0:
|
|
1593
|
+
if verbose:
|
|
1594
|
+
print(f" Created {len(breakend_pairs)} BND pairs for processing")
|
|
1595
|
+
|
|
1596
|
+
# Transform BND coordinates using offset tracker from Phase 1
|
|
1597
|
+
for pair in breakend_pairs:
|
|
1598
|
+
# Handle both BreakendPair objects and tuple pairs
|
|
1599
|
+
if hasattr(pair, "breakend1"):
|
|
1600
|
+
bnd1 = pair.breakend1
|
|
1601
|
+
bnd2 = pair.breakend2
|
|
1602
|
+
else:
|
|
1603
|
+
bnd1, bnd2 = pair
|
|
1604
|
+
|
|
1605
|
+
original_pos1 = bnd1.pos
|
|
1606
|
+
original_pos2 = bnd2.pos
|
|
1607
|
+
|
|
1608
|
+
# Transform coordinates to account for applied Phase 1 variants
|
|
1609
|
+
if hasattr(offset_tracker, "get_offset_at_position"):
|
|
1610
|
+
bnd1_offset = offset_tracker.get_offset_at_position(
|
|
1611
|
+
bnd1.chrom, bnd1.pos - 1
|
|
1612
|
+
)
|
|
1613
|
+
bnd2_offset = offset_tracker.get_offset_at_position(
|
|
1614
|
+
bnd2.chrom, bnd2.pos - 1
|
|
1615
|
+
)
|
|
1616
|
+
bnd1.pos += bnd1_offset
|
|
1617
|
+
bnd2.pos += bnd2_offset
|
|
1618
|
+
|
|
1619
|
+
if verbose and (bnd1_offset != 0 or bnd2_offset != 0):
|
|
1620
|
+
print(
|
|
1621
|
+
f" 📍 Transformed coordinates: {bnd1.chrom}:{original_pos1}→{bnd1.pos}, {bnd2.chrom}:{original_pos2}→{bnd2.pos}"
|
|
1622
|
+
)
|
|
1623
|
+
|
|
1624
|
+
# Note: BND semantic classification (SV_BND_DUP, SV_BND_INV) is handled by
|
|
1625
|
+
# group_variants_by_semantic_type() and _preprocess_bnd_derived_variants().
|
|
1626
|
+
# Remaining BND variants are processed as translocations using ChimericSequenceBuilder.
|
|
1627
|
+
|
|
1628
|
+
# Enhanced frozen region validation for BND breakpoints
|
|
1629
|
+
validated_pairs = []
|
|
1630
|
+
skipped_pairs = []
|
|
1631
|
+
|
|
1632
|
+
for pair in breakend_pairs:
|
|
1633
|
+
# Handle both BreakendPair objects and tuple pairs
|
|
1634
|
+
if hasattr(pair, "breakend1"):
|
|
1635
|
+
bnd1 = pair.breakend1
|
|
1636
|
+
bnd2 = pair.breakend2
|
|
1637
|
+
else:
|
|
1638
|
+
bnd1, bnd2 = pair
|
|
1639
|
+
|
|
1640
|
+
# Check if both breakpoints are in non-frozen regions
|
|
1641
|
+
# Create a temporary FrozenRegionTracker to check current frozen regions
|
|
1642
|
+
# Note: This is a simplified check - a more sophisticated implementation would
|
|
1643
|
+
# track frozen regions across all chromosomes from Phase 1
|
|
1644
|
+
|
|
1645
|
+
breakpoint_conflicts = []
|
|
1646
|
+
|
|
1647
|
+
# Note: Frozen region tracking is handled by FrozenRegionTracker within each chromosome
|
|
1648
|
+
# processing. Cross-Phase conflict detection could be enhanced in future versions.
|
|
1649
|
+
if breakpoint_conflicts:
|
|
1650
|
+
skipped_pairs.append(pair)
|
|
1651
|
+
if verbose:
|
|
1652
|
+
conflicts_msg = "; ".join(breakpoint_conflicts)
|
|
1653
|
+
print(
|
|
1654
|
+
f" ⚠️ Skipping BND pair {bnd1.id}-{bnd2.id}: {conflicts_msg}"
|
|
1655
|
+
)
|
|
1656
|
+
else:
|
|
1657
|
+
validated_pairs.append(pair)
|
|
1658
|
+
|
|
1659
|
+
if verbose and len(skipped_pairs) > 0:
|
|
1660
|
+
print(
|
|
1661
|
+
f" 📍 Skipped {len(skipped_pairs)} BND pairs due to frozen region conflicts"
|
|
1662
|
+
)
|
|
1663
|
+
|
|
1664
|
+
# Create chimeric sequences using validated pairs only
|
|
1665
|
+
sequence_builder = ChimericSequenceBuilder(modified_sequences)
|
|
1666
|
+
|
|
1667
|
+
# Initialize segment tracker with original reference chromosomes only
|
|
1668
|
+
original_ref_sequences = {
|
|
1669
|
+
chrom: seq
|
|
1670
|
+
for chrom, seq in modified_sequences.items()
|
|
1671
|
+
if "_fusion_" not in chrom
|
|
1672
|
+
}
|
|
1673
|
+
segment_tracker = ChromosomeSegmentTracker(original_ref_sequences)
|
|
1674
|
+
|
|
1675
|
+
for i, pair in enumerate(validated_pairs):
|
|
1676
|
+
# Handle both BreakendPair objects and tuple pairs for display
|
|
1677
|
+
if hasattr(pair, "breakend1"):
|
|
1678
|
+
bnd1_id, bnd2_id = pair.breakend1.id, pair.breakend2.id
|
|
1679
|
+
pair_tuple = (pair.breakend1, pair.breakend2)
|
|
1680
|
+
bnd1, bnd2 = pair.breakend1, pair.breakend2
|
|
1681
|
+
else:
|
|
1682
|
+
bnd1_id, bnd2_id = pair[0].id, pair[1].id
|
|
1683
|
+
pair_tuple = pair
|
|
1684
|
+
bnd1, bnd2 = pair[0], pair[1]
|
|
1685
|
+
|
|
1686
|
+
if verbose:
|
|
1687
|
+
print(
|
|
1688
|
+
f" 🔄 Creating fusion {i+1}/{len(validated_pairs)}: {bnd1_id}-{bnd2_id}"
|
|
1689
|
+
)
|
|
1690
|
+
|
|
1691
|
+
try:
|
|
1692
|
+
fusion_name, fusion_seq = sequence_builder.create_fusion_from_pair(
|
|
1693
|
+
pair_tuple
|
|
1694
|
+
)
|
|
1695
|
+
modified_sequences[fusion_name] = fusion_seq
|
|
1696
|
+
total_processed += 2 # Count both BNDs in the pair
|
|
1697
|
+
|
|
1698
|
+
if verbose:
|
|
1699
|
+
print(
|
|
1700
|
+
f" ✅ Created fusion: {fusion_name} ({len(fusion_seq)} bp)"
|
|
1701
|
+
)
|
|
1702
|
+
|
|
1703
|
+
# Track chromosome segment usage based on fusion orientations
|
|
1704
|
+
pos1_0 = bnd1.pos - 1 # Convert to 0-based
|
|
1705
|
+
pos2_0 = bnd2.pos - 1 # Convert to 0-based
|
|
1706
|
+
seq1_len = len(modified_sequences[bnd1.chrom])
|
|
1707
|
+
seq2_len = len(modified_sequences[bnd2.chrom])
|
|
1708
|
+
|
|
1709
|
+
# Track segments used based on the actual fusion logic from prototype
|
|
1710
|
+
if bnd1.orientation == "t]p]" and bnd2.orientation == "t]p]":
|
|
1711
|
+
# seq1[:pos1] + RC(seq2[:pos2]) - uses chromosome prefixes
|
|
1712
|
+
segment_tracker.add_used_segment(
|
|
1713
|
+
bnd1.chrom, 0, bnd1.pos, verbose
|
|
1714
|
+
) # VCF pos as count
|
|
1715
|
+
segment_tracker.add_used_segment(
|
|
1716
|
+
bnd2.chrom, 0, bnd2.pos, verbose
|
|
1717
|
+
) # VCF pos as count
|
|
1718
|
+
elif bnd1.orientation == "]p]t" and bnd2.orientation == "t[p[":
|
|
1719
|
+
# seq2[:pos2] + seq1[pos1_0:] - prefix from seq2, suffix from seq1
|
|
1720
|
+
segment_tracker.add_used_segment(
|
|
1721
|
+
bnd2.chrom, 0, bnd2.pos, verbose
|
|
1722
|
+
) # VCF pos as count
|
|
1723
|
+
segment_tracker.add_used_segment(
|
|
1724
|
+
bnd1.chrom, pos1_0, seq1_len, verbose
|
|
1725
|
+
)
|
|
1726
|
+
elif bnd1.orientation == "[p[t" and bnd2.orientation == "[p[t":
|
|
1727
|
+
# RC(seq2[pos2_0:]) + seq1[pos1_0:] - uses chromosome suffixes
|
|
1728
|
+
segment_tracker.add_used_segment(
|
|
1729
|
+
bnd2.chrom, pos2_0, seq2_len, verbose
|
|
1730
|
+
)
|
|
1731
|
+
segment_tracker.add_used_segment(
|
|
1732
|
+
bnd1.chrom, pos1_0, seq1_len, verbose
|
|
1733
|
+
)
|
|
1734
|
+
elif bnd1.orientation == "t[p[" and bnd2.orientation == "t[p[":
|
|
1735
|
+
# seq1[:pos1_0] + seq2[pos2_0:] - prefix from seq1, suffix from seq2
|
|
1736
|
+
segment_tracker.add_used_segment(bnd1.chrom, 0, pos1_0, verbose)
|
|
1737
|
+
segment_tracker.add_used_segment(
|
|
1738
|
+
bnd2.chrom, pos2_0, seq2_len, verbose
|
|
1739
|
+
)
|
|
1740
|
+
elif bnd1.orientation == "t[p[" and bnd2.orientation == "]p]t":
|
|
1741
|
+
# seq1[:pos1_0] + seq2[pos2_0:] - prefix from seq1, suffix from seq2
|
|
1742
|
+
segment_tracker.add_used_segment(bnd1.chrom, 0, pos1_0, verbose)
|
|
1743
|
+
segment_tracker.add_used_segment(
|
|
1744
|
+
bnd2.chrom, pos2_0, seq2_len, verbose
|
|
1745
|
+
)
|
|
1746
|
+
elif bnd1.orientation == "]p]t" and bnd2.orientation == "]p]t":
|
|
1747
|
+
# seq2[:pos2] + seq1[pos1_0:] - prefix from seq2, suffix from seq1
|
|
1748
|
+
segment_tracker.add_used_segment(
|
|
1749
|
+
bnd2.chrom, 0, bnd2.pos, verbose
|
|
1750
|
+
) # VCF pos as count
|
|
1751
|
+
segment_tracker.add_used_segment(
|
|
1752
|
+
bnd1.chrom, pos1_0, seq1_len, verbose
|
|
1753
|
+
)
|
|
1754
|
+
else:
|
|
1755
|
+
# Unknown orientation patterns - track conservatively
|
|
1756
|
+
if verbose:
|
|
1757
|
+
print(
|
|
1758
|
+
f" ⚠️ Unknown orientation pattern: {bnd1.orientation} + {bnd2.orientation}"
|
|
1759
|
+
)
|
|
1760
|
+
segment_tracker.add_used_segment(bnd1.chrom, 0, pos1_0, verbose)
|
|
1761
|
+
segment_tracker.add_used_segment(
|
|
1762
|
+
bnd2.chrom, pos2_0, seq2_len, verbose
|
|
1763
|
+
)
|
|
1764
|
+
|
|
1765
|
+
except Exception as e:
|
|
1766
|
+
if verbose:
|
|
1767
|
+
print(
|
|
1768
|
+
f" ⚠️ Failed to create fusion for {bnd1_id}-{bnd2_id}: {e}"
|
|
1769
|
+
)
|
|
1770
|
+
|
|
1771
|
+
# Calculate and add leftover sequences
|
|
1772
|
+
leftover_sequences = segment_tracker.get_leftover_sequences(verbose)
|
|
1773
|
+
|
|
1774
|
+
# Remove original chromosomes that were consumed by fusions and replace with leftovers
|
|
1775
|
+
chromosomes_with_fusions = set()
|
|
1776
|
+
for seq_name in list(modified_sequences.keys()):
|
|
1777
|
+
if "_fusion_" in seq_name:
|
|
1778
|
+
# Extract chromosome names from fusion sequence names
|
|
1779
|
+
parts = seq_name.split("_")
|
|
1780
|
+
if len(parts) >= 2:
|
|
1781
|
+
chromosomes_with_fusions.add(parts[0])
|
|
1782
|
+
chromosomes_with_fusions.add(parts[1])
|
|
1783
|
+
|
|
1784
|
+
# Remove consumed chromosomes and add their leftovers
|
|
1785
|
+
for chrom in chromosomes_with_fusions:
|
|
1786
|
+
if chrom in modified_sequences:
|
|
1787
|
+
del modified_sequences[chrom]
|
|
1788
|
+
if verbose:
|
|
1789
|
+
print(f" 🗑️ Removed consumed chromosome: {chrom}")
|
|
1790
|
+
|
|
1791
|
+
# Add leftover sequences
|
|
1792
|
+
modified_sequences.update(leftover_sequences)
|
|
1793
|
+
|
|
1794
|
+
# FINAL STEP: Encode sequences and create output
|
|
1795
|
+
# Preserve reference chromosome order, then append fusion/leftover sequences
|
|
1796
|
+
reference_chroms = list(reference.keys())
|
|
1797
|
+
|
|
1798
|
+
# First, add chromosomes in reference order
|
|
1799
|
+
for chrom in reference_chroms:
|
|
1800
|
+
if chrom in modified_sequences:
|
|
1801
|
+
seq = modified_sequences[chrom]
|
|
1802
|
+
if encode:
|
|
1803
|
+
if encoder:
|
|
1804
|
+
personal_genome[chrom] = encoder(seq)
|
|
1805
|
+
else:
|
|
1806
|
+
personal_genome[chrom] = encode_seq(seq)
|
|
1807
|
+
else:
|
|
1808
|
+
personal_genome[chrom] = seq
|
|
1809
|
+
|
|
1810
|
+
# Then, add fusion and leftover sequences (not in original reference)
|
|
1811
|
+
for chrom, seq in modified_sequences.items():
|
|
1812
|
+
if chrom not in reference_chroms:
|
|
1813
|
+
if encode:
|
|
1814
|
+
if encoder:
|
|
1815
|
+
personal_genome[chrom] = encoder(seq)
|
|
1816
|
+
else:
|
|
1817
|
+
personal_genome[chrom] = encode_seq(seq)
|
|
1818
|
+
else:
|
|
1819
|
+
personal_genome[chrom] = seq
|
|
1820
|
+
|
|
1821
|
+
if verbose:
|
|
1822
|
+
total_variants = len(variants_df)
|
|
1823
|
+
sequences_msg = f"{len(personal_genome):,} sequences"
|
|
1824
|
+
if any("_fusion_" in name for name in personal_genome.keys()):
|
|
1825
|
+
fusion_count = sum(
|
|
1826
|
+
1 for name in personal_genome.keys() if "_fusion_" in name
|
|
1827
|
+
)
|
|
1828
|
+
leftover_count = len(personal_genome) - fusion_count
|
|
1829
|
+
sequences_msg = (
|
|
1830
|
+
f"{fusion_count} fusions, {leftover_count} leftover sequences"
|
|
1831
|
+
)
|
|
1832
|
+
|
|
1833
|
+
print(
|
|
1834
|
+
f"🧬 Completed: {total_processed:,}/{total_variants:,} variants processed → {sequences_msg}"
|
|
1835
|
+
)
|
|
1836
|
+
|
|
1837
|
+
return personal_genome
|
|
1838
|
+
|
|
1839
|
+
|
|
1840
|
+
def _generate_sequence_metadata(chunk_variants, seq_len):
|
|
1841
|
+
"""
|
|
1842
|
+
Generate standardized metadata for sequence functions.
|
|
1843
|
+
|
|
1844
|
+
This centralizes metadata generation to eliminate duplication across
|
|
1845
|
+
get_alt_sequences, get_ref_sequences, and get_alt_ref_sequences.
|
|
1846
|
+
|
|
1847
|
+
Args:
|
|
1848
|
+
chunk_variants: DataFrame of variants for this chunk
|
|
1849
|
+
seq_len: Length of the sequence window
|
|
1850
|
+
|
|
1851
|
+
Returns:
|
|
1852
|
+
pandas.DataFrame: Comprehensive metadata with standardized columns
|
|
1853
|
+
"""
|
|
1854
|
+
metadata = []
|
|
1855
|
+
|
|
1856
|
+
for _, var in chunk_variants.iterrows():
|
|
1857
|
+
# Basic position calculations
|
|
1858
|
+
pos = var["pos1"] # 1-based VCF position
|
|
1859
|
+
genomic_pos = pos - 1 # Convert to 0-based
|
|
1860
|
+
half_len = seq_len // 2
|
|
1861
|
+
window_start = max(0, genomic_pos - half_len)
|
|
1862
|
+
window_end = window_start + seq_len
|
|
1863
|
+
|
|
1864
|
+
# Variant classification
|
|
1865
|
+
variant_type = var.get("variant_type", "unknown")
|
|
1866
|
+
|
|
1867
|
+
# Build minimal metadata dictionary
|
|
1868
|
+
meta_dict = {
|
|
1869
|
+
"chrom": var["chrom"],
|
|
1870
|
+
"window_start": window_start,
|
|
1871
|
+
"window_end": window_end,
|
|
1872
|
+
"variant_pos0": genomic_pos, # 0-based absolute position
|
|
1873
|
+
"variant_pos1": pos, # 1-based absolute position
|
|
1874
|
+
"ref": var["ref"],
|
|
1875
|
+
"alt": var["alt"],
|
|
1876
|
+
"variant_type": variant_type,
|
|
1877
|
+
}
|
|
1878
|
+
|
|
1879
|
+
# Add sym_variant_end ONLY for symbolic alleles (<INV>, <DUP>, etc.)
|
|
1880
|
+
if variant_type.startswith("SV_") and "<" in var["alt"]:
|
|
1881
|
+
if "info" in var and var["info"] and var["info"] != ".":
|
|
1882
|
+
parsed_info = parse_vcf_info(var["info"])
|
|
1883
|
+
sym_end = parsed_info.get("END")
|
|
1884
|
+
if sym_end is not None:
|
|
1885
|
+
meta_dict["sym_variant_end"] = sym_end
|
|
1886
|
+
|
|
1887
|
+
metadata.append(meta_dict)
|
|
1888
|
+
|
|
1889
|
+
return pd.DataFrame(metadata)
|
|
1890
|
+
|
|
1891
|
+
|
|
1892
|
+
def _generate_bnd_ref_sequences(
|
|
1893
|
+
breakend_pairs, reference, seq_len, encode=True, encoder=None
|
|
1894
|
+
):
|
|
1895
|
+
"""
|
|
1896
|
+
Generate dual reference sequences for BND variants (no ALT sequences).
|
|
1897
|
+
|
|
1898
|
+
Args:
|
|
1899
|
+
breakend_pairs: List of breakend pairs from load_breakend_variants
|
|
1900
|
+
reference: Reference genome dictionary
|
|
1901
|
+
seq_len: Length of sequence window
|
|
1902
|
+
encode: Whether to encode sequences
|
|
1903
|
+
encoder: Optional custom encoder
|
|
1904
|
+
|
|
1905
|
+
Returns:
|
|
1906
|
+
Tuple of (left_ref_sequences, right_ref_sequences, metadata)
|
|
1907
|
+
left_ref_sequences contains left breakend reference (+ N-padding)
|
|
1908
|
+
right_ref_sequences contains right breakend reference (N-padding +)
|
|
1909
|
+
"""
|
|
1910
|
+
from .sequence_utils import encode_seq, rc_str
|
|
1911
|
+
|
|
1912
|
+
left_ref_sequences = []
|
|
1913
|
+
right_ref_sequences = []
|
|
1914
|
+
metadata = []
|
|
1915
|
+
|
|
1916
|
+
if not breakend_pairs:
|
|
1917
|
+
return left_ref_sequences, right_ref_sequences, metadata
|
|
1918
|
+
|
|
1919
|
+
# Process each breakend pair
|
|
1920
|
+
for bnd1, bnd2 in breakend_pairs:
|
|
1921
|
+
try:
|
|
1922
|
+
# Get chromosome sequences
|
|
1923
|
+
seq1 = str(reference[bnd1.chrom]) if bnd1.chrom in reference else ""
|
|
1924
|
+
seq2 = str(reference[bnd2.chrom]) if bnd2.chrom in reference else ""
|
|
1925
|
+
|
|
1926
|
+
if not seq1 or not seq2:
|
|
1927
|
+
continue
|
|
1928
|
+
|
|
1929
|
+
# Calculate window centered on first breakend
|
|
1930
|
+
center_pos = bnd1.pos - 1 # Convert to 0-based
|
|
1931
|
+
|
|
1932
|
+
# Detect if this is a BND with insertion for consistent handling
|
|
1933
|
+
has_insertion = bool(bnd1.inserted_seq or bnd2.inserted_seq)
|
|
1934
|
+
insertion_length = len(bnd1.inserted_seq) + len(bnd2.inserted_seq)
|
|
1935
|
+
|
|
1936
|
+
# Generate left reference sequence (sequence before breakend + right-side N-padding)
|
|
1937
|
+
# For BNDs, we want to show what was there BEFORE the fusion point
|
|
1938
|
+
# Then pad the right side with N's to represent the missing fusion partner + insertion
|
|
1939
|
+
half_len = seq_len // 2
|
|
1940
|
+
|
|
1941
|
+
# Extract sequence leading up to the breakend (before the fusion point)
|
|
1942
|
+
left_start = max(0, center_pos - half_len)
|
|
1943
|
+
left_end = center_pos # Stop at the breakend position
|
|
1944
|
+
left_ref_raw = seq1[left_start:left_end]
|
|
1945
|
+
|
|
1946
|
+
# Pad the right side to represent where the fusion partner + insertion would attach
|
|
1947
|
+
left_padding_needed = seq_len - len(left_ref_raw)
|
|
1948
|
+
# Note: For BND with insertion, this padding represents both the missing chromosome and the insertion
|
|
1949
|
+
left_ref_seq = left_ref_raw + "N" * left_padding_needed
|
|
1950
|
+
|
|
1951
|
+
# Generate right reference sequence (left-side N-padding + sequence after breakend)
|
|
1952
|
+
# For the right side, we want to show what was there AFTER the fusion point
|
|
1953
|
+
# Pad the left side with N's to represent the missing fusion partner + insertion
|
|
1954
|
+
bnd2_center = bnd2.pos - 1 # Convert to 0-based
|
|
1955
|
+
|
|
1956
|
+
# Extract sequence starting from the breakend (after the fusion point)
|
|
1957
|
+
right_start = bnd2_center # Start at the breakend position
|
|
1958
|
+
right_end = min(len(seq2), bnd2_center + half_len)
|
|
1959
|
+
right_ref_raw = seq2[right_start:right_end]
|
|
1960
|
+
|
|
1961
|
+
# Pad the left side to represent where the fusion partner + insertion would attach
|
|
1962
|
+
right_padding_needed = seq_len - len(right_ref_raw)
|
|
1963
|
+
# Note: For BND with insertion, this padding represents both the missing chromosome and the insertion
|
|
1964
|
+
right_ref_seq = "N" * right_padding_needed + right_ref_raw
|
|
1965
|
+
|
|
1966
|
+
# Apply reverse complement if needed based on orientation
|
|
1967
|
+
if bnd1.orientation in ["t]p]", "[p[t"]: # orientations requiring RC
|
|
1968
|
+
left_ref_seq = rc_str(left_ref_seq)
|
|
1969
|
+
if bnd2.orientation in ["t]p]", "[p[t"]:
|
|
1970
|
+
right_ref_seq = rc_str(right_ref_seq)
|
|
1971
|
+
|
|
1972
|
+
# Ensure sequences are exactly seq_len
|
|
1973
|
+
left_ref_seq = left_ref_seq[:seq_len].ljust(seq_len, "N")
|
|
1974
|
+
right_ref_seq = right_ref_seq[:seq_len].ljust(seq_len, "N")
|
|
1975
|
+
|
|
1976
|
+
left_ref_sequences.append(left_ref_seq)
|
|
1977
|
+
right_ref_sequences.append(right_ref_seq)
|
|
1978
|
+
|
|
1979
|
+
# Create metadata for this BND
|
|
1980
|
+
window_start = max(0, center_pos - seq_len // 2)
|
|
1981
|
+
window_end = window_start + seq_len
|
|
1982
|
+
metadata.append(
|
|
1983
|
+
{
|
|
1984
|
+
"chrom": bnd1.chrom,
|
|
1985
|
+
"window_start": window_start,
|
|
1986
|
+
"window_end": window_end,
|
|
1987
|
+
"variant_pos0": center_pos,
|
|
1988
|
+
"variant_pos1": bnd1.pos,
|
|
1989
|
+
"ref": bnd1.ref,
|
|
1990
|
+
"alt": bnd1.alt,
|
|
1991
|
+
"variant_type": "SV_BND",
|
|
1992
|
+
"mate_chrom": bnd2.chrom,
|
|
1993
|
+
"mate_pos": bnd2.pos,
|
|
1994
|
+
"orientation_1": bnd1.orientation,
|
|
1995
|
+
"orientation_2": bnd2.orientation,
|
|
1996
|
+
}
|
|
1997
|
+
)
|
|
1998
|
+
|
|
1999
|
+
except Exception as e:
|
|
2000
|
+
# Log error but continue processing other BNDs
|
|
2001
|
+
import warnings
|
|
2002
|
+
|
|
2003
|
+
warnings.warn(f"Failed to process BND pair {bnd1.id}-{bnd2.id}: {e}")
|
|
2004
|
+
continue
|
|
2005
|
+
|
|
2006
|
+
# Encode sequences if requested
|
|
2007
|
+
if encode and left_ref_sequences:
|
|
2008
|
+
# Encode each sequence individually and collect them
|
|
2009
|
+
encoded_left_ref = []
|
|
2010
|
+
encoded_right_ref = []
|
|
2011
|
+
|
|
2012
|
+
for i in range(len(left_ref_sequences)):
|
|
2013
|
+
encoded_left_ref.append(encode_seq(left_ref_sequences[i], encoder))
|
|
2014
|
+
encoded_right_ref.append(encode_seq(right_ref_sequences[i], encoder))
|
|
2015
|
+
|
|
2016
|
+
# Stack the encoded sequences
|
|
2017
|
+
if TORCH_AVAILABLE:
|
|
2018
|
+
left_ref_sequences = (
|
|
2019
|
+
torch.stack(encoded_left_ref) if encoded_left_ref else []
|
|
2020
|
+
)
|
|
2021
|
+
right_ref_sequences = (
|
|
2022
|
+
torch.stack(encoded_right_ref) if encoded_right_ref else []
|
|
2023
|
+
)
|
|
2024
|
+
else:
|
|
2025
|
+
left_ref_sequences = np.stack(encoded_left_ref) if encoded_left_ref else []
|
|
2026
|
+
right_ref_sequences = (
|
|
2027
|
+
np.stack(encoded_right_ref) if encoded_right_ref else []
|
|
2028
|
+
)
|
|
2029
|
+
|
|
2030
|
+
return left_ref_sequences, right_ref_sequences, metadata
|
|
2031
|
+
|
|
2032
|
+
|
|
2033
|
+
def _generate_bnd_sequences(
|
|
2034
|
+
breakend_pairs, reference, seq_len, encode=True, encoder=None
|
|
2035
|
+
):
|
|
2036
|
+
"""
|
|
2037
|
+
Generate ALT and reference sequences for BND variants.
|
|
2038
|
+
|
|
2039
|
+
Args:
|
|
2040
|
+
breakend_pairs: List of breakend pairs from load_breakend_variants
|
|
2041
|
+
reference: Reference genome dictionary
|
|
2042
|
+
seq_len: Length of sequence window
|
|
2043
|
+
encode: Whether to encode sequences
|
|
2044
|
+
encoder: Optional custom encoder
|
|
2045
|
+
|
|
2046
|
+
Returns:
|
|
2047
|
+
Tuple of (alt_sequences, left_ref_sequences, right_ref_sequences, metadata)
|
|
2048
|
+
For BNDs: alt_sequences contains fusion sequences
|
|
2049
|
+
left_ref_sequences contains left breakend reference (+ N-padding)
|
|
2050
|
+
right_ref_sequences contains right breakend reference (N-padding +)
|
|
2051
|
+
"""
|
|
2052
|
+
from .sequence_utils import encode_seq, rc_str
|
|
2053
|
+
|
|
2054
|
+
alt_sequences = []
|
|
2055
|
+
left_ref_sequences = []
|
|
2056
|
+
right_ref_sequences = []
|
|
2057
|
+
metadata = []
|
|
2058
|
+
|
|
2059
|
+
if not breakend_pairs:
|
|
2060
|
+
return alt_sequences, left_ref_sequences, right_ref_sequences, metadata
|
|
2061
|
+
|
|
2062
|
+
# Process each breakend pair
|
|
2063
|
+
for bnd1, bnd2 in breakend_pairs:
|
|
2064
|
+
try:
|
|
2065
|
+
# Get chromosome sequences
|
|
2066
|
+
seq1 = str(reference[bnd1.chrom]) if bnd1.chrom in reference else ""
|
|
2067
|
+
seq2 = str(reference[bnd2.chrom]) if bnd2.chrom in reference else ""
|
|
2068
|
+
|
|
2069
|
+
if not seq1 or not seq2:
|
|
2070
|
+
continue
|
|
2071
|
+
|
|
2072
|
+
# Generate fusion sequence using existing ChimericSequenceBuilder
|
|
2073
|
+
builder = ChimericSequenceBuilder({bnd1.chrom: seq1, bnd2.chrom: seq2})
|
|
2074
|
+
fusion_name, fusion_seq = builder.create_fusion_from_pair((bnd1, bnd2))
|
|
2075
|
+
|
|
2076
|
+
# Detect if this is a BND with insertion and center appropriately
|
|
2077
|
+
has_insertion = bool(bnd1.inserted_seq or bnd2.inserted_seq)
|
|
2078
|
+
|
|
2079
|
+
if has_insertion:
|
|
2080
|
+
# For BND with insertion, center window on the inserted sequence
|
|
2081
|
+
# Use segment metadata to find where the novel sequence is located
|
|
2082
|
+
segments = builder.get_sequence_segments(fusion_name)
|
|
2083
|
+
novel_segment = None
|
|
2084
|
+
for seg in segments:
|
|
2085
|
+
if seg.source_type == "novel":
|
|
2086
|
+
novel_segment = seg
|
|
2087
|
+
break
|
|
2088
|
+
|
|
2089
|
+
if novel_segment:
|
|
2090
|
+
# Center window on the novel sequence
|
|
2091
|
+
novel_center = (
|
|
2092
|
+
novel_segment.start_pos + novel_segment.end_pos
|
|
2093
|
+
) // 2
|
|
2094
|
+
window_start = max(0, novel_center - seq_len // 2)
|
|
2095
|
+
window_end = window_start + seq_len
|
|
2096
|
+
else:
|
|
2097
|
+
# Fallback to standard centering if no novel segment found
|
|
2098
|
+
center_pos = bnd1.pos - 1 # Convert to 0-based
|
|
2099
|
+
window_start = max(0, center_pos - seq_len // 2)
|
|
2100
|
+
window_end = window_start + seq_len
|
|
2101
|
+
else:
|
|
2102
|
+
# Standard BND: center on first breakend position
|
|
2103
|
+
center_pos = bnd1.pos - 1 # Convert to 0-based
|
|
2104
|
+
window_start = max(0, center_pos - seq_len // 2)
|
|
2105
|
+
window_end = window_start + seq_len
|
|
2106
|
+
|
|
2107
|
+
# Generate ALT sequence (fusion sequence window)
|
|
2108
|
+
if len(fusion_seq) >= seq_len:
|
|
2109
|
+
alt_seq = fusion_seq[window_start:window_end]
|
|
2110
|
+
else:
|
|
2111
|
+
# Pad if fusion is shorter than window
|
|
2112
|
+
alt_seq = fusion_seq + "N" * (seq_len - len(fusion_seq))
|
|
2113
|
+
|
|
2114
|
+
# Generate reference sequences with appropriate padding
|
|
2115
|
+
# For BNDs with insertions, we need to account for the inserted sequence length
|
|
2116
|
+
half_len = seq_len // 2
|
|
2117
|
+
insertion_length = len(bnd1.inserted_seq) + len(bnd2.inserted_seq)
|
|
2118
|
+
|
|
2119
|
+
# Generate left reference sequence (sequence before breakend + right-side N-padding)
|
|
2120
|
+
# For BNDs, we want to show what was there BEFORE the fusion point
|
|
2121
|
+
# Then pad the right side with N's to represent the missing fusion partner + insertion
|
|
2122
|
+
if not has_insertion:
|
|
2123
|
+
# Standard BND: use existing logic
|
|
2124
|
+
center_pos = bnd1.pos - 1 # Convert to 0-based if not set above
|
|
2125
|
+
left_start = max(0, center_pos - half_len)
|
|
2126
|
+
left_end = center_pos # Stop at the breakend position
|
|
2127
|
+
left_ref_raw = seq1[left_start:left_end]
|
|
2128
|
+
|
|
2129
|
+
# For BND with insertion, pad for both the missing chromosome and the insertion
|
|
2130
|
+
left_padding_needed = seq_len - len(left_ref_raw)
|
|
2131
|
+
# Note: The padding represents what's missing (other chromosome + insertion)
|
|
2132
|
+
# but we don't artificially inflate it since the user wants to see proper N-padding
|
|
2133
|
+
left_ref_seq = left_ref_raw + "N" * left_padding_needed
|
|
2134
|
+
|
|
2135
|
+
# Generate right reference sequence (left-side N-padding + sequence after breakend)
|
|
2136
|
+
# For the right side, we want to show what was there AFTER the fusion point
|
|
2137
|
+
# Pad the left side with N's to represent the missing fusion partner + insertion
|
|
2138
|
+
bnd2_center = bnd2.pos - 1 # Convert to 0-based
|
|
2139
|
+
|
|
2140
|
+
# Extract sequence starting from the breakend (after the fusion point)
|
|
2141
|
+
right_start = bnd2_center # Start at the breakend position
|
|
2142
|
+
right_end = min(len(seq2), bnd2_center + half_len)
|
|
2143
|
+
right_ref_raw = seq2[right_start:right_end]
|
|
2144
|
+
|
|
2145
|
+
# For BND with insertion, pad for both the missing chromosome and the insertion
|
|
2146
|
+
right_padding_needed = seq_len - len(right_ref_raw)
|
|
2147
|
+
# Note: The padding represents what's missing (other chromosome + insertion)
|
|
2148
|
+
# but we don't artificially inflate it since the user wants to see proper N-padding
|
|
2149
|
+
right_ref_seq = "N" * right_padding_needed + right_ref_raw
|
|
2150
|
+
|
|
2151
|
+
# Apply reverse complement if needed based on orientation
|
|
2152
|
+
if bnd1.orientation in ["t]p]", "[p[t"]: # orientations requiring RC
|
|
2153
|
+
left_ref_seq = rc_str(left_ref_seq)
|
|
2154
|
+
if bnd2.orientation in ["t]p]", "[p[t"]:
|
|
2155
|
+
right_ref_seq = rc_str(right_ref_seq)
|
|
2156
|
+
|
|
2157
|
+
# Ensure sequences are exactly seq_len
|
|
2158
|
+
alt_seq = alt_seq[:seq_len].ljust(seq_len, "N")
|
|
2159
|
+
left_ref_seq = left_ref_seq[:seq_len].ljust(seq_len, "N")
|
|
2160
|
+
right_ref_seq = right_ref_seq[:seq_len].ljust(seq_len, "N")
|
|
2161
|
+
|
|
2162
|
+
alt_sequences.append(alt_seq)
|
|
2163
|
+
left_ref_sequences.append(left_ref_seq)
|
|
2164
|
+
right_ref_sequences.append(right_ref_seq)
|
|
2165
|
+
|
|
2166
|
+
# Create metadata for this BND
|
|
2167
|
+
metadata.append(
|
|
2168
|
+
{
|
|
2169
|
+
"chrom": bnd1.chrom,
|
|
2170
|
+
"window_start": window_start,
|
|
2171
|
+
"window_end": window_end,
|
|
2172
|
+
"variant_pos0": center_pos,
|
|
2173
|
+
"variant_pos1": bnd1.pos,
|
|
2174
|
+
"ref": bnd1.ref,
|
|
2175
|
+
"alt": bnd1.alt,
|
|
2176
|
+
"variant_type": "SV_BND",
|
|
2177
|
+
"mate_chrom": bnd2.chrom,
|
|
2178
|
+
"mate_pos": bnd2.pos,
|
|
2179
|
+
"orientation_1": bnd1.orientation,
|
|
2180
|
+
"orientation_2": bnd2.orientation,
|
|
2181
|
+
"fusion_name": fusion_name,
|
|
2182
|
+
}
|
|
2183
|
+
)
|
|
2184
|
+
|
|
2185
|
+
except Exception as e:
|
|
2186
|
+
# Log error but continue processing other BNDs
|
|
2187
|
+
import warnings
|
|
2188
|
+
|
|
2189
|
+
warnings.warn(f"Failed to process BND pair {bnd1.id}-{bnd2.id}: {e}")
|
|
2190
|
+
continue
|
|
2191
|
+
|
|
2192
|
+
# Encode sequences if requested
|
|
2193
|
+
if encode and alt_sequences:
|
|
2194
|
+
# Encode each sequence individually and collect them
|
|
2195
|
+
encoded_alt = []
|
|
2196
|
+
encoded_left_ref = []
|
|
2197
|
+
encoded_right_ref = []
|
|
2198
|
+
|
|
2199
|
+
for i in range(len(alt_sequences)):
|
|
2200
|
+
encoded_alt.append(encode_seq(alt_sequences[i], encoder))
|
|
2201
|
+
encoded_left_ref.append(encode_seq(left_ref_sequences[i], encoder))
|
|
2202
|
+
encoded_right_ref.append(encode_seq(right_ref_sequences[i], encoder))
|
|
2203
|
+
|
|
2204
|
+
# Stack the encoded sequences
|
|
2205
|
+
if TORCH_AVAILABLE:
|
|
2206
|
+
alt_sequences = torch.stack(encoded_alt) if encoded_alt else []
|
|
2207
|
+
left_ref_sequences = (
|
|
2208
|
+
torch.stack(encoded_left_ref) if encoded_left_ref else []
|
|
2209
|
+
)
|
|
2210
|
+
right_ref_sequences = (
|
|
2211
|
+
torch.stack(encoded_right_ref) if encoded_right_ref else []
|
|
2212
|
+
)
|
|
2213
|
+
else:
|
|
2214
|
+
alt_sequences = np.stack(encoded_alt) if encoded_alt else []
|
|
2215
|
+
left_ref_sequences = np.stack(encoded_left_ref) if encoded_left_ref else []
|
|
2216
|
+
right_ref_sequences = (
|
|
2217
|
+
np.stack(encoded_right_ref) if encoded_right_ref else []
|
|
2218
|
+
)
|
|
2219
|
+
|
|
2220
|
+
return alt_sequences, left_ref_sequences, right_ref_sequences, metadata
|
|
2221
|
+
|
|
2222
|
+
|
|
2223
|
+
def get_alt_sequences(
|
|
2224
|
+
reference_fn,
|
|
2225
|
+
variants_fn,
|
|
2226
|
+
seq_len,
|
|
2227
|
+
encode=True,
|
|
2228
|
+
n_chunks=1,
|
|
2229
|
+
encoder=None,
|
|
2230
|
+
auto_map_chromosomes=False,
|
|
2231
|
+
):
|
|
2232
|
+
"""
|
|
2233
|
+
Create sequence windows centered on each variant position with variants applied.
|
|
2234
|
+
Now supports both standard variants and BND variants.
|
|
2235
|
+
|
|
2236
|
+
Args:
|
|
2237
|
+
reference_fn: Path to reference genome file or dictionary-like object
|
|
2238
|
+
variants_fn: Path to VCF file (string) or DataFrame with variant data.
|
|
2239
|
+
For DataFrames, position column can be 'pos', 'pos1', or assumes second column is position.
|
|
2240
|
+
seq_len: Length of the sequence window
|
|
2241
|
+
encode: Return sequences as one-hot encoded numpy arrays (default: True)
|
|
2242
|
+
n_chunks: Number of chunks to split variants into (default: 1)
|
|
2243
|
+
encoder: Optional custom encoding function
|
|
2244
|
+
auto_map_chromosomes: Automatically map chromosome names between VCF and reference
|
|
2245
|
+
when they don't match exactly. Default: False. (default: False)
|
|
2246
|
+
|
|
2247
|
+
Yields:
|
|
2248
|
+
Tuple containing (sequences, metadata_df) where:
|
|
2249
|
+
If encode=True: sequences is a tensor/array of shape (chunk_size, seq_len, 4) for each chunk
|
|
2250
|
+
If encode=False: sequences is a list of tuples containing (chrom, start, end, sequence_string) for each chunk
|
|
2251
|
+
metadata_df is a DataFrame with variant information including position offsets
|
|
2252
|
+
For BND variants: sequences contain fusion sequences
|
|
2253
|
+
|
|
2254
|
+
Raises:
|
|
2255
|
+
ChromosomeMismatchError: If auto_map_chromosomes=False and chromosome names don't match
|
|
2256
|
+
"""
|
|
2257
|
+
# Load reference and variants, separating BNDs from standard variants
|
|
2258
|
+
from .variant_utils import load_breakend_variants
|
|
2259
|
+
|
|
2260
|
+
reference = _load_reference(reference_fn)
|
|
2261
|
+
|
|
2262
|
+
# Load variants and separate BNDs
|
|
2263
|
+
standard_variants, breakend_pairs = load_breakend_variants(variants_fn)
|
|
2264
|
+
|
|
2265
|
+
# Combine chromosome names from both standard variants and breakend pairs
|
|
2266
|
+
ref_chroms = set(reference.keys())
|
|
2267
|
+
standard_chroms = (
|
|
2268
|
+
set(standard_variants["chrom"].unique())
|
|
2269
|
+
if len(standard_variants) > 0
|
|
2270
|
+
else set()
|
|
2271
|
+
)
|
|
2272
|
+
breakend_chroms = set()
|
|
2273
|
+
for bnd1, bnd2 in breakend_pairs:
|
|
2274
|
+
breakend_chroms.add(bnd1.chrom)
|
|
2275
|
+
breakend_chroms.add(bnd2.chrom)
|
|
2276
|
+
vcf_chroms = standard_chroms | breakend_chroms
|
|
2277
|
+
|
|
2278
|
+
# Use chromosome matching to handle name mismatches
|
|
2279
|
+
mapping, unmatched = match_chromosomes_with_report(
|
|
2280
|
+
ref_chroms, vcf_chroms, verbose=True, auto_map_chromosomes=auto_map_chromosomes
|
|
2281
|
+
)
|
|
2282
|
+
|
|
2283
|
+
# Apply chromosome name mapping to standard variants
|
|
2284
|
+
if mapping and len(standard_variants) > 0:
|
|
2285
|
+
standard_variants = apply_chromosome_mapping(standard_variants, mapping)
|
|
2286
|
+
|
|
2287
|
+
# Apply chromosome name mapping to breakend pairs
|
|
2288
|
+
if mapping and breakend_pairs:
|
|
2289
|
+
updated_pairs = []
|
|
2290
|
+
for bnd1, bnd2 in breakend_pairs:
|
|
2291
|
+
# Update chromosome names in breakend objects if needed
|
|
2292
|
+
if bnd1.chrom in mapping:
|
|
2293
|
+
bnd1.chrom = mapping[bnd1.chrom]
|
|
2294
|
+
if bnd2.chrom in mapping:
|
|
2295
|
+
bnd2.chrom = mapping[bnd2.chrom]
|
|
2296
|
+
if bnd1.mate_chrom in mapping:
|
|
2297
|
+
bnd1.mate_chrom = mapping[bnd1.mate_chrom]
|
|
2298
|
+
if bnd2.mate_chrom in mapping:
|
|
2299
|
+
bnd2.mate_chrom = mapping[bnd2.mate_chrom]
|
|
2300
|
+
updated_pairs.append((bnd1, bnd2))
|
|
2301
|
+
breakend_pairs = updated_pairs
|
|
2302
|
+
|
|
2303
|
+
# Process standard variants and BNDs separately, then combine results
|
|
2304
|
+
# For now, we'll process all in one chunk (BND chunking is more complex)
|
|
2305
|
+
|
|
2306
|
+
# Process standard variants first - yield each chunk individually
|
|
2307
|
+
if len(standard_variants) > 0:
|
|
2308
|
+
# Split standard variants into chunks
|
|
2309
|
+
std_indices = np.array_split(np.arange(len(standard_variants)), n_chunks)
|
|
2310
|
+
std_variant_chunks = (
|
|
2311
|
+
standard_variants.iloc[chunk_indices].reset_index(drop=True)
|
|
2312
|
+
for chunk_indices in std_indices
|
|
2313
|
+
if len(chunk_indices) > 0
|
|
2314
|
+
)
|
|
2315
|
+
|
|
2316
|
+
for chunk_variants in std_variant_chunks:
|
|
2317
|
+
sequences = []
|
|
2318
|
+
|
|
2319
|
+
# Vectorized calculation of window positions for ALL variants in chunk
|
|
2320
|
+
positions = chunk_variants["pos1"].values - 1 # Convert to 0-based
|
|
2321
|
+
half_len = seq_len // 2
|
|
2322
|
+
window_starts = positions - half_len
|
|
2323
|
+
window_ends = window_starts + seq_len
|
|
2324
|
+
|
|
2325
|
+
# Cache for reference chromosome access
|
|
2326
|
+
current_chrom = None
|
|
2327
|
+
ref_seq = None
|
|
2328
|
+
chrom_length = None
|
|
2329
|
+
|
|
2330
|
+
# Track valid indices for metadata filtering
|
|
2331
|
+
valid_indices = []
|
|
2332
|
+
|
|
2333
|
+
# Process each variant individually (applying only that single variant)
|
|
2334
|
+
for idx, (_, var) in enumerate(chunk_variants.iterrows()):
|
|
2335
|
+
chrom = var["chrom"]
|
|
2336
|
+
pos = var["pos1"]
|
|
2337
|
+
|
|
2338
|
+
# Load reference chromosome (with caching)
|
|
2339
|
+
if chrom != current_chrom:
|
|
2340
|
+
if chrom not in reference:
|
|
2341
|
+
warnings.warn(
|
|
2342
|
+
f"Chromosome {chrom} not found in reference. Skipping variant at {chrom}:{pos}."
|
|
2343
|
+
)
|
|
2344
|
+
# Skip this variant - don't add to sequences
|
|
2345
|
+
continue
|
|
2346
|
+
ref_seq = str(reference[chrom])
|
|
2347
|
+
chrom_length = len(ref_seq)
|
|
2348
|
+
current_chrom = chrom
|
|
2349
|
+
|
|
2350
|
+
# Track that this variant was successfully processed
|
|
2351
|
+
valid_indices.append(idx)
|
|
2352
|
+
|
|
2353
|
+
# Use pre-calculated window positions
|
|
2354
|
+
window_start = window_starts[idx]
|
|
2355
|
+
window_end = window_ends[idx]
|
|
2356
|
+
|
|
2357
|
+
# Calculate padding and actual bounds
|
|
2358
|
+
if window_start < 0:
|
|
2359
|
+
left_pad = -window_start
|
|
2360
|
+
actual_start = 0
|
|
2361
|
+
else:
|
|
2362
|
+
left_pad = 0
|
|
2363
|
+
actual_start = window_start
|
|
2364
|
+
|
|
2365
|
+
if window_end > chrom_length:
|
|
2366
|
+
right_pad = window_end - chrom_length
|
|
2367
|
+
actual_end = chrom_length
|
|
2368
|
+
else:
|
|
2369
|
+
right_pad = 0
|
|
2370
|
+
actual_end = window_end
|
|
2371
|
+
|
|
2372
|
+
# Apply ONLY this single variant to reference sequence
|
|
2373
|
+
single_var_df = pd.DataFrame([var])
|
|
2374
|
+
applicator = VariantApplicator(ref_seq, single_var_df)
|
|
2375
|
+
modified_seq, _ = applicator.apply_variants()
|
|
2376
|
+
|
|
2377
|
+
# Extract window from the single-variant modified sequence
|
|
2378
|
+
window_seq = modified_seq[actual_start:actual_end]
|
|
2379
|
+
|
|
2380
|
+
# Add padding if needed
|
|
2381
|
+
if left_pad > 0:
|
|
2382
|
+
window_seq = "N" * left_pad + window_seq
|
|
2383
|
+
if right_pad > 0:
|
|
2384
|
+
window_seq = window_seq + "N" * right_pad
|
|
2385
|
+
|
|
2386
|
+
# Truncate or pad as needed
|
|
2387
|
+
if len(window_seq) < seq_len:
|
|
2388
|
+
window_seq += "N" * (seq_len - len(window_seq))
|
|
2389
|
+
else:
|
|
2390
|
+
window_seq = window_seq[:seq_len]
|
|
2391
|
+
|
|
2392
|
+
# Ensure correct length
|
|
2393
|
+
if len(window_seq) != seq_len:
|
|
2394
|
+
warnings.warn(
|
|
2395
|
+
f"Sequence length mismatch for variant at {chrom}:{pos}. "
|
|
2396
|
+
f"Expected {seq_len}, got {len(window_seq)}"
|
|
2397
|
+
)
|
|
2398
|
+
|
|
2399
|
+
if encode:
|
|
2400
|
+
sequences.append(encode_seq(window_seq, encoder))
|
|
2401
|
+
else:
|
|
2402
|
+
genomic_pos = positions[idx]
|
|
2403
|
+
sequences.append(
|
|
2404
|
+
(
|
|
2405
|
+
chrom,
|
|
2406
|
+
max(0, genomic_pos - half_len),
|
|
2407
|
+
max(0, genomic_pos - half_len) + seq_len,
|
|
2408
|
+
window_seq,
|
|
2409
|
+
)
|
|
2410
|
+
)
|
|
2411
|
+
|
|
2412
|
+
# Generate metadata only for successfully processed variants
|
|
2413
|
+
if valid_indices:
|
|
2414
|
+
filtered_chunk = chunk_variants.iloc[valid_indices].reset_index(
|
|
2415
|
+
drop=True
|
|
2416
|
+
)
|
|
2417
|
+
metadata_df = _generate_sequence_metadata(filtered_chunk, seq_len)
|
|
2418
|
+
else:
|
|
2419
|
+
# No valid variants in this chunk, create empty metadata
|
|
2420
|
+
metadata_df = pd.DataFrame()
|
|
2421
|
+
|
|
2422
|
+
# Yield each chunk immediately
|
|
2423
|
+
if encode and sequences:
|
|
2424
|
+
if TORCH_AVAILABLE:
|
|
2425
|
+
sequences_result = torch.stack(sequences)
|
|
2426
|
+
else:
|
|
2427
|
+
sequences_result = np.stack(sequences)
|
|
2428
|
+
else:
|
|
2429
|
+
sequences_result = sequences
|
|
2430
|
+
|
|
2431
|
+
yield (sequences_result, metadata_df)
|
|
2432
|
+
|
|
2433
|
+
# Process BND variants
|
|
2434
|
+
bnd_alt_sequences, bnd_left_refs, bnd_right_refs, bnd_metadata = (
|
|
2435
|
+
_generate_bnd_sequences(breakend_pairs, reference, seq_len, encode, encoder)
|
|
2436
|
+
)
|
|
2437
|
+
|
|
2438
|
+
# Process BND variants after standard variants (if any)
|
|
2439
|
+
# BND variants are yielded as a single batch for now
|
|
2440
|
+
if len(bnd_alt_sequences) > 0:
|
|
2441
|
+
bnd_metadata_df = pd.DataFrame(bnd_metadata) if bnd_metadata else pd.DataFrame()
|
|
2442
|
+
|
|
2443
|
+
# BND sequences are already stacked by _generate_bnd_sequences
|
|
2444
|
+
bnd_sequences_result = bnd_alt_sequences
|
|
2445
|
+
|
|
2446
|
+
yield (bnd_sequences_result, bnd_metadata_df)
|
|
2447
|
+
|
|
2448
|
+
|
|
2449
|
+
def get_ref_sequences(
|
|
2450
|
+
reference_fn,
|
|
2451
|
+
variants_fn,
|
|
2452
|
+
seq_len,
|
|
2453
|
+
encode=True,
|
|
2454
|
+
n_chunks=1,
|
|
2455
|
+
encoder=None,
|
|
2456
|
+
auto_map_chromosomes=False,
|
|
2457
|
+
):
|
|
2458
|
+
"""
|
|
2459
|
+
Create reference sequence windows centered on each variant position (no variants applied).
|
|
2460
|
+
|
|
2461
|
+
Args:
|
|
2462
|
+
reference_fn: Path to reference genome file or dictionary-like object
|
|
2463
|
+
variants_fn: Path to VCF file (string) or DataFrame with variant data.
|
|
2464
|
+
For DataFrames, position column can be 'pos', 'pos1', or assumes second column is position.
|
|
2465
|
+
seq_len: Length of the sequence window
|
|
2466
|
+
encode: Return sequences as one-hot encoded numpy arrays (default: True)
|
|
2467
|
+
n_chunks: Number of chunks to split variants into (default: 1)
|
|
2468
|
+
encoder: Optional custom encoding function
|
|
2469
|
+
auto_map_chromosomes: Automatically map chromosome names between VCF and reference
|
|
2470
|
+
when they don't match exactly. Default: False. (default: False)
|
|
2471
|
+
|
|
2472
|
+
Yields:
|
|
2473
|
+
Tuple containing (sequences, metadata_df) where:
|
|
2474
|
+
If encode=True: sequences is a tensor/array of shape (chunk_size, seq_len, 4) for each chunk
|
|
2475
|
+
If encode=False: sequences is a list of tuples containing (chrom, start, end, sequence_string) for each chunk
|
|
2476
|
+
metadata_df is a DataFrame with variant information including position offsets
|
|
2477
|
+
For BND variants: sequences contain dual reference sequences (left + right)
|
|
2478
|
+
|
|
2479
|
+
Raises:
|
|
2480
|
+
ChromosomeMismatchError: If auto_map_chromosomes=False and chromosome names don't match
|
|
2481
|
+
"""
|
|
2482
|
+
# Load reference and variants, separating BNDs from standard variants
|
|
2483
|
+
from .variant_utils import load_breakend_variants
|
|
2484
|
+
|
|
2485
|
+
reference = _load_reference(reference_fn)
|
|
2486
|
+
|
|
2487
|
+
# Load variants and separate BNDs
|
|
2488
|
+
standard_variants, breakend_pairs = load_breakend_variants(variants_fn)
|
|
2489
|
+
|
|
2490
|
+
# Combine chromosome names from both standard variants and breakend pairs
|
|
2491
|
+
ref_chroms = set(reference.keys())
|
|
2492
|
+
standard_chroms = (
|
|
2493
|
+
set(standard_variants["chrom"].unique())
|
|
2494
|
+
if len(standard_variants) > 0
|
|
2495
|
+
else set()
|
|
2496
|
+
)
|
|
2497
|
+
breakend_chroms = set()
|
|
2498
|
+
for bnd1, bnd2 in breakend_pairs:
|
|
2499
|
+
breakend_chroms.add(bnd1.chrom)
|
|
2500
|
+
breakend_chroms.add(bnd2.chrom)
|
|
2501
|
+
vcf_chroms = standard_chroms | breakend_chroms
|
|
2502
|
+
|
|
2503
|
+
# Use chromosome matching to handle name mismatches
|
|
2504
|
+
mapping, unmatched = match_chromosomes_with_report(
|
|
2505
|
+
ref_chroms, vcf_chroms, verbose=True, auto_map_chromosomes=auto_map_chromosomes
|
|
2506
|
+
)
|
|
2507
|
+
|
|
2508
|
+
# Apply chromosome name mapping to standard variants
|
|
2509
|
+
if mapping and len(standard_variants) > 0:
|
|
2510
|
+
standard_variants = apply_chromosome_mapping(standard_variants, mapping)
|
|
2511
|
+
|
|
2512
|
+
# Apply chromosome name mapping to breakend pairs
|
|
2513
|
+
if mapping and breakend_pairs:
|
|
2514
|
+
updated_pairs = []
|
|
2515
|
+
for bnd1, bnd2 in breakend_pairs:
|
|
2516
|
+
# Update chromosome names in breakend objects if needed
|
|
2517
|
+
if bnd1.chrom in mapping:
|
|
2518
|
+
bnd1.chrom = mapping[bnd1.chrom]
|
|
2519
|
+
if bnd2.chrom in mapping:
|
|
2520
|
+
bnd2.chrom = mapping[bnd2.chrom]
|
|
2521
|
+
if bnd1.mate_chrom in mapping:
|
|
2522
|
+
bnd1.mate_chrom = mapping[bnd1.mate_chrom]
|
|
2523
|
+
if bnd2.mate_chrom in mapping:
|
|
2524
|
+
bnd2.mate_chrom = mapping[bnd2.mate_chrom]
|
|
2525
|
+
updated_pairs.append((bnd1, bnd2))
|
|
2526
|
+
breakend_pairs = updated_pairs
|
|
2527
|
+
|
|
2528
|
+
# Process standard variants first - yield each chunk individually
|
|
2529
|
+
if len(standard_variants) > 0:
|
|
2530
|
+
# Split standard variants into chunks
|
|
2531
|
+
std_indices = np.array_split(np.arange(len(standard_variants)), n_chunks)
|
|
2532
|
+
std_variant_chunks = (
|
|
2533
|
+
standard_variants.iloc[chunk_indices].reset_index(drop=True)
|
|
2534
|
+
for chunk_indices in std_indices
|
|
2535
|
+
if len(chunk_indices) > 0
|
|
2536
|
+
)
|
|
2537
|
+
|
|
2538
|
+
for chunk_variants in std_variant_chunks:
|
|
2539
|
+
sequences = []
|
|
2540
|
+
# Generate standardized metadata using shared function
|
|
2541
|
+
metadata_df = _generate_sequence_metadata(chunk_variants, seq_len)
|
|
2542
|
+
|
|
2543
|
+
# Group variants by chromosome for efficient processing
|
|
2544
|
+
for chrom, chrom_variants in chunk_variants.groupby("chrom"):
|
|
2545
|
+
if chrom not in reference:
|
|
2546
|
+
warnings.warn(
|
|
2547
|
+
f"Chromosome {chrom} not found in reference. Skipping {len(chrom_variants)} variants."
|
|
2548
|
+
)
|
|
2549
|
+
continue
|
|
2550
|
+
|
|
2551
|
+
ref_seq = str(reference[chrom])
|
|
2552
|
+
chrom_length = len(ref_seq)
|
|
2553
|
+
|
|
2554
|
+
# Vectorized calculation of window positions
|
|
2555
|
+
positions = chrom_variants["pos1"].values - 1 # Convert to 0-based
|
|
2556
|
+
half_len = seq_len // 2
|
|
2557
|
+
window_starts = positions - half_len
|
|
2558
|
+
window_ends = window_starts + seq_len
|
|
2559
|
+
|
|
2560
|
+
# Process all variants in this chromosome using NumPy operations
|
|
2561
|
+
for idx, (_, var) in enumerate(chrom_variants.iterrows()):
|
|
2562
|
+
pos = var["pos1"]
|
|
2563
|
+
genomic_pos = positions[idx]
|
|
2564
|
+
window_start = window_starts[idx]
|
|
2565
|
+
window_end = window_ends[idx]
|
|
2566
|
+
|
|
2567
|
+
# Handle edge cases and extract window
|
|
2568
|
+
if window_start < 0:
|
|
2569
|
+
left_pad = -window_start
|
|
2570
|
+
actual_start = 0
|
|
2571
|
+
else:
|
|
2572
|
+
left_pad = 0
|
|
2573
|
+
actual_start = window_start
|
|
2574
|
+
|
|
2575
|
+
if window_end > chrom_length:
|
|
2576
|
+
right_pad = window_end - chrom_length
|
|
2577
|
+
actual_end = chrom_length
|
|
2578
|
+
else:
|
|
2579
|
+
right_pad = 0
|
|
2580
|
+
actual_end = window_end
|
|
2581
|
+
|
|
2582
|
+
# Extract window from reference chromosome (no variants applied)
|
|
2583
|
+
window_seq = ref_seq[actual_start:actual_end]
|
|
2584
|
+
|
|
2585
|
+
# Add padding if needed
|
|
2586
|
+
if left_pad > 0:
|
|
2587
|
+
window_seq = "N" * left_pad + window_seq
|
|
2588
|
+
if right_pad > 0:
|
|
2589
|
+
window_seq = window_seq + "N" * right_pad
|
|
2590
|
+
|
|
2591
|
+
# Ensure correct length
|
|
2592
|
+
if len(window_seq) != seq_len:
|
|
2593
|
+
warnings.warn(
|
|
2594
|
+
f"Sequence length mismatch for variant at {chrom}:{pos}. "
|
|
2595
|
+
f"Expected {seq_len}, got {len(window_seq)}"
|
|
2596
|
+
)
|
|
2597
|
+
# Truncate or pad as needed
|
|
2598
|
+
if len(window_seq) < seq_len:
|
|
2599
|
+
window_seq += "N" * (seq_len - len(window_seq))
|
|
2600
|
+
else:
|
|
2601
|
+
window_seq = window_seq[:seq_len]
|
|
2602
|
+
|
|
2603
|
+
if encode:
|
|
2604
|
+
sequences.append(encode_seq(window_seq, encoder))
|
|
2605
|
+
else:
|
|
2606
|
+
sequences.append(
|
|
2607
|
+
(
|
|
2608
|
+
chrom,
|
|
2609
|
+
max(0, genomic_pos - half_len),
|
|
2610
|
+
max(0, genomic_pos - half_len) + seq_len,
|
|
2611
|
+
window_seq,
|
|
2612
|
+
)
|
|
2613
|
+
)
|
|
2614
|
+
|
|
2615
|
+
# Yield each chunk immediately
|
|
2616
|
+
if encode and sequences:
|
|
2617
|
+
if TORCH_AVAILABLE:
|
|
2618
|
+
sequences_result = torch.stack(sequences)
|
|
2619
|
+
else:
|
|
2620
|
+
sequences_result = np.stack(sequences)
|
|
2621
|
+
else:
|
|
2622
|
+
sequences_result = sequences
|
|
2623
|
+
|
|
2624
|
+
yield (sequences_result, metadata_df)
|
|
2625
|
+
|
|
2626
|
+
# Process BND variants after standard variants (if any)
|
|
2627
|
+
# BND variants are yielded as dual references
|
|
2628
|
+
if breakend_pairs:
|
|
2629
|
+
bnd_left_refs, bnd_right_refs, bnd_metadata = _generate_bnd_ref_sequences(
|
|
2630
|
+
breakend_pairs, reference, seq_len, encode, encoder
|
|
2631
|
+
)
|
|
2632
|
+
|
|
2633
|
+
if len(bnd_left_refs) > 0 or len(bnd_right_refs) > 0:
|
|
2634
|
+
bnd_metadata_df = (
|
|
2635
|
+
pd.DataFrame(bnd_metadata) if bnd_metadata else pd.DataFrame()
|
|
2636
|
+
)
|
|
2637
|
+
|
|
2638
|
+
# For ref sequences, we return dual references as a tuple
|
|
2639
|
+
# This is different from get_alt_sequences which returns fusion sequences
|
|
2640
|
+
# BND ref sequences are already stacked by _generate_bnd_ref_sequences
|
|
2641
|
+
bnd_left_result = bnd_left_refs
|
|
2642
|
+
bnd_right_result = bnd_right_refs
|
|
2643
|
+
|
|
2644
|
+
# Return dual references as a tuple (left_refs, right_refs)
|
|
2645
|
+
yield ((bnd_left_result, bnd_right_result), bnd_metadata_df)
|
|
2646
|
+
|
|
2647
|
+
|
|
2648
|
+
def get_alt_ref_sequences(
|
|
2649
|
+
reference_fn,
|
|
2650
|
+
variants_fn,
|
|
2651
|
+
seq_len,
|
|
2652
|
+
encode=True,
|
|
2653
|
+
n_chunks=1,
|
|
2654
|
+
encoder=None,
|
|
2655
|
+
auto_map_chromosomes=False,
|
|
2656
|
+
):
|
|
2657
|
+
"""
|
|
2658
|
+
Create both reference and variant sequence windows for alt/ref ratio calculations.
|
|
2659
|
+
Maintains backward compatibility while supporting BND variants with dual references.
|
|
2660
|
+
|
|
2661
|
+
This wrapper function calls both get_ref_sequences and get_alt_sequences to return
|
|
2662
|
+
matching pairs of reference and variant sequences for computing ratios.
|
|
2663
|
+
|
|
2664
|
+
Args:
|
|
2665
|
+
reference_fn: Path to reference genome file or dictionary-like object
|
|
2666
|
+
variants_fn: Path to VCF file (string) or DataFrame with variant data.
|
|
2667
|
+
For DataFrames, position column can be 'pos', 'pos1', or assumes second column is position.
|
|
2668
|
+
seq_len: Length of the sequence window
|
|
2669
|
+
encode: Return sequences as one-hot encoded numpy arrays (default: True)
|
|
2670
|
+
n_chunks: Number of chunks to split variants into (default: 1)
|
|
2671
|
+
encoder: Optional custom encoder function
|
|
2672
|
+
auto_map_chromosomes: Automatically map chromosome names between VCF and reference
|
|
2673
|
+
when they don't match exactly. Default: False. (default: False)
|
|
2674
|
+
|
|
2675
|
+
Yields:
|
|
2676
|
+
Tuple containing (alt_sequences, ref_sequences, metadata_df):
|
|
2677
|
+
For standard variants:
|
|
2678
|
+
- alt_sequences: Variant sequences with mutations applied
|
|
2679
|
+
- ref_sequences: Reference sequences without mutations
|
|
2680
|
+
- metadata_df: Variant metadata (pandas DataFrame)
|
|
2681
|
+
|
|
2682
|
+
For BND variants:
|
|
2683
|
+
- alt_sequences: Fusion sequences from breakend pairs
|
|
2684
|
+
- ref_sequences: Tuple of (left_ref_sequences, right_ref_sequences)
|
|
2685
|
+
- metadata_df: BND metadata with orientation and mate information
|
|
2686
|
+
|
|
2687
|
+
Metadata DataFrame columns:
|
|
2688
|
+
Standard fields (all variants):
|
|
2689
|
+
- chrom: Chromosome name (str)
|
|
2690
|
+
- window_start: Window start position, 0-based (int)
|
|
2691
|
+
- window_end: Window end position, 0-based exclusive (int)
|
|
2692
|
+
- variant_pos0: Variant position, 0-based (int)
|
|
2693
|
+
- variant_pos1: Variant position, 1-based VCF standard (int)
|
|
2694
|
+
- ref: Reference allele (str)
|
|
2695
|
+
- alt: Alternate allele (str)
|
|
2696
|
+
- variant_type: Variant classification (str)
|
|
2697
|
+
Examples: 'SNV', 'INS', 'DEL', 'MNV', 'SV_INV', 'SV_DUP', 'SV_BND'
|
|
2698
|
+
|
|
2699
|
+
Additional field for symbolic alleles (<INV>, <DUP>, etc.):
|
|
2700
|
+
- sym_variant_end: END position from INFO field, 1-based (int, optional)
|
|
2701
|
+
|
|
2702
|
+
BND-specific fields:
|
|
2703
|
+
- mate_chrom: Mate breakend chromosome (str)
|
|
2704
|
+
- mate_pos: Mate breakend position, 1-based (int)
|
|
2705
|
+
- orientation_1: First breakend orientation (str)
|
|
2706
|
+
- orientation_2: Second breakend orientation (str)
|
|
2707
|
+
- fusion_name: Fusion sequence identifier (str, optional)
|
|
2708
|
+
|
|
2709
|
+
Raises:
|
|
2710
|
+
ChromosomeMismatchError: If auto_map_chromosomes=False and chromosome names don't match
|
|
2711
|
+
"""
|
|
2712
|
+
# Get generators for both reference and variant sequences
|
|
2713
|
+
# These already handle variant loading, chromosome matching, and chunking consistently
|
|
2714
|
+
ref_gen = get_ref_sequences(
|
|
2715
|
+
reference_fn,
|
|
2716
|
+
variants_fn,
|
|
2717
|
+
seq_len,
|
|
2718
|
+
encode,
|
|
2719
|
+
n_chunks,
|
|
2720
|
+
encoder,
|
|
2721
|
+
auto_map_chromosomes,
|
|
2722
|
+
)
|
|
2723
|
+
alt_gen = get_alt_sequences(
|
|
2724
|
+
reference_fn,
|
|
2725
|
+
variants_fn,
|
|
2726
|
+
seq_len,
|
|
2727
|
+
encode,
|
|
2728
|
+
n_chunks,
|
|
2729
|
+
encoder,
|
|
2730
|
+
auto_map_chromosomes,
|
|
2731
|
+
)
|
|
2732
|
+
|
|
2733
|
+
# Process chunks from both generators
|
|
2734
|
+
# Both generators will yield chunks in the same order:
|
|
2735
|
+
# 1. Standard variant chunks first (if any) - maintains existing behavior
|
|
2736
|
+
# 2. BND variant chunks last (if any) - new dual reference structure
|
|
2737
|
+
for (ref_chunk, ref_metadata), (alt_chunk, alt_metadata) in zip(ref_gen, alt_gen):
|
|
2738
|
+
# For standard variants: preserve existing behavior exactly
|
|
2739
|
+
# For BND variants: ref_chunk will be (left_refs, right_refs) tuple
|
|
2740
|
+
# The caller can detect BND chunks by checking if ref_chunk is a tuple
|
|
2741
|
+
yield (alt_chunk, ref_chunk, ref_metadata)
|
|
2742
|
+
|
|
2743
|
+
|
|
2744
|
+
def get_pam_disrupting_alt_sequences(
|
|
2745
|
+
reference_fn,
|
|
2746
|
+
variants_fn,
|
|
2747
|
+
seq_len,
|
|
2748
|
+
max_pam_distance,
|
|
2749
|
+
pam_sequence="NGG",
|
|
2750
|
+
encode=True,
|
|
2751
|
+
n_chunks=1,
|
|
2752
|
+
encoder=None,
|
|
2753
|
+
auto_map_chromosomes=False,
|
|
2754
|
+
):
|
|
2755
|
+
"""
|
|
2756
|
+
Generate sequences for variants that disrupt PAM sites.
|
|
2757
|
+
|
|
2758
|
+
This function identifies variants that disrupt existing PAM sites in the reference
|
|
2759
|
+
genome and generates sequence pairs for each disrupting variant. Works like
|
|
2760
|
+
get_alt_ref_sequences() but filtered to only PAM-disrupting variants.
|
|
2761
|
+
|
|
2762
|
+
Args:
|
|
2763
|
+
reference_fn: Path to reference genome file or dictionary-like object
|
|
2764
|
+
variants_fn: Path to variants file or DataFrame
|
|
2765
|
+
seq_len: Length of sequence windows
|
|
2766
|
+
max_pam_distance: Maximum distance from variant to PAM site
|
|
2767
|
+
pam_sequence: PAM sequence pattern (default: 'NGG' for SpCas9).
|
|
2768
|
+
Supports all IUPAC degenerate nucleotide codes:
|
|
2769
|
+
N (any), R (A/G), Y (C/T), W (A/T), S (C/G), M (A/C),
|
|
2770
|
+
K (G/T), B (C/G/T), D (A/G/T), H (A/C/T), V (A/C/G)
|
|
2771
|
+
encode: Return sequences as one-hot encoded numpy arrays (default: True)
|
|
2772
|
+
n_chunks: Number of chunks to split variants for processing (default: 1)
|
|
2773
|
+
encoder: Optional custom encoding function
|
|
2774
|
+
auto_map_chromosomes: Automatically map chromosome names between VCF and reference
|
|
2775
|
+
when they don't match exactly. Default: False. (default: False)
|
|
2776
|
+
|
|
2777
|
+
Yields:
|
|
2778
|
+
Tuple containing (alt_sequences, ref_sequences, metadata_df):
|
|
2779
|
+
- alt_sequences: Variant sequences with mutations applied
|
|
2780
|
+
- ref_sequences: Reference sequences without mutations
|
|
2781
|
+
- metadata_df: Variant metadata (pandas DataFrame) with PAM-specific columns
|
|
2782
|
+
|
|
2783
|
+
Metadata DataFrame columns:
|
|
2784
|
+
Standard fields:
|
|
2785
|
+
- chrom: Chromosome name (str)
|
|
2786
|
+
- window_start: Window start position, 0-based (int)
|
|
2787
|
+
- window_end: Window end position, 0-based exclusive (int)
|
|
2788
|
+
- variant_pos0: Variant position, 0-based (int)
|
|
2789
|
+
- variant_pos1: Variant position, 1-based VCF standard (int)
|
|
2790
|
+
- ref: Reference allele (str)
|
|
2791
|
+
- alt: Alternate allele (str)
|
|
2792
|
+
- variant_type: Variant classification (str)
|
|
2793
|
+
|
|
2794
|
+
PAM-specific fields:
|
|
2795
|
+
- pam_site_pos: 0-based start position of PAM site in window (int)
|
|
2796
|
+
- pam_ref_sequence: PAM sequence in reference (str)
|
|
2797
|
+
- pam_alt_sequence: PAM sequence after variant (str)
|
|
2798
|
+
- pam_distance: Distance from variant to PAM start (int)
|
|
2799
|
+
|
|
2800
|
+
Raises:
|
|
2801
|
+
ChromosomeMismatchError: If auto_map_chromosomes=False and chromosome names don't match
|
|
2802
|
+
|
|
2803
|
+
Example:
|
|
2804
|
+
>>> # Process all PAM-disrupting variants at once
|
|
2805
|
+
>>> gen = get_pam_disrupting_alt_sequences(ref, vcf, seq_len=50,
|
|
2806
|
+
... max_pam_distance=10, n_chunks=1)
|
|
2807
|
+
>>> alt_seqs, ref_seqs, metadata = next(gen)
|
|
2808
|
+
>>>
|
|
2809
|
+
>>> # Or iterate through chunks
|
|
2810
|
+
>>> for alt_seqs, ref_seqs, metadata in get_pam_disrupting_alt_sequences(
|
|
2811
|
+
... ref, vcf, seq_len=50, max_pam_distance=10, n_chunks=5):
|
|
2812
|
+
... predictions = model.predict(alt_seqs, ref_seqs)
|
|
2813
|
+
"""
|
|
2814
|
+
# Helper function to find PAM sites in a sequence
|
|
2815
|
+
def _find_pam_sites(sequence, pam_pattern):
|
|
2816
|
+
"""Find all PAM site positions in a sequence using IUPAC codes.
|
|
2817
|
+
|
|
2818
|
+
Supports IUPAC degenerate nucleotide codes in the pattern (N, R, Y, W, S, M, K, B, D, H, V).
|
|
2819
|
+
Pattern wildcards match corresponding bases in the sequence.
|
|
2820
|
+
Sequence 'N' (used for padding or unknown bases) matches any pattern base.
|
|
2821
|
+
"""
|
|
2822
|
+
sites = []
|
|
2823
|
+
seq_upper = sequence.upper()
|
|
2824
|
+
pat_upper = pam_pattern.upper()
|
|
2825
|
+
|
|
2826
|
+
for i in range(len(seq_upper) - len(pat_upper) + 1):
|
|
2827
|
+
match = True
|
|
2828
|
+
for j in range(len(pat_upper)):
|
|
2829
|
+
seq_base = seq_upper[i + j]
|
|
2830
|
+
pat_base = pat_upper[j]
|
|
2831
|
+
|
|
2832
|
+
# Sequence 'N' (padding or unknown) matches any pattern base
|
|
2833
|
+
if seq_base == 'N':
|
|
2834
|
+
continue # Always matches
|
|
2835
|
+
|
|
2836
|
+
# Get allowed bases for this pattern position
|
|
2837
|
+
allowed_bases = IUPAC_CODES.get(pat_base, pat_base)
|
|
2838
|
+
|
|
2839
|
+
# Remove brackets from character class if present
|
|
2840
|
+
if allowed_bases.startswith("["):
|
|
2841
|
+
allowed_bases = allowed_bases[1:-1]
|
|
2842
|
+
|
|
2843
|
+
# Check if sequence base is in the pattern's allowed bases
|
|
2844
|
+
if seq_base not in allowed_bases:
|
|
2845
|
+
match = False
|
|
2846
|
+
break
|
|
2847
|
+
|
|
2848
|
+
if match:
|
|
2849
|
+
sites.append(i)
|
|
2850
|
+
|
|
2851
|
+
return sites
|
|
2852
|
+
|
|
2853
|
+
# Load reference and variants
|
|
2854
|
+
reference = _load_reference(reference_fn)
|
|
2855
|
+
variants = _load_variants(variants_fn)
|
|
2856
|
+
|
|
2857
|
+
# Get all chromosome names and apply chromosome matching
|
|
2858
|
+
ref_chroms = set(reference.keys())
|
|
2859
|
+
vcf_chroms = set(variants["chrom"].unique())
|
|
2860
|
+
|
|
2861
|
+
# Use chromosome matching to handle name mismatches
|
|
2862
|
+
mapping, unmatched = match_chromosomes_with_report(
|
|
2863
|
+
ref_chroms, vcf_chroms, verbose=True, auto_map_chromosomes=auto_map_chromosomes
|
|
2864
|
+
)
|
|
2865
|
+
|
|
2866
|
+
# Apply chromosome name mapping to variants
|
|
2867
|
+
if mapping:
|
|
2868
|
+
variants = apply_chromosome_mapping(variants, mapping)
|
|
2869
|
+
|
|
2870
|
+
# Filter variants to find those that disrupt PAM sites
|
|
2871
|
+
pam_disrupting_variants_list = []
|
|
2872
|
+
pam_metadata_list = []
|
|
2873
|
+
|
|
2874
|
+
# Process each variant to identify PAM disruption
|
|
2875
|
+
for _, var in variants.iterrows():
|
|
2876
|
+
chrom = var["chrom"]
|
|
2877
|
+
pos = var["pos1"] # 1-based position
|
|
2878
|
+
|
|
2879
|
+
# Get reference sequence for this chromosome
|
|
2880
|
+
if chrom not in reference:
|
|
2881
|
+
warnings.warn(
|
|
2882
|
+
f"Chromosome {chrom} not found in reference. Skipping variant at {chrom}:{pos}."
|
|
2883
|
+
)
|
|
2884
|
+
continue
|
|
2885
|
+
|
|
2886
|
+
ref_seq = str(reference[chrom])
|
|
2887
|
+
chrom_length = len(ref_seq)
|
|
2888
|
+
|
|
2889
|
+
# Convert to 0-based position
|
|
2890
|
+
genomic_pos = pos - 1
|
|
2891
|
+
|
|
2892
|
+
# Calculate window boundaries centered on variant start
|
|
2893
|
+
half_len = seq_len // 2
|
|
2894
|
+
window_start = genomic_pos - half_len
|
|
2895
|
+
window_end = window_start + seq_len
|
|
2896
|
+
|
|
2897
|
+
# Check if variant extends past window boundaries
|
|
2898
|
+
ref_allele = var.get("ref", "")
|
|
2899
|
+
alt_allele = var.get("alt", "")
|
|
2900
|
+
variant_length = max(len(ref_allele), len(alt_allele))
|
|
2901
|
+
variant_end = genomic_pos + variant_length
|
|
2902
|
+
|
|
2903
|
+
if variant_end > window_end:
|
|
2904
|
+
overflow = variant_end - window_end
|
|
2905
|
+
warnings.warn(
|
|
2906
|
+
f"Variant at {chrom}:{pos} extends {overflow} bp beyond the "
|
|
2907
|
+
f"requested window (length: {seq_len} bp). This may affect "
|
|
2908
|
+
f"PAM site detection accuracy.",
|
|
2909
|
+
UserWarning,
|
|
2910
|
+
)
|
|
2911
|
+
|
|
2912
|
+
# Handle edge cases for reference sequence PAM detection
|
|
2913
|
+
if window_start < 0:
|
|
2914
|
+
left_pad = -window_start
|
|
2915
|
+
ref_window_start = 0
|
|
2916
|
+
else:
|
|
2917
|
+
left_pad = 0
|
|
2918
|
+
ref_window_start = window_start
|
|
2919
|
+
|
|
2920
|
+
if window_end > chrom_length:
|
|
2921
|
+
right_pad = window_end - chrom_length
|
|
2922
|
+
ref_window_end = chrom_length
|
|
2923
|
+
else:
|
|
2924
|
+
right_pad = 0
|
|
2925
|
+
ref_window_end = window_end
|
|
2926
|
+
|
|
2927
|
+
# Extract window from reference for PAM detection
|
|
2928
|
+
ref_window_seq = ref_seq[ref_window_start:ref_window_end]
|
|
2929
|
+
|
|
2930
|
+
# Add padding for PAM detection
|
|
2931
|
+
if left_pad > 0:
|
|
2932
|
+
ref_window_seq = "N" * left_pad + ref_window_seq
|
|
2933
|
+
if right_pad > 0:
|
|
2934
|
+
ref_window_seq = ref_window_seq + "N" * right_pad
|
|
2935
|
+
|
|
2936
|
+
# Find PAM sites in the reference sequence window
|
|
2937
|
+
ref_pam_sites = _find_pam_sites(ref_window_seq, pam_sequence)
|
|
2938
|
+
|
|
2939
|
+
# Calculate variant position in padded window
|
|
2940
|
+
variant_pos_in_window = left_pad + (genomic_pos - ref_window_start)
|
|
2941
|
+
|
|
2942
|
+
# Filter PAM sites that are within max_pam_distance of the variant
|
|
2943
|
+
nearby_ref_pam_sites = [
|
|
2944
|
+
p
|
|
2945
|
+
for p in ref_pam_sites
|
|
2946
|
+
if abs(p - variant_pos_in_window) <= max_pam_distance
|
|
2947
|
+
]
|
|
2948
|
+
|
|
2949
|
+
# Skip if no nearby PAM sites
|
|
2950
|
+
if not nearby_ref_pam_sites:
|
|
2951
|
+
continue
|
|
2952
|
+
|
|
2953
|
+
# Create a temporary applicator with just this variant
|
|
2954
|
+
single_var_df = pd.DataFrame([var])
|
|
2955
|
+
temp_applicator = VariantApplicator(ref_seq, single_var_df)
|
|
2956
|
+
|
|
2957
|
+
# Apply the variant to get the full modified chromosome
|
|
2958
|
+
modified_chrom, stats = temp_applicator.apply_variants()
|
|
2959
|
+
|
|
2960
|
+
# Extract window from modified chromosome
|
|
2961
|
+
if window_start < 0:
|
|
2962
|
+
actual_start = 0
|
|
2963
|
+
else:
|
|
2964
|
+
actual_start = window_start
|
|
2965
|
+
|
|
2966
|
+
if window_end > len(modified_chrom):
|
|
2967
|
+
actual_end = len(modified_chrom)
|
|
2968
|
+
else:
|
|
2969
|
+
actual_end = window_end
|
|
2970
|
+
|
|
2971
|
+
modified_window = modified_chrom[actual_start:actual_end]
|
|
2972
|
+
|
|
2973
|
+
# Add padding
|
|
2974
|
+
if left_pad > 0:
|
|
2975
|
+
modified_window = "N" * left_pad + modified_window
|
|
2976
|
+
if right_pad > 0:
|
|
2977
|
+
modified_window = modified_window + "N" * right_pad
|
|
2978
|
+
|
|
2979
|
+
# Ensure correct length
|
|
2980
|
+
if len(modified_window) != seq_len:
|
|
2981
|
+
if len(modified_window) < seq_len:
|
|
2982
|
+
modified_window += "N" * (seq_len - len(modified_window))
|
|
2983
|
+
else:
|
|
2984
|
+
modified_window = modified_window[:seq_len]
|
|
2985
|
+
|
|
2986
|
+
# Check for new PAM formation in the alternate sequence
|
|
2987
|
+
# Find PAM sites in the modified (alternate) sequence
|
|
2988
|
+
alt_pam_sites = _find_pam_sites(modified_window, pam_sequence)
|
|
2989
|
+
|
|
2990
|
+
# Filter to nearby PAM sites in the alternate sequence
|
|
2991
|
+
nearby_alt_pam_sites = [
|
|
2992
|
+
p
|
|
2993
|
+
for p in alt_pam_sites
|
|
2994
|
+
if abs(p - variant_pos_in_window) <= max_pam_distance
|
|
2995
|
+
]
|
|
2996
|
+
|
|
2997
|
+
# Identify which reference PAM sites are truly disrupted
|
|
2998
|
+
# Different logic for SNVs vs INDELs:
|
|
2999
|
+
# - SNV: PAM disrupted if the exact PAM sequence changes at that position
|
|
3000
|
+
# - INDEL: PAM disrupted ONLY if no PAM exists in ALT (even at shifted position)
|
|
3001
|
+
# If INDEL creates/shifts a PAM, it's NOT considered disrupting
|
|
3002
|
+
|
|
3003
|
+
ref_allele = var.get("ref", "")
|
|
3004
|
+
alt_allele = var.get("alt", "")
|
|
3005
|
+
is_indel = (
|
|
3006
|
+
len(ref_allele) != len(alt_allele)
|
|
3007
|
+
or ref_allele == "-"
|
|
3008
|
+
or alt_allele == "-"
|
|
3009
|
+
)
|
|
3010
|
+
|
|
3011
|
+
truly_disrupted_pam_sites = []
|
|
3012
|
+
|
|
3013
|
+
if is_indel:
|
|
3014
|
+
# For INDELs: check if PAM still exists anywhere nearby (allowing for shifts)
|
|
3015
|
+
for ref_pam_pos in nearby_ref_pam_sites:
|
|
3016
|
+
pam_still_exists = False
|
|
3017
|
+
for alt_pam_pos in nearby_alt_pam_sites:
|
|
3018
|
+
# Allow for positional shifts due to the INDEL
|
|
3019
|
+
# If a PAM exists within a reasonable distance, consider it maintained
|
|
3020
|
+
if abs(ref_pam_pos - alt_pam_pos) <= len(pam_sequence):
|
|
3021
|
+
pam_still_exists = True
|
|
3022
|
+
break
|
|
3023
|
+
|
|
3024
|
+
if not pam_still_exists:
|
|
3025
|
+
truly_disrupted_pam_sites.append(ref_pam_pos)
|
|
3026
|
+
else:
|
|
3027
|
+
# For SNVs: check if the PAM sequence at the exact position has changed
|
|
3028
|
+
for ref_pam_pos in nearby_ref_pam_sites:
|
|
3029
|
+
# Extract the PAM sequence from both ref and alt at this exact position
|
|
3030
|
+
ref_pam_seq = ref_window_seq[
|
|
3031
|
+
ref_pam_pos : ref_pam_pos + len(pam_sequence)
|
|
3032
|
+
]
|
|
3033
|
+
alt_pam_seq = modified_window[
|
|
3034
|
+
ref_pam_pos : ref_pam_pos + len(pam_sequence)
|
|
3035
|
+
]
|
|
3036
|
+
|
|
3037
|
+
# Check if the PAM pattern still matches after the variant
|
|
3038
|
+
alt_matches_pattern = all(
|
|
3039
|
+
a == b or b == "N"
|
|
3040
|
+
for a, b in zip(alt_pam_seq.upper(), pam_sequence.upper())
|
|
3041
|
+
)
|
|
3042
|
+
|
|
3043
|
+
# If the ALT no longer matches the PAM pattern, it's disrupted
|
|
3044
|
+
if not alt_matches_pattern:
|
|
3045
|
+
truly_disrupted_pam_sites.append(ref_pam_pos)
|
|
3046
|
+
|
|
3047
|
+
# Only proceed if there are truly disrupted PAM sites
|
|
3048
|
+
# If all reference PAMs are maintained (possibly at shifted positions for INDELs),
|
|
3049
|
+
# skip this variant
|
|
3050
|
+
if not truly_disrupted_pam_sites:
|
|
3051
|
+
continue
|
|
3052
|
+
|
|
3053
|
+
# For each disrupted PAM site, create a metadata entry
|
|
3054
|
+
for pam_site_pos in truly_disrupted_pam_sites:
|
|
3055
|
+
# Extract PAM sequences
|
|
3056
|
+
ref_pam_seq = ref_window_seq[pam_site_pos : pam_site_pos + len(pam_sequence)]
|
|
3057
|
+
alt_pam_seq = modified_window[pam_site_pos : pam_site_pos + len(pam_sequence)]
|
|
3058
|
+
|
|
3059
|
+
# Calculate distance from variant to PAM
|
|
3060
|
+
pam_distance = abs(pam_site_pos - variant_pos_in_window)
|
|
3061
|
+
|
|
3062
|
+
# Store the variant (may be duplicate if multiple PAMs disrupted)
|
|
3063
|
+
pam_disrupting_variants_list.append(var)
|
|
3064
|
+
|
|
3065
|
+
# Store PAM-specific metadata
|
|
3066
|
+
pam_metadata_list.append({
|
|
3067
|
+
'pam_site_pos': pam_site_pos,
|
|
3068
|
+
'pam_ref_sequence': ref_pam_seq,
|
|
3069
|
+
'pam_alt_sequence': alt_pam_seq,
|
|
3070
|
+
'pam_distance': pam_distance
|
|
3071
|
+
})
|
|
3072
|
+
|
|
3073
|
+
# If no PAM-disrupting variants found, yield empty results
|
|
3074
|
+
if not pam_disrupting_variants_list:
|
|
3075
|
+
# Return empty generator
|
|
3076
|
+
return
|
|
3077
|
+
|
|
3078
|
+
# Create DataFrame with filtered PAM-disrupting variants
|
|
3079
|
+
filtered_variants_df = pd.DataFrame(pam_disrupting_variants_list).reset_index(drop=True)
|
|
3080
|
+
pam_metadata_df = pd.DataFrame(pam_metadata_list)
|
|
3081
|
+
|
|
3082
|
+
# Call get_alt_ref_sequences with the filtered variants
|
|
3083
|
+
for alt_seqs, ref_seqs, base_metadata in get_alt_ref_sequences(
|
|
3084
|
+
reference_fn,
|
|
3085
|
+
filtered_variants_df,
|
|
3086
|
+
seq_len,
|
|
3087
|
+
encode,
|
|
3088
|
+
n_chunks,
|
|
3089
|
+
encoder,
|
|
3090
|
+
auto_map_chromosomes
|
|
3091
|
+
):
|
|
3092
|
+
# Merge PAM-specific metadata with base metadata
|
|
3093
|
+
# Both should have the same number of rows since we created one entry per PAM site
|
|
3094
|
+
enriched_metadata = pd.concat([base_metadata.reset_index(drop=True),
|
|
3095
|
+
pam_metadata_df], axis=1)
|
|
3096
|
+
|
|
3097
|
+
# Yield the chunk with enriched metadata
|
|
3098
|
+
yield (alt_seqs, ref_seqs, enriched_metadata)
|