speconsense 0.7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,606 @@
1
+ """MSA-based variant merging for speconsense-summarize.
2
+
3
+ Provides functions for finding and merging compatible variants within HAC groups
4
+ using exhaustive subset evaluation with SPOA multiple sequence alignment.
5
+ """
6
+
7
+ import itertools
8
+ import logging
9
+ from typing import List, Tuple, Dict
10
+ from collections import defaultdict
11
+
12
+ from speconsense.types import ConsensusInfo, OverlapMergeInfo
13
+
14
+ from .iupac import merge_bases_to_iupac, primers_are_same
15
+ from .analysis import (
16
+ run_spoa_msa,
17
+ analyze_msa_columns,
18
+ analyze_msa_columns_overlap_aware,
19
+ MAX_MSA_MERGE_VARIANTS, # Kept for backward compatibility
20
+ compute_merge_batch_size,
21
+ )
22
+
23
+
24
+ def generate_all_subsets_by_size(variants: List[ConsensusInfo]) -> List[Tuple[int, ...]]:
25
+ """
26
+ Generate all possible non-empty subsets of variant indices.
27
+ Returns subsets in descending order by total cluster size.
28
+
29
+ This exhaustive approach guarantees finding the globally optimal merge
30
+ when the number of variants is small (<= MAX_MSA_MERGE_VARIANTS).
31
+
32
+ Args:
33
+ variants: List of variants to generate subsets from
34
+
35
+ Returns:
36
+ List of tuples of indices, sorted by total size descending
37
+ """
38
+ n = len(variants)
39
+ sizes = [v.size for v in variants]
40
+
41
+ # Build list of (total_size, subset_indices) tuples
42
+ candidates = []
43
+
44
+ # Generate all non-empty subsets
45
+ for r in range(n, 0, -1): # From largest to smallest subset size
46
+ for indices in itertools.combinations(range(n), r):
47
+ total_size = sum(sizes[i] for i in indices)
48
+ candidates.append((total_size, indices))
49
+
50
+ # Sort by total size descending
51
+ candidates.sort(reverse=True, key=lambda x: x[0])
52
+
53
+ # Return just the subset indices
54
+ return [subset for _, subset in candidates]
55
+
56
+
57
+ def is_compatible_subset(variant_stats: dict, args, prior_positions: dict = None) -> bool:
58
+ """
59
+ Check if variant statistics are within merge limits.
60
+
61
+ By default, homopolymer indels are ignored (treated as compatible) to match
62
+ adjusted-identity homopolymer normalization semantics where AAA ~ AAAA.
63
+ Only structural indels count against the limits.
64
+
65
+ When --disable-homopolymer-equivalence is set, homopolymer indels are treated
66
+ the same as structural indels and count against merge limits.
67
+
68
+ Args:
69
+ variant_stats: Statistics from MSA analysis (snp_count, indel counts, etc.)
70
+ args: Command-line arguments with merge parameters
71
+ prior_positions: Optional dict with cumulative counts from prior merge rounds
72
+ {'snp_count': N, 'indel_count': M} - these are added to
73
+ current stats when checking limits for iterative merging
74
+ """
75
+ if prior_positions is None:
76
+ prior_positions = {'snp_count': 0, 'indel_count': 0}
77
+
78
+ # Check SNP limit
79
+ if variant_stats['snp_count'] > 0 and not args.merge_snp:
80
+ return False
81
+
82
+ # Determine which indels to count based on homopolymer equivalence setting
83
+ if args.disable_homopolymer_equivalence:
84
+ # Count both structural and homopolymer indels
85
+ indel_count = variant_stats['structural_indel_count'] + variant_stats['homopolymer_indel_count']
86
+ indel_length = max(variant_stats['structural_indel_length'],
87
+ variant_stats['homopolymer_indel_length'])
88
+ else:
89
+ # Only count structural indels (homopolymer indels ignored)
90
+ indel_count = variant_stats['structural_indel_count']
91
+ indel_length = variant_stats['structural_indel_length']
92
+
93
+ # Check indel limits
94
+ if indel_count > 0:
95
+ if args.merge_indel_length == 0:
96
+ return False
97
+ if indel_length > args.merge_indel_length:
98
+ return False
99
+
100
+ # Check total position count (including prior merge rounds)
101
+ total_positions = (variant_stats['snp_count'] + prior_positions['snp_count'] +
102
+ indel_count + prior_positions['indel_count'])
103
+ if total_positions > args.merge_position_count:
104
+ return False
105
+
106
+ return True
107
+
108
+
109
+ def create_consensus_from_msa(aligned_seqs: List, variants: List[ConsensusInfo]) -> ConsensusInfo:
110
+ """
111
+ Generate consensus from MSA using size-weighted majority voting.
112
+
113
+ At each position:
114
+ - Weight each variant by cluster size
115
+ - Choose majority representation (base vs gap)
116
+ - For multiple bases, generate IUPAC code representing all variants
117
+
118
+ Important: All gaps (including terminal) count as variant positions
119
+ since variants share the same primers.
120
+
121
+ Args:
122
+ aligned_seqs: MSA sequences with gaps as '-'
123
+ variants: Original ConsensusInfo objects (for size weighting)
124
+
125
+ Returns:
126
+ ConsensusInfo with merged consensus sequence
127
+ """
128
+ consensus_seq = []
129
+ snp_count = 0
130
+ alignment_length = len(aligned_seqs[0].seq)
131
+
132
+ for col_idx in range(alignment_length):
133
+ column = [str(seq.seq[col_idx]) for seq in aligned_seqs]
134
+
135
+ # Weight each base/gap by cluster size
136
+ votes_with_size = [(base, variants[i].size) for i, base in enumerate(column)]
137
+
138
+ # Count size-weighted votes (EXACT match only, no IUPAC expansion)
139
+ votes = defaultdict(int)
140
+ for base, size in votes_with_size:
141
+ votes[base.upper()] += size
142
+
143
+ # Separate gap votes from base votes
144
+ gap_votes = votes.get('-', 0)
145
+ base_votes = {b: v for b, v in votes.items() if b != '-'}
146
+
147
+ # Determine if position should be included
148
+ total_base_votes = sum(base_votes.values())
149
+
150
+ if total_base_votes > gap_votes:
151
+ # Majority wants a base - include position
152
+ if len(base_votes) == 1:
153
+ # Single base - no ambiguity
154
+ consensus_seq.append(list(base_votes.keys())[0])
155
+ else:
156
+ # Multiple bases - generate IUPAC code (expanding any existing IUPAC codes)
157
+ represented_bases = set(base_votes.keys())
158
+ iupac_code = merge_bases_to_iupac(represented_bases)
159
+ consensus_seq.append(iupac_code)
160
+ snp_count += 1
161
+ # else: majority wants gap, omit position
162
+
163
+ # Create merged ConsensusInfo
164
+ consensus_sequence = ''.join(consensus_seq)
165
+ total_size = sum(v.size for v in variants)
166
+ total_ric = sum(v.ric for v in variants)
167
+
168
+ # Collect RiC values, preserving any prior merge history
169
+ raw_ric_values = []
170
+ for v in variants:
171
+ if v.raw_ric:
172
+ raw_ric_values.extend(v.raw_ric) # Flatten prior merge history
173
+ else:
174
+ raw_ric_values.append(v.ric)
175
+ raw_ric_values = sorted(raw_ric_values, reverse=True) if len(variants) > 1 else None
176
+
177
+ # Collect lengths, preserving any prior merge history
178
+ raw_len_values = []
179
+ for v in variants:
180
+ if v.raw_len:
181
+ raw_len_values.extend(v.raw_len) # Flatten prior merge history
182
+ else:
183
+ raw_len_values.append(len(v.sequence))
184
+ raw_len_values = sorted(raw_len_values, reverse=True) if len(variants) > 1 else None
185
+
186
+ # Use name from largest variant
187
+ largest_variant = max(variants, key=lambda v: v.size)
188
+
189
+ return ConsensusInfo(
190
+ sample_name=largest_variant.sample_name,
191
+ cluster_id=largest_variant.cluster_id,
192
+ sequence=consensus_sequence,
193
+ ric=total_ric,
194
+ size=total_size,
195
+ file_path=largest_variant.file_path,
196
+ snp_count=snp_count if snp_count > 0 else None,
197
+ primers=largest_variant.primers,
198
+ raw_ric=raw_ric_values,
199
+ raw_len=raw_len_values,
200
+ rid=largest_variant.rid, # Preserve identity metrics from largest variant
201
+ rid_min=largest_variant.rid_min,
202
+ )
203
+
204
+
205
+ def create_overlap_consensus_from_msa(aligned_seqs: List, variants: List[ConsensusInfo]) -> ConsensusInfo:
206
+ """
207
+ Generate consensus from MSA where sequences may have different lengths.
208
+
209
+ For overlap merging (primer pools with different endpoints):
210
+ - In overlap region: Use size-weighted majority voting
211
+ - In non-overlap regions: Keep content from whichever sequence(s) have it
212
+
213
+ This produces a consensus spanning the union of all input sequences.
214
+
215
+ Args:
216
+ aligned_seqs: MSA sequences with gaps as '-'
217
+ variants: Original ConsensusInfo objects (for size weighting)
218
+
219
+ Returns:
220
+ ConsensusInfo with merged consensus sequence spanning full length
221
+ """
222
+ consensus_seq = []
223
+ snp_count = 0
224
+ alignment_length = len(aligned_seqs[0].seq)
225
+
226
+ # Find content region for each sequence
227
+ content_regions = []
228
+ for seq in aligned_seqs:
229
+ seq_str = str(seq.seq)
230
+ first_base = next((i for i, c in enumerate(seq_str) if c != '-'), 0)
231
+ last_base = alignment_length - 1 - next(
232
+ (i for i, c in enumerate(reversed(seq_str)) if c != '-'), 0
233
+ )
234
+ content_regions.append((first_base, last_base))
235
+
236
+ # Calculate overlap region
237
+ overlap_start = max(start for start, _ in content_regions)
238
+ overlap_end = min(end for _, end in content_regions)
239
+
240
+ # Process each column
241
+ for col_idx in range(alignment_length):
242
+ column = [str(seq.seq[col_idx]) for seq in aligned_seqs]
243
+
244
+ # Determine which sequences have content at this position
245
+ seqs_with_content = []
246
+ for i, (start, end) in enumerate(content_regions):
247
+ if start <= col_idx <= end:
248
+ seqs_with_content.append(i)
249
+
250
+ if not seqs_with_content:
251
+ # No sequence has content here (shouldn't happen in valid MSA)
252
+ continue
253
+
254
+ # Check if we're in the overlap region
255
+ in_overlap = overlap_start <= col_idx <= overlap_end
256
+
257
+ if in_overlap:
258
+ # Overlap region: use size-weighted majority voting (like original)
259
+ votes_with_size = [(column[i], variants[i].size) for i in seqs_with_content]
260
+
261
+ votes = defaultdict(int)
262
+ for base, size in votes_with_size:
263
+ votes[base.upper()] += size
264
+
265
+ gap_votes = votes.get('-', 0)
266
+ base_votes = {b: v for b, v in votes.items() if b != '-'}
267
+ total_base_votes = sum(base_votes.values())
268
+
269
+ if total_base_votes > gap_votes:
270
+ if len(base_votes) == 1:
271
+ consensus_seq.append(list(base_votes.keys())[0])
272
+ else:
273
+ represented_bases = set(base_votes.keys())
274
+ iupac_code = merge_bases_to_iupac(represented_bases)
275
+ consensus_seq.append(iupac_code)
276
+ snp_count += 1
277
+ # else: majority wants gap in overlap, omit position
278
+ else:
279
+ # Non-overlap region: keep content from available sequences
280
+ # (don't let gap votes from sequences that don't extend here remove content)
281
+ bases_only = [column[i] for i in seqs_with_content if column[i] != '-']
282
+
283
+ if bases_only:
284
+ # Weight by size for consistency
285
+ votes = defaultdict(int)
286
+ for i in seqs_with_content:
287
+ if column[i] != '-':
288
+ votes[column[i].upper()] += variants[i].size
289
+
290
+ if len(votes) == 1:
291
+ consensus_seq.append(list(votes.keys())[0])
292
+ else:
293
+ represented_bases = set(votes.keys())
294
+ iupac_code = merge_bases_to_iupac(represented_bases)
295
+ consensus_seq.append(iupac_code)
296
+ snp_count += 1
297
+
298
+ # Create merged ConsensusInfo
299
+ consensus_sequence = ''.join(consensus_seq)
300
+ total_size = sum(v.size for v in variants)
301
+ total_ric = sum(v.ric for v in variants)
302
+
303
+ # Collect RiC values, preserving any prior merge history
304
+ raw_ric_values = []
305
+ for v in variants:
306
+ if v.raw_ric:
307
+ raw_ric_values.extend(v.raw_ric) # Flatten prior merge history
308
+ else:
309
+ raw_ric_values.append(v.ric)
310
+ raw_ric_values = sorted(raw_ric_values, reverse=True) if len(variants) > 1 else None
311
+
312
+ # Collect lengths, preserving any prior merge history
313
+ raw_len_values = []
314
+ for v in variants:
315
+ if v.raw_len:
316
+ raw_len_values.extend(v.raw_len) # Flatten prior merge history
317
+ else:
318
+ raw_len_values.append(len(v.sequence))
319
+ raw_len_values = sorted(raw_len_values, reverse=True) if len(variants) > 1 else None
320
+
321
+ # Use name from largest variant
322
+ largest_variant = max(variants, key=lambda v: v.size)
323
+
324
+ return ConsensusInfo(
325
+ sample_name=largest_variant.sample_name,
326
+ cluster_id=largest_variant.cluster_id,
327
+ sequence=consensus_sequence,
328
+ ric=total_ric,
329
+ size=total_size,
330
+ file_path=largest_variant.file_path,
331
+ snp_count=snp_count if snp_count > 0 else None,
332
+ primers=largest_variant.primers,
333
+ raw_ric=raw_ric_values,
334
+ raw_len=raw_len_values,
335
+ rid=largest_variant.rid,
336
+ rid_min=largest_variant.rid_min,
337
+ )
338
+
339
+
340
+ def merge_group_with_msa(variants: List[ConsensusInfo], args) -> Tuple[List[ConsensusInfo], Dict, int, List[OverlapMergeInfo]]:
341
+ """
342
+ Find largest mergeable subset of variants using MSA-based evaluation with exhaustive search.
343
+
344
+ Algorithm:
345
+ 1. Process variants in batches of up to MAX_MSA_MERGE_VARIANTS
346
+ 2. For each batch, run SPOA MSA once
347
+ 3. Exhaustively evaluate ALL subsets by total size (descending)
348
+ 4. Merge the best compatible subset found
349
+ 5. Remove merged variants and repeat with remaining
350
+ 6. When overlap mode is enabled, iterate the entire process on merged results
351
+ until no more merges happen (handles prefix+suffix+full scenarios)
352
+
353
+ This approach guarantees optimal results when N <= MAX_MSA_MERGE_VARIANTS.
354
+ For N > MAX, processes top MAX per round (potentially suboptimal globally).
355
+
356
+ Iterative merging (overlap mode only):
357
+ - After first pass, merged results are fed back for another round
358
+ - Cumulative SNP/indel counts are tracked across rounds
359
+ - Continues until no merges occur in a round
360
+
361
+ Args:
362
+ variants: List of ConsensusInfo from HAC group
363
+ args: Command-line arguments with merge parameters
364
+
365
+ Returns:
366
+ (merged_variants, merge_traceability, potentially_suboptimal, overlap_merges) where:
367
+ - merged_variants is list of merged ConsensusInfo objects
368
+ - traceability maps merged names to original cluster names
369
+ - potentially_suboptimal is 1 if group had >MAX variants, 0 otherwise
370
+ - overlap_merges is list of OverlapMergeInfo for quality reporting
371
+ """
372
+ if len(variants) == 1:
373
+ return variants, {}, 0, []
374
+
375
+ # Compute batch size based on effort and group size
376
+ effort = getattr(args, 'merge_effort_value', 10) # Default to balanced
377
+ batch_size = compute_merge_batch_size(len(variants), effort)
378
+
379
+ # Track if this group is potentially suboptimal (too many variants for global optimum)
380
+ potentially_suboptimal = 1 if len(variants) > batch_size else 0
381
+
382
+ all_traceability = {}
383
+ overlap_merges = [] # Track overlap merge events for quality reporting
384
+
385
+ # For iterative merging in overlap mode, we may need multiple rounds
386
+ current_variants = variants
387
+ iteration = 0
388
+ max_iterations = 10 # Safety limit to prevent infinite loops
389
+
390
+ while iteration < max_iterations:
391
+ iteration += 1
392
+
393
+ # Sort variants by size (largest first)
394
+ remaining_variants = sorted(current_variants, key=lambda v: v.size, reverse=True)
395
+ merged_results = []
396
+ merges_this_iteration = 0
397
+
398
+ while remaining_variants:
399
+ # Take up to batch_size candidates (dynamically computed based on effort and group size)
400
+ candidates = remaining_variants[:batch_size]
401
+
402
+ # Apply size ratio filter if enabled (relative to largest in batch)
403
+ if args.merge_min_size_ratio > 0:
404
+ largest_size = candidates[0].size
405
+ filtered_candidates = [v for v in candidates
406
+ if (v.size / largest_size) >= args.merge_min_size_ratio]
407
+ if len(filtered_candidates) < len(candidates):
408
+ filtered_count = len(candidates) - len(filtered_candidates)
409
+ logging.debug(f"Filtered out {filtered_count} variants with size ratio < {args.merge_min_size_ratio} relative to largest (size={largest_size})")
410
+ candidates = filtered_candidates
411
+
412
+ # Single candidate - just pass through
413
+ if len(candidates) == 1:
414
+ merged_results.append(candidates[0])
415
+ remaining_variants.remove(candidates[0])
416
+ continue
417
+
418
+ if iteration > 1:
419
+ logging.debug(f"Iteration {iteration}: Evaluating {len(candidates)} variants "
420
+ f"(batch_size={batch_size}) for merging")
421
+ else:
422
+ logging.debug(f"Evaluating {len(candidates)} variants (batch_size={batch_size}, "
423
+ f"effort={effort}) for merging (exhaustive subset search)")
424
+
425
+ # Determine if overlap mode should be used for this merge batch
426
+ # Same primers -> use global mode (chimeras have same primers but different lengths)
427
+ # Different primers -> use overlap mode (legitimate primer pool variation)
428
+ all_same_primers = all(
429
+ primers_are_same(candidates[0].primers, v.primers)
430
+ for v in candidates[1:]
431
+ ) if len(candidates) > 1 else True
432
+ use_overlap_mode = args.min_merge_overlap > 0 and not all_same_primers
433
+
434
+ if args.min_merge_overlap > 0 and all_same_primers and len(candidates) > 1:
435
+ # Log when primer constraint prevents overlap merging
436
+ primer_str = ','.join(candidates[0].primers) if candidates[0].primers else 'unknown'
437
+ logging.debug(f"Same primers [{primer_str}] detected - using global alignment instead of overlap")
438
+
439
+ # Run SPOA MSA on candidates
440
+ # Use local alignment mode (0) for overlap merging to get clean terminal gaps
441
+ # Use global alignment mode (1) for standard same-length merging
442
+ sequences = [v.sequence for v in candidates]
443
+ spoa_mode = 0 if use_overlap_mode else 1
444
+ aligned_seqs = run_spoa_msa(sequences, alignment_mode=spoa_mode)
445
+
446
+ logging.debug(f"Generated MSA with length {len(aligned_seqs[0].seq)}")
447
+
448
+ # Generate ALL subsets sorted by total size (exhaustive search)
449
+ all_subsets = generate_all_subsets_by_size(candidates)
450
+
451
+ logging.debug(f"Evaluating {len(all_subsets)} candidate subsets")
452
+
453
+ # Find first (largest) compatible subset
454
+ merged_this_round = False
455
+ for subset_indices in all_subsets:
456
+ subset_variants = [candidates[i] for i in subset_indices]
457
+ subset_aligned = [aligned_seqs[i] for i in subset_indices]
458
+
459
+ # Analyze MSA for this subset
460
+ if use_overlap_mode:
461
+ # Use overlap-aware analysis for primer pool scenarios
462
+ original_lengths = [len(v.sequence) for v in subset_variants]
463
+ variant_stats = analyze_msa_columns_overlap_aware(
464
+ subset_aligned, args.min_merge_overlap, original_lengths
465
+ )
466
+
467
+ # Check overlap requirement
468
+ shorter_len = min(original_lengths)
469
+ effective_threshold = min(args.min_merge_overlap, shorter_len)
470
+ if variant_stats['overlap_bp'] < effective_threshold:
471
+ # Insufficient overlap - skip this subset
472
+ continue
473
+ else:
474
+ # Use standard analysis
475
+ variant_stats = analyze_msa_columns(subset_aligned)
476
+
477
+ # Calculate cumulative positions from input sequences (for iterative merging)
478
+ # Each sequence may carry positions from prior merges
479
+ prior_snps = sum(v.snp_count or 0 for v in subset_variants)
480
+ prior_indels = sum(v.merge_indel_count or 0 for v in subset_variants)
481
+ prior_positions = {'snp_count': prior_snps, 'indel_count': prior_indels}
482
+
483
+ # Check compatibility against merge limits (including cumulative positions)
484
+ if is_compatible_subset(variant_stats, args, prior_positions):
485
+ # Only log "mergeable subset" message for actual merges (>1 variant)
486
+ if len(subset_indices) > 1:
487
+ # Build detailed variant description
488
+ parts = []
489
+ if variant_stats['snp_count'] > 0:
490
+ parts.append(f"{variant_stats['snp_count']} SNPs")
491
+ if variant_stats['structural_indel_count'] > 0:
492
+ parts.append(f"{variant_stats['structural_indel_count']} structural indels")
493
+ if variant_stats['homopolymer_indel_count'] > 0:
494
+ parts.append(f"{variant_stats['homopolymer_indel_count']} homopolymer indels")
495
+
496
+ variant_desc = ", ".join(parts) if parts else "identical sequences"
497
+ iter_prefix = f"Iteration {iteration}: " if iteration > 1 else ""
498
+ if use_overlap_mode:
499
+ # Include prefix/suffix extension info for overlap merges
500
+ prefix_bp = variant_stats.get('prefix_bp', 0)
501
+ suffix_bp = variant_stats.get('suffix_bp', 0)
502
+ logging.info(f"{iter_prefix}Found mergeable subset of {len(subset_indices)} variants "
503
+ f"(overlap={variant_stats.get('overlap_bp', 'N/A')}bp, "
504
+ f"prefix={prefix_bp}bp, suffix={suffix_bp}bp): {variant_desc}")
505
+
506
+ # DEBUG: Show span details for each sequence in the merge
507
+ content_regions = variant_stats.get('content_regions', [])
508
+ if content_regions:
509
+ spans = [f"seq{i+1}=({s},{e})" for i, (s, e) in enumerate(content_regions)]
510
+ logging.debug(f"Merge spans: {', '.join(spans)}")
511
+ else:
512
+ logging.info(f"{iter_prefix}Found mergeable subset of {len(subset_indices)} variants: {variant_desc}")
513
+
514
+ # Calculate total positions for cumulative tracking
515
+ # Total = prior positions from input sequences + new positions from this merge
516
+ if args.disable_homopolymer_equivalence:
517
+ this_merge_indels = variant_stats['structural_indel_count'] + variant_stats['homopolymer_indel_count']
518
+ else:
519
+ this_merge_indels = variant_stats['structural_indel_count']
520
+ total_snps = prior_snps + variant_stats['snp_count']
521
+ total_indels = prior_indels + this_merge_indels
522
+
523
+ # Create merged consensus
524
+ if len(subset_indices) == 1:
525
+ # Single variant - use directly, preserving raw_ric and other metadata
526
+ merged_consensus = subset_variants[0]
527
+ elif use_overlap_mode:
528
+ # Use overlap-aware consensus generation
529
+ merged_consensus = create_overlap_consensus_from_msa(
530
+ subset_aligned, subset_variants
531
+ )
532
+ else:
533
+ merged_consensus = create_consensus_from_msa(
534
+ subset_aligned, subset_variants
535
+ )
536
+
537
+ # Update merged consensus with cumulative position counts for iterative tracking
538
+ if len(subset_indices) > 1:
539
+ merged_consensus = merged_consensus._replace(
540
+ snp_count=total_snps if total_snps > 0 else None,
541
+ merge_indel_count=total_indels if total_indels > 0 else None
542
+ )
543
+
544
+ # Track merge provenance - expand any intermediate merges
545
+ # so we always trace back to the original cluster names
546
+ original_clusters = []
547
+ for v in subset_variants:
548
+ if v.sample_name in all_traceability:
549
+ # This variant was itself merged, expand to its originals
550
+ original_clusters.extend(all_traceability[v.sample_name])
551
+ else:
552
+ original_clusters.append(v.sample_name)
553
+ traceability = {
554
+ merged_consensus.sample_name: original_clusters
555
+ }
556
+ all_traceability.update(traceability)
557
+
558
+ # Track overlap merge for quality reporting
559
+ if use_overlap_mode and len(subset_indices) > 1:
560
+ # Extract specimen name (remove cluster suffix like -c1)
561
+ specimen = merged_consensus.sample_name.rsplit('-c', 1)[0] if '-c' in merged_consensus.sample_name else merged_consensus.sample_name
562
+ overlap_merges.append(OverlapMergeInfo(
563
+ specimen=specimen,
564
+ iteration=iteration,
565
+ input_clusters=[v.sample_name for v in subset_variants],
566
+ input_lengths=[len(v.sequence) for v in subset_variants],
567
+ input_rics=[v.ric for v in subset_variants],
568
+ overlap_bp=variant_stats.get('overlap_bp', 0),
569
+ prefix_bp=variant_stats.get('prefix_bp', 0),
570
+ suffix_bp=variant_stats.get('suffix_bp', 0),
571
+ output_length=len(merged_consensus.sequence)
572
+ ))
573
+
574
+ # Add merged consensus to results
575
+ merged_results.append(merged_consensus)
576
+
577
+ # Remove merged variants from remaining pool
578
+ for v in subset_variants:
579
+ if v in remaining_variants:
580
+ remaining_variants.remove(v)
581
+
582
+ merged_this_round = True
583
+ if len(subset_indices) > 1:
584
+ merges_this_iteration += 1
585
+ break
586
+
587
+ # If no merge found, keep largest variant as-is and continue
588
+ if not merged_this_round:
589
+ logging.debug(f"No compatible merge found for largest variant (size={candidates[0].size})")
590
+ merged_results.append(candidates[0])
591
+ remaining_variants.remove(candidates[0])
592
+
593
+ # Check if we should do another iteration (overlap mode only)
594
+ if args.min_merge_overlap > 0 and merges_this_iteration > 0 and len(merged_results) > 1:
595
+ # More merges might be possible with the new merged sequences
596
+ # Cumulative positions are tracked per-sequence via snp_count and merge_indel_count
597
+ logging.debug(f"Iteration {iteration} complete: {merges_this_iteration} merges, "
598
+ f"{len(merged_results)} variants remaining, trying another round")
599
+ current_variants = merged_results
600
+ else:
601
+ # No more iterations needed
602
+ if iteration > 1:
603
+ logging.debug(f"Iterative merging complete after {iteration} iterations")
604
+ break
605
+
606
+ return merged_results, all_traceability, potentially_suboptimal, overlap_merges