speconsense 0.7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
speconsense/msa.py ADDED
@@ -0,0 +1,813 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ MSA (Multiple Sequence Alignment) Analysis Module for Speconsense.
4
+
5
+ This module contains functions and data structures for analyzing MSA output from SPOA,
6
+ including:
7
+ - Homopolymer-normalized error detection
8
+ - Positional variation analysis
9
+ - Variant position detection and phasing support
10
+ - IUPAC ambiguity code generation
11
+
12
+ These functions were extracted from core.py to improve code organization and testability.
13
+ """
14
+
15
+ from collections import defaultdict
16
+ import logging
17
+ from typing import List, Set, Tuple, Optional, Dict, NamedTuple
18
+
19
+ import edlib
20
+ from adjusted_identity import score_alignment, AdjustmentParams
21
+ import numpy as np
22
+ from Bio import SeqIO
23
+
24
+
25
+ # IUPAC nucleotide ambiguity codes mapping
26
+ # Maps sets of nucleotides to their corresponding IUPAC code
27
+ IUPAC_CODES = {
28
+ frozenset(['A']): 'A',
29
+ frozenset(['C']): 'C',
30
+ frozenset(['G']): 'G',
31
+ frozenset(['T']): 'T',
32
+ frozenset(['A', 'G']): 'R',
33
+ frozenset(['C', 'T']): 'Y',
34
+ frozenset(['G', 'C']): 'S',
35
+ frozenset(['A', 'T']): 'W',
36
+ frozenset(['G', 'T']): 'K',
37
+ frozenset(['A', 'C']): 'M',
38
+ frozenset(['C', 'G', 'T']): 'B',
39
+ frozenset(['A', 'G', 'T']): 'D',
40
+ frozenset(['A', 'C', 'T']): 'H',
41
+ frozenset(['A', 'C', 'G']): 'V',
42
+ frozenset(['A', 'C', 'G', 'T']): 'N',
43
+ }
44
+
45
+
46
+ class ErrorPosition(NamedTuple):
47
+ """An error at a specific position in the MSA."""
48
+ msa_position: int # 0-indexed position in MSA alignment
49
+ error_type: str # 'sub', 'ins', or 'del'
50
+
51
+
52
+ class ReadAlignment(NamedTuple):
53
+ """Alignment result for a single read against consensus."""
54
+ read_id: str
55
+ aligned_sequence: str # Gapped sequence from MSA
56
+ read_length: int
57
+
58
+ # Raw metrics (count all differences including homopolymer length)
59
+ edit_distance: int
60
+ num_insertions: int
61
+ num_deletions: int
62
+ num_substitutions: int
63
+ error_positions: List[ErrorPosition] # Detailed error information
64
+
65
+ # Homopolymer-normalized metrics (exclude homopolymer extensions)
66
+ normalized_edit_distance: int # Edit distance excluding homopolymer length differences
67
+ normalized_error_positions: List[ErrorPosition] # Only non-homopolymer errors
68
+ score_aligned: str # Scoring string from adjusted-identity ('|'=match, '='=homopolymer, ' '=error)
69
+
70
+
71
+ class PositionStats(NamedTuple):
72
+ """Statistics for a single position in the MSA."""
73
+ msa_position: int # Position in MSA (0-indexed)
74
+ consensus_position: Optional[int] # Position in consensus (None for insertion columns)
75
+ coverage: int
76
+ error_count: int
77
+ error_rate: float
78
+ sub_count: int
79
+ ins_count: int
80
+ del_count: int
81
+ consensus_nucleotide: str # Base in consensus at this MSA position (or '-' for insertion)
82
+ base_composition: Dict[str, int] # Raw base counts: {A: 50, C: 3, G: 45, T: 2, '-': 0}
83
+ homopolymer_composition: Dict[str, int] # HP extension counts: {A: 5, G: 2} (base and count)
84
+
85
+
86
+ class MSAResult(NamedTuple):
87
+ """Result from SPOA multiple sequence alignment.
88
+
89
+ Attributes:
90
+ consensus: Ungapped consensus sequence
91
+ msa_string: Raw MSA in FASTA format (for file writing)
92
+ alignments: Parsed read alignments with gapped sequences
93
+ msa_to_consensus_pos: Mapping from MSA position to consensus position
94
+ """
95
+ consensus: str
96
+ msa_string: str
97
+ alignments: List[ReadAlignment]
98
+ msa_to_consensus_pos: Dict[int, Optional[int]]
99
+
100
+
101
+ # ============================================================================
102
+ # MSA Analysis Functions
103
+ # ============================================================================
104
+
105
+ def parse_score_aligned_for_errors(
106
+ score_aligned: str,
107
+ read_aligned: str,
108
+ consensus_aligned: str
109
+ ) -> List[ErrorPosition]:
110
+ """
111
+ Parse score_aligned string to extract non-homopolymer errors.
112
+
113
+ The score_aligned string from adjusted-identity uses these codes:
114
+ - '|' : Exact match (not an error)
115
+ - '=' : Ambiguous match or homopolymer extension (not counted as error)
116
+ - ' ' : Substitution or indel (IS an error)
117
+ - '.' : End-trimmed position (not counted)
118
+
119
+ Args:
120
+ score_aligned: Scoring string from adjusted-identity
121
+ read_aligned: Aligned read sequence with gaps
122
+ consensus_aligned: Aligned consensus sequence with gaps
123
+
124
+ Returns:
125
+ List of ErrorPosition for positions marked as errors (excluding homopolymer extensions)
126
+ """
127
+ normalized_errors = []
128
+
129
+ for msa_pos, (score_char, read_base, cons_base) in enumerate(
130
+ zip(score_aligned, read_aligned, consensus_aligned)
131
+ ):
132
+ # Skip matches and homopolymer extensions
133
+ if score_char in ('|', '=', '.'):
134
+ continue
135
+
136
+ # This is a real error (substitution or indel) - classify it
137
+ if read_base == '-' and cons_base != '-':
138
+ error_type = 'del'
139
+ elif read_base != '-' and cons_base == '-':
140
+ error_type = 'ins'
141
+ elif read_base != cons_base:
142
+ error_type = 'sub'
143
+ else:
144
+ # Both are gaps or identical - should not happen if score_char indicates error
145
+ continue
146
+
147
+ normalized_errors.append(ErrorPosition(msa_pos, error_type))
148
+
149
+ return normalized_errors
150
+
151
+
152
+ def extract_alignments_from_msa(
153
+ msa_string: str,
154
+ enable_homopolymer_normalization: bool = True
155
+ ) -> Tuple[List[ReadAlignment], str, Dict[int, Optional[int]]]:
156
+ """
157
+ Extract read alignments from an MSA string with optional homopolymer normalization.
158
+
159
+ The MSA contains aligned sequences where the consensus has header containing "Consensus".
160
+ This function compares each read to the consensus at each aligned position.
161
+
162
+ Error classification (raw metrics):
163
+ - Both '-': Not an error (read doesn't cover this position)
164
+ - Read '-', consensus base: Deletion (missing base in read)
165
+ - Read base, consensus '-': Insertion (extra base in read)
166
+ - Different bases: Substitution
167
+ - Same base: Match (not an error)
168
+
169
+ When enable_homopolymer_normalization=True, also computes normalized metrics that
170
+ exclude homopolymer length differences using adjusted-identity library.
171
+
172
+ IMPORTANT: Errors are reported at MSA positions, not consensus positions.
173
+ This avoids ambiguity when multiple insertion columns map to the same consensus position.
174
+
175
+ Args:
176
+ msa_string: MSA content in FASTA format
177
+ enable_homopolymer_normalization: If True, compute homopolymer-normalized metrics
178
+
179
+ Returns:
180
+ Tuple of:
181
+ - list of ReadAlignment objects (with both raw and normalized metrics)
182
+ - consensus sequence without gaps
183
+ - mapping from MSA position to consensus position (None for insertion columns)
184
+ """
185
+ from io import StringIO
186
+
187
+ # Define adjustment parameters for homopolymer normalization
188
+ # Only normalize homopolymers (single-base repeats), no other adjustments
189
+ HOMOPOLYMER_ADJUSTMENT_PARAMS = AdjustmentParams(
190
+ normalize_homopolymers=True,
191
+ handle_iupac_overlap=False,
192
+ normalize_indels=False,
193
+ end_skip_distance=0,
194
+ max_repeat_motif_length=1 # Single-base repeats only
195
+ )
196
+
197
+ # Parse MSA
198
+ msa_handle = StringIO(msa_string)
199
+ records = list(SeqIO.parse(msa_handle, 'fasta'))
200
+
201
+ if not records:
202
+ logging.warning("No sequences found in MSA string")
203
+ return [], "", {}
204
+
205
+ # Find consensus sequence
206
+ consensus_record = None
207
+ read_records = []
208
+
209
+ for record in records:
210
+ if 'Consensus' in record.description or 'Consensus' in record.id:
211
+ consensus_record = record
212
+ else:
213
+ read_records.append(record)
214
+
215
+ if consensus_record is None:
216
+ logging.warning("No consensus sequence found in MSA string")
217
+ return [], "", {}
218
+
219
+ consensus_aligned = str(consensus_record.seq).upper()
220
+ msa_length = len(consensus_aligned)
221
+
222
+ # Build mapping from MSA position to consensus position (excluding gaps)
223
+ # For insertion columns (consensus has '-'), maps to None
224
+ msa_to_consensus_pos = {}
225
+ consensus_pos = 0
226
+ for msa_pos in range(msa_length):
227
+ if consensus_aligned[msa_pos] != '-':
228
+ msa_to_consensus_pos[msa_pos] = consensus_pos
229
+ consensus_pos += 1
230
+ else:
231
+ # Insertion column - no consensus position
232
+ msa_to_consensus_pos[msa_pos] = None
233
+
234
+ # Get consensus without gaps for return value
235
+ consensus_ungapped = consensus_aligned.replace('-', '')
236
+
237
+ # Process each read
238
+ alignments = []
239
+
240
+ for read_record in read_records:
241
+ read_aligned = str(read_record.seq).upper()
242
+
243
+ if len(read_aligned) != msa_length:
244
+ logging.warning(f"Read {read_record.id} length mismatch with MSA length")
245
+ continue
246
+
247
+ # Compare read to consensus at each position
248
+ error_positions = []
249
+ num_insertions = 0
250
+ num_deletions = 0
251
+ num_substitutions = 0
252
+
253
+ for msa_pos in range(msa_length):
254
+ read_base = read_aligned[msa_pos]
255
+ cons_base = consensus_aligned[msa_pos]
256
+
257
+ # Skip if both are gaps (read doesn't cover this position)
258
+ if read_base == '-' and cons_base == '-':
259
+ continue
260
+
261
+ # Classify error type and record at MSA position
262
+ if read_base == '-' and cons_base != '-':
263
+ # Deletion (missing base in read)
264
+ error_positions.append(ErrorPosition(msa_pos, 'del'))
265
+ num_deletions += 1
266
+ elif read_base != '-' and cons_base == '-':
267
+ # Insertion (extra base in read)
268
+ error_positions.append(ErrorPosition(msa_pos, 'ins'))
269
+ num_insertions += 1
270
+ elif read_base != cons_base:
271
+ # Substitution (different bases)
272
+ error_positions.append(ErrorPosition(msa_pos, 'sub'))
273
+ num_substitutions += 1
274
+ # else: match, no error
275
+
276
+ # Calculate edit distance and read length
277
+ edit_distance = num_insertions + num_deletions + num_substitutions
278
+ read_length = len(read_aligned.replace('-', '')) # Length without gaps
279
+
280
+ # Compute homopolymer-normalized metrics if enabled
281
+ if enable_homopolymer_normalization:
282
+ try:
283
+ # Use adjusted-identity to get homopolymer-normalized scoring
284
+ # IMPORTANT: seq1=read, seq2=consensus. The score_aligned visualization
285
+ # is asymmetric and shows HP extensions from seq1's (the READ's) perspective.
286
+ # This is what we want since we're identifying which READ bases are extensions.
287
+ result = score_alignment(
288
+ read_aligned, # seq1 - the read
289
+ consensus_aligned, # seq2 - the consensus
290
+ HOMOPOLYMER_ADJUSTMENT_PARAMS
291
+ )
292
+
293
+ # Parse score_aligned string to extract normalized errors
294
+ normalized_error_positions = parse_score_aligned_for_errors(
295
+ result.score_aligned,
296
+ read_aligned,
297
+ consensus_aligned
298
+ )
299
+
300
+ normalized_edit_distance = result.mismatches
301
+ score_aligned_str = result.score_aligned
302
+
303
+ except Exception as e:
304
+ # If normalization fails, fall back to raw metrics
305
+ logging.warning(f"Homopolymer normalization failed for read {read_record.id}: {e}")
306
+ normalized_edit_distance = edit_distance
307
+ normalized_error_positions = error_positions
308
+ score_aligned_str = ""
309
+ else:
310
+ # Homopolymer normalization disabled - use raw metrics
311
+ normalized_edit_distance = edit_distance
312
+ normalized_error_positions = error_positions
313
+ score_aligned_str = ""
314
+
315
+ # Create alignment object with both raw and normalized metrics
316
+ alignment = ReadAlignment(
317
+ read_id=read_record.id,
318
+ aligned_sequence=read_aligned, # Store gapped sequence
319
+ read_length=read_length,
320
+ # Raw metrics
321
+ edit_distance=edit_distance,
322
+ num_insertions=num_insertions,
323
+ num_deletions=num_deletions,
324
+ num_substitutions=num_substitutions,
325
+ error_positions=error_positions,
326
+ # Normalized metrics
327
+ normalized_edit_distance=normalized_edit_distance,
328
+ normalized_error_positions=normalized_error_positions,
329
+ score_aligned=score_aligned_str
330
+ )
331
+ alignments.append(alignment)
332
+
333
+ return alignments, consensus_ungapped, msa_to_consensus_pos
334
+
335
+
336
+ def analyze_positional_variation(alignments: List[ReadAlignment], consensus_aligned: str,
337
+ msa_to_consensus_pos: Dict[int, Optional[int]]) -> List[PositionStats]:
338
+ """
339
+ Analyze error rates at each position in the MSA with homopolymer tracking.
340
+
341
+ Uses normalized error positions and base composition to identify true variants
342
+ while tracking homopolymer length differences separately. For each position:
343
+ - base_composition: Raw counts of each base observed
344
+ - homopolymer_composition: Counts of bases that are homopolymer extensions (score_aligned='=')
345
+
346
+ Downstream variant detection uses effective counts (raw - HP) to identify true
347
+ biological variants while ignoring diversity due solely to homopolymer variation.
348
+
349
+ IMPORTANT: All analysis is performed in MSA space (not consensus space).
350
+ This correctly handles insertion columns where multiple MSA positions
351
+ don't correspond to any consensus position.
352
+
353
+ Args:
354
+ alignments: List of read alignments (with normalized metrics)
355
+ consensus_aligned: Consensus sequence (gapped, from MSA)
356
+ msa_to_consensus_pos: Mapping from MSA position to consensus position
357
+
358
+ Returns:
359
+ List of PositionStats for each MSA position with normalized base composition
360
+ """
361
+ msa_length = len(consensus_aligned)
362
+
363
+ # Build error frequency matrix in MSA space
364
+ # For each MSA position: [sub_count, ins_count, del_count, total_coverage]
365
+ error_matrix = np.zeros((msa_length, 4), dtype=int)
366
+
367
+ # Build base composition matrix in MSA space (raw counts)
368
+ base_composition_matrix = [
369
+ {'A': 0, 'C': 0, 'G': 0, 'T': 0, '-': 0}
370
+ for _ in range(msa_length)
371
+ ]
372
+
373
+ # Build homopolymer composition matrix in MSA space
374
+ # Tracks bases that are homopolymer extensions (score_aligned='=')
375
+ homopolymer_composition_matrix = [
376
+ {'A': 0, 'C': 0, 'G': 0, 'T': 0, '-': 0}
377
+ for _ in range(msa_length)
378
+ ]
379
+
380
+ # Process alignments to count errors at MSA positions
381
+ for read_idx, alignment in enumerate(alignments):
382
+ # Count this read as coverage for all MSA positions
383
+ # Note: alignments span the full MSA
384
+ for msa_pos in range(msa_length):
385
+ error_matrix[msa_pos, 3] += 1 # coverage
386
+
387
+ # Add errors at specific MSA positions using normalized errors
388
+ # (excludes homopolymer extensions)
389
+ for error_pos in alignment.normalized_error_positions:
390
+ msa_pos = error_pos.msa_position
391
+ if 0 <= msa_pos < msa_length:
392
+ if error_pos.error_type == 'sub':
393
+ error_matrix[msa_pos, 0] += 1
394
+ elif error_pos.error_type == 'ins':
395
+ error_matrix[msa_pos, 1] += 1
396
+ elif error_pos.error_type == 'del':
397
+ error_matrix[msa_pos, 2] += 1
398
+
399
+ # Extract base composition from aligned sequence with homopolymer normalization
400
+ read_aligned = alignment.aligned_sequence
401
+ if len(read_aligned) != msa_length:
402
+ continue
403
+
404
+ # Track what base each read has at each MSA position
405
+ # Raw base composition plus separate tracking of homopolymer extensions
406
+ for msa_pos in range(msa_length):
407
+ read_base = read_aligned[msa_pos]
408
+
409
+ # Track raw base composition
410
+ if read_base in ['A', 'C', 'G', 'T', '-']:
411
+ base_composition_matrix[msa_pos][read_base] += 1
412
+ else:
413
+ # Treat N or other ambiguous as gap
414
+ base_composition_matrix[msa_pos]['-'] += 1
415
+
416
+ # Additionally track if this is a homopolymer extension
417
+ # NOTE: score_aligned is from the READ's perspective (seq1), which is what we want
418
+ # since we're asking whether this particular READ base is an HP extension
419
+ if alignment.score_aligned and msa_pos < len(alignment.score_aligned):
420
+ if alignment.score_aligned[msa_pos] == '=':
421
+ # Homopolymer extension - track separately
422
+ if read_base in ['A', 'C', 'G', 'T', '-']:
423
+ homopolymer_composition_matrix[msa_pos][read_base] += 1
424
+ else:
425
+ homopolymer_composition_matrix[msa_pos]['-'] += 1
426
+
427
+ # Calculate statistics for each MSA position
428
+ position_stats = []
429
+
430
+ for msa_pos in range(msa_length):
431
+ sub_count = error_matrix[msa_pos, 0]
432
+ ins_count = error_matrix[msa_pos, 1]
433
+ del_count = error_matrix[msa_pos, 2]
434
+ coverage = error_matrix[msa_pos, 3]
435
+
436
+ # Total error events
437
+ error_count = sub_count + ins_count + del_count
438
+ error_rate = error_count / coverage if coverage > 0 else 0.0
439
+
440
+ # Get consensus position (None for insertion columns)
441
+ cons_pos = msa_to_consensus_pos[msa_pos]
442
+
443
+ # Get consensus nucleotide at this MSA position
444
+ cons_nucleotide = consensus_aligned[msa_pos]
445
+
446
+ # Get base composition for this MSA position (raw counts)
447
+ base_comp = base_composition_matrix[msa_pos].copy()
448
+
449
+ # Get homopolymer extension composition for this MSA position
450
+ hp_comp = homopolymer_composition_matrix[msa_pos].copy()
451
+
452
+ position_stats.append(PositionStats(
453
+ msa_position=msa_pos,
454
+ consensus_position=cons_pos,
455
+ coverage=coverage,
456
+ error_count=error_count,
457
+ error_rate=error_rate,
458
+ sub_count=sub_count,
459
+ ins_count=ins_count,
460
+ del_count=del_count,
461
+ consensus_nucleotide=cons_nucleotide,
462
+ base_composition=base_comp,
463
+ homopolymer_composition=hp_comp
464
+ ))
465
+
466
+ return position_stats
467
+
468
+
469
+ def is_variant_position_with_composition(
470
+ position_stats: PositionStats,
471
+ min_variant_frequency: float = 0.10,
472
+ min_variant_count: int = 5
473
+ ) -> Tuple[bool, List[str], str]:
474
+ """
475
+ Identify variant positions using simple frequency and count thresholds.
476
+
477
+ This function determines if a position shows systematic variation (true biological
478
+ variant) rather than scattered sequencing errors. Homopolymer extensions are
479
+ excluded from consideration - diversity due solely to homopolymer length variation
480
+ is not considered a true variant.
481
+
482
+ Criteria for variant detection:
483
+ 1. At least one alternative allele must have frequency >= min_variant_frequency
484
+ 2. That allele must have count >= min_variant_count
485
+ 3. Counts are adjusted by subtracting homopolymer extension counts
486
+
487
+ Args:
488
+ position_stats: Position statistics including base composition
489
+ min_variant_frequency: Minimum alternative allele frequency (default: 0.10 for 10%)
490
+ min_variant_count: Minimum alternative allele read count (default: 5 reads)
491
+
492
+ Returns:
493
+ Tuple of (is_variant, variant_bases, reason)
494
+ - is_variant: True if this position requires cluster separation
495
+ - variant_bases: List of alternative bases meeting criteria (e.g., ['G', 'T'])
496
+ - reason: Explanation of decision for logging/debugging
497
+ """
498
+ n = position_stats.coverage
499
+ base_composition = position_stats.base_composition
500
+ hp_composition = position_stats.homopolymer_composition
501
+
502
+ # Check we have composition data
503
+ if not base_composition or sum(base_composition.values()) == 0:
504
+ return False, [], "No composition data available"
505
+
506
+ # Calculate effective counts by subtracting homopolymer extensions
507
+ # This excludes diversity that's purely due to HP length variation
508
+ effective_composition = {}
509
+ for base in ['A', 'C', 'G', 'T', '-']:
510
+ raw_count = base_composition.get(base, 0)
511
+ hp_count = hp_composition.get(base, 0) if hp_composition else 0
512
+ effective_count = raw_count - hp_count
513
+ if effective_count > 0:
514
+ effective_composition[base] = effective_count
515
+
516
+ # Check we have effective composition data after HP adjustment
517
+ if not effective_composition or sum(effective_composition.values()) == 0:
518
+ return False, [], "No composition data after HP adjustment"
519
+
520
+ effective_total = sum(effective_composition.values())
521
+
522
+ sorted_bases = sorted(
523
+ effective_composition.items(),
524
+ key=lambda x: x[1],
525
+ reverse=True
526
+ )
527
+
528
+ if len(sorted_bases) < 2:
529
+ return False, [], "No alternative alleles observed (after HP adjustment)"
530
+
531
+ # Check each alternative allele (skip consensus base at index 0)
532
+ variant_bases = []
533
+ variant_details = []
534
+
535
+ for base, count in sorted_bases[1:]:
536
+ freq = count / effective_total if effective_total > 0 else 0
537
+
538
+ # Must meet both frequency and count thresholds
539
+ if freq >= min_variant_frequency and count >= min_variant_count:
540
+ variant_bases.append(base)
541
+ variant_details.append(f"{base}:{count}/{effective_total}({freq:.1%})")
542
+
543
+ if variant_bases:
544
+ return True, variant_bases, f"Variant alleles: {', '.join(variant_details)}"
545
+
546
+ # Debug: Check if this would be a variant WITHOUT HP normalization
547
+ # This helps identify cases where HP adjustment incorrectly eliminates variants
548
+ raw_total = sum(base_composition.get(b, 0) for b in ['A', 'C', 'G', 'T', '-'])
549
+ raw_sorted = sorted(
550
+ [(b, base_composition.get(b, 0)) for b in ['A', 'C', 'G', 'T', '-'] if base_composition.get(b, 0) > 0],
551
+ key=lambda x: x[1],
552
+ reverse=True
553
+ )
554
+ if len(raw_sorted) >= 2:
555
+ for base, count in raw_sorted[1:]:
556
+ freq = count / raw_total if raw_total > 0 else 0
557
+ if freq >= min_variant_frequency and count >= min_variant_count:
558
+ # Would be variant without HP normalization!
559
+ logging.debug(
560
+ f"HP normalization eliminated variant at MSA pos {position_stats.msa_position}: "
561
+ f"raw {base}:{count}/{raw_total}({freq:.1%}) meets threshold, "
562
+ f"but effective composition={effective_composition}, "
563
+ f"raw={base_composition}, hp={hp_composition}"
564
+ )
565
+ break
566
+
567
+ return False, [], "No variants detected (after HP adjustment)"
568
+
569
+
570
+ def call_iupac_ambiguities(
571
+ consensus: str,
572
+ alignments: List['ReadAlignment'],
573
+ msa_to_consensus_pos: Dict[int, Optional[int]],
574
+ min_variant_frequency: float = 0.10,
575
+ min_variant_count: int = 5
576
+ ) -> Tuple[str, int, List[Dict]]:
577
+ """
578
+ Replace consensus bases at variant positions with IUPAC ambiguity codes.
579
+
580
+ Analyzes positional variation in the MSA and identifies positions where
581
+ significant variation remains after phasing. At these positions, the
582
+ consensus base is replaced with the appropriate IUPAC code representing
583
+ all variant alleles that meet the threshold criteria.
584
+
585
+ Uses the same thresholds as phasing to ensure consistency. Homopolymer
586
+ length variation is excluded (only true nucleotide variants are considered).
587
+
588
+ Args:
589
+ consensus: Ungapped consensus sequence from SPOA
590
+ alignments: List of ReadAlignment objects from MSA
591
+ msa_to_consensus_pos: Mapping from MSA position to consensus position
592
+ min_variant_frequency: Minimum alternative allele frequency (default: 0.10)
593
+ min_variant_count: Minimum alternative allele read count (default: 5)
594
+
595
+ Returns:
596
+ Tuple of:
597
+ - Modified consensus sequence with IUPAC codes at variant positions
598
+ - Count of IUPAC positions introduced
599
+ - List of dicts with details about each IUPAC position:
600
+ {
601
+ 'consensus_position': int,
602
+ 'original_base': str,
603
+ 'iupac_code': str,
604
+ 'variant_bases': List[str],
605
+ 'base_composition': Dict[str, int]
606
+ }
607
+ """
608
+ if not consensus or not alignments:
609
+ return consensus, 0, []
610
+
611
+ # Reconstruct consensus_aligned from consensus and msa_to_consensus_pos
612
+ # (same pattern as detect_variant_positions)
613
+ msa_length = max(msa_to_consensus_pos.keys()) + 1 if msa_to_consensus_pos else 0
614
+ if msa_length == 0:
615
+ return consensus, 0, []
616
+
617
+ consensus_aligned = []
618
+ for msa_pos in range(msa_length):
619
+ cons_pos = msa_to_consensus_pos.get(msa_pos)
620
+ if cons_pos is not None and cons_pos < len(consensus):
621
+ consensus_aligned.append(consensus[cons_pos])
622
+ else:
623
+ consensus_aligned.append('-')
624
+ consensus_aligned_str = ''.join(consensus_aligned)
625
+
626
+ # Analyze positional variation
627
+ position_stats = analyze_positional_variation(alignments, consensus_aligned_str, msa_to_consensus_pos)
628
+
629
+ # Build list of positions to replace
630
+ iupac_positions = []
631
+
632
+ for pos_stat in position_stats:
633
+ # Skip insertion columns (no consensus position)
634
+ if pos_stat.consensus_position is None:
635
+ continue
636
+
637
+ # Check if this position has significant variation
638
+ is_variant, variant_bases, reason = is_variant_position_with_composition(
639
+ pos_stat, min_variant_frequency, min_variant_count
640
+ )
641
+
642
+ if not is_variant:
643
+ continue
644
+
645
+ # Filter out gaps from variant bases (we can only represent nucleotide ambiguities)
646
+ nucleotide_variants = [b for b in variant_bases if b in 'ACGT']
647
+
648
+ if not nucleotide_variants:
649
+ # Only gaps met the threshold - skip this position
650
+ continue
651
+
652
+ # Get the consensus base at this position
653
+ cons_pos = pos_stat.consensus_position
654
+ consensus_base = consensus[cons_pos] if cons_pos < len(consensus) else None
655
+
656
+ if consensus_base is None or consensus_base not in 'ACGT':
657
+ continue
658
+
659
+ # Build set of all significant bases (consensus + variants)
660
+ all_bases = set(nucleotide_variants)
661
+ all_bases.add(consensus_base)
662
+
663
+ # Look up IUPAC code
664
+ iupac_code = IUPAC_CODES.get(frozenset(all_bases), 'N')
665
+
666
+ # Only record if we actually need an ambiguity code (more than one base)
667
+ if len(all_bases) > 1:
668
+ iupac_positions.append({
669
+ 'consensus_position': cons_pos,
670
+ 'original_base': consensus_base,
671
+ 'iupac_code': iupac_code,
672
+ 'variant_bases': nucleotide_variants,
673
+ 'base_composition': pos_stat.base_composition
674
+ })
675
+
676
+ if not iupac_positions:
677
+ return consensus, 0, []
678
+
679
+ # Build modified consensus
680
+ consensus_list = list(consensus)
681
+ for pos_info in iupac_positions:
682
+ cons_pos = pos_info['consensus_position']
683
+ consensus_list[cons_pos] = pos_info['iupac_code']
684
+
685
+ modified_consensus = ''.join(consensus_list)
686
+
687
+ return modified_consensus, len(iupac_positions), iupac_positions
688
+
689
+
690
+ def calculate_within_cluster_error(
691
+ haplotype_groups: Dict[str, Set[str]],
692
+ read_alleles: Dict[str, Dict[int, str]],
693
+ phasing_positions: Set[int],
694
+ all_variant_positions: Set[int]
695
+ ) -> float:
696
+ """Calculate within-cluster error for a given haplotype grouping.
697
+
698
+ Measures the average variation at ALL variant positions within each haplotype,
699
+ including positions used for phasing. This ensures fair comparison across
700
+ different candidate position sets and captures heterogeneity introduced by
701
+ reassignment of non-qualifying haplotypes.
702
+
703
+ Lower error indicates more homogeneous clusters.
704
+
705
+ Args:
706
+ haplotype_groups: Dict mapping allele_combo -> set of read_ids
707
+ read_alleles: Dict mapping read_id -> {msa_position -> allele}
708
+ phasing_positions: Set of MSA positions used for phasing (kept for API compatibility)
709
+ all_variant_positions: Set of all variant MSA positions (error measured at all of these)
710
+
711
+ Returns:
712
+ Weighted average error rate across haplotypes (0.0 = perfect, 1.0 = maximum error)
713
+ """
714
+ # Measure error at ALL variant positions, not just non-phased ones.
715
+ # This ensures fair comparison across candidate position sets and captures
716
+ # heterogeneity introduced by reassignment at phasing positions.
717
+ measured_positions = all_variant_positions
718
+
719
+ if not measured_positions or not haplotype_groups:
720
+ return 0.0
721
+
722
+ total_weighted_error = 0.0
723
+ total_reads = 0
724
+
725
+ for combo, read_ids in haplotype_groups.items():
726
+ if not read_ids:
727
+ continue
728
+
729
+ haplotype_error = 0.0
730
+ positions_counted = 0
731
+
732
+ for pos in measured_positions:
733
+ # Count alleles at this position for reads in this haplotype
734
+ allele_counts = defaultdict(int)
735
+ for read_id in read_ids:
736
+ allele = read_alleles.get(read_id, {}).get(pos, '-')
737
+ allele_counts[allele] += 1
738
+
739
+ if not allele_counts:
740
+ continue
741
+
742
+ # Find consensus (most common) allele
743
+ total_at_pos = sum(allele_counts.values())
744
+ max_count = max(allele_counts.values())
745
+
746
+ # Error rate = fraction of reads NOT matching consensus
747
+ error_at_pos = (total_at_pos - max_count) / total_at_pos
748
+ haplotype_error += error_at_pos
749
+ positions_counted += 1
750
+
751
+ # Average error across all variant positions for this haplotype
752
+ if positions_counted > 0:
753
+ mean_haplotype_error = haplotype_error / positions_counted
754
+ total_weighted_error += mean_haplotype_error * len(read_ids)
755
+ total_reads += len(read_ids)
756
+
757
+ if total_reads == 0:
758
+ return 0.0
759
+
760
+ return total_weighted_error / total_reads
761
+
762
+
763
+ def filter_qualifying_haplotypes(
764
+ combo_to_reads: Dict[str, Set[str]],
765
+ total_reads: int,
766
+ min_count: int,
767
+ min_frequency: float
768
+ ) -> Tuple[Dict[str, Set[str]], Dict[str, Set[str]]]:
769
+ """Filter haplotypes to those meeting count and frequency thresholds.
770
+
771
+ Args:
772
+ combo_to_reads: Dict mapping allele_combo -> set of read_ids
773
+ total_reads: Total number of reads for frequency calculation
774
+ min_count: Minimum read count threshold
775
+ min_frequency: Minimum frequency threshold (0.0 to 1.0)
776
+
777
+ Returns:
778
+ Tuple of (qualifying_combos, non_qualifying_combos)
779
+ """
780
+ qualifying = {}
781
+ non_qualifying = {}
782
+ for combo, reads in combo_to_reads.items():
783
+ count = len(reads)
784
+ freq = count / total_reads if total_reads > 0 else 0
785
+ if count >= min_count and freq >= min_frequency:
786
+ qualifying[combo] = reads
787
+ else:
788
+ non_qualifying[combo] = reads
789
+ return qualifying, non_qualifying
790
+
791
+
792
+ def group_reads_by_single_position(
793
+ read_alleles: Dict[str, Dict[int, str]],
794
+ position: int,
795
+ read_ids: Set[str]
796
+ ) -> Dict[str, Set[str]]:
797
+ """Group a subset of reads by their allele at a single position.
798
+
799
+ Args:
800
+ read_alleles: Dict mapping read_id -> {msa_position -> allele}
801
+ position: MSA position to group by
802
+ read_ids: Subset of read IDs to consider
803
+
804
+ Returns:
805
+ Dict mapping allele -> set of read_ids
806
+ """
807
+ allele_to_reads = defaultdict(set)
808
+ for read_id in read_ids:
809
+ allele = read_alleles.get(read_id, {}).get(position, '-')
810
+ allele_to_reads[allele].add(read_id)
811
+ return dict(allele_to_reads)
812
+
813
+