speconsense 0.7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,780 @@
1
+ """MSA analysis and quality assessment for speconsense-summarize.
2
+
3
+ Provides functions for analyzing multiple sequence alignments, detecting outliers,
4
+ identifying indel events, and assessing cluster quality.
5
+ """
6
+
7
+ import os
8
+ import re
9
+ import logging
10
+ import subprocess
11
+ import tempfile
12
+ from typing import List, Dict, Optional, Tuple, NamedTuple
13
+ from io import StringIO
14
+
15
+ import edlib
16
+ import numpy as np
17
+ from Bio import SeqIO
18
+ from Bio.SeqRecord import SeqRecord
19
+ from Bio.Seq import Seq
20
+
21
+ from speconsense.types import ConsensusInfo
22
+ from speconsense.msa import (
23
+ extract_alignments_from_msa,
24
+ analyze_positional_variation,
25
+ )
26
+
27
+ from .iupac import IUPAC_EQUIV
28
+
29
+
30
+ # Maximum number of variants to evaluate for MSA-based merging (legacy constant)
31
+ # Batch size is now dynamically computed based on --merge-effort and group size.
32
+ # This constant is kept for backward compatibility and as the default MAX_MERGE_BATCH.
33
+ MAX_MSA_MERGE_VARIANTS = 8
34
+
35
+ # Merge effort batch size limits
36
+ MIN_MERGE_BATCH = 4
37
+ MAX_MERGE_BATCH = 8
38
+
39
+
40
+ def compute_merge_batch_size(group_size: int, effort: int) -> int:
41
+ """Compute batch size for a group based on effort level.
42
+
43
+ Uses formula: B = E + 1 - log2(V), clamped to [MIN_MERGE_BATCH, MAX_MERGE_BATCH]
44
+ This keeps expected evaluations near 2^E per group.
45
+
46
+ Args:
47
+ group_size: Number of variants in the HAC group
48
+ effort: Merge effort level (6-14, default 10)
49
+
50
+ Returns:
51
+ Batch size between MIN_MERGE_BATCH and MAX_MERGE_BATCH
52
+ """
53
+ import math
54
+
55
+ if group_size <= 1:
56
+ return 1
57
+
58
+ log_v = int(math.log2(group_size))
59
+ batch = effort + 1 - log_v
60
+
61
+ return max(MIN_MERGE_BATCH, min(MAX_MERGE_BATCH, batch))
62
+
63
+
64
+ class ClusterQualityData(NamedTuple):
65
+ """Quality metrics for a cluster (no visualization matrix)."""
66
+ consensus_seq: str
67
+ position_error_rates: List[float] # Per-position error rates (0-1) in consensus space
68
+ position_error_counts: List[int] # Per-position error counts in consensus space
69
+ read_identities: List[float] # Per-read identity scores (0-1)
70
+ position_stats: Optional[List] = None # Detailed PositionStats for debugging (optional)
71
+
72
+
73
+ def identify_outliers(final_consensus: List, all_raw_consensuses: List, source_folder: str) -> Dict:
74
+ """Identify sequences with low read identity using statistical outlier detection.
75
+
76
+ Flags sequences with mean read identity (rid) below (mean - 2*std) for the dataset.
77
+ This identifies the ~2.5% lowest values that may warrant review.
78
+
79
+ Note: rid_min (minimum read identity) is not used because single outlier reads
80
+ don't significantly impact consensus quality. Positional analysis better captures
81
+ systematic issues like mixed clusters or variants.
82
+
83
+ Args:
84
+ final_consensus: List of final consensus sequences
85
+ all_raw_consensuses: List of all raw consensus sequences (unused, kept for API compatibility)
86
+ source_folder: Source directory (unused, kept for API compatibility)
87
+
88
+ Returns:
89
+ Dictionary with:
90
+ {
91
+ 'statistical_outliers': List of (cons, rid),
92
+ 'no_issues': List of consensus sequences with good quality,
93
+ 'global_stats': {'mean_rid', 'std_rid', 'stat_threshold_rid'}
94
+ }
95
+ """
96
+ # Calculate global statistics for all sequences with identity metrics
97
+ all_rids = []
98
+
99
+ for cons in final_consensus:
100
+ if cons.rid is not None:
101
+ all_rids.append(cons.rid)
102
+
103
+ # Calculate mean and std for statistical outlier detection
104
+ mean_rid = np.mean(all_rids) if all_rids else 1.0
105
+ std_rid = np.std(all_rids) if len(all_rids) > 1 else 0.0
106
+
107
+ # Threshold for statistical outliers (2 standard deviations below mean)
108
+ stat_threshold_rid = mean_rid - 2 * std_rid
109
+
110
+ # Categorize sequences
111
+ statistical = []
112
+ no_issues = []
113
+
114
+ for cons in final_consensus:
115
+ rid = cons.rid if cons.rid is not None else 1.0
116
+
117
+ if rid < stat_threshold_rid:
118
+ statistical.append((cons, rid))
119
+ else:
120
+ no_issues.append(cons)
121
+
122
+ return {
123
+ 'statistical_outliers': statistical,
124
+ 'no_issues': no_issues,
125
+ 'global_stats': {
126
+ 'mean_rid': mean_rid,
127
+ 'std_rid': std_rid,
128
+ 'stat_threshold_rid': stat_threshold_rid
129
+ }
130
+ }
131
+
132
+
133
+ def analyze_positional_identity_outliers(
134
+ consensus_info,
135
+ source_folder: str,
136
+ min_variant_frequency: float,
137
+ min_variant_count: int
138
+ ) -> Optional[Dict]:
139
+ """Analyze positional error rates and identify high-error positions.
140
+
141
+ Args:
142
+ consensus_info: ConsensusInfo object for the sequence
143
+ source_folder: Source directory containing cluster_debug folder
144
+ min_variant_frequency: Global threshold for flagging positions (from metadata)
145
+ min_variant_count: Minimum variant count for phasing (from metadata)
146
+
147
+ Returns:
148
+ Dictionary with positional analysis:
149
+ {
150
+ 'num_outlier_positions': int,
151
+ 'mean_outlier_error_rate': float, # Mean error rate across outlier positions only
152
+ 'total_nucleotide_errors': int, # Sum of error counts at outlier positions
153
+ 'outlier_threshold': float,
154
+ 'outlier_positions': List of (position, error_rate, error_count) tuples
155
+ }
156
+ Returns None if MSA file not found or analysis fails
157
+
158
+ Note: Error rates already exclude homopolymer length differences due to
159
+ homopolymer normalization in analyze_positional_variation()
160
+ """
161
+ # Skip analysis for low-RiC sequences (insufficient data for meaningful statistics)
162
+ # Need at least 2 * min_variant_count to confidently phase two variants
163
+ min_ric_threshold = 2 * min_variant_count
164
+ if consensus_info.ric < min_ric_threshold:
165
+ logging.debug(f"Skipping positional analysis for {consensus_info.sample_name}: "
166
+ f"RiC {consensus_info.ric} < {min_ric_threshold}")
167
+ return None
168
+
169
+ # Construct path to MSA file
170
+ debug_dir = os.path.join(source_folder, "cluster_debug")
171
+
172
+ # Try to find the MSA file
173
+ # MSA files use the original cluster naming (e.g., "specimen-c1")
174
+ # not the summarized naming (e.g., "specimen-1.v1")
175
+ msa_file = None
176
+
177
+ # Extract specimen name and cluster ID
178
+ # consensus_info.sample_name might be "specimen-1.v1" (summarized)
179
+ # consensus_info.cluster_id should be "-c1" (original cluster)
180
+
181
+ # Build the base name from specimen + cluster_id
182
+ # If sample_name is "ONT01.23-...-1.v1" and cluster_id is "-c1"
183
+ # we need to reconstruct "ONT01.23-...-c1"
184
+
185
+ sample_name = consensus_info.sample_name
186
+ cluster_id = consensus_info.cluster_id
187
+
188
+ # Remove any HAC group/variant suffix from sample_name to get specimen base
189
+ # Pattern: "-\d+\.v\d+" (e.g., "-1.v1")
190
+ specimen_base = re.sub(r'-\d+\.v\d+$', '', sample_name)
191
+
192
+ # Reconstruct original cluster name
193
+ original_cluster_name = f"{specimen_base}{cluster_id}"
194
+
195
+ # Look for the MSA file with correct extension
196
+ msa_fasta = os.path.join(debug_dir, f"{original_cluster_name}-RiC{consensus_info.ric}-msa.fasta")
197
+ if os.path.exists(msa_fasta):
198
+ msa_file = msa_fasta
199
+
200
+ if not msa_file:
201
+ logging.debug(f"No MSA file found for {original_cluster_name}")
202
+ return None
203
+
204
+ # Analyze cluster quality using core.py's positional analysis
205
+ quality_data = analyze_cluster_quality(msa_file, consensus_info.sequence)
206
+
207
+ if not quality_data or not quality_data.position_error_rates:
208
+ logging.debug(f"Failed to analyze cluster quality for {original_cluster_name}")
209
+ return None
210
+
211
+ position_error_rates = quality_data.position_error_rates
212
+ position_error_counts = quality_data.position_error_counts
213
+ position_stats = quality_data.position_stats
214
+
215
+ # Use global min_variant_frequency as threshold
216
+ # Positions above this could be undetected/unphased variants
217
+ threshold = min_variant_frequency
218
+ outlier_positions = [
219
+ (i, rate, count)
220
+ for i, (rate, count) in enumerate(zip(position_error_rates, position_error_counts))
221
+ if rate > threshold
222
+ ]
223
+
224
+ # Build detailed outlier info including base composition
225
+ outlier_details = []
226
+ if position_stats:
227
+ for i, rate, count in outlier_positions:
228
+ if i < len(position_stats):
229
+ ps = position_stats[i]
230
+ outlier_details.append({
231
+ 'consensus_position': ps.consensus_position,
232
+ 'msa_position': ps.msa_position,
233
+ 'error_rate': rate,
234
+ 'error_count': count,
235
+ 'coverage': ps.coverage,
236
+ 'consensus_nucleotide': ps.consensus_nucleotide,
237
+ 'base_composition': dict(ps.base_composition),
238
+ 'homopolymer_composition': dict(ps.homopolymer_composition) if ps.homopolymer_composition else {},
239
+ 'sub_count': ps.sub_count,
240
+ 'ins_count': ps.ins_count,
241
+ 'del_count': ps.del_count,
242
+ })
243
+
244
+ # Calculate statistics for outlier positions only
245
+ if outlier_positions:
246
+ mean_outlier_error = np.mean([rate for _, rate, _ in outlier_positions])
247
+ total_nucleotide_errors = sum(count for _, _, count in outlier_positions)
248
+ else:
249
+ mean_outlier_error = 0.0
250
+ total_nucleotide_errors = 0
251
+
252
+ return {
253
+ 'num_outlier_positions': len(outlier_positions),
254
+ 'mean_outlier_error_rate': mean_outlier_error,
255
+ 'total_nucleotide_errors': total_nucleotide_errors,
256
+ 'outlier_threshold': threshold,
257
+ 'outlier_positions': outlier_positions,
258
+ 'outlier_details': outlier_details,
259
+ 'consensus_seq': quality_data.consensus_seq,
260
+ 'ric': consensus_info.ric,
261
+ }
262
+
263
+
264
+ def run_spoa_msa(sequences: List[str], alignment_mode: int = 1) -> List:
265
+ """
266
+ Run SPOA to create multiple sequence alignment.
267
+
268
+ Args:
269
+ sequences: List of DNA sequence strings
270
+ alignment_mode: SPOA alignment mode:
271
+ 0 = local (Smith-Waterman) - best for overlap merging
272
+ 1 = global (Needleman-Wunsch) - default, for same-length sequences
273
+ 2 = semi-global - alternative for overlap merging
274
+
275
+ Returns:
276
+ List of SeqRecord objects with aligned sequences (including gaps)
277
+ """
278
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.fasta', delete=False) as temp_input:
279
+ try:
280
+ # Write sequences to temporary file
281
+ records = [
282
+ SeqRecord(Seq(seq), id=f"seq{i}", description="")
283
+ for i, seq in enumerate(sequences)
284
+ ]
285
+ SeqIO.write(records, temp_input, "fasta")
286
+ temp_input.flush()
287
+
288
+ # Run SPOA with alignment output (-r 2) and specified alignment mode
289
+ result = subprocess.run(
290
+ ['spoa', temp_input.name, '-r', '2', '-l', str(alignment_mode)],
291
+ capture_output=True,
292
+ text=True,
293
+ check=True
294
+ )
295
+
296
+ # Parse aligned sequences from SPOA output
297
+ aligned_sequences = []
298
+ lines = result.stdout.strip().split('\n')
299
+ current_id = None
300
+ current_seq = []
301
+
302
+ for line in lines:
303
+ if line.startswith('>'):
304
+ if current_id is not None:
305
+ # Skip consensus sequence (usually last)
306
+ if not current_id.startswith('Consensus'):
307
+ aligned_sequences.append(SeqRecord(
308
+ Seq(''.join(current_seq)),
309
+ id=current_id,
310
+ description=""
311
+ ))
312
+ current_id = line[1:]
313
+ current_seq = []
314
+ elif line.strip():
315
+ current_seq.append(line.strip())
316
+
317
+ # Add last sequence (if not consensus)
318
+ if current_id is not None and not current_id.startswith('Consensus'):
319
+ aligned_sequences.append(SeqRecord(
320
+ Seq(''.join(current_seq)),
321
+ id=current_id,
322
+ description=""
323
+ ))
324
+
325
+ return aligned_sequences
326
+
327
+ finally:
328
+ if os.path.exists(temp_input.name):
329
+ os.unlink(temp_input.name)
330
+
331
+
332
+ def identify_indel_events(aligned_seqs: List, alignment_length: int) -> List[Tuple[int, int]]:
333
+ """
334
+ Identify consecutive runs of indel columns (events).
335
+
336
+ An indel event is a maximal consecutive run of columns containing gaps.
337
+ Each event represents a single biological insertion or deletion.
338
+
339
+ Args:
340
+ aligned_seqs: List of aligned sequences from SPOA
341
+ alignment_length: Length of the alignment
342
+
343
+ Returns:
344
+ List of (start_col, end_col) tuples, where end_col is inclusive
345
+ """
346
+ events = []
347
+ in_event = False
348
+ start_col = None
349
+
350
+ for col_idx in range(alignment_length):
351
+ column = [str(seq.seq[col_idx]) for seq in aligned_seqs]
352
+ has_gap = '-' in column
353
+ has_bases = any(c != '-' for c in column)
354
+
355
+ # Indel column: mix of gaps and bases
356
+ if has_gap and has_bases:
357
+ if not in_event:
358
+ # Start new event
359
+ in_event = True
360
+ start_col = col_idx
361
+ else:
362
+ # Not an indel column (either all gaps or all bases)
363
+ if in_event:
364
+ # End current event
365
+ events.append((start_col, col_idx - 1))
366
+ in_event = False
367
+
368
+ # Handle event that extends to end of alignment
369
+ if in_event:
370
+ events.append((start_col, alignment_length - 1))
371
+
372
+ return events
373
+
374
+
375
+ def is_homopolymer_event(aligned_seqs: List, start_col: int, end_col: int) -> bool:
376
+ """
377
+ Classify a complete indel event as homopolymer or structural.
378
+
379
+ An event is homopolymer if:
380
+ 1. All bases in the event region (across all sequences, all columns) are identical
381
+ 2. At least one flanking solid column has all sequences showing the same base
382
+
383
+ This matches adjusted-identity semantics where AAA ~ AAAA.
384
+
385
+ Examples:
386
+ Homopolymer: ATAAA--GC vs ATAAAAGC (event has all A's, flanked by A)
387
+ Structural: ATAA-GC vs ATG-AGC (event has A, flanked by A vs G)
388
+ Structural: ATC--GC vs ATCATGC (event has A and T - not homopolymer)
389
+
390
+ Args:
391
+ aligned_seqs: List of aligned sequences from SPOA
392
+ start_col: First column of the indel event (inclusive)
393
+ end_col: Last column of the indel event (inclusive)
394
+
395
+ Returns:
396
+ True if homopolymer event, False if structural
397
+ """
398
+ # Extract all bases from the event region (excluding gaps)
399
+ bases_in_event = set()
400
+ for col_idx in range(start_col, end_col + 1):
401
+ column = [str(seq.seq[col_idx]) for seq in aligned_seqs]
402
+ bases_in_event.update(c for c in column if c != '-')
403
+
404
+ # Must have exactly one base type across the entire event
405
+ if len(bases_in_event) != 1:
406
+ return False
407
+
408
+ event_base = list(bases_in_event)[0]
409
+ alignment_length = len(aligned_seqs[0].seq)
410
+
411
+ # Check flanking columns for matching homopolymer context
412
+ # A valid flanking column must:
413
+ # 1. Not be an indel column (all sequences have bases, no gaps)
414
+ # 2. All bases match the event base
415
+
416
+ # Check left flank
417
+ if start_col > 0:
418
+ left_col = start_col - 1
419
+ left_column = [str(seq.seq[left_col]) for seq in aligned_seqs]
420
+ left_bases = set(c for c in left_column if c != '-')
421
+ left_has_gap = '-' in left_column
422
+
423
+ if not left_has_gap and left_bases == {event_base}:
424
+ return True
425
+
426
+ # Check right flank
427
+ if end_col < alignment_length - 1:
428
+ right_col = end_col + 1
429
+ right_column = [str(seq.seq[right_col]) for seq in aligned_seqs]
430
+ right_bases = set(c for c in right_column if c != '-')
431
+ right_has_gap = '-' in right_column
432
+
433
+ if not right_has_gap and right_bases == {event_base}:
434
+ return True
435
+
436
+ # No valid homopolymer flanking found
437
+ return False
438
+
439
+
440
+ def analyze_msa_columns(aligned_seqs: List) -> dict:
441
+ """
442
+ Analyze aligned sequences to count SNPs and indels.
443
+
444
+ Distinguishes between structural indels (real insertions/deletions) and
445
+ homopolymer indels (length differences in homopolymer runs like AAA vs AAAA).
446
+
447
+ Uses event-based classification: consecutive indel columns are grouped into
448
+ events, and each complete event is classified as homopolymer or structural.
449
+
450
+ Important: All gaps (including terminal gaps) count as variant positions
451
+ since variants within a group share the same primers.
452
+
453
+ Returns dict with:
454
+ 'snp_count': number of positions with >1 non-gap base
455
+ 'structural_indel_count': number of structural indel events
456
+ 'structural_indel_length': length of longest structural indel event
457
+ 'homopolymer_indel_count': number of homopolymer indel events
458
+ 'homopolymer_indel_length': length of longest homopolymer indel event
459
+ 'indel_count': total indel events (for backward compatibility)
460
+ 'max_indel_length': max indel event length (for backward compatibility)
461
+ """
462
+ alignment_length = len(aligned_seqs[0].seq)
463
+
464
+ # Step 1: Count SNPs
465
+ snp_count = 0
466
+ for col_idx in range(alignment_length):
467
+ column = [str(seq.seq[col_idx]) for seq in aligned_seqs]
468
+ unique_bases = set(c for c in column if c != '-')
469
+ has_gap = '-' in column
470
+
471
+ # SNP position: multiple different bases with NO gaps
472
+ # Columns with gaps are indels, not SNPs
473
+ if len(unique_bases) > 1 and not has_gap:
474
+ snp_count += 1
475
+
476
+ # Step 2: Identify indel events (consecutive runs of indel columns)
477
+ indel_events = identify_indel_events(aligned_seqs, alignment_length)
478
+
479
+ # Step 3: Classify each event as homopolymer or structural
480
+ structural_events = []
481
+ homopolymer_events = []
482
+
483
+ for start_col, end_col in indel_events:
484
+ if is_homopolymer_event(aligned_seqs, start_col, end_col):
485
+ homopolymer_events.append((start_col, end_col))
486
+ else:
487
+ structural_events.append((start_col, end_col))
488
+
489
+ # Step 4: Calculate statistics
490
+ # Count is number of events (not columns)
491
+ structural_indel_count = len(structural_events)
492
+ homopolymer_indel_count = len(homopolymer_events)
493
+
494
+ # Length is the size of the longest event
495
+ structural_indel_length = max((end - start + 1 for start, end in structural_events), default=0)
496
+ homopolymer_indel_length = max((end - start + 1 for start, end in homopolymer_events), default=0)
497
+
498
+ # Backward compatibility: total events and max length
499
+ total_indel_count = structural_indel_count + homopolymer_indel_count
500
+ max_indel_length = max(structural_indel_length, homopolymer_indel_length)
501
+
502
+ return {
503
+ 'snp_count': snp_count,
504
+ 'structural_indel_count': structural_indel_count,
505
+ 'structural_indel_length': structural_indel_length,
506
+ 'homopolymer_indel_count': homopolymer_indel_count,
507
+ 'homopolymer_indel_length': homopolymer_indel_length,
508
+ 'indel_count': total_indel_count, # Backward compatibility
509
+ 'max_indel_length': max_indel_length # Backward compatibility
510
+ }
511
+
512
+
513
+ def analyze_msa_columns_overlap_aware(aligned_seqs: List, min_overlap_bp: int,
514
+ original_lengths: List[int]) -> dict:
515
+ """
516
+ Analyze MSA columns, distinguishing terminal gaps from structural indels.
517
+
518
+ Terminal gaps (from length differences at sequence ends) are NOT counted
519
+ as structural indels when sequences have sufficient overlap in their
520
+ shared region. This enables merging sequences from primer pools with
521
+ different endpoints.
522
+
523
+ Args:
524
+ aligned_seqs: List of aligned sequences from SPOA
525
+ min_overlap_bp: Minimum overlap required (0 to disable overlap mode)
526
+ original_lengths: Original ungapped sequence lengths
527
+
528
+ Returns dict with:
529
+ 'snp_count': SNPs in overlap region
530
+ 'structural_indel_count': Structural indels in overlap region only
531
+ 'structural_indel_length': Length of longest structural indel
532
+ 'homopolymer_indel_count': Homopolymer indels (anywhere)
533
+ 'homopolymer_indel_length': Length of longest homopolymer indel
534
+ 'terminal_gap_columns': Number of terminal gap columns (not counted as structural)
535
+ 'overlap_bp': Size of overlap region in base pairs
536
+ 'prefix_bp': Extension before overlap region (for logging)
537
+ 'suffix_bp': Extension after overlap region (for logging)
538
+ 'content_regions': List of (start, end) tuples per sequence (for span logging)
539
+ 'indel_count': Total events (backward compatibility)
540
+ 'max_indel_length': Max event length (backward compatibility)
541
+ """
542
+ alignment_length = len(aligned_seqs[0].seq)
543
+
544
+ # Step 1: Find content region for each sequence (first non-gap to last non-gap)
545
+ content_regions = [] # List of (start, end) tuples
546
+ for seq in aligned_seqs:
547
+ seq_str = str(seq.seq)
548
+ # Find first and last non-gap positions
549
+ first_base = next((i for i, c in enumerate(seq_str) if c != '-'), 0)
550
+ last_base = alignment_length - 1 - next(
551
+ (i for i, c in enumerate(reversed(seq_str)) if c != '-'), 0
552
+ )
553
+ content_regions.append((first_base, last_base))
554
+
555
+ # Step 2: Calculate overlap region (intersection of all content regions)
556
+ overlap_start = max(start for start, _ in content_regions)
557
+ overlap_end = min(end for _, end in content_regions)
558
+
559
+ # Calculate union region (for prefix/suffix extension reporting)
560
+ union_start = min(start for start, _ in content_regions)
561
+ union_end = max(end for _, end in content_regions)
562
+ prefix_bp = overlap_start - union_start
563
+ suffix_bp = union_end - overlap_end
564
+
565
+ # Calculate actual overlap in base pairs (count only columns where all have bases)
566
+ overlap_bp = 0
567
+ if overlap_end >= overlap_start:
568
+ for col_idx in range(overlap_start, overlap_end + 1):
569
+ column = [str(seq.seq[col_idx]) for seq in aligned_seqs]
570
+ if all(c != '-' for c in column):
571
+ overlap_bp += 1
572
+
573
+ # Determine effective threshold for containment cases
574
+ shorter_len = min(original_lengths)
575
+ effective_threshold = min(min_overlap_bp, shorter_len)
576
+
577
+ # Step 3: Count SNPs only within overlap region
578
+ snp_count = 0
579
+ for col_idx in range(overlap_start, overlap_end + 1):
580
+ column = [str(seq.seq[col_idx]) for seq in aligned_seqs]
581
+ unique_bases = set(c for c in column if c != '-')
582
+ has_gap = '-' in column
583
+
584
+ # SNP position: multiple different bases with NO gaps
585
+ if len(unique_bases) > 1 and not has_gap:
586
+ snp_count += 1
587
+
588
+ # Step 4: Identify indel events, but only count those within overlap region
589
+ indel_events = identify_indel_events(aligned_seqs, alignment_length)
590
+
591
+ # Step 5: Classify each event and determine if it's in overlap region
592
+ structural_events = []
593
+ homopolymer_events = []
594
+ terminal_gap_columns = 0
595
+
596
+ for start_col, end_col in indel_events:
597
+ # Check if this event is entirely within the overlap region
598
+ is_in_overlap = (start_col >= overlap_start and end_col <= overlap_end)
599
+
600
+ # Check if this is a terminal gap event (at the boundary of a content region)
601
+ is_terminal = False
602
+ for seq_start, seq_end in content_regions:
603
+ # Terminal if event is adjacent to or outside a sequence's content region
604
+ if end_col < seq_start or start_col > seq_end:
605
+ is_terminal = True
606
+ break
607
+ # Also terminal if event is at the very edge of content
608
+ if start_col == seq_start or end_col == seq_end:
609
+ # Check if the gaps in this event are from this sequence's terminal
610
+ for col_idx in range(start_col, end_col + 1):
611
+ column = [str(seq.seq[col_idx]) for seq in aligned_seqs]
612
+ for i, (s, e) in enumerate(content_regions):
613
+ if col_idx < s or col_idx > e:
614
+ if column[i] == '-':
615
+ is_terminal = True
616
+ break
617
+ if is_terminal:
618
+ break
619
+
620
+ if is_terminal and overlap_bp >= effective_threshold:
621
+ # Terminal gap from length difference - don't count as structural
622
+ terminal_gap_columns += (end_col - start_col + 1)
623
+ elif is_homopolymer_event(aligned_seqs, start_col, end_col):
624
+ homopolymer_events.append((start_col, end_col))
625
+ else:
626
+ # Only count as structural if within overlap region
627
+ if is_in_overlap:
628
+ structural_events.append((start_col, end_col))
629
+ else:
630
+ # Outside overlap - this is a terminal gap
631
+ terminal_gap_columns += (end_col - start_col + 1)
632
+
633
+ # Step 6: Calculate statistics
634
+ structural_indel_count = len(structural_events)
635
+ homopolymer_indel_count = len(homopolymer_events)
636
+
637
+ structural_indel_length = max((end - start + 1 for start, end in structural_events), default=0)
638
+ homopolymer_indel_length = max((end - start + 1 for start, end in homopolymer_events), default=0)
639
+
640
+ # Backward compatibility
641
+ total_indel_count = structural_indel_count + homopolymer_indel_count
642
+ max_indel_length = max(structural_indel_length, homopolymer_indel_length)
643
+
644
+ return {
645
+ 'snp_count': snp_count,
646
+ 'structural_indel_count': structural_indel_count,
647
+ 'structural_indel_length': structural_indel_length,
648
+ 'homopolymer_indel_count': homopolymer_indel_count,
649
+ 'homopolymer_indel_length': homopolymer_indel_length,
650
+ 'terminal_gap_columns': terminal_gap_columns,
651
+ 'overlap_bp': overlap_bp,
652
+ 'prefix_bp': prefix_bp,
653
+ 'suffix_bp': suffix_bp,
654
+ 'content_regions': content_regions,
655
+ 'indel_count': total_indel_count, # Backward compatibility
656
+ 'max_indel_length': max_indel_length # Backward compatibility
657
+ }
658
+
659
+
660
+ def analyze_cluster_quality(
661
+ msa_file: str,
662
+ consensus_seq: str,
663
+ max_reads: Optional[int] = None
664
+ ) -> Optional[ClusterQualityData]:
665
+ """
666
+ Analyze cluster quality using core.py's analyze_positional_variation().
667
+
668
+ Uses the canonical positional analysis from core.py to ensure consistent
669
+ treatment of homopolymer length differences across the pipeline.
670
+
671
+ Args:
672
+ msa_file: Path to MSA FASTA file
673
+ consensus_seq: Ungapped consensus sequence
674
+ max_reads: Maximum reads to include (for downsampling large clusters)
675
+
676
+ Returns:
677
+ ClusterQualityData with position error rates and read identities, or None if failed
678
+ """
679
+ if not os.path.exists(msa_file):
680
+ logging.debug(f"MSA file not found: {msa_file}")
681
+ return None
682
+
683
+ # Load MSA file content
684
+ try:
685
+ with open(msa_file, 'r') as f:
686
+ msa_string = f.read()
687
+ except Exception as e:
688
+ logging.debug(f"Failed to read MSA file {msa_file}: {e}")
689
+ return None
690
+
691
+ # Extract alignments from MSA using core.py function with homopolymer normalization
692
+ # This returns ReadAlignment objects with score_aligned field
693
+ alignments, msa_consensus, msa_to_consensus_pos = extract_alignments_from_msa(
694
+ msa_string,
695
+ enable_homopolymer_normalization=True
696
+ )
697
+
698
+ if not alignments:
699
+ logging.debug(f"No alignments found in MSA: {msa_file}")
700
+ return None
701
+
702
+ # Verify consensus matches: the passed-in consensus_seq may be trimmed (shorter) with IUPAC codes
703
+ # The MSA consensus is untrimmed (longer) without IUPAC codes
704
+ # Use edlib in HW mode to check if trimmed consensus is contained within MSA consensus
705
+ if msa_consensus and msa_consensus != consensus_seq:
706
+ # Use edlib HW mode (semi-global) to find consensus_seq within msa_consensus
707
+ # This handles primer trimming (length difference) and IUPAC codes (via equivalencies)
708
+ result = edlib.align(consensus_seq, msa_consensus, mode="HW", task="distance",
709
+ additionalEqualities=IUPAC_EQUIV)
710
+ edit_distance = result["editDistance"]
711
+ if edit_distance > 0: # Any edits indicate a real mismatch
712
+ logging.warning(f"Consensus mismatch in MSA file: {msa_file}")
713
+ logging.warning(f" MSA length: {len(msa_consensus)}, consensus length: {len(consensus_seq)}, edit distance: {edit_distance}")
714
+
715
+ # Use the passed-in consensus (with IUPAC codes) as authoritative for quality analysis
716
+ # This reflects the actual output sequence
717
+ consensus_length = len(consensus_seq)
718
+
719
+ if consensus_length == 0:
720
+ logging.debug(f"Empty consensus sequence: {msa_file}")
721
+ return None
722
+
723
+ # Get consensus aligned sequence by parsing MSA string
724
+ msa_handle = StringIO(msa_string)
725
+ records = list(SeqIO.parse(msa_handle, 'fasta'))
726
+ consensus_aligned = None
727
+ for record in records:
728
+ if 'Consensus' in record.description or 'Consensus' in record.id:
729
+ consensus_aligned = str(record.seq).upper()
730
+ break
731
+
732
+ if consensus_aligned is None:
733
+ logging.debug(f"No consensus found in MSA: {msa_file}")
734
+ return None
735
+
736
+ # Downsample reads if needed
737
+ if max_reads and len(alignments) > max_reads:
738
+ # Sort by read identity (using normalized edit distance) and take worst reads first, then best
739
+ # This gives us a representative sample showing the quality range
740
+ read_identities_temp = []
741
+ for alignment in alignments:
742
+ # Use normalized edit distance for identity calculation
743
+ identity = 1.0 - (alignment.normalized_edit_distance / consensus_length) if consensus_length > 0 else 0.0
744
+ read_identities_temp.append((identity, alignment))
745
+
746
+ # Sort by identity
747
+ read_identities_temp.sort(key=lambda x: x[0])
748
+
749
+ # Take worst half and best half
750
+ n_worst = max_reads // 2
751
+ n_best = max_reads - n_worst
752
+ sampled = read_identities_temp[:n_worst] + read_identities_temp[-n_best:]
753
+
754
+ alignments = [alignment for _, alignment in sampled]
755
+ logging.debug(f"Downsampled {len(read_identities_temp)} reads to {len(alignments)} for analysis")
756
+
757
+ # Use core.py's canonical positional analysis
758
+ position_stats = analyze_positional_variation(alignments, consensus_aligned, msa_to_consensus_pos)
759
+
760
+ # Extract position error rates and counts for consensus positions only (skip insertion columns)
761
+ consensus_position_stats = [ps for ps in position_stats if ps.consensus_position is not None]
762
+ # Sort by consensus position to ensure correct order
763
+ consensus_position_stats.sort(key=lambda ps: ps.consensus_position)
764
+ position_error_rates = [ps.error_rate for ps in consensus_position_stats]
765
+ position_error_counts = [ps.error_count for ps in consensus_position_stats]
766
+
767
+ # Calculate per-read identities from alignments
768
+ read_identities = []
769
+ for alignment in alignments:
770
+ # Use normalized edit distance for identity calculation
771
+ identity = 1.0 - (alignment.normalized_edit_distance / consensus_length) if consensus_length > 0 else 0.0
772
+ read_identities.append(identity)
773
+
774
+ return ClusterQualityData(
775
+ consensus_seq=consensus_seq,
776
+ position_error_rates=position_error_rates,
777
+ position_error_counts=position_error_counts,
778
+ read_identities=read_identities,
779
+ position_stats=consensus_position_stats
780
+ )