speconsense 0.7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,696 @@
1
+ """Module-level worker functions for ProcessPoolExecutor.
2
+
3
+ These must be at module level to be picklable for multiprocessing.
4
+ Includes standalone versions of functions used by workers and config classes.
5
+ """
6
+
7
+ import logging
8
+ import os
9
+ import subprocess
10
+ import tempfile
11
+ from io import StringIO
12
+ from typing import Dict, List, Optional, Set, Tuple
13
+
14
+ import edlib
15
+ import numpy as np
16
+ from Bio import SeqIO
17
+
18
+ from speconsense.msa import (
19
+ MSAResult,
20
+ ReadAlignment,
21
+ analyze_positional_variation,
22
+ call_iupac_ambiguities,
23
+ calculate_within_cluster_error,
24
+ extract_alignments_from_msa,
25
+ filter_qualifying_haplotypes,
26
+ group_reads_by_single_position,
27
+ is_variant_position_with_composition,
28
+ )
29
+
30
+
31
+ # Configuration classes for parallel processing
32
+
33
+ class ClusterProcessingConfig:
34
+ """Configuration for parallel cluster processing.
35
+
36
+ Passed to worker processes to avoid needing to pickle the entire SpecimenClusterer.
37
+ """
38
+ __slots__ = ['outlier_identity_threshold', 'enable_secondpass_phasing',
39
+ 'disable_homopolymer_equivalence', 'min_variant_frequency', 'min_variant_count']
40
+
41
+ def __init__(self, outlier_identity_threshold: Optional[float],
42
+ enable_secondpass_phasing: bool,
43
+ disable_homopolymer_equivalence: bool,
44
+ min_variant_frequency: float,
45
+ min_variant_count: int):
46
+ self.outlier_identity_threshold = outlier_identity_threshold
47
+ self.enable_secondpass_phasing = enable_secondpass_phasing
48
+ self.disable_homopolymer_equivalence = disable_homopolymer_equivalence
49
+ self.min_variant_frequency = min_variant_frequency
50
+ self.min_variant_count = min_variant_count
51
+
52
+
53
+ class ConsensusGenerationConfig:
54
+ """Configuration for parallel final consensus generation.
55
+
56
+ Passed to worker processes to avoid needing to pickle the entire SpecimenClusterer.
57
+ """
58
+ __slots__ = ['max_sample_size', 'enable_iupac_calling', 'min_ambiguity_frequency',
59
+ 'min_ambiguity_count', 'disable_homopolymer_equivalence', 'primers']
60
+
61
+ def __init__(self, max_sample_size: int,
62
+ enable_iupac_calling: bool,
63
+ min_ambiguity_frequency: float,
64
+ min_ambiguity_count: int,
65
+ disable_homopolymer_equivalence: bool,
66
+ primers: Optional[List[Tuple[str, str]]] = None):
67
+ self.max_sample_size = max_sample_size
68
+ self.enable_iupac_calling = enable_iupac_calling
69
+ self.min_ambiguity_frequency = min_ambiguity_frequency
70
+ self.min_ambiguity_count = min_ambiguity_count
71
+ self.disable_homopolymer_equivalence = disable_homopolymer_equivalence
72
+ self.primers = primers
73
+
74
+
75
+ # Worker functions
76
+
77
+ def _run_spoa_worker(args: Tuple[int, Dict[str, str], bool]) -> Tuple[int, Optional[MSAResult]]:
78
+ """Worker function for parallel SPOA execution.
79
+
80
+ Must be at module level for ProcessPoolExecutor pickling.
81
+
82
+ Args:
83
+ args: Tuple of (cluster_idx, sampled_seqs, disable_homopolymer_equivalence)
84
+
85
+ Returns:
86
+ Tuple of (cluster_idx, MSAResult or None)
87
+ """
88
+ cluster_idx, sampled_seqs, disable_homopolymer_equivalence = args
89
+
90
+ if not sampled_seqs:
91
+ return cluster_idx, None
92
+
93
+ try:
94
+ with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.fasta') as f:
95
+ for read_id, seq in sorted(sampled_seqs.items()):
96
+ f.write(f">{read_id}\n{seq}\n")
97
+ temp_input = f.name
98
+
99
+ cmd = [
100
+ "spoa", temp_input,
101
+ "-r", "2", "-l", "1", "-m", "5", "-n", "-4", "-g", "-8", "-e", "-6",
102
+ ]
103
+ result = subprocess.run(cmd, capture_output=True, text=True, check=True)
104
+ os.unlink(temp_input)
105
+
106
+ enable_normalization = not disable_homopolymer_equivalence
107
+ alignments, consensus, msa_to_consensus_pos = extract_alignments_from_msa(
108
+ result.stdout, enable_homopolymer_normalization=enable_normalization
109
+ )
110
+
111
+ if not consensus:
112
+ return cluster_idx, None
113
+
114
+ return cluster_idx, MSAResult(
115
+ consensus=consensus,
116
+ msa_string=result.stdout,
117
+ alignments=alignments,
118
+ msa_to_consensus_pos=msa_to_consensus_pos
119
+ )
120
+ except subprocess.CalledProcessError as e:
121
+ logging.error(f"SPOA worker failed for cluster {cluster_idx}: return code {e.returncode}")
122
+ return cluster_idx, None
123
+ except Exception as e:
124
+ logging.error(f"SPOA worker failed for cluster {cluster_idx}: {e}")
125
+ return cluster_idx, None
126
+
127
+
128
+ def _run_spoa_for_cluster_worker(sequences: Dict[str, str],
129
+ disable_homopolymer_equivalence: bool) -> Optional[MSAResult]:
130
+ """Run SPOA for a set of sequences. Used by cluster processing worker."""
131
+ if not sequences:
132
+ return None
133
+
134
+ try:
135
+ with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.fasta') as f:
136
+ for read_id, seq in sorted(sequences.items()):
137
+ f.write(f">{read_id}\n{seq}\n")
138
+ temp_input = f.name
139
+
140
+ cmd = [
141
+ "spoa", temp_input,
142
+ "-r", "2", "-l", "1", "-m", "5", "-n", "-4", "-g", "-8", "-e", "-6",
143
+ ]
144
+ result = subprocess.run(cmd, capture_output=True, text=True, check=True)
145
+ os.unlink(temp_input)
146
+
147
+ enable_normalization = not disable_homopolymer_equivalence
148
+ alignments, consensus, msa_to_consensus_pos = extract_alignments_from_msa(
149
+ result.stdout, enable_homopolymer_normalization=enable_normalization
150
+ )
151
+
152
+ if not consensus:
153
+ return None
154
+
155
+ return MSAResult(
156
+ consensus=consensus,
157
+ msa_string=result.stdout,
158
+ alignments=alignments,
159
+ msa_to_consensus_pos=msa_to_consensus_pos
160
+ )
161
+ except Exception:
162
+ return None
163
+
164
+
165
+ def _identify_outlier_reads_standalone(alignments: List[ReadAlignment], consensus_seq: str,
166
+ sampled_ids: Set[str], threshold: float) -> Tuple[Set[str], Set[str]]:
167
+ """Identify outlier reads below identity threshold. Standalone version for workers."""
168
+ if not alignments or not consensus_seq:
169
+ return sampled_ids, set()
170
+
171
+ try:
172
+ keep_ids = set()
173
+ outlier_ids = set()
174
+ consensus_length = len(consensus_seq)
175
+ if consensus_length == 0:
176
+ return sampled_ids, set()
177
+
178
+ for alignment in alignments:
179
+ error_rate = alignment.normalized_edit_distance / consensus_length
180
+ identity = 1.0 - error_rate
181
+ if identity >= threshold:
182
+ keep_ids.add(alignment.read_id)
183
+ else:
184
+ outlier_ids.add(alignment.read_id)
185
+
186
+ return keep_ids, outlier_ids
187
+ except Exception:
188
+ return sampled_ids, set()
189
+
190
+
191
+ def _calculate_read_identity_standalone(alignments: List[ReadAlignment],
192
+ consensus_seq: str) -> Tuple[Optional[float], Optional[float]]:
193
+ """Calculate read identity metrics. Standalone version for workers."""
194
+ if not alignments or not consensus_seq:
195
+ return None, None
196
+
197
+ try:
198
+ consensus_length = len(consensus_seq)
199
+ if consensus_length == 0:
200
+ return None, None
201
+
202
+ identities = []
203
+ for alignment in alignments:
204
+ error_rate = alignment.normalized_edit_distance / consensus_length
205
+ identity = 1.0 - error_rate
206
+ identities.append(identity)
207
+
208
+ if not identities:
209
+ return None, None
210
+
211
+ return np.mean(identities), np.min(identities)
212
+ except Exception:
213
+ return None, None
214
+
215
+
216
+ def _detect_variant_positions_standalone(alignments: List[ReadAlignment], consensus_seq: str,
217
+ msa_to_consensus_pos: Dict[int, Optional[int]],
218
+ min_variant_frequency: float,
219
+ min_variant_count: int) -> List[Dict]:
220
+ """Detect variant positions in MSA. Standalone version for workers."""
221
+ if not alignments or not consensus_seq:
222
+ return []
223
+
224
+ try:
225
+ msa_length = len(alignments[0].aligned_sequence)
226
+ consensus_aligned = []
227
+ for msa_pos in range(msa_length):
228
+ cons_pos = msa_to_consensus_pos.get(msa_pos)
229
+ if cons_pos is not None:
230
+ consensus_aligned.append(consensus_seq[cons_pos])
231
+ else:
232
+ consensus_aligned.append('-')
233
+ consensus_aligned = ''.join(consensus_aligned)
234
+
235
+ position_stats = analyze_positional_variation(alignments, consensus_aligned, msa_to_consensus_pos)
236
+
237
+ variant_positions = []
238
+ for pos_stat in position_stats:
239
+ is_variant, variant_bases, reason = is_variant_position_with_composition(
240
+ pos_stat,
241
+ min_variant_frequency=min_variant_frequency,
242
+ min_variant_count=min_variant_count
243
+ )
244
+ if is_variant:
245
+ variant_positions.append({
246
+ 'msa_position': pos_stat.msa_position,
247
+ 'consensus_position': pos_stat.consensus_position,
248
+ 'coverage': pos_stat.coverage,
249
+ 'variant_bases': variant_bases,
250
+ 'base_composition': pos_stat.base_composition,
251
+ 'homopolymer_composition': pos_stat.homopolymer_composition,
252
+ 'error_rate': pos_stat.error_rate,
253
+ 'reason': reason
254
+ })
255
+
256
+ return variant_positions
257
+ except Exception:
258
+ return []
259
+
260
+
261
+ def _recursive_phase_cluster_standalone(
262
+ read_ids: Set[str],
263
+ read_sequences: Dict[str, str],
264
+ path: List[str],
265
+ depth: int,
266
+ config: ClusterProcessingConfig
267
+ ) -> Tuple[List[Tuple[List[str], str, Set[str]]], Set[str]]:
268
+ """Recursively phase a cluster. Standalone version for workers."""
269
+ total_reads = len(read_ids)
270
+
271
+ # Base case: cluster too small
272
+ if total_reads < config.min_variant_count * 2:
273
+ leaf_seqs = {rid: read_sequences[rid] for rid in read_ids}
274
+ result = _run_spoa_for_cluster_worker(leaf_seqs, config.disable_homopolymer_equivalence)
275
+ consensus = result.consensus if result else ""
276
+ return [(path, consensus, read_ids)], set()
277
+
278
+ # Generate MSA
279
+ cluster_seqs = {rid: read_sequences[rid] for rid in read_ids}
280
+ result = _run_spoa_for_cluster_worker(cluster_seqs, config.disable_homopolymer_equivalence)
281
+
282
+ if result is None:
283
+ return [(path, "", read_ids)], set()
284
+
285
+ consensus = result.consensus
286
+ alignments = result.alignments
287
+ msa_to_consensus_pos = result.msa_to_consensus_pos
288
+
289
+ # Detect variants
290
+ variant_positions = _detect_variant_positions_standalone(
291
+ alignments, consensus, msa_to_consensus_pos,
292
+ config.min_variant_frequency, config.min_variant_count
293
+ )
294
+
295
+ if not variant_positions:
296
+ return [(path, consensus, read_ids)], set()
297
+
298
+ # Parse MSA for consensus_aligned
299
+ msa_handle = StringIO(result.msa_string)
300
+ records = list(SeqIO.parse(msa_handle, 'fasta'))
301
+
302
+ consensus_aligned = None
303
+ for record in records:
304
+ if 'Consensus' in record.description or 'Consensus' in record.id:
305
+ consensus_aligned = str(record.seq).upper()
306
+ break
307
+
308
+ if not consensus_aligned:
309
+ return [(path, consensus, read_ids)], set()
310
+
311
+ # Build read to alignment mapping
312
+ read_to_alignment = {a.read_id: a for a in alignments}
313
+
314
+ # Extract alleles at variant positions
315
+ variant_msa_positions = sorted([v['msa_position'] for v in variant_positions])
316
+ read_to_position_alleles = {}
317
+
318
+ for read_id in read_ids:
319
+ alignment = read_to_alignment.get(read_id)
320
+ if not alignment:
321
+ continue
322
+
323
+ aligned_seq = alignment.aligned_sequence
324
+ score_aligned = alignment.score_aligned
325
+
326
+ position_alleles = {}
327
+ for msa_pos in variant_msa_positions:
328
+ if msa_pos < len(aligned_seq):
329
+ allele = aligned_seq[msa_pos]
330
+ if score_aligned and msa_pos < len(score_aligned):
331
+ if score_aligned[msa_pos] == '=':
332
+ allele = consensus_aligned[msa_pos]
333
+ position_alleles[msa_pos] = allele
334
+ else:
335
+ position_alleles[msa_pos] = '-'
336
+
337
+ read_to_position_alleles[read_id] = position_alleles
338
+
339
+ # Find best split
340
+ best_pos = None
341
+ best_error = float('inf')
342
+ best_qualifying = None
343
+ best_non_qualifying = None
344
+ all_positions = set(variant_msa_positions)
345
+
346
+ for pos in variant_msa_positions:
347
+ allele_groups = group_reads_by_single_position(
348
+ read_to_position_alleles, pos, set(read_to_position_alleles.keys())
349
+ )
350
+ qualifying, non_qualifying = filter_qualifying_haplotypes(
351
+ allele_groups, total_reads, config.min_variant_count, config.min_variant_frequency
352
+ )
353
+ if len(qualifying) < 2:
354
+ continue
355
+
356
+ error = calculate_within_cluster_error(qualifying, read_to_position_alleles, {pos}, all_positions)
357
+ if error < best_error:
358
+ best_error = error
359
+ best_pos = pos
360
+ best_qualifying = qualifying
361
+ best_non_qualifying = non_qualifying
362
+
363
+ if best_pos is None:
364
+ return [(path, consensus, read_ids)], set()
365
+
366
+ # Collect deferred reads
367
+ all_deferred = set()
368
+ for allele, reads in best_non_qualifying.items():
369
+ all_deferred.update(reads)
370
+
371
+ # Recurse
372
+ all_leaves = []
373
+ for allele, sub_read_ids in sorted(best_qualifying.items()):
374
+ new_path = path + [allele]
375
+ sub_leaves, sub_deferred = _recursive_phase_cluster_standalone(
376
+ sub_read_ids, read_sequences, new_path, depth + 1, config
377
+ )
378
+ all_leaves.extend(sub_leaves)
379
+ all_deferred.update(sub_deferred)
380
+
381
+ return all_leaves, all_deferred
382
+
383
+
384
+ def _phase_reads_by_variants_standalone(
385
+ cluster_read_ids: Set[str],
386
+ sequences: Dict[str, str],
387
+ variant_positions: List[Dict],
388
+ config: ClusterProcessingConfig
389
+ ) -> List[Tuple[str, Set[str]]]:
390
+ """Phase reads into haplotypes. Standalone version for workers."""
391
+ if not variant_positions:
392
+ return [(None, cluster_read_ids)]
393
+
394
+ try:
395
+ read_sequences = {rid: sequences[rid] for rid in cluster_read_ids if rid in sequences}
396
+ if not read_sequences:
397
+ return [(None, cluster_read_ids)]
398
+
399
+ logging.debug(f"Recursive phasing with MSA regeneration: {len(variant_positions)} initial variants, {len(read_sequences)} reads")
400
+
401
+ leaves, deferred = _recursive_phase_cluster_standalone(
402
+ set(read_sequences.keys()), read_sequences, [], 0, config
403
+ )
404
+
405
+ if len(leaves) <= 1 and not deferred:
406
+ if leaves:
407
+ path, consensus, reads = leaves[0]
408
+ if not path:
409
+ return [(None, cluster_read_ids)]
410
+ return [(None, cluster_read_ids)]
411
+
412
+ if len(leaves) == 1:
413
+ return [(None, cluster_read_ids)]
414
+
415
+ logging.debug(f"Recursive phasing: {len(leaves)} leaf haplotypes, {len(deferred)} deferred reads")
416
+
417
+ # Reassign deferred reads
418
+ if deferred:
419
+ leaf_reads_updated = {tuple(path): set(reads) for path, consensus, reads in leaves}
420
+ leaf_consensuses = {tuple(path): consensus for path, consensus, reads in leaves}
421
+
422
+ for read_id in deferred:
423
+ if read_id not in read_sequences:
424
+ continue
425
+
426
+ read_seq = read_sequences[read_id]
427
+ min_distance = float('inf')
428
+ nearest_path = None
429
+
430
+ for path_tuple, consensus in leaf_consensuses.items():
431
+ if not consensus:
432
+ continue
433
+ result = edlib.align(read_seq, consensus)
434
+ distance = result['editDistance']
435
+ if distance < min_distance:
436
+ min_distance = distance
437
+ nearest_path = path_tuple
438
+
439
+ if nearest_path is not None:
440
+ leaf_reads_updated[nearest_path].add(read_id)
441
+
442
+ # Log and decide on phasing
443
+ total_phased = sum(len(reads) for reads in leaf_reads_updated.values())
444
+ if total_phased < 2:
445
+ return [(None, cluster_read_ids)]
446
+
447
+ logging.debug(f"Phasing decision: SPLITTING cluster into {len(leaf_reads_updated)} haplotypes")
448
+
449
+ return [('-'.join(path), reads) for path, reads in sorted(leaf_reads_updated.items(), key=lambda x: -len(x[1]))]
450
+ else:
451
+ logging.debug(f"Phasing decision: SPLITTING cluster into {len(leaves)} haplotypes")
452
+ return [('-'.join(path), reads) for path, consensus, reads in sorted(leaves, key=lambda x: -len(x[2]))]
453
+
454
+ except Exception as e:
455
+ logging.warning(f"Failed to phase reads: {e}")
456
+ return [(None, cluster_read_ids)]
457
+
458
+
459
+ def _process_cluster_worker(args) -> Tuple[List[Dict], Set[str]]:
460
+ """Worker function for parallel cluster processing.
461
+
462
+ Must be at module level for ProcessPoolExecutor pickling.
463
+
464
+ Args:
465
+ args: Tuple of (initial_idx, cluster_ids, sequences, qualities, config)
466
+
467
+ Returns:
468
+ Tuple of (subclusters, discarded_read_ids)
469
+ """
470
+ initial_idx, cluster_ids, sequences, qualities, config = args
471
+
472
+ subclusters = []
473
+ discarded_ids = set()
474
+
475
+ # Sort by quality
476
+ sorted_ids = sorted(cluster_ids, key=lambda x: (-qualities.get(x, 0), x))
477
+
478
+ # Generate consensus and MSA
479
+ cluster_seqs = {seq_id: sequences[seq_id] for seq_id in sorted_ids}
480
+ result = _run_spoa_for_cluster_worker(cluster_seqs, config.disable_homopolymer_equivalence)
481
+
482
+ if result is None:
483
+ logging.warning(f"Initial cluster {initial_idx}: Failed to generate consensus, skipping")
484
+ # Track these reads as discarded since we couldn't generate consensus
485
+ discarded_ids.update(cluster_ids)
486
+ return subclusters, discarded_ids
487
+
488
+ consensus = result.consensus
489
+ msa = result.msa_string
490
+ alignments = result.alignments
491
+ msa_to_consensus_pos = result.msa_to_consensus_pos
492
+
493
+ _calculate_read_identity_standalone(alignments, consensus)
494
+
495
+ cluster = set(cluster_ids)
496
+
497
+ # Outlier removal
498
+ if config.outlier_identity_threshold is not None:
499
+ keep_ids, outlier_ids = _identify_outlier_reads_standalone(
500
+ alignments, consensus, cluster, config.outlier_identity_threshold
501
+ )
502
+
503
+ if outlier_ids:
504
+ if len(cluster) == 2 and len(outlier_ids) == 1:
505
+ logging.debug(f"Initial cluster {initial_idx}: Split 2-read cluster due to 1 outlier")
506
+ for read_id in cluster:
507
+ subclusters.append({
508
+ 'read_ids': {read_id},
509
+ 'initial_cluster_num': initial_idx,
510
+ 'allele_combo': 'single-read-split'
511
+ })
512
+ return subclusters, discarded_ids
513
+
514
+ logging.debug(f"Initial cluster {initial_idx}: Removing {len(outlier_ids)}/{len(cluster)} outlier reads, "
515
+ f"regenerating consensus")
516
+
517
+ discarded_ids.update(outlier_ids)
518
+ cluster = cluster - outlier_ids
519
+
520
+ # Regenerate consensus
521
+ sorted_ids_filtered = sorted(cluster, key=lambda x: (-qualities.get(x, 0), x))
522
+ cluster_seqs = {seq_id: sequences[seq_id] for seq_id in sorted_ids_filtered}
523
+ result = _run_spoa_for_cluster_worker(cluster_seqs, config.disable_homopolymer_equivalence)
524
+
525
+ if result is not None:
526
+ consensus = result.consensus
527
+ msa = result.msa_string
528
+ alignments = result.alignments
529
+ msa_to_consensus_pos = result.msa_to_consensus_pos
530
+ _calculate_read_identity_standalone(alignments, consensus)
531
+
532
+ # Detect variants
533
+ variant_positions = []
534
+ if consensus and alignments and config.enable_secondpass_phasing:
535
+ variant_positions = _detect_variant_positions_standalone(
536
+ alignments, consensus, msa_to_consensus_pos,
537
+ config.min_variant_frequency, config.min_variant_count
538
+ )
539
+
540
+ if variant_positions:
541
+ logging.debug(f"Initial cluster {initial_idx}: Detected {len(variant_positions)} variant positions")
542
+
543
+ # Phase reads
544
+ phased_haplotypes = _phase_reads_by_variants_standalone(
545
+ cluster, sequences, variant_positions, config
546
+ )
547
+
548
+ for haplotype_idx, (allele_combo, haplotype_reads) in enumerate(phased_haplotypes):
549
+ subclusters.append({
550
+ 'read_ids': haplotype_reads,
551
+ 'initial_cluster_num': initial_idx,
552
+ 'allele_combo': allele_combo
553
+ })
554
+
555
+ return subclusters, discarded_ids
556
+
557
+
558
+ def _trim_primers_standalone(sequence: str, primers: Optional[List[Tuple[str, str]]]) -> Tuple[str, List[str]]:
559
+ """Trim primers from start and end of sequence. Standalone version for workers."""
560
+ if not primers:
561
+ return sequence, []
562
+
563
+ found_primers = []
564
+ trimmed_seq = sequence
565
+
566
+ # Look for primer at 5' end
567
+ best_start_dist = float('inf')
568
+ best_start_primer = None
569
+ best_start_end = None
570
+
571
+ for primer_name, primer_seq in primers:
572
+ k = len(primer_seq) // 4 # Allow ~25% errors
573
+ search_region = sequence[:len(primer_seq) * 2]
574
+
575
+ result = edlib.align(primer_seq, search_region, task="path", mode="HW", k=k)
576
+
577
+ if result["editDistance"] != -1:
578
+ dist = result["editDistance"]
579
+ if dist < best_start_dist:
580
+ best_start_dist = dist
581
+ best_start_primer = primer_name
582
+ best_start_end = result["locations"][0][1] + 1
583
+
584
+ if best_start_primer:
585
+ found_primers.append(f"5'-{best_start_primer}")
586
+ trimmed_seq = trimmed_seq[best_start_end:]
587
+
588
+ # Look for primer at 3' end
589
+ best_end_dist = float('inf')
590
+ best_end_primer = None
591
+ best_end_start = None
592
+
593
+ for primer_name, primer_seq in primers:
594
+ k = len(primer_seq) // 4 # Allow ~25% errors
595
+ search_region = sequence[-len(primer_seq) * 2:]
596
+
597
+ result = edlib.align(primer_seq, search_region, task="path", mode="HW", k=k)
598
+
599
+ if result["editDistance"] != -1:
600
+ dist = result["editDistance"]
601
+ if dist < best_end_dist:
602
+ best_end_dist = dist
603
+ best_end_primer = primer_name
604
+ base_pos = len(trimmed_seq) - len(search_region)
605
+ best_end_start = base_pos + result["locations"][0][0]
606
+
607
+ if best_end_primer:
608
+ found_primers.append(f"3'-{best_end_primer}")
609
+ trimmed_seq = trimmed_seq[:best_end_start]
610
+
611
+ return trimmed_seq, found_primers
612
+
613
+
614
+ def _generate_cluster_consensus_worker(args) -> Dict:
615
+ """Worker function for parallel final consensus generation.
616
+
617
+ Must be at module level for ProcessPoolExecutor pickling.
618
+
619
+ Args:
620
+ args: Tuple of (final_idx, cluster_read_ids, sequences, qualities, config)
621
+ - final_idx: 1-based cluster index
622
+ - cluster_read_ids: Set of read IDs in this cluster
623
+ - sequences: Dict mapping read_id -> sequence
624
+ - qualities: Dict mapping read_id -> mean quality score
625
+ - config: ConsensusGenerationConfig
626
+
627
+ Returns:
628
+ Dict with all computed results for this cluster
629
+ """
630
+ final_idx, cluster_read_ids, sequences, qualities, config = args
631
+
632
+ cluster = cluster_read_ids
633
+ actual_size = len(cluster)
634
+
635
+ # Sort all cluster reads by quality for consistent output ordering
636
+ sorted_cluster_ids = sorted(
637
+ cluster,
638
+ key=lambda x: (-qualities.get(x, 0), x)
639
+ )
640
+
641
+ # Sample sequences for final consensus generation if needed
642
+ if len(cluster) > config.max_sample_size:
643
+ sorted_sampled_ids = sorted_cluster_ids[:config.max_sample_size]
644
+ sampled_ids = set(sorted_sampled_ids)
645
+ else:
646
+ sorted_sampled_ids = sorted_cluster_ids
647
+ sampled_ids = cluster
648
+
649
+ # Generate final consensus and MSA
650
+ sampled_seqs = {seq_id: sequences[seq_id] for seq_id in sorted_sampled_ids}
651
+ result = _run_spoa_for_cluster_worker(sampled_seqs, config.disable_homopolymer_equivalence)
652
+
653
+ # Calculate final identity metrics
654
+ rid, rid_min = None, None
655
+ consensus = None
656
+ msa = None
657
+ iupac_count = 0
658
+ trimmed_consensus = None
659
+ found_primers = None
660
+
661
+ if result is not None:
662
+ consensus = result.consensus
663
+ msa = result.msa_string
664
+ alignments = result.alignments
665
+ rid, rid_min = _calculate_read_identity_standalone(alignments, consensus)
666
+
667
+ if consensus:
668
+ # Apply IUPAC ambiguity calling for unphased variant positions
669
+ if config.enable_iupac_calling:
670
+ consensus, iupac_count, iupac_details = call_iupac_ambiguities(
671
+ consensus=consensus,
672
+ alignments=result.alignments,
673
+ msa_to_consensus_pos=result.msa_to_consensus_pos,
674
+ min_variant_frequency=config.min_ambiguity_frequency,
675
+ min_variant_count=config.min_ambiguity_count
676
+ )
677
+
678
+ # Perform primer trimming
679
+ if config.primers:
680
+ trimmed_consensus, found_primers = _trim_primers_standalone(consensus, config.primers)
681
+
682
+ return {
683
+ 'final_idx': final_idx,
684
+ 'cluster': cluster,
685
+ 'actual_size': actual_size,
686
+ 'consensus': consensus,
687
+ 'trimmed_consensus': trimmed_consensus,
688
+ 'found_primers': found_primers,
689
+ 'rid': rid,
690
+ 'rid_min': rid_min,
691
+ 'msa': msa,
692
+ 'sampled_ids': sampled_ids,
693
+ 'sorted_cluster_ids': sorted_cluster_ids,
694
+ 'sorted_sampled_ids': sorted_sampled_ids,
695
+ 'iupac_count': iupac_count
696
+ }