speconsense 0.7.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- speconsense/__init__.py +16 -0
- speconsense/cli.py +6 -0
- speconsense/core/__init__.py +32 -0
- speconsense/core/__main__.py +6 -0
- speconsense/core/cli.py +308 -0
- speconsense/core/clusterer.py +1565 -0
- speconsense/core/workers.py +696 -0
- speconsense/msa.py +813 -0
- speconsense/profiles/__init__.py +514 -0
- speconsense/profiles/example.yaml +97 -0
- speconsense/profiles/herbarium.yaml +25 -0
- speconsense/profiles/largedata.yaml +19 -0
- speconsense/profiles/nostalgia.yaml +22 -0
- speconsense/profiles/strict.yaml +27 -0
- speconsense/quality_report.py +499 -0
- speconsense/scalability/__init__.py +29 -0
- speconsense/scalability/base.py +461 -0
- speconsense/scalability/config.py +42 -0
- speconsense/scalability/vsearch.py +226 -0
- speconsense/summarize/__init__.py +129 -0
- speconsense/summarize/__main__.py +6 -0
- speconsense/summarize/analysis.py +780 -0
- speconsense/summarize/cli.py +528 -0
- speconsense/summarize/clustering.py +669 -0
- speconsense/summarize/fields.py +262 -0
- speconsense/summarize/io.py +723 -0
- speconsense/summarize/iupac.py +294 -0
- speconsense/summarize/merging.py +606 -0
- speconsense/synth.py +292 -0
- speconsense/types.py +38 -0
- speconsense-0.7.2.dist-info/METADATA +1449 -0
- speconsense-0.7.2.dist-info/RECORD +36 -0
- speconsense-0.7.2.dist-info/WHEEL +5 -0
- speconsense-0.7.2.dist-info/entry_points.txt +4 -0
- speconsense-0.7.2.dist-info/licenses/LICENSE +28 -0
- speconsense-0.7.2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,696 @@
|
|
|
1
|
+
"""Module-level worker functions for ProcessPoolExecutor.
|
|
2
|
+
|
|
3
|
+
These must be at module level to be picklable for multiprocessing.
|
|
4
|
+
Includes standalone versions of functions used by workers and config classes.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import logging
|
|
8
|
+
import os
|
|
9
|
+
import subprocess
|
|
10
|
+
import tempfile
|
|
11
|
+
from io import StringIO
|
|
12
|
+
from typing import Dict, List, Optional, Set, Tuple
|
|
13
|
+
|
|
14
|
+
import edlib
|
|
15
|
+
import numpy as np
|
|
16
|
+
from Bio import SeqIO
|
|
17
|
+
|
|
18
|
+
from speconsense.msa import (
|
|
19
|
+
MSAResult,
|
|
20
|
+
ReadAlignment,
|
|
21
|
+
analyze_positional_variation,
|
|
22
|
+
call_iupac_ambiguities,
|
|
23
|
+
calculate_within_cluster_error,
|
|
24
|
+
extract_alignments_from_msa,
|
|
25
|
+
filter_qualifying_haplotypes,
|
|
26
|
+
group_reads_by_single_position,
|
|
27
|
+
is_variant_position_with_composition,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
# Configuration classes for parallel processing
|
|
32
|
+
|
|
33
|
+
class ClusterProcessingConfig:
|
|
34
|
+
"""Configuration for parallel cluster processing.
|
|
35
|
+
|
|
36
|
+
Passed to worker processes to avoid needing to pickle the entire SpecimenClusterer.
|
|
37
|
+
"""
|
|
38
|
+
__slots__ = ['outlier_identity_threshold', 'enable_secondpass_phasing',
|
|
39
|
+
'disable_homopolymer_equivalence', 'min_variant_frequency', 'min_variant_count']
|
|
40
|
+
|
|
41
|
+
def __init__(self, outlier_identity_threshold: Optional[float],
|
|
42
|
+
enable_secondpass_phasing: bool,
|
|
43
|
+
disable_homopolymer_equivalence: bool,
|
|
44
|
+
min_variant_frequency: float,
|
|
45
|
+
min_variant_count: int):
|
|
46
|
+
self.outlier_identity_threshold = outlier_identity_threshold
|
|
47
|
+
self.enable_secondpass_phasing = enable_secondpass_phasing
|
|
48
|
+
self.disable_homopolymer_equivalence = disable_homopolymer_equivalence
|
|
49
|
+
self.min_variant_frequency = min_variant_frequency
|
|
50
|
+
self.min_variant_count = min_variant_count
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class ConsensusGenerationConfig:
|
|
54
|
+
"""Configuration for parallel final consensus generation.
|
|
55
|
+
|
|
56
|
+
Passed to worker processes to avoid needing to pickle the entire SpecimenClusterer.
|
|
57
|
+
"""
|
|
58
|
+
__slots__ = ['max_sample_size', 'enable_iupac_calling', 'min_ambiguity_frequency',
|
|
59
|
+
'min_ambiguity_count', 'disable_homopolymer_equivalence', 'primers']
|
|
60
|
+
|
|
61
|
+
def __init__(self, max_sample_size: int,
|
|
62
|
+
enable_iupac_calling: bool,
|
|
63
|
+
min_ambiguity_frequency: float,
|
|
64
|
+
min_ambiguity_count: int,
|
|
65
|
+
disable_homopolymer_equivalence: bool,
|
|
66
|
+
primers: Optional[List[Tuple[str, str]]] = None):
|
|
67
|
+
self.max_sample_size = max_sample_size
|
|
68
|
+
self.enable_iupac_calling = enable_iupac_calling
|
|
69
|
+
self.min_ambiguity_frequency = min_ambiguity_frequency
|
|
70
|
+
self.min_ambiguity_count = min_ambiguity_count
|
|
71
|
+
self.disable_homopolymer_equivalence = disable_homopolymer_equivalence
|
|
72
|
+
self.primers = primers
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
# Worker functions
|
|
76
|
+
|
|
77
|
+
def _run_spoa_worker(args: Tuple[int, Dict[str, str], bool]) -> Tuple[int, Optional[MSAResult]]:
|
|
78
|
+
"""Worker function for parallel SPOA execution.
|
|
79
|
+
|
|
80
|
+
Must be at module level for ProcessPoolExecutor pickling.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
args: Tuple of (cluster_idx, sampled_seqs, disable_homopolymer_equivalence)
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
Tuple of (cluster_idx, MSAResult or None)
|
|
87
|
+
"""
|
|
88
|
+
cluster_idx, sampled_seqs, disable_homopolymer_equivalence = args
|
|
89
|
+
|
|
90
|
+
if not sampled_seqs:
|
|
91
|
+
return cluster_idx, None
|
|
92
|
+
|
|
93
|
+
try:
|
|
94
|
+
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.fasta') as f:
|
|
95
|
+
for read_id, seq in sorted(sampled_seqs.items()):
|
|
96
|
+
f.write(f">{read_id}\n{seq}\n")
|
|
97
|
+
temp_input = f.name
|
|
98
|
+
|
|
99
|
+
cmd = [
|
|
100
|
+
"spoa", temp_input,
|
|
101
|
+
"-r", "2", "-l", "1", "-m", "5", "-n", "-4", "-g", "-8", "-e", "-6",
|
|
102
|
+
]
|
|
103
|
+
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
|
104
|
+
os.unlink(temp_input)
|
|
105
|
+
|
|
106
|
+
enable_normalization = not disable_homopolymer_equivalence
|
|
107
|
+
alignments, consensus, msa_to_consensus_pos = extract_alignments_from_msa(
|
|
108
|
+
result.stdout, enable_homopolymer_normalization=enable_normalization
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
if not consensus:
|
|
112
|
+
return cluster_idx, None
|
|
113
|
+
|
|
114
|
+
return cluster_idx, MSAResult(
|
|
115
|
+
consensus=consensus,
|
|
116
|
+
msa_string=result.stdout,
|
|
117
|
+
alignments=alignments,
|
|
118
|
+
msa_to_consensus_pos=msa_to_consensus_pos
|
|
119
|
+
)
|
|
120
|
+
except subprocess.CalledProcessError as e:
|
|
121
|
+
logging.error(f"SPOA worker failed for cluster {cluster_idx}: return code {e.returncode}")
|
|
122
|
+
return cluster_idx, None
|
|
123
|
+
except Exception as e:
|
|
124
|
+
logging.error(f"SPOA worker failed for cluster {cluster_idx}: {e}")
|
|
125
|
+
return cluster_idx, None
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def _run_spoa_for_cluster_worker(sequences: Dict[str, str],
|
|
129
|
+
disable_homopolymer_equivalence: bool) -> Optional[MSAResult]:
|
|
130
|
+
"""Run SPOA for a set of sequences. Used by cluster processing worker."""
|
|
131
|
+
if not sequences:
|
|
132
|
+
return None
|
|
133
|
+
|
|
134
|
+
try:
|
|
135
|
+
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.fasta') as f:
|
|
136
|
+
for read_id, seq in sorted(sequences.items()):
|
|
137
|
+
f.write(f">{read_id}\n{seq}\n")
|
|
138
|
+
temp_input = f.name
|
|
139
|
+
|
|
140
|
+
cmd = [
|
|
141
|
+
"spoa", temp_input,
|
|
142
|
+
"-r", "2", "-l", "1", "-m", "5", "-n", "-4", "-g", "-8", "-e", "-6",
|
|
143
|
+
]
|
|
144
|
+
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
|
145
|
+
os.unlink(temp_input)
|
|
146
|
+
|
|
147
|
+
enable_normalization = not disable_homopolymer_equivalence
|
|
148
|
+
alignments, consensus, msa_to_consensus_pos = extract_alignments_from_msa(
|
|
149
|
+
result.stdout, enable_homopolymer_normalization=enable_normalization
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
if not consensus:
|
|
153
|
+
return None
|
|
154
|
+
|
|
155
|
+
return MSAResult(
|
|
156
|
+
consensus=consensus,
|
|
157
|
+
msa_string=result.stdout,
|
|
158
|
+
alignments=alignments,
|
|
159
|
+
msa_to_consensus_pos=msa_to_consensus_pos
|
|
160
|
+
)
|
|
161
|
+
except Exception:
|
|
162
|
+
return None
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def _identify_outlier_reads_standalone(alignments: List[ReadAlignment], consensus_seq: str,
|
|
166
|
+
sampled_ids: Set[str], threshold: float) -> Tuple[Set[str], Set[str]]:
|
|
167
|
+
"""Identify outlier reads below identity threshold. Standalone version for workers."""
|
|
168
|
+
if not alignments or not consensus_seq:
|
|
169
|
+
return sampled_ids, set()
|
|
170
|
+
|
|
171
|
+
try:
|
|
172
|
+
keep_ids = set()
|
|
173
|
+
outlier_ids = set()
|
|
174
|
+
consensus_length = len(consensus_seq)
|
|
175
|
+
if consensus_length == 0:
|
|
176
|
+
return sampled_ids, set()
|
|
177
|
+
|
|
178
|
+
for alignment in alignments:
|
|
179
|
+
error_rate = alignment.normalized_edit_distance / consensus_length
|
|
180
|
+
identity = 1.0 - error_rate
|
|
181
|
+
if identity >= threshold:
|
|
182
|
+
keep_ids.add(alignment.read_id)
|
|
183
|
+
else:
|
|
184
|
+
outlier_ids.add(alignment.read_id)
|
|
185
|
+
|
|
186
|
+
return keep_ids, outlier_ids
|
|
187
|
+
except Exception:
|
|
188
|
+
return sampled_ids, set()
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def _calculate_read_identity_standalone(alignments: List[ReadAlignment],
|
|
192
|
+
consensus_seq: str) -> Tuple[Optional[float], Optional[float]]:
|
|
193
|
+
"""Calculate read identity metrics. Standalone version for workers."""
|
|
194
|
+
if not alignments or not consensus_seq:
|
|
195
|
+
return None, None
|
|
196
|
+
|
|
197
|
+
try:
|
|
198
|
+
consensus_length = len(consensus_seq)
|
|
199
|
+
if consensus_length == 0:
|
|
200
|
+
return None, None
|
|
201
|
+
|
|
202
|
+
identities = []
|
|
203
|
+
for alignment in alignments:
|
|
204
|
+
error_rate = alignment.normalized_edit_distance / consensus_length
|
|
205
|
+
identity = 1.0 - error_rate
|
|
206
|
+
identities.append(identity)
|
|
207
|
+
|
|
208
|
+
if not identities:
|
|
209
|
+
return None, None
|
|
210
|
+
|
|
211
|
+
return np.mean(identities), np.min(identities)
|
|
212
|
+
except Exception:
|
|
213
|
+
return None, None
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def _detect_variant_positions_standalone(alignments: List[ReadAlignment], consensus_seq: str,
|
|
217
|
+
msa_to_consensus_pos: Dict[int, Optional[int]],
|
|
218
|
+
min_variant_frequency: float,
|
|
219
|
+
min_variant_count: int) -> List[Dict]:
|
|
220
|
+
"""Detect variant positions in MSA. Standalone version for workers."""
|
|
221
|
+
if not alignments or not consensus_seq:
|
|
222
|
+
return []
|
|
223
|
+
|
|
224
|
+
try:
|
|
225
|
+
msa_length = len(alignments[0].aligned_sequence)
|
|
226
|
+
consensus_aligned = []
|
|
227
|
+
for msa_pos in range(msa_length):
|
|
228
|
+
cons_pos = msa_to_consensus_pos.get(msa_pos)
|
|
229
|
+
if cons_pos is not None:
|
|
230
|
+
consensus_aligned.append(consensus_seq[cons_pos])
|
|
231
|
+
else:
|
|
232
|
+
consensus_aligned.append('-')
|
|
233
|
+
consensus_aligned = ''.join(consensus_aligned)
|
|
234
|
+
|
|
235
|
+
position_stats = analyze_positional_variation(alignments, consensus_aligned, msa_to_consensus_pos)
|
|
236
|
+
|
|
237
|
+
variant_positions = []
|
|
238
|
+
for pos_stat in position_stats:
|
|
239
|
+
is_variant, variant_bases, reason = is_variant_position_with_composition(
|
|
240
|
+
pos_stat,
|
|
241
|
+
min_variant_frequency=min_variant_frequency,
|
|
242
|
+
min_variant_count=min_variant_count
|
|
243
|
+
)
|
|
244
|
+
if is_variant:
|
|
245
|
+
variant_positions.append({
|
|
246
|
+
'msa_position': pos_stat.msa_position,
|
|
247
|
+
'consensus_position': pos_stat.consensus_position,
|
|
248
|
+
'coverage': pos_stat.coverage,
|
|
249
|
+
'variant_bases': variant_bases,
|
|
250
|
+
'base_composition': pos_stat.base_composition,
|
|
251
|
+
'homopolymer_composition': pos_stat.homopolymer_composition,
|
|
252
|
+
'error_rate': pos_stat.error_rate,
|
|
253
|
+
'reason': reason
|
|
254
|
+
})
|
|
255
|
+
|
|
256
|
+
return variant_positions
|
|
257
|
+
except Exception:
|
|
258
|
+
return []
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
def _recursive_phase_cluster_standalone(
|
|
262
|
+
read_ids: Set[str],
|
|
263
|
+
read_sequences: Dict[str, str],
|
|
264
|
+
path: List[str],
|
|
265
|
+
depth: int,
|
|
266
|
+
config: ClusterProcessingConfig
|
|
267
|
+
) -> Tuple[List[Tuple[List[str], str, Set[str]]], Set[str]]:
|
|
268
|
+
"""Recursively phase a cluster. Standalone version for workers."""
|
|
269
|
+
total_reads = len(read_ids)
|
|
270
|
+
|
|
271
|
+
# Base case: cluster too small
|
|
272
|
+
if total_reads < config.min_variant_count * 2:
|
|
273
|
+
leaf_seqs = {rid: read_sequences[rid] for rid in read_ids}
|
|
274
|
+
result = _run_spoa_for_cluster_worker(leaf_seqs, config.disable_homopolymer_equivalence)
|
|
275
|
+
consensus = result.consensus if result else ""
|
|
276
|
+
return [(path, consensus, read_ids)], set()
|
|
277
|
+
|
|
278
|
+
# Generate MSA
|
|
279
|
+
cluster_seqs = {rid: read_sequences[rid] for rid in read_ids}
|
|
280
|
+
result = _run_spoa_for_cluster_worker(cluster_seqs, config.disable_homopolymer_equivalence)
|
|
281
|
+
|
|
282
|
+
if result is None:
|
|
283
|
+
return [(path, "", read_ids)], set()
|
|
284
|
+
|
|
285
|
+
consensus = result.consensus
|
|
286
|
+
alignments = result.alignments
|
|
287
|
+
msa_to_consensus_pos = result.msa_to_consensus_pos
|
|
288
|
+
|
|
289
|
+
# Detect variants
|
|
290
|
+
variant_positions = _detect_variant_positions_standalone(
|
|
291
|
+
alignments, consensus, msa_to_consensus_pos,
|
|
292
|
+
config.min_variant_frequency, config.min_variant_count
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
if not variant_positions:
|
|
296
|
+
return [(path, consensus, read_ids)], set()
|
|
297
|
+
|
|
298
|
+
# Parse MSA for consensus_aligned
|
|
299
|
+
msa_handle = StringIO(result.msa_string)
|
|
300
|
+
records = list(SeqIO.parse(msa_handle, 'fasta'))
|
|
301
|
+
|
|
302
|
+
consensus_aligned = None
|
|
303
|
+
for record in records:
|
|
304
|
+
if 'Consensus' in record.description or 'Consensus' in record.id:
|
|
305
|
+
consensus_aligned = str(record.seq).upper()
|
|
306
|
+
break
|
|
307
|
+
|
|
308
|
+
if not consensus_aligned:
|
|
309
|
+
return [(path, consensus, read_ids)], set()
|
|
310
|
+
|
|
311
|
+
# Build read to alignment mapping
|
|
312
|
+
read_to_alignment = {a.read_id: a for a in alignments}
|
|
313
|
+
|
|
314
|
+
# Extract alleles at variant positions
|
|
315
|
+
variant_msa_positions = sorted([v['msa_position'] for v in variant_positions])
|
|
316
|
+
read_to_position_alleles = {}
|
|
317
|
+
|
|
318
|
+
for read_id in read_ids:
|
|
319
|
+
alignment = read_to_alignment.get(read_id)
|
|
320
|
+
if not alignment:
|
|
321
|
+
continue
|
|
322
|
+
|
|
323
|
+
aligned_seq = alignment.aligned_sequence
|
|
324
|
+
score_aligned = alignment.score_aligned
|
|
325
|
+
|
|
326
|
+
position_alleles = {}
|
|
327
|
+
for msa_pos in variant_msa_positions:
|
|
328
|
+
if msa_pos < len(aligned_seq):
|
|
329
|
+
allele = aligned_seq[msa_pos]
|
|
330
|
+
if score_aligned and msa_pos < len(score_aligned):
|
|
331
|
+
if score_aligned[msa_pos] == '=':
|
|
332
|
+
allele = consensus_aligned[msa_pos]
|
|
333
|
+
position_alleles[msa_pos] = allele
|
|
334
|
+
else:
|
|
335
|
+
position_alleles[msa_pos] = '-'
|
|
336
|
+
|
|
337
|
+
read_to_position_alleles[read_id] = position_alleles
|
|
338
|
+
|
|
339
|
+
# Find best split
|
|
340
|
+
best_pos = None
|
|
341
|
+
best_error = float('inf')
|
|
342
|
+
best_qualifying = None
|
|
343
|
+
best_non_qualifying = None
|
|
344
|
+
all_positions = set(variant_msa_positions)
|
|
345
|
+
|
|
346
|
+
for pos in variant_msa_positions:
|
|
347
|
+
allele_groups = group_reads_by_single_position(
|
|
348
|
+
read_to_position_alleles, pos, set(read_to_position_alleles.keys())
|
|
349
|
+
)
|
|
350
|
+
qualifying, non_qualifying = filter_qualifying_haplotypes(
|
|
351
|
+
allele_groups, total_reads, config.min_variant_count, config.min_variant_frequency
|
|
352
|
+
)
|
|
353
|
+
if len(qualifying) < 2:
|
|
354
|
+
continue
|
|
355
|
+
|
|
356
|
+
error = calculate_within_cluster_error(qualifying, read_to_position_alleles, {pos}, all_positions)
|
|
357
|
+
if error < best_error:
|
|
358
|
+
best_error = error
|
|
359
|
+
best_pos = pos
|
|
360
|
+
best_qualifying = qualifying
|
|
361
|
+
best_non_qualifying = non_qualifying
|
|
362
|
+
|
|
363
|
+
if best_pos is None:
|
|
364
|
+
return [(path, consensus, read_ids)], set()
|
|
365
|
+
|
|
366
|
+
# Collect deferred reads
|
|
367
|
+
all_deferred = set()
|
|
368
|
+
for allele, reads in best_non_qualifying.items():
|
|
369
|
+
all_deferred.update(reads)
|
|
370
|
+
|
|
371
|
+
# Recurse
|
|
372
|
+
all_leaves = []
|
|
373
|
+
for allele, sub_read_ids in sorted(best_qualifying.items()):
|
|
374
|
+
new_path = path + [allele]
|
|
375
|
+
sub_leaves, sub_deferred = _recursive_phase_cluster_standalone(
|
|
376
|
+
sub_read_ids, read_sequences, new_path, depth + 1, config
|
|
377
|
+
)
|
|
378
|
+
all_leaves.extend(sub_leaves)
|
|
379
|
+
all_deferred.update(sub_deferred)
|
|
380
|
+
|
|
381
|
+
return all_leaves, all_deferred
|
|
382
|
+
|
|
383
|
+
|
|
384
|
+
def _phase_reads_by_variants_standalone(
|
|
385
|
+
cluster_read_ids: Set[str],
|
|
386
|
+
sequences: Dict[str, str],
|
|
387
|
+
variant_positions: List[Dict],
|
|
388
|
+
config: ClusterProcessingConfig
|
|
389
|
+
) -> List[Tuple[str, Set[str]]]:
|
|
390
|
+
"""Phase reads into haplotypes. Standalone version for workers."""
|
|
391
|
+
if not variant_positions:
|
|
392
|
+
return [(None, cluster_read_ids)]
|
|
393
|
+
|
|
394
|
+
try:
|
|
395
|
+
read_sequences = {rid: sequences[rid] for rid in cluster_read_ids if rid in sequences}
|
|
396
|
+
if not read_sequences:
|
|
397
|
+
return [(None, cluster_read_ids)]
|
|
398
|
+
|
|
399
|
+
logging.debug(f"Recursive phasing with MSA regeneration: {len(variant_positions)} initial variants, {len(read_sequences)} reads")
|
|
400
|
+
|
|
401
|
+
leaves, deferred = _recursive_phase_cluster_standalone(
|
|
402
|
+
set(read_sequences.keys()), read_sequences, [], 0, config
|
|
403
|
+
)
|
|
404
|
+
|
|
405
|
+
if len(leaves) <= 1 and not deferred:
|
|
406
|
+
if leaves:
|
|
407
|
+
path, consensus, reads = leaves[0]
|
|
408
|
+
if not path:
|
|
409
|
+
return [(None, cluster_read_ids)]
|
|
410
|
+
return [(None, cluster_read_ids)]
|
|
411
|
+
|
|
412
|
+
if len(leaves) == 1:
|
|
413
|
+
return [(None, cluster_read_ids)]
|
|
414
|
+
|
|
415
|
+
logging.debug(f"Recursive phasing: {len(leaves)} leaf haplotypes, {len(deferred)} deferred reads")
|
|
416
|
+
|
|
417
|
+
# Reassign deferred reads
|
|
418
|
+
if deferred:
|
|
419
|
+
leaf_reads_updated = {tuple(path): set(reads) for path, consensus, reads in leaves}
|
|
420
|
+
leaf_consensuses = {tuple(path): consensus for path, consensus, reads in leaves}
|
|
421
|
+
|
|
422
|
+
for read_id in deferred:
|
|
423
|
+
if read_id not in read_sequences:
|
|
424
|
+
continue
|
|
425
|
+
|
|
426
|
+
read_seq = read_sequences[read_id]
|
|
427
|
+
min_distance = float('inf')
|
|
428
|
+
nearest_path = None
|
|
429
|
+
|
|
430
|
+
for path_tuple, consensus in leaf_consensuses.items():
|
|
431
|
+
if not consensus:
|
|
432
|
+
continue
|
|
433
|
+
result = edlib.align(read_seq, consensus)
|
|
434
|
+
distance = result['editDistance']
|
|
435
|
+
if distance < min_distance:
|
|
436
|
+
min_distance = distance
|
|
437
|
+
nearest_path = path_tuple
|
|
438
|
+
|
|
439
|
+
if nearest_path is not None:
|
|
440
|
+
leaf_reads_updated[nearest_path].add(read_id)
|
|
441
|
+
|
|
442
|
+
# Log and decide on phasing
|
|
443
|
+
total_phased = sum(len(reads) for reads in leaf_reads_updated.values())
|
|
444
|
+
if total_phased < 2:
|
|
445
|
+
return [(None, cluster_read_ids)]
|
|
446
|
+
|
|
447
|
+
logging.debug(f"Phasing decision: SPLITTING cluster into {len(leaf_reads_updated)} haplotypes")
|
|
448
|
+
|
|
449
|
+
return [('-'.join(path), reads) for path, reads in sorted(leaf_reads_updated.items(), key=lambda x: -len(x[1]))]
|
|
450
|
+
else:
|
|
451
|
+
logging.debug(f"Phasing decision: SPLITTING cluster into {len(leaves)} haplotypes")
|
|
452
|
+
return [('-'.join(path), reads) for path, consensus, reads in sorted(leaves, key=lambda x: -len(x[2]))]
|
|
453
|
+
|
|
454
|
+
except Exception as e:
|
|
455
|
+
logging.warning(f"Failed to phase reads: {e}")
|
|
456
|
+
return [(None, cluster_read_ids)]
|
|
457
|
+
|
|
458
|
+
|
|
459
|
+
def _process_cluster_worker(args) -> Tuple[List[Dict], Set[str]]:
|
|
460
|
+
"""Worker function for parallel cluster processing.
|
|
461
|
+
|
|
462
|
+
Must be at module level for ProcessPoolExecutor pickling.
|
|
463
|
+
|
|
464
|
+
Args:
|
|
465
|
+
args: Tuple of (initial_idx, cluster_ids, sequences, qualities, config)
|
|
466
|
+
|
|
467
|
+
Returns:
|
|
468
|
+
Tuple of (subclusters, discarded_read_ids)
|
|
469
|
+
"""
|
|
470
|
+
initial_idx, cluster_ids, sequences, qualities, config = args
|
|
471
|
+
|
|
472
|
+
subclusters = []
|
|
473
|
+
discarded_ids = set()
|
|
474
|
+
|
|
475
|
+
# Sort by quality
|
|
476
|
+
sorted_ids = sorted(cluster_ids, key=lambda x: (-qualities.get(x, 0), x))
|
|
477
|
+
|
|
478
|
+
# Generate consensus and MSA
|
|
479
|
+
cluster_seqs = {seq_id: sequences[seq_id] for seq_id in sorted_ids}
|
|
480
|
+
result = _run_spoa_for_cluster_worker(cluster_seqs, config.disable_homopolymer_equivalence)
|
|
481
|
+
|
|
482
|
+
if result is None:
|
|
483
|
+
logging.warning(f"Initial cluster {initial_idx}: Failed to generate consensus, skipping")
|
|
484
|
+
# Track these reads as discarded since we couldn't generate consensus
|
|
485
|
+
discarded_ids.update(cluster_ids)
|
|
486
|
+
return subclusters, discarded_ids
|
|
487
|
+
|
|
488
|
+
consensus = result.consensus
|
|
489
|
+
msa = result.msa_string
|
|
490
|
+
alignments = result.alignments
|
|
491
|
+
msa_to_consensus_pos = result.msa_to_consensus_pos
|
|
492
|
+
|
|
493
|
+
_calculate_read_identity_standalone(alignments, consensus)
|
|
494
|
+
|
|
495
|
+
cluster = set(cluster_ids)
|
|
496
|
+
|
|
497
|
+
# Outlier removal
|
|
498
|
+
if config.outlier_identity_threshold is not None:
|
|
499
|
+
keep_ids, outlier_ids = _identify_outlier_reads_standalone(
|
|
500
|
+
alignments, consensus, cluster, config.outlier_identity_threshold
|
|
501
|
+
)
|
|
502
|
+
|
|
503
|
+
if outlier_ids:
|
|
504
|
+
if len(cluster) == 2 and len(outlier_ids) == 1:
|
|
505
|
+
logging.debug(f"Initial cluster {initial_idx}: Split 2-read cluster due to 1 outlier")
|
|
506
|
+
for read_id in cluster:
|
|
507
|
+
subclusters.append({
|
|
508
|
+
'read_ids': {read_id},
|
|
509
|
+
'initial_cluster_num': initial_idx,
|
|
510
|
+
'allele_combo': 'single-read-split'
|
|
511
|
+
})
|
|
512
|
+
return subclusters, discarded_ids
|
|
513
|
+
|
|
514
|
+
logging.debug(f"Initial cluster {initial_idx}: Removing {len(outlier_ids)}/{len(cluster)} outlier reads, "
|
|
515
|
+
f"regenerating consensus")
|
|
516
|
+
|
|
517
|
+
discarded_ids.update(outlier_ids)
|
|
518
|
+
cluster = cluster - outlier_ids
|
|
519
|
+
|
|
520
|
+
# Regenerate consensus
|
|
521
|
+
sorted_ids_filtered = sorted(cluster, key=lambda x: (-qualities.get(x, 0), x))
|
|
522
|
+
cluster_seqs = {seq_id: sequences[seq_id] for seq_id in sorted_ids_filtered}
|
|
523
|
+
result = _run_spoa_for_cluster_worker(cluster_seqs, config.disable_homopolymer_equivalence)
|
|
524
|
+
|
|
525
|
+
if result is not None:
|
|
526
|
+
consensus = result.consensus
|
|
527
|
+
msa = result.msa_string
|
|
528
|
+
alignments = result.alignments
|
|
529
|
+
msa_to_consensus_pos = result.msa_to_consensus_pos
|
|
530
|
+
_calculate_read_identity_standalone(alignments, consensus)
|
|
531
|
+
|
|
532
|
+
# Detect variants
|
|
533
|
+
variant_positions = []
|
|
534
|
+
if consensus and alignments and config.enable_secondpass_phasing:
|
|
535
|
+
variant_positions = _detect_variant_positions_standalone(
|
|
536
|
+
alignments, consensus, msa_to_consensus_pos,
|
|
537
|
+
config.min_variant_frequency, config.min_variant_count
|
|
538
|
+
)
|
|
539
|
+
|
|
540
|
+
if variant_positions:
|
|
541
|
+
logging.debug(f"Initial cluster {initial_idx}: Detected {len(variant_positions)} variant positions")
|
|
542
|
+
|
|
543
|
+
# Phase reads
|
|
544
|
+
phased_haplotypes = _phase_reads_by_variants_standalone(
|
|
545
|
+
cluster, sequences, variant_positions, config
|
|
546
|
+
)
|
|
547
|
+
|
|
548
|
+
for haplotype_idx, (allele_combo, haplotype_reads) in enumerate(phased_haplotypes):
|
|
549
|
+
subclusters.append({
|
|
550
|
+
'read_ids': haplotype_reads,
|
|
551
|
+
'initial_cluster_num': initial_idx,
|
|
552
|
+
'allele_combo': allele_combo
|
|
553
|
+
})
|
|
554
|
+
|
|
555
|
+
return subclusters, discarded_ids
|
|
556
|
+
|
|
557
|
+
|
|
558
|
+
def _trim_primers_standalone(sequence: str, primers: Optional[List[Tuple[str, str]]]) -> Tuple[str, List[str]]:
|
|
559
|
+
"""Trim primers from start and end of sequence. Standalone version for workers."""
|
|
560
|
+
if not primers:
|
|
561
|
+
return sequence, []
|
|
562
|
+
|
|
563
|
+
found_primers = []
|
|
564
|
+
trimmed_seq = sequence
|
|
565
|
+
|
|
566
|
+
# Look for primer at 5' end
|
|
567
|
+
best_start_dist = float('inf')
|
|
568
|
+
best_start_primer = None
|
|
569
|
+
best_start_end = None
|
|
570
|
+
|
|
571
|
+
for primer_name, primer_seq in primers:
|
|
572
|
+
k = len(primer_seq) // 4 # Allow ~25% errors
|
|
573
|
+
search_region = sequence[:len(primer_seq) * 2]
|
|
574
|
+
|
|
575
|
+
result = edlib.align(primer_seq, search_region, task="path", mode="HW", k=k)
|
|
576
|
+
|
|
577
|
+
if result["editDistance"] != -1:
|
|
578
|
+
dist = result["editDistance"]
|
|
579
|
+
if dist < best_start_dist:
|
|
580
|
+
best_start_dist = dist
|
|
581
|
+
best_start_primer = primer_name
|
|
582
|
+
best_start_end = result["locations"][0][1] + 1
|
|
583
|
+
|
|
584
|
+
if best_start_primer:
|
|
585
|
+
found_primers.append(f"5'-{best_start_primer}")
|
|
586
|
+
trimmed_seq = trimmed_seq[best_start_end:]
|
|
587
|
+
|
|
588
|
+
# Look for primer at 3' end
|
|
589
|
+
best_end_dist = float('inf')
|
|
590
|
+
best_end_primer = None
|
|
591
|
+
best_end_start = None
|
|
592
|
+
|
|
593
|
+
for primer_name, primer_seq in primers:
|
|
594
|
+
k = len(primer_seq) // 4 # Allow ~25% errors
|
|
595
|
+
search_region = sequence[-len(primer_seq) * 2:]
|
|
596
|
+
|
|
597
|
+
result = edlib.align(primer_seq, search_region, task="path", mode="HW", k=k)
|
|
598
|
+
|
|
599
|
+
if result["editDistance"] != -1:
|
|
600
|
+
dist = result["editDistance"]
|
|
601
|
+
if dist < best_end_dist:
|
|
602
|
+
best_end_dist = dist
|
|
603
|
+
best_end_primer = primer_name
|
|
604
|
+
base_pos = len(trimmed_seq) - len(search_region)
|
|
605
|
+
best_end_start = base_pos + result["locations"][0][0]
|
|
606
|
+
|
|
607
|
+
if best_end_primer:
|
|
608
|
+
found_primers.append(f"3'-{best_end_primer}")
|
|
609
|
+
trimmed_seq = trimmed_seq[:best_end_start]
|
|
610
|
+
|
|
611
|
+
return trimmed_seq, found_primers
|
|
612
|
+
|
|
613
|
+
|
|
614
|
+
def _generate_cluster_consensus_worker(args) -> Dict:
|
|
615
|
+
"""Worker function for parallel final consensus generation.
|
|
616
|
+
|
|
617
|
+
Must be at module level for ProcessPoolExecutor pickling.
|
|
618
|
+
|
|
619
|
+
Args:
|
|
620
|
+
args: Tuple of (final_idx, cluster_read_ids, sequences, qualities, config)
|
|
621
|
+
- final_idx: 1-based cluster index
|
|
622
|
+
- cluster_read_ids: Set of read IDs in this cluster
|
|
623
|
+
- sequences: Dict mapping read_id -> sequence
|
|
624
|
+
- qualities: Dict mapping read_id -> mean quality score
|
|
625
|
+
- config: ConsensusGenerationConfig
|
|
626
|
+
|
|
627
|
+
Returns:
|
|
628
|
+
Dict with all computed results for this cluster
|
|
629
|
+
"""
|
|
630
|
+
final_idx, cluster_read_ids, sequences, qualities, config = args
|
|
631
|
+
|
|
632
|
+
cluster = cluster_read_ids
|
|
633
|
+
actual_size = len(cluster)
|
|
634
|
+
|
|
635
|
+
# Sort all cluster reads by quality for consistent output ordering
|
|
636
|
+
sorted_cluster_ids = sorted(
|
|
637
|
+
cluster,
|
|
638
|
+
key=lambda x: (-qualities.get(x, 0), x)
|
|
639
|
+
)
|
|
640
|
+
|
|
641
|
+
# Sample sequences for final consensus generation if needed
|
|
642
|
+
if len(cluster) > config.max_sample_size:
|
|
643
|
+
sorted_sampled_ids = sorted_cluster_ids[:config.max_sample_size]
|
|
644
|
+
sampled_ids = set(sorted_sampled_ids)
|
|
645
|
+
else:
|
|
646
|
+
sorted_sampled_ids = sorted_cluster_ids
|
|
647
|
+
sampled_ids = cluster
|
|
648
|
+
|
|
649
|
+
# Generate final consensus and MSA
|
|
650
|
+
sampled_seqs = {seq_id: sequences[seq_id] for seq_id in sorted_sampled_ids}
|
|
651
|
+
result = _run_spoa_for_cluster_worker(sampled_seqs, config.disable_homopolymer_equivalence)
|
|
652
|
+
|
|
653
|
+
# Calculate final identity metrics
|
|
654
|
+
rid, rid_min = None, None
|
|
655
|
+
consensus = None
|
|
656
|
+
msa = None
|
|
657
|
+
iupac_count = 0
|
|
658
|
+
trimmed_consensus = None
|
|
659
|
+
found_primers = None
|
|
660
|
+
|
|
661
|
+
if result is not None:
|
|
662
|
+
consensus = result.consensus
|
|
663
|
+
msa = result.msa_string
|
|
664
|
+
alignments = result.alignments
|
|
665
|
+
rid, rid_min = _calculate_read_identity_standalone(alignments, consensus)
|
|
666
|
+
|
|
667
|
+
if consensus:
|
|
668
|
+
# Apply IUPAC ambiguity calling for unphased variant positions
|
|
669
|
+
if config.enable_iupac_calling:
|
|
670
|
+
consensus, iupac_count, iupac_details = call_iupac_ambiguities(
|
|
671
|
+
consensus=consensus,
|
|
672
|
+
alignments=result.alignments,
|
|
673
|
+
msa_to_consensus_pos=result.msa_to_consensus_pos,
|
|
674
|
+
min_variant_frequency=config.min_ambiguity_frequency,
|
|
675
|
+
min_variant_count=config.min_ambiguity_count
|
|
676
|
+
)
|
|
677
|
+
|
|
678
|
+
# Perform primer trimming
|
|
679
|
+
if config.primers:
|
|
680
|
+
trimmed_consensus, found_primers = _trim_primers_standalone(consensus, config.primers)
|
|
681
|
+
|
|
682
|
+
return {
|
|
683
|
+
'final_idx': final_idx,
|
|
684
|
+
'cluster': cluster,
|
|
685
|
+
'actual_size': actual_size,
|
|
686
|
+
'consensus': consensus,
|
|
687
|
+
'trimmed_consensus': trimmed_consensus,
|
|
688
|
+
'found_primers': found_primers,
|
|
689
|
+
'rid': rid,
|
|
690
|
+
'rid_min': rid_min,
|
|
691
|
+
'msa': msa,
|
|
692
|
+
'sampled_ids': sampled_ids,
|
|
693
|
+
'sorted_cluster_ids': sorted_cluster_ids,
|
|
694
|
+
'sorted_sampled_ids': sorted_sampled_ids,
|
|
695
|
+
'iupac_count': iupac_count
|
|
696
|
+
}
|