speconsense 0.7.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- speconsense/__init__.py +16 -0
- speconsense/cli.py +6 -0
- speconsense/core/__init__.py +32 -0
- speconsense/core/__main__.py +6 -0
- speconsense/core/cli.py +308 -0
- speconsense/core/clusterer.py +1565 -0
- speconsense/core/workers.py +696 -0
- speconsense/msa.py +813 -0
- speconsense/profiles/__init__.py +514 -0
- speconsense/profiles/example.yaml +97 -0
- speconsense/profiles/herbarium.yaml +25 -0
- speconsense/profiles/largedata.yaml +19 -0
- speconsense/profiles/nostalgia.yaml +22 -0
- speconsense/profiles/strict.yaml +27 -0
- speconsense/quality_report.py +499 -0
- speconsense/scalability/__init__.py +29 -0
- speconsense/scalability/base.py +461 -0
- speconsense/scalability/config.py +42 -0
- speconsense/scalability/vsearch.py +226 -0
- speconsense/summarize/__init__.py +129 -0
- speconsense/summarize/__main__.py +6 -0
- speconsense/summarize/analysis.py +780 -0
- speconsense/summarize/cli.py +528 -0
- speconsense/summarize/clustering.py +669 -0
- speconsense/summarize/fields.py +262 -0
- speconsense/summarize/io.py +723 -0
- speconsense/summarize/iupac.py +294 -0
- speconsense/summarize/merging.py +606 -0
- speconsense/synth.py +292 -0
- speconsense/types.py +38 -0
- speconsense-0.7.2.dist-info/METADATA +1449 -0
- speconsense-0.7.2.dist-info/RECORD +36 -0
- speconsense-0.7.2.dist-info/WHEEL +5 -0
- speconsense-0.7.2.dist-info/entry_points.txt +4 -0
- speconsense-0.7.2.dist-info/licenses/LICENSE +28 -0
- speconsense-0.7.2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,606 @@
|
|
|
1
|
+
"""MSA-based variant merging for speconsense-summarize.
|
|
2
|
+
|
|
3
|
+
Provides functions for finding and merging compatible variants within HAC groups
|
|
4
|
+
using exhaustive subset evaluation with SPOA multiple sequence alignment.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import itertools
|
|
8
|
+
import logging
|
|
9
|
+
from typing import List, Tuple, Dict
|
|
10
|
+
from collections import defaultdict
|
|
11
|
+
|
|
12
|
+
from speconsense.types import ConsensusInfo, OverlapMergeInfo
|
|
13
|
+
|
|
14
|
+
from .iupac import merge_bases_to_iupac, primers_are_same
|
|
15
|
+
from .analysis import (
|
|
16
|
+
run_spoa_msa,
|
|
17
|
+
analyze_msa_columns,
|
|
18
|
+
analyze_msa_columns_overlap_aware,
|
|
19
|
+
MAX_MSA_MERGE_VARIANTS, # Kept for backward compatibility
|
|
20
|
+
compute_merge_batch_size,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def generate_all_subsets_by_size(variants: List[ConsensusInfo]) -> List[Tuple[int, ...]]:
|
|
25
|
+
"""
|
|
26
|
+
Generate all possible non-empty subsets of variant indices.
|
|
27
|
+
Returns subsets in descending order by total cluster size.
|
|
28
|
+
|
|
29
|
+
This exhaustive approach guarantees finding the globally optimal merge
|
|
30
|
+
when the number of variants is small (<= MAX_MSA_MERGE_VARIANTS).
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
variants: List of variants to generate subsets from
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
List of tuples of indices, sorted by total size descending
|
|
37
|
+
"""
|
|
38
|
+
n = len(variants)
|
|
39
|
+
sizes = [v.size for v in variants]
|
|
40
|
+
|
|
41
|
+
# Build list of (total_size, subset_indices) tuples
|
|
42
|
+
candidates = []
|
|
43
|
+
|
|
44
|
+
# Generate all non-empty subsets
|
|
45
|
+
for r in range(n, 0, -1): # From largest to smallest subset size
|
|
46
|
+
for indices in itertools.combinations(range(n), r):
|
|
47
|
+
total_size = sum(sizes[i] for i in indices)
|
|
48
|
+
candidates.append((total_size, indices))
|
|
49
|
+
|
|
50
|
+
# Sort by total size descending
|
|
51
|
+
candidates.sort(reverse=True, key=lambda x: x[0])
|
|
52
|
+
|
|
53
|
+
# Return just the subset indices
|
|
54
|
+
return [subset for _, subset in candidates]
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def is_compatible_subset(variant_stats: dict, args, prior_positions: dict = None) -> bool:
|
|
58
|
+
"""
|
|
59
|
+
Check if variant statistics are within merge limits.
|
|
60
|
+
|
|
61
|
+
By default, homopolymer indels are ignored (treated as compatible) to match
|
|
62
|
+
adjusted-identity homopolymer normalization semantics where AAA ~ AAAA.
|
|
63
|
+
Only structural indels count against the limits.
|
|
64
|
+
|
|
65
|
+
When --disable-homopolymer-equivalence is set, homopolymer indels are treated
|
|
66
|
+
the same as structural indels and count against merge limits.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
variant_stats: Statistics from MSA analysis (snp_count, indel counts, etc.)
|
|
70
|
+
args: Command-line arguments with merge parameters
|
|
71
|
+
prior_positions: Optional dict with cumulative counts from prior merge rounds
|
|
72
|
+
{'snp_count': N, 'indel_count': M} - these are added to
|
|
73
|
+
current stats when checking limits for iterative merging
|
|
74
|
+
"""
|
|
75
|
+
if prior_positions is None:
|
|
76
|
+
prior_positions = {'snp_count': 0, 'indel_count': 0}
|
|
77
|
+
|
|
78
|
+
# Check SNP limit
|
|
79
|
+
if variant_stats['snp_count'] > 0 and not args.merge_snp:
|
|
80
|
+
return False
|
|
81
|
+
|
|
82
|
+
# Determine which indels to count based on homopolymer equivalence setting
|
|
83
|
+
if args.disable_homopolymer_equivalence:
|
|
84
|
+
# Count both structural and homopolymer indels
|
|
85
|
+
indel_count = variant_stats['structural_indel_count'] + variant_stats['homopolymer_indel_count']
|
|
86
|
+
indel_length = max(variant_stats['structural_indel_length'],
|
|
87
|
+
variant_stats['homopolymer_indel_length'])
|
|
88
|
+
else:
|
|
89
|
+
# Only count structural indels (homopolymer indels ignored)
|
|
90
|
+
indel_count = variant_stats['structural_indel_count']
|
|
91
|
+
indel_length = variant_stats['structural_indel_length']
|
|
92
|
+
|
|
93
|
+
# Check indel limits
|
|
94
|
+
if indel_count > 0:
|
|
95
|
+
if args.merge_indel_length == 0:
|
|
96
|
+
return False
|
|
97
|
+
if indel_length > args.merge_indel_length:
|
|
98
|
+
return False
|
|
99
|
+
|
|
100
|
+
# Check total position count (including prior merge rounds)
|
|
101
|
+
total_positions = (variant_stats['snp_count'] + prior_positions['snp_count'] +
|
|
102
|
+
indel_count + prior_positions['indel_count'])
|
|
103
|
+
if total_positions > args.merge_position_count:
|
|
104
|
+
return False
|
|
105
|
+
|
|
106
|
+
return True
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def create_consensus_from_msa(aligned_seqs: List, variants: List[ConsensusInfo]) -> ConsensusInfo:
|
|
110
|
+
"""
|
|
111
|
+
Generate consensus from MSA using size-weighted majority voting.
|
|
112
|
+
|
|
113
|
+
At each position:
|
|
114
|
+
- Weight each variant by cluster size
|
|
115
|
+
- Choose majority representation (base vs gap)
|
|
116
|
+
- For multiple bases, generate IUPAC code representing all variants
|
|
117
|
+
|
|
118
|
+
Important: All gaps (including terminal) count as variant positions
|
|
119
|
+
since variants share the same primers.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
aligned_seqs: MSA sequences with gaps as '-'
|
|
123
|
+
variants: Original ConsensusInfo objects (for size weighting)
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
ConsensusInfo with merged consensus sequence
|
|
127
|
+
"""
|
|
128
|
+
consensus_seq = []
|
|
129
|
+
snp_count = 0
|
|
130
|
+
alignment_length = len(aligned_seqs[0].seq)
|
|
131
|
+
|
|
132
|
+
for col_idx in range(alignment_length):
|
|
133
|
+
column = [str(seq.seq[col_idx]) for seq in aligned_seqs]
|
|
134
|
+
|
|
135
|
+
# Weight each base/gap by cluster size
|
|
136
|
+
votes_with_size = [(base, variants[i].size) for i, base in enumerate(column)]
|
|
137
|
+
|
|
138
|
+
# Count size-weighted votes (EXACT match only, no IUPAC expansion)
|
|
139
|
+
votes = defaultdict(int)
|
|
140
|
+
for base, size in votes_with_size:
|
|
141
|
+
votes[base.upper()] += size
|
|
142
|
+
|
|
143
|
+
# Separate gap votes from base votes
|
|
144
|
+
gap_votes = votes.get('-', 0)
|
|
145
|
+
base_votes = {b: v for b, v in votes.items() if b != '-'}
|
|
146
|
+
|
|
147
|
+
# Determine if position should be included
|
|
148
|
+
total_base_votes = sum(base_votes.values())
|
|
149
|
+
|
|
150
|
+
if total_base_votes > gap_votes:
|
|
151
|
+
# Majority wants a base - include position
|
|
152
|
+
if len(base_votes) == 1:
|
|
153
|
+
# Single base - no ambiguity
|
|
154
|
+
consensus_seq.append(list(base_votes.keys())[0])
|
|
155
|
+
else:
|
|
156
|
+
# Multiple bases - generate IUPAC code (expanding any existing IUPAC codes)
|
|
157
|
+
represented_bases = set(base_votes.keys())
|
|
158
|
+
iupac_code = merge_bases_to_iupac(represented_bases)
|
|
159
|
+
consensus_seq.append(iupac_code)
|
|
160
|
+
snp_count += 1
|
|
161
|
+
# else: majority wants gap, omit position
|
|
162
|
+
|
|
163
|
+
# Create merged ConsensusInfo
|
|
164
|
+
consensus_sequence = ''.join(consensus_seq)
|
|
165
|
+
total_size = sum(v.size for v in variants)
|
|
166
|
+
total_ric = sum(v.ric for v in variants)
|
|
167
|
+
|
|
168
|
+
# Collect RiC values, preserving any prior merge history
|
|
169
|
+
raw_ric_values = []
|
|
170
|
+
for v in variants:
|
|
171
|
+
if v.raw_ric:
|
|
172
|
+
raw_ric_values.extend(v.raw_ric) # Flatten prior merge history
|
|
173
|
+
else:
|
|
174
|
+
raw_ric_values.append(v.ric)
|
|
175
|
+
raw_ric_values = sorted(raw_ric_values, reverse=True) if len(variants) > 1 else None
|
|
176
|
+
|
|
177
|
+
# Collect lengths, preserving any prior merge history
|
|
178
|
+
raw_len_values = []
|
|
179
|
+
for v in variants:
|
|
180
|
+
if v.raw_len:
|
|
181
|
+
raw_len_values.extend(v.raw_len) # Flatten prior merge history
|
|
182
|
+
else:
|
|
183
|
+
raw_len_values.append(len(v.sequence))
|
|
184
|
+
raw_len_values = sorted(raw_len_values, reverse=True) if len(variants) > 1 else None
|
|
185
|
+
|
|
186
|
+
# Use name from largest variant
|
|
187
|
+
largest_variant = max(variants, key=lambda v: v.size)
|
|
188
|
+
|
|
189
|
+
return ConsensusInfo(
|
|
190
|
+
sample_name=largest_variant.sample_name,
|
|
191
|
+
cluster_id=largest_variant.cluster_id,
|
|
192
|
+
sequence=consensus_sequence,
|
|
193
|
+
ric=total_ric,
|
|
194
|
+
size=total_size,
|
|
195
|
+
file_path=largest_variant.file_path,
|
|
196
|
+
snp_count=snp_count if snp_count > 0 else None,
|
|
197
|
+
primers=largest_variant.primers,
|
|
198
|
+
raw_ric=raw_ric_values,
|
|
199
|
+
raw_len=raw_len_values,
|
|
200
|
+
rid=largest_variant.rid, # Preserve identity metrics from largest variant
|
|
201
|
+
rid_min=largest_variant.rid_min,
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def create_overlap_consensus_from_msa(aligned_seqs: List, variants: List[ConsensusInfo]) -> ConsensusInfo:
|
|
206
|
+
"""
|
|
207
|
+
Generate consensus from MSA where sequences may have different lengths.
|
|
208
|
+
|
|
209
|
+
For overlap merging (primer pools with different endpoints):
|
|
210
|
+
- In overlap region: Use size-weighted majority voting
|
|
211
|
+
- In non-overlap regions: Keep content from whichever sequence(s) have it
|
|
212
|
+
|
|
213
|
+
This produces a consensus spanning the union of all input sequences.
|
|
214
|
+
|
|
215
|
+
Args:
|
|
216
|
+
aligned_seqs: MSA sequences with gaps as '-'
|
|
217
|
+
variants: Original ConsensusInfo objects (for size weighting)
|
|
218
|
+
|
|
219
|
+
Returns:
|
|
220
|
+
ConsensusInfo with merged consensus sequence spanning full length
|
|
221
|
+
"""
|
|
222
|
+
consensus_seq = []
|
|
223
|
+
snp_count = 0
|
|
224
|
+
alignment_length = len(aligned_seqs[0].seq)
|
|
225
|
+
|
|
226
|
+
# Find content region for each sequence
|
|
227
|
+
content_regions = []
|
|
228
|
+
for seq in aligned_seqs:
|
|
229
|
+
seq_str = str(seq.seq)
|
|
230
|
+
first_base = next((i for i, c in enumerate(seq_str) if c != '-'), 0)
|
|
231
|
+
last_base = alignment_length - 1 - next(
|
|
232
|
+
(i for i, c in enumerate(reversed(seq_str)) if c != '-'), 0
|
|
233
|
+
)
|
|
234
|
+
content_regions.append((first_base, last_base))
|
|
235
|
+
|
|
236
|
+
# Calculate overlap region
|
|
237
|
+
overlap_start = max(start for start, _ in content_regions)
|
|
238
|
+
overlap_end = min(end for _, end in content_regions)
|
|
239
|
+
|
|
240
|
+
# Process each column
|
|
241
|
+
for col_idx in range(alignment_length):
|
|
242
|
+
column = [str(seq.seq[col_idx]) for seq in aligned_seqs]
|
|
243
|
+
|
|
244
|
+
# Determine which sequences have content at this position
|
|
245
|
+
seqs_with_content = []
|
|
246
|
+
for i, (start, end) in enumerate(content_regions):
|
|
247
|
+
if start <= col_idx <= end:
|
|
248
|
+
seqs_with_content.append(i)
|
|
249
|
+
|
|
250
|
+
if not seqs_with_content:
|
|
251
|
+
# No sequence has content here (shouldn't happen in valid MSA)
|
|
252
|
+
continue
|
|
253
|
+
|
|
254
|
+
# Check if we're in the overlap region
|
|
255
|
+
in_overlap = overlap_start <= col_idx <= overlap_end
|
|
256
|
+
|
|
257
|
+
if in_overlap:
|
|
258
|
+
# Overlap region: use size-weighted majority voting (like original)
|
|
259
|
+
votes_with_size = [(column[i], variants[i].size) for i in seqs_with_content]
|
|
260
|
+
|
|
261
|
+
votes = defaultdict(int)
|
|
262
|
+
for base, size in votes_with_size:
|
|
263
|
+
votes[base.upper()] += size
|
|
264
|
+
|
|
265
|
+
gap_votes = votes.get('-', 0)
|
|
266
|
+
base_votes = {b: v for b, v in votes.items() if b != '-'}
|
|
267
|
+
total_base_votes = sum(base_votes.values())
|
|
268
|
+
|
|
269
|
+
if total_base_votes > gap_votes:
|
|
270
|
+
if len(base_votes) == 1:
|
|
271
|
+
consensus_seq.append(list(base_votes.keys())[0])
|
|
272
|
+
else:
|
|
273
|
+
represented_bases = set(base_votes.keys())
|
|
274
|
+
iupac_code = merge_bases_to_iupac(represented_bases)
|
|
275
|
+
consensus_seq.append(iupac_code)
|
|
276
|
+
snp_count += 1
|
|
277
|
+
# else: majority wants gap in overlap, omit position
|
|
278
|
+
else:
|
|
279
|
+
# Non-overlap region: keep content from available sequences
|
|
280
|
+
# (don't let gap votes from sequences that don't extend here remove content)
|
|
281
|
+
bases_only = [column[i] for i in seqs_with_content if column[i] != '-']
|
|
282
|
+
|
|
283
|
+
if bases_only:
|
|
284
|
+
# Weight by size for consistency
|
|
285
|
+
votes = defaultdict(int)
|
|
286
|
+
for i in seqs_with_content:
|
|
287
|
+
if column[i] != '-':
|
|
288
|
+
votes[column[i].upper()] += variants[i].size
|
|
289
|
+
|
|
290
|
+
if len(votes) == 1:
|
|
291
|
+
consensus_seq.append(list(votes.keys())[0])
|
|
292
|
+
else:
|
|
293
|
+
represented_bases = set(votes.keys())
|
|
294
|
+
iupac_code = merge_bases_to_iupac(represented_bases)
|
|
295
|
+
consensus_seq.append(iupac_code)
|
|
296
|
+
snp_count += 1
|
|
297
|
+
|
|
298
|
+
# Create merged ConsensusInfo
|
|
299
|
+
consensus_sequence = ''.join(consensus_seq)
|
|
300
|
+
total_size = sum(v.size for v in variants)
|
|
301
|
+
total_ric = sum(v.ric for v in variants)
|
|
302
|
+
|
|
303
|
+
# Collect RiC values, preserving any prior merge history
|
|
304
|
+
raw_ric_values = []
|
|
305
|
+
for v in variants:
|
|
306
|
+
if v.raw_ric:
|
|
307
|
+
raw_ric_values.extend(v.raw_ric) # Flatten prior merge history
|
|
308
|
+
else:
|
|
309
|
+
raw_ric_values.append(v.ric)
|
|
310
|
+
raw_ric_values = sorted(raw_ric_values, reverse=True) if len(variants) > 1 else None
|
|
311
|
+
|
|
312
|
+
# Collect lengths, preserving any prior merge history
|
|
313
|
+
raw_len_values = []
|
|
314
|
+
for v in variants:
|
|
315
|
+
if v.raw_len:
|
|
316
|
+
raw_len_values.extend(v.raw_len) # Flatten prior merge history
|
|
317
|
+
else:
|
|
318
|
+
raw_len_values.append(len(v.sequence))
|
|
319
|
+
raw_len_values = sorted(raw_len_values, reverse=True) if len(variants) > 1 else None
|
|
320
|
+
|
|
321
|
+
# Use name from largest variant
|
|
322
|
+
largest_variant = max(variants, key=lambda v: v.size)
|
|
323
|
+
|
|
324
|
+
return ConsensusInfo(
|
|
325
|
+
sample_name=largest_variant.sample_name,
|
|
326
|
+
cluster_id=largest_variant.cluster_id,
|
|
327
|
+
sequence=consensus_sequence,
|
|
328
|
+
ric=total_ric,
|
|
329
|
+
size=total_size,
|
|
330
|
+
file_path=largest_variant.file_path,
|
|
331
|
+
snp_count=snp_count if snp_count > 0 else None,
|
|
332
|
+
primers=largest_variant.primers,
|
|
333
|
+
raw_ric=raw_ric_values,
|
|
334
|
+
raw_len=raw_len_values,
|
|
335
|
+
rid=largest_variant.rid,
|
|
336
|
+
rid_min=largest_variant.rid_min,
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
def merge_group_with_msa(variants: List[ConsensusInfo], args) -> Tuple[List[ConsensusInfo], Dict, int, List[OverlapMergeInfo]]:
|
|
341
|
+
"""
|
|
342
|
+
Find largest mergeable subset of variants using MSA-based evaluation with exhaustive search.
|
|
343
|
+
|
|
344
|
+
Algorithm:
|
|
345
|
+
1. Process variants in batches of up to MAX_MSA_MERGE_VARIANTS
|
|
346
|
+
2. For each batch, run SPOA MSA once
|
|
347
|
+
3. Exhaustively evaluate ALL subsets by total size (descending)
|
|
348
|
+
4. Merge the best compatible subset found
|
|
349
|
+
5. Remove merged variants and repeat with remaining
|
|
350
|
+
6. When overlap mode is enabled, iterate the entire process on merged results
|
|
351
|
+
until no more merges happen (handles prefix+suffix+full scenarios)
|
|
352
|
+
|
|
353
|
+
This approach guarantees optimal results when N <= MAX_MSA_MERGE_VARIANTS.
|
|
354
|
+
For N > MAX, processes top MAX per round (potentially suboptimal globally).
|
|
355
|
+
|
|
356
|
+
Iterative merging (overlap mode only):
|
|
357
|
+
- After first pass, merged results are fed back for another round
|
|
358
|
+
- Cumulative SNP/indel counts are tracked across rounds
|
|
359
|
+
- Continues until no merges occur in a round
|
|
360
|
+
|
|
361
|
+
Args:
|
|
362
|
+
variants: List of ConsensusInfo from HAC group
|
|
363
|
+
args: Command-line arguments with merge parameters
|
|
364
|
+
|
|
365
|
+
Returns:
|
|
366
|
+
(merged_variants, merge_traceability, potentially_suboptimal, overlap_merges) where:
|
|
367
|
+
- merged_variants is list of merged ConsensusInfo objects
|
|
368
|
+
- traceability maps merged names to original cluster names
|
|
369
|
+
- potentially_suboptimal is 1 if group had >MAX variants, 0 otherwise
|
|
370
|
+
- overlap_merges is list of OverlapMergeInfo for quality reporting
|
|
371
|
+
"""
|
|
372
|
+
if len(variants) == 1:
|
|
373
|
+
return variants, {}, 0, []
|
|
374
|
+
|
|
375
|
+
# Compute batch size based on effort and group size
|
|
376
|
+
effort = getattr(args, 'merge_effort_value', 10) # Default to balanced
|
|
377
|
+
batch_size = compute_merge_batch_size(len(variants), effort)
|
|
378
|
+
|
|
379
|
+
# Track if this group is potentially suboptimal (too many variants for global optimum)
|
|
380
|
+
potentially_suboptimal = 1 if len(variants) > batch_size else 0
|
|
381
|
+
|
|
382
|
+
all_traceability = {}
|
|
383
|
+
overlap_merges = [] # Track overlap merge events for quality reporting
|
|
384
|
+
|
|
385
|
+
# For iterative merging in overlap mode, we may need multiple rounds
|
|
386
|
+
current_variants = variants
|
|
387
|
+
iteration = 0
|
|
388
|
+
max_iterations = 10 # Safety limit to prevent infinite loops
|
|
389
|
+
|
|
390
|
+
while iteration < max_iterations:
|
|
391
|
+
iteration += 1
|
|
392
|
+
|
|
393
|
+
# Sort variants by size (largest first)
|
|
394
|
+
remaining_variants = sorted(current_variants, key=lambda v: v.size, reverse=True)
|
|
395
|
+
merged_results = []
|
|
396
|
+
merges_this_iteration = 0
|
|
397
|
+
|
|
398
|
+
while remaining_variants:
|
|
399
|
+
# Take up to batch_size candidates (dynamically computed based on effort and group size)
|
|
400
|
+
candidates = remaining_variants[:batch_size]
|
|
401
|
+
|
|
402
|
+
# Apply size ratio filter if enabled (relative to largest in batch)
|
|
403
|
+
if args.merge_min_size_ratio > 0:
|
|
404
|
+
largest_size = candidates[0].size
|
|
405
|
+
filtered_candidates = [v for v in candidates
|
|
406
|
+
if (v.size / largest_size) >= args.merge_min_size_ratio]
|
|
407
|
+
if len(filtered_candidates) < len(candidates):
|
|
408
|
+
filtered_count = len(candidates) - len(filtered_candidates)
|
|
409
|
+
logging.debug(f"Filtered out {filtered_count} variants with size ratio < {args.merge_min_size_ratio} relative to largest (size={largest_size})")
|
|
410
|
+
candidates = filtered_candidates
|
|
411
|
+
|
|
412
|
+
# Single candidate - just pass through
|
|
413
|
+
if len(candidates) == 1:
|
|
414
|
+
merged_results.append(candidates[0])
|
|
415
|
+
remaining_variants.remove(candidates[0])
|
|
416
|
+
continue
|
|
417
|
+
|
|
418
|
+
if iteration > 1:
|
|
419
|
+
logging.debug(f"Iteration {iteration}: Evaluating {len(candidates)} variants "
|
|
420
|
+
f"(batch_size={batch_size}) for merging")
|
|
421
|
+
else:
|
|
422
|
+
logging.debug(f"Evaluating {len(candidates)} variants (batch_size={batch_size}, "
|
|
423
|
+
f"effort={effort}) for merging (exhaustive subset search)")
|
|
424
|
+
|
|
425
|
+
# Determine if overlap mode should be used for this merge batch
|
|
426
|
+
# Same primers -> use global mode (chimeras have same primers but different lengths)
|
|
427
|
+
# Different primers -> use overlap mode (legitimate primer pool variation)
|
|
428
|
+
all_same_primers = all(
|
|
429
|
+
primers_are_same(candidates[0].primers, v.primers)
|
|
430
|
+
for v in candidates[1:]
|
|
431
|
+
) if len(candidates) > 1 else True
|
|
432
|
+
use_overlap_mode = args.min_merge_overlap > 0 and not all_same_primers
|
|
433
|
+
|
|
434
|
+
if args.min_merge_overlap > 0 and all_same_primers and len(candidates) > 1:
|
|
435
|
+
# Log when primer constraint prevents overlap merging
|
|
436
|
+
primer_str = ','.join(candidates[0].primers) if candidates[0].primers else 'unknown'
|
|
437
|
+
logging.debug(f"Same primers [{primer_str}] detected - using global alignment instead of overlap")
|
|
438
|
+
|
|
439
|
+
# Run SPOA MSA on candidates
|
|
440
|
+
# Use local alignment mode (0) for overlap merging to get clean terminal gaps
|
|
441
|
+
# Use global alignment mode (1) for standard same-length merging
|
|
442
|
+
sequences = [v.sequence for v in candidates]
|
|
443
|
+
spoa_mode = 0 if use_overlap_mode else 1
|
|
444
|
+
aligned_seqs = run_spoa_msa(sequences, alignment_mode=spoa_mode)
|
|
445
|
+
|
|
446
|
+
logging.debug(f"Generated MSA with length {len(aligned_seqs[0].seq)}")
|
|
447
|
+
|
|
448
|
+
# Generate ALL subsets sorted by total size (exhaustive search)
|
|
449
|
+
all_subsets = generate_all_subsets_by_size(candidates)
|
|
450
|
+
|
|
451
|
+
logging.debug(f"Evaluating {len(all_subsets)} candidate subsets")
|
|
452
|
+
|
|
453
|
+
# Find first (largest) compatible subset
|
|
454
|
+
merged_this_round = False
|
|
455
|
+
for subset_indices in all_subsets:
|
|
456
|
+
subset_variants = [candidates[i] for i in subset_indices]
|
|
457
|
+
subset_aligned = [aligned_seqs[i] for i in subset_indices]
|
|
458
|
+
|
|
459
|
+
# Analyze MSA for this subset
|
|
460
|
+
if use_overlap_mode:
|
|
461
|
+
# Use overlap-aware analysis for primer pool scenarios
|
|
462
|
+
original_lengths = [len(v.sequence) for v in subset_variants]
|
|
463
|
+
variant_stats = analyze_msa_columns_overlap_aware(
|
|
464
|
+
subset_aligned, args.min_merge_overlap, original_lengths
|
|
465
|
+
)
|
|
466
|
+
|
|
467
|
+
# Check overlap requirement
|
|
468
|
+
shorter_len = min(original_lengths)
|
|
469
|
+
effective_threshold = min(args.min_merge_overlap, shorter_len)
|
|
470
|
+
if variant_stats['overlap_bp'] < effective_threshold:
|
|
471
|
+
# Insufficient overlap - skip this subset
|
|
472
|
+
continue
|
|
473
|
+
else:
|
|
474
|
+
# Use standard analysis
|
|
475
|
+
variant_stats = analyze_msa_columns(subset_aligned)
|
|
476
|
+
|
|
477
|
+
# Calculate cumulative positions from input sequences (for iterative merging)
|
|
478
|
+
# Each sequence may carry positions from prior merges
|
|
479
|
+
prior_snps = sum(v.snp_count or 0 for v in subset_variants)
|
|
480
|
+
prior_indels = sum(v.merge_indel_count or 0 for v in subset_variants)
|
|
481
|
+
prior_positions = {'snp_count': prior_snps, 'indel_count': prior_indels}
|
|
482
|
+
|
|
483
|
+
# Check compatibility against merge limits (including cumulative positions)
|
|
484
|
+
if is_compatible_subset(variant_stats, args, prior_positions):
|
|
485
|
+
# Only log "mergeable subset" message for actual merges (>1 variant)
|
|
486
|
+
if len(subset_indices) > 1:
|
|
487
|
+
# Build detailed variant description
|
|
488
|
+
parts = []
|
|
489
|
+
if variant_stats['snp_count'] > 0:
|
|
490
|
+
parts.append(f"{variant_stats['snp_count']} SNPs")
|
|
491
|
+
if variant_stats['structural_indel_count'] > 0:
|
|
492
|
+
parts.append(f"{variant_stats['structural_indel_count']} structural indels")
|
|
493
|
+
if variant_stats['homopolymer_indel_count'] > 0:
|
|
494
|
+
parts.append(f"{variant_stats['homopolymer_indel_count']} homopolymer indels")
|
|
495
|
+
|
|
496
|
+
variant_desc = ", ".join(parts) if parts else "identical sequences"
|
|
497
|
+
iter_prefix = f"Iteration {iteration}: " if iteration > 1 else ""
|
|
498
|
+
if use_overlap_mode:
|
|
499
|
+
# Include prefix/suffix extension info for overlap merges
|
|
500
|
+
prefix_bp = variant_stats.get('prefix_bp', 0)
|
|
501
|
+
suffix_bp = variant_stats.get('suffix_bp', 0)
|
|
502
|
+
logging.info(f"{iter_prefix}Found mergeable subset of {len(subset_indices)} variants "
|
|
503
|
+
f"(overlap={variant_stats.get('overlap_bp', 'N/A')}bp, "
|
|
504
|
+
f"prefix={prefix_bp}bp, suffix={suffix_bp}bp): {variant_desc}")
|
|
505
|
+
|
|
506
|
+
# DEBUG: Show span details for each sequence in the merge
|
|
507
|
+
content_regions = variant_stats.get('content_regions', [])
|
|
508
|
+
if content_regions:
|
|
509
|
+
spans = [f"seq{i+1}=({s},{e})" for i, (s, e) in enumerate(content_regions)]
|
|
510
|
+
logging.debug(f"Merge spans: {', '.join(spans)}")
|
|
511
|
+
else:
|
|
512
|
+
logging.info(f"{iter_prefix}Found mergeable subset of {len(subset_indices)} variants: {variant_desc}")
|
|
513
|
+
|
|
514
|
+
# Calculate total positions for cumulative tracking
|
|
515
|
+
# Total = prior positions from input sequences + new positions from this merge
|
|
516
|
+
if args.disable_homopolymer_equivalence:
|
|
517
|
+
this_merge_indels = variant_stats['structural_indel_count'] + variant_stats['homopolymer_indel_count']
|
|
518
|
+
else:
|
|
519
|
+
this_merge_indels = variant_stats['structural_indel_count']
|
|
520
|
+
total_snps = prior_snps + variant_stats['snp_count']
|
|
521
|
+
total_indels = prior_indels + this_merge_indels
|
|
522
|
+
|
|
523
|
+
# Create merged consensus
|
|
524
|
+
if len(subset_indices) == 1:
|
|
525
|
+
# Single variant - use directly, preserving raw_ric and other metadata
|
|
526
|
+
merged_consensus = subset_variants[0]
|
|
527
|
+
elif use_overlap_mode:
|
|
528
|
+
# Use overlap-aware consensus generation
|
|
529
|
+
merged_consensus = create_overlap_consensus_from_msa(
|
|
530
|
+
subset_aligned, subset_variants
|
|
531
|
+
)
|
|
532
|
+
else:
|
|
533
|
+
merged_consensus = create_consensus_from_msa(
|
|
534
|
+
subset_aligned, subset_variants
|
|
535
|
+
)
|
|
536
|
+
|
|
537
|
+
# Update merged consensus with cumulative position counts for iterative tracking
|
|
538
|
+
if len(subset_indices) > 1:
|
|
539
|
+
merged_consensus = merged_consensus._replace(
|
|
540
|
+
snp_count=total_snps if total_snps > 0 else None,
|
|
541
|
+
merge_indel_count=total_indels if total_indels > 0 else None
|
|
542
|
+
)
|
|
543
|
+
|
|
544
|
+
# Track merge provenance - expand any intermediate merges
|
|
545
|
+
# so we always trace back to the original cluster names
|
|
546
|
+
original_clusters = []
|
|
547
|
+
for v in subset_variants:
|
|
548
|
+
if v.sample_name in all_traceability:
|
|
549
|
+
# This variant was itself merged, expand to its originals
|
|
550
|
+
original_clusters.extend(all_traceability[v.sample_name])
|
|
551
|
+
else:
|
|
552
|
+
original_clusters.append(v.sample_name)
|
|
553
|
+
traceability = {
|
|
554
|
+
merged_consensus.sample_name: original_clusters
|
|
555
|
+
}
|
|
556
|
+
all_traceability.update(traceability)
|
|
557
|
+
|
|
558
|
+
# Track overlap merge for quality reporting
|
|
559
|
+
if use_overlap_mode and len(subset_indices) > 1:
|
|
560
|
+
# Extract specimen name (remove cluster suffix like -c1)
|
|
561
|
+
specimen = merged_consensus.sample_name.rsplit('-c', 1)[0] if '-c' in merged_consensus.sample_name else merged_consensus.sample_name
|
|
562
|
+
overlap_merges.append(OverlapMergeInfo(
|
|
563
|
+
specimen=specimen,
|
|
564
|
+
iteration=iteration,
|
|
565
|
+
input_clusters=[v.sample_name for v in subset_variants],
|
|
566
|
+
input_lengths=[len(v.sequence) for v in subset_variants],
|
|
567
|
+
input_rics=[v.ric for v in subset_variants],
|
|
568
|
+
overlap_bp=variant_stats.get('overlap_bp', 0),
|
|
569
|
+
prefix_bp=variant_stats.get('prefix_bp', 0),
|
|
570
|
+
suffix_bp=variant_stats.get('suffix_bp', 0),
|
|
571
|
+
output_length=len(merged_consensus.sequence)
|
|
572
|
+
))
|
|
573
|
+
|
|
574
|
+
# Add merged consensus to results
|
|
575
|
+
merged_results.append(merged_consensus)
|
|
576
|
+
|
|
577
|
+
# Remove merged variants from remaining pool
|
|
578
|
+
for v in subset_variants:
|
|
579
|
+
if v in remaining_variants:
|
|
580
|
+
remaining_variants.remove(v)
|
|
581
|
+
|
|
582
|
+
merged_this_round = True
|
|
583
|
+
if len(subset_indices) > 1:
|
|
584
|
+
merges_this_iteration += 1
|
|
585
|
+
break
|
|
586
|
+
|
|
587
|
+
# If no merge found, keep largest variant as-is and continue
|
|
588
|
+
if not merged_this_round:
|
|
589
|
+
logging.debug(f"No compatible merge found for largest variant (size={candidates[0].size})")
|
|
590
|
+
merged_results.append(candidates[0])
|
|
591
|
+
remaining_variants.remove(candidates[0])
|
|
592
|
+
|
|
593
|
+
# Check if we should do another iteration (overlap mode only)
|
|
594
|
+
if args.min_merge_overlap > 0 and merges_this_iteration > 0 and len(merged_results) > 1:
|
|
595
|
+
# More merges might be possible with the new merged sequences
|
|
596
|
+
# Cumulative positions are tracked per-sequence via snp_count and merge_indel_count
|
|
597
|
+
logging.debug(f"Iteration {iteration} complete: {merges_this_iteration} merges, "
|
|
598
|
+
f"{len(merged_results)} variants remaining, trying another round")
|
|
599
|
+
current_variants = merged_results
|
|
600
|
+
else:
|
|
601
|
+
# No more iterations needed
|
|
602
|
+
if iteration > 1:
|
|
603
|
+
logging.debug(f"Iterative merging complete after {iteration} iterations")
|
|
604
|
+
break
|
|
605
|
+
|
|
606
|
+
return merged_results, all_traceability, potentially_suboptimal, overlap_merges
|