speconsense 0.7.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- speconsense/__init__.py +16 -0
- speconsense/cli.py +6 -0
- speconsense/core/__init__.py +32 -0
- speconsense/core/__main__.py +6 -0
- speconsense/core/cli.py +308 -0
- speconsense/core/clusterer.py +1565 -0
- speconsense/core/workers.py +696 -0
- speconsense/msa.py +813 -0
- speconsense/profiles/__init__.py +514 -0
- speconsense/profiles/example.yaml +97 -0
- speconsense/profiles/herbarium.yaml +25 -0
- speconsense/profiles/largedata.yaml +19 -0
- speconsense/profiles/nostalgia.yaml +22 -0
- speconsense/profiles/strict.yaml +27 -0
- speconsense/quality_report.py +499 -0
- speconsense/scalability/__init__.py +29 -0
- speconsense/scalability/base.py +461 -0
- speconsense/scalability/config.py +42 -0
- speconsense/scalability/vsearch.py +226 -0
- speconsense/summarize/__init__.py +129 -0
- speconsense/summarize/__main__.py +6 -0
- speconsense/summarize/analysis.py +780 -0
- speconsense/summarize/cli.py +528 -0
- speconsense/summarize/clustering.py +669 -0
- speconsense/summarize/fields.py +262 -0
- speconsense/summarize/io.py +723 -0
- speconsense/summarize/iupac.py +294 -0
- speconsense/summarize/merging.py +606 -0
- speconsense/synth.py +292 -0
- speconsense/types.py +38 -0
- speconsense-0.7.2.dist-info/METADATA +1449 -0
- speconsense-0.7.2.dist-info/RECORD +36 -0
- speconsense-0.7.2.dist-info/WHEEL +5 -0
- speconsense-0.7.2.dist-info/entry_points.txt +4 -0
- speconsense-0.7.2.dist-info/licenses/LICENSE +28 -0
- speconsense-0.7.2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,780 @@
|
|
|
1
|
+
"""MSA analysis and quality assessment for speconsense-summarize.
|
|
2
|
+
|
|
3
|
+
Provides functions for analyzing multiple sequence alignments, detecting outliers,
|
|
4
|
+
identifying indel events, and assessing cluster quality.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import os
|
|
8
|
+
import re
|
|
9
|
+
import logging
|
|
10
|
+
import subprocess
|
|
11
|
+
import tempfile
|
|
12
|
+
from typing import List, Dict, Optional, Tuple, NamedTuple
|
|
13
|
+
from io import StringIO
|
|
14
|
+
|
|
15
|
+
import edlib
|
|
16
|
+
import numpy as np
|
|
17
|
+
from Bio import SeqIO
|
|
18
|
+
from Bio.SeqRecord import SeqRecord
|
|
19
|
+
from Bio.Seq import Seq
|
|
20
|
+
|
|
21
|
+
from speconsense.types import ConsensusInfo
|
|
22
|
+
from speconsense.msa import (
|
|
23
|
+
extract_alignments_from_msa,
|
|
24
|
+
analyze_positional_variation,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
from .iupac import IUPAC_EQUIV
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
# Maximum number of variants to evaluate for MSA-based merging (legacy constant)
|
|
31
|
+
# Batch size is now dynamically computed based on --merge-effort and group size.
|
|
32
|
+
# This constant is kept for backward compatibility and as the default MAX_MERGE_BATCH.
|
|
33
|
+
MAX_MSA_MERGE_VARIANTS = 8
|
|
34
|
+
|
|
35
|
+
# Merge effort batch size limits
|
|
36
|
+
MIN_MERGE_BATCH = 4
|
|
37
|
+
MAX_MERGE_BATCH = 8
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def compute_merge_batch_size(group_size: int, effort: int) -> int:
|
|
41
|
+
"""Compute batch size for a group based on effort level.
|
|
42
|
+
|
|
43
|
+
Uses formula: B = E + 1 - log2(V), clamped to [MIN_MERGE_BATCH, MAX_MERGE_BATCH]
|
|
44
|
+
This keeps expected evaluations near 2^E per group.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
group_size: Number of variants in the HAC group
|
|
48
|
+
effort: Merge effort level (6-14, default 10)
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
Batch size between MIN_MERGE_BATCH and MAX_MERGE_BATCH
|
|
52
|
+
"""
|
|
53
|
+
import math
|
|
54
|
+
|
|
55
|
+
if group_size <= 1:
|
|
56
|
+
return 1
|
|
57
|
+
|
|
58
|
+
log_v = int(math.log2(group_size))
|
|
59
|
+
batch = effort + 1 - log_v
|
|
60
|
+
|
|
61
|
+
return max(MIN_MERGE_BATCH, min(MAX_MERGE_BATCH, batch))
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class ClusterQualityData(NamedTuple):
|
|
65
|
+
"""Quality metrics for a cluster (no visualization matrix)."""
|
|
66
|
+
consensus_seq: str
|
|
67
|
+
position_error_rates: List[float] # Per-position error rates (0-1) in consensus space
|
|
68
|
+
position_error_counts: List[int] # Per-position error counts in consensus space
|
|
69
|
+
read_identities: List[float] # Per-read identity scores (0-1)
|
|
70
|
+
position_stats: Optional[List] = None # Detailed PositionStats for debugging (optional)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def identify_outliers(final_consensus: List, all_raw_consensuses: List, source_folder: str) -> Dict:
|
|
74
|
+
"""Identify sequences with low read identity using statistical outlier detection.
|
|
75
|
+
|
|
76
|
+
Flags sequences with mean read identity (rid) below (mean - 2*std) for the dataset.
|
|
77
|
+
This identifies the ~2.5% lowest values that may warrant review.
|
|
78
|
+
|
|
79
|
+
Note: rid_min (minimum read identity) is not used because single outlier reads
|
|
80
|
+
don't significantly impact consensus quality. Positional analysis better captures
|
|
81
|
+
systematic issues like mixed clusters or variants.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
final_consensus: List of final consensus sequences
|
|
85
|
+
all_raw_consensuses: List of all raw consensus sequences (unused, kept for API compatibility)
|
|
86
|
+
source_folder: Source directory (unused, kept for API compatibility)
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
Dictionary with:
|
|
90
|
+
{
|
|
91
|
+
'statistical_outliers': List of (cons, rid),
|
|
92
|
+
'no_issues': List of consensus sequences with good quality,
|
|
93
|
+
'global_stats': {'mean_rid', 'std_rid', 'stat_threshold_rid'}
|
|
94
|
+
}
|
|
95
|
+
"""
|
|
96
|
+
# Calculate global statistics for all sequences with identity metrics
|
|
97
|
+
all_rids = []
|
|
98
|
+
|
|
99
|
+
for cons in final_consensus:
|
|
100
|
+
if cons.rid is not None:
|
|
101
|
+
all_rids.append(cons.rid)
|
|
102
|
+
|
|
103
|
+
# Calculate mean and std for statistical outlier detection
|
|
104
|
+
mean_rid = np.mean(all_rids) if all_rids else 1.0
|
|
105
|
+
std_rid = np.std(all_rids) if len(all_rids) > 1 else 0.0
|
|
106
|
+
|
|
107
|
+
# Threshold for statistical outliers (2 standard deviations below mean)
|
|
108
|
+
stat_threshold_rid = mean_rid - 2 * std_rid
|
|
109
|
+
|
|
110
|
+
# Categorize sequences
|
|
111
|
+
statistical = []
|
|
112
|
+
no_issues = []
|
|
113
|
+
|
|
114
|
+
for cons in final_consensus:
|
|
115
|
+
rid = cons.rid if cons.rid is not None else 1.0
|
|
116
|
+
|
|
117
|
+
if rid < stat_threshold_rid:
|
|
118
|
+
statistical.append((cons, rid))
|
|
119
|
+
else:
|
|
120
|
+
no_issues.append(cons)
|
|
121
|
+
|
|
122
|
+
return {
|
|
123
|
+
'statistical_outliers': statistical,
|
|
124
|
+
'no_issues': no_issues,
|
|
125
|
+
'global_stats': {
|
|
126
|
+
'mean_rid': mean_rid,
|
|
127
|
+
'std_rid': std_rid,
|
|
128
|
+
'stat_threshold_rid': stat_threshold_rid
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def analyze_positional_identity_outliers(
|
|
134
|
+
consensus_info,
|
|
135
|
+
source_folder: str,
|
|
136
|
+
min_variant_frequency: float,
|
|
137
|
+
min_variant_count: int
|
|
138
|
+
) -> Optional[Dict]:
|
|
139
|
+
"""Analyze positional error rates and identify high-error positions.
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
consensus_info: ConsensusInfo object for the sequence
|
|
143
|
+
source_folder: Source directory containing cluster_debug folder
|
|
144
|
+
min_variant_frequency: Global threshold for flagging positions (from metadata)
|
|
145
|
+
min_variant_count: Minimum variant count for phasing (from metadata)
|
|
146
|
+
|
|
147
|
+
Returns:
|
|
148
|
+
Dictionary with positional analysis:
|
|
149
|
+
{
|
|
150
|
+
'num_outlier_positions': int,
|
|
151
|
+
'mean_outlier_error_rate': float, # Mean error rate across outlier positions only
|
|
152
|
+
'total_nucleotide_errors': int, # Sum of error counts at outlier positions
|
|
153
|
+
'outlier_threshold': float,
|
|
154
|
+
'outlier_positions': List of (position, error_rate, error_count) tuples
|
|
155
|
+
}
|
|
156
|
+
Returns None if MSA file not found or analysis fails
|
|
157
|
+
|
|
158
|
+
Note: Error rates already exclude homopolymer length differences due to
|
|
159
|
+
homopolymer normalization in analyze_positional_variation()
|
|
160
|
+
"""
|
|
161
|
+
# Skip analysis for low-RiC sequences (insufficient data for meaningful statistics)
|
|
162
|
+
# Need at least 2 * min_variant_count to confidently phase two variants
|
|
163
|
+
min_ric_threshold = 2 * min_variant_count
|
|
164
|
+
if consensus_info.ric < min_ric_threshold:
|
|
165
|
+
logging.debug(f"Skipping positional analysis for {consensus_info.sample_name}: "
|
|
166
|
+
f"RiC {consensus_info.ric} < {min_ric_threshold}")
|
|
167
|
+
return None
|
|
168
|
+
|
|
169
|
+
# Construct path to MSA file
|
|
170
|
+
debug_dir = os.path.join(source_folder, "cluster_debug")
|
|
171
|
+
|
|
172
|
+
# Try to find the MSA file
|
|
173
|
+
# MSA files use the original cluster naming (e.g., "specimen-c1")
|
|
174
|
+
# not the summarized naming (e.g., "specimen-1.v1")
|
|
175
|
+
msa_file = None
|
|
176
|
+
|
|
177
|
+
# Extract specimen name and cluster ID
|
|
178
|
+
# consensus_info.sample_name might be "specimen-1.v1" (summarized)
|
|
179
|
+
# consensus_info.cluster_id should be "-c1" (original cluster)
|
|
180
|
+
|
|
181
|
+
# Build the base name from specimen + cluster_id
|
|
182
|
+
# If sample_name is "ONT01.23-...-1.v1" and cluster_id is "-c1"
|
|
183
|
+
# we need to reconstruct "ONT01.23-...-c1"
|
|
184
|
+
|
|
185
|
+
sample_name = consensus_info.sample_name
|
|
186
|
+
cluster_id = consensus_info.cluster_id
|
|
187
|
+
|
|
188
|
+
# Remove any HAC group/variant suffix from sample_name to get specimen base
|
|
189
|
+
# Pattern: "-\d+\.v\d+" (e.g., "-1.v1")
|
|
190
|
+
specimen_base = re.sub(r'-\d+\.v\d+$', '', sample_name)
|
|
191
|
+
|
|
192
|
+
# Reconstruct original cluster name
|
|
193
|
+
original_cluster_name = f"{specimen_base}{cluster_id}"
|
|
194
|
+
|
|
195
|
+
# Look for the MSA file with correct extension
|
|
196
|
+
msa_fasta = os.path.join(debug_dir, f"{original_cluster_name}-RiC{consensus_info.ric}-msa.fasta")
|
|
197
|
+
if os.path.exists(msa_fasta):
|
|
198
|
+
msa_file = msa_fasta
|
|
199
|
+
|
|
200
|
+
if not msa_file:
|
|
201
|
+
logging.debug(f"No MSA file found for {original_cluster_name}")
|
|
202
|
+
return None
|
|
203
|
+
|
|
204
|
+
# Analyze cluster quality using core.py's positional analysis
|
|
205
|
+
quality_data = analyze_cluster_quality(msa_file, consensus_info.sequence)
|
|
206
|
+
|
|
207
|
+
if not quality_data or not quality_data.position_error_rates:
|
|
208
|
+
logging.debug(f"Failed to analyze cluster quality for {original_cluster_name}")
|
|
209
|
+
return None
|
|
210
|
+
|
|
211
|
+
position_error_rates = quality_data.position_error_rates
|
|
212
|
+
position_error_counts = quality_data.position_error_counts
|
|
213
|
+
position_stats = quality_data.position_stats
|
|
214
|
+
|
|
215
|
+
# Use global min_variant_frequency as threshold
|
|
216
|
+
# Positions above this could be undetected/unphased variants
|
|
217
|
+
threshold = min_variant_frequency
|
|
218
|
+
outlier_positions = [
|
|
219
|
+
(i, rate, count)
|
|
220
|
+
for i, (rate, count) in enumerate(zip(position_error_rates, position_error_counts))
|
|
221
|
+
if rate > threshold
|
|
222
|
+
]
|
|
223
|
+
|
|
224
|
+
# Build detailed outlier info including base composition
|
|
225
|
+
outlier_details = []
|
|
226
|
+
if position_stats:
|
|
227
|
+
for i, rate, count in outlier_positions:
|
|
228
|
+
if i < len(position_stats):
|
|
229
|
+
ps = position_stats[i]
|
|
230
|
+
outlier_details.append({
|
|
231
|
+
'consensus_position': ps.consensus_position,
|
|
232
|
+
'msa_position': ps.msa_position,
|
|
233
|
+
'error_rate': rate,
|
|
234
|
+
'error_count': count,
|
|
235
|
+
'coverage': ps.coverage,
|
|
236
|
+
'consensus_nucleotide': ps.consensus_nucleotide,
|
|
237
|
+
'base_composition': dict(ps.base_composition),
|
|
238
|
+
'homopolymer_composition': dict(ps.homopolymer_composition) if ps.homopolymer_composition else {},
|
|
239
|
+
'sub_count': ps.sub_count,
|
|
240
|
+
'ins_count': ps.ins_count,
|
|
241
|
+
'del_count': ps.del_count,
|
|
242
|
+
})
|
|
243
|
+
|
|
244
|
+
# Calculate statistics for outlier positions only
|
|
245
|
+
if outlier_positions:
|
|
246
|
+
mean_outlier_error = np.mean([rate for _, rate, _ in outlier_positions])
|
|
247
|
+
total_nucleotide_errors = sum(count for _, _, count in outlier_positions)
|
|
248
|
+
else:
|
|
249
|
+
mean_outlier_error = 0.0
|
|
250
|
+
total_nucleotide_errors = 0
|
|
251
|
+
|
|
252
|
+
return {
|
|
253
|
+
'num_outlier_positions': len(outlier_positions),
|
|
254
|
+
'mean_outlier_error_rate': mean_outlier_error,
|
|
255
|
+
'total_nucleotide_errors': total_nucleotide_errors,
|
|
256
|
+
'outlier_threshold': threshold,
|
|
257
|
+
'outlier_positions': outlier_positions,
|
|
258
|
+
'outlier_details': outlier_details,
|
|
259
|
+
'consensus_seq': quality_data.consensus_seq,
|
|
260
|
+
'ric': consensus_info.ric,
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def run_spoa_msa(sequences: List[str], alignment_mode: int = 1) -> List:
|
|
265
|
+
"""
|
|
266
|
+
Run SPOA to create multiple sequence alignment.
|
|
267
|
+
|
|
268
|
+
Args:
|
|
269
|
+
sequences: List of DNA sequence strings
|
|
270
|
+
alignment_mode: SPOA alignment mode:
|
|
271
|
+
0 = local (Smith-Waterman) - best for overlap merging
|
|
272
|
+
1 = global (Needleman-Wunsch) - default, for same-length sequences
|
|
273
|
+
2 = semi-global - alternative for overlap merging
|
|
274
|
+
|
|
275
|
+
Returns:
|
|
276
|
+
List of SeqRecord objects with aligned sequences (including gaps)
|
|
277
|
+
"""
|
|
278
|
+
with tempfile.NamedTemporaryFile(mode='w', suffix='.fasta', delete=False) as temp_input:
|
|
279
|
+
try:
|
|
280
|
+
# Write sequences to temporary file
|
|
281
|
+
records = [
|
|
282
|
+
SeqRecord(Seq(seq), id=f"seq{i}", description="")
|
|
283
|
+
for i, seq in enumerate(sequences)
|
|
284
|
+
]
|
|
285
|
+
SeqIO.write(records, temp_input, "fasta")
|
|
286
|
+
temp_input.flush()
|
|
287
|
+
|
|
288
|
+
# Run SPOA with alignment output (-r 2) and specified alignment mode
|
|
289
|
+
result = subprocess.run(
|
|
290
|
+
['spoa', temp_input.name, '-r', '2', '-l', str(alignment_mode)],
|
|
291
|
+
capture_output=True,
|
|
292
|
+
text=True,
|
|
293
|
+
check=True
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
# Parse aligned sequences from SPOA output
|
|
297
|
+
aligned_sequences = []
|
|
298
|
+
lines = result.stdout.strip().split('\n')
|
|
299
|
+
current_id = None
|
|
300
|
+
current_seq = []
|
|
301
|
+
|
|
302
|
+
for line in lines:
|
|
303
|
+
if line.startswith('>'):
|
|
304
|
+
if current_id is not None:
|
|
305
|
+
# Skip consensus sequence (usually last)
|
|
306
|
+
if not current_id.startswith('Consensus'):
|
|
307
|
+
aligned_sequences.append(SeqRecord(
|
|
308
|
+
Seq(''.join(current_seq)),
|
|
309
|
+
id=current_id,
|
|
310
|
+
description=""
|
|
311
|
+
))
|
|
312
|
+
current_id = line[1:]
|
|
313
|
+
current_seq = []
|
|
314
|
+
elif line.strip():
|
|
315
|
+
current_seq.append(line.strip())
|
|
316
|
+
|
|
317
|
+
# Add last sequence (if not consensus)
|
|
318
|
+
if current_id is not None and not current_id.startswith('Consensus'):
|
|
319
|
+
aligned_sequences.append(SeqRecord(
|
|
320
|
+
Seq(''.join(current_seq)),
|
|
321
|
+
id=current_id,
|
|
322
|
+
description=""
|
|
323
|
+
))
|
|
324
|
+
|
|
325
|
+
return aligned_sequences
|
|
326
|
+
|
|
327
|
+
finally:
|
|
328
|
+
if os.path.exists(temp_input.name):
|
|
329
|
+
os.unlink(temp_input.name)
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
def identify_indel_events(aligned_seqs: List, alignment_length: int) -> List[Tuple[int, int]]:
|
|
333
|
+
"""
|
|
334
|
+
Identify consecutive runs of indel columns (events).
|
|
335
|
+
|
|
336
|
+
An indel event is a maximal consecutive run of columns containing gaps.
|
|
337
|
+
Each event represents a single biological insertion or deletion.
|
|
338
|
+
|
|
339
|
+
Args:
|
|
340
|
+
aligned_seqs: List of aligned sequences from SPOA
|
|
341
|
+
alignment_length: Length of the alignment
|
|
342
|
+
|
|
343
|
+
Returns:
|
|
344
|
+
List of (start_col, end_col) tuples, where end_col is inclusive
|
|
345
|
+
"""
|
|
346
|
+
events = []
|
|
347
|
+
in_event = False
|
|
348
|
+
start_col = None
|
|
349
|
+
|
|
350
|
+
for col_idx in range(alignment_length):
|
|
351
|
+
column = [str(seq.seq[col_idx]) for seq in aligned_seqs]
|
|
352
|
+
has_gap = '-' in column
|
|
353
|
+
has_bases = any(c != '-' for c in column)
|
|
354
|
+
|
|
355
|
+
# Indel column: mix of gaps and bases
|
|
356
|
+
if has_gap and has_bases:
|
|
357
|
+
if not in_event:
|
|
358
|
+
# Start new event
|
|
359
|
+
in_event = True
|
|
360
|
+
start_col = col_idx
|
|
361
|
+
else:
|
|
362
|
+
# Not an indel column (either all gaps or all bases)
|
|
363
|
+
if in_event:
|
|
364
|
+
# End current event
|
|
365
|
+
events.append((start_col, col_idx - 1))
|
|
366
|
+
in_event = False
|
|
367
|
+
|
|
368
|
+
# Handle event that extends to end of alignment
|
|
369
|
+
if in_event:
|
|
370
|
+
events.append((start_col, alignment_length - 1))
|
|
371
|
+
|
|
372
|
+
return events
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
def is_homopolymer_event(aligned_seqs: List, start_col: int, end_col: int) -> bool:
|
|
376
|
+
"""
|
|
377
|
+
Classify a complete indel event as homopolymer or structural.
|
|
378
|
+
|
|
379
|
+
An event is homopolymer if:
|
|
380
|
+
1. All bases in the event region (across all sequences, all columns) are identical
|
|
381
|
+
2. At least one flanking solid column has all sequences showing the same base
|
|
382
|
+
|
|
383
|
+
This matches adjusted-identity semantics where AAA ~ AAAA.
|
|
384
|
+
|
|
385
|
+
Examples:
|
|
386
|
+
Homopolymer: ATAAA--GC vs ATAAAAGC (event has all A's, flanked by A)
|
|
387
|
+
Structural: ATAA-GC vs ATG-AGC (event has A, flanked by A vs G)
|
|
388
|
+
Structural: ATC--GC vs ATCATGC (event has A and T - not homopolymer)
|
|
389
|
+
|
|
390
|
+
Args:
|
|
391
|
+
aligned_seqs: List of aligned sequences from SPOA
|
|
392
|
+
start_col: First column of the indel event (inclusive)
|
|
393
|
+
end_col: Last column of the indel event (inclusive)
|
|
394
|
+
|
|
395
|
+
Returns:
|
|
396
|
+
True if homopolymer event, False if structural
|
|
397
|
+
"""
|
|
398
|
+
# Extract all bases from the event region (excluding gaps)
|
|
399
|
+
bases_in_event = set()
|
|
400
|
+
for col_idx in range(start_col, end_col + 1):
|
|
401
|
+
column = [str(seq.seq[col_idx]) for seq in aligned_seqs]
|
|
402
|
+
bases_in_event.update(c for c in column if c != '-')
|
|
403
|
+
|
|
404
|
+
# Must have exactly one base type across the entire event
|
|
405
|
+
if len(bases_in_event) != 1:
|
|
406
|
+
return False
|
|
407
|
+
|
|
408
|
+
event_base = list(bases_in_event)[0]
|
|
409
|
+
alignment_length = len(aligned_seqs[0].seq)
|
|
410
|
+
|
|
411
|
+
# Check flanking columns for matching homopolymer context
|
|
412
|
+
# A valid flanking column must:
|
|
413
|
+
# 1. Not be an indel column (all sequences have bases, no gaps)
|
|
414
|
+
# 2. All bases match the event base
|
|
415
|
+
|
|
416
|
+
# Check left flank
|
|
417
|
+
if start_col > 0:
|
|
418
|
+
left_col = start_col - 1
|
|
419
|
+
left_column = [str(seq.seq[left_col]) for seq in aligned_seqs]
|
|
420
|
+
left_bases = set(c for c in left_column if c != '-')
|
|
421
|
+
left_has_gap = '-' in left_column
|
|
422
|
+
|
|
423
|
+
if not left_has_gap and left_bases == {event_base}:
|
|
424
|
+
return True
|
|
425
|
+
|
|
426
|
+
# Check right flank
|
|
427
|
+
if end_col < alignment_length - 1:
|
|
428
|
+
right_col = end_col + 1
|
|
429
|
+
right_column = [str(seq.seq[right_col]) for seq in aligned_seqs]
|
|
430
|
+
right_bases = set(c for c in right_column if c != '-')
|
|
431
|
+
right_has_gap = '-' in right_column
|
|
432
|
+
|
|
433
|
+
if not right_has_gap and right_bases == {event_base}:
|
|
434
|
+
return True
|
|
435
|
+
|
|
436
|
+
# No valid homopolymer flanking found
|
|
437
|
+
return False
|
|
438
|
+
|
|
439
|
+
|
|
440
|
+
def analyze_msa_columns(aligned_seqs: List) -> dict:
|
|
441
|
+
"""
|
|
442
|
+
Analyze aligned sequences to count SNPs and indels.
|
|
443
|
+
|
|
444
|
+
Distinguishes between structural indels (real insertions/deletions) and
|
|
445
|
+
homopolymer indels (length differences in homopolymer runs like AAA vs AAAA).
|
|
446
|
+
|
|
447
|
+
Uses event-based classification: consecutive indel columns are grouped into
|
|
448
|
+
events, and each complete event is classified as homopolymer or structural.
|
|
449
|
+
|
|
450
|
+
Important: All gaps (including terminal gaps) count as variant positions
|
|
451
|
+
since variants within a group share the same primers.
|
|
452
|
+
|
|
453
|
+
Returns dict with:
|
|
454
|
+
'snp_count': number of positions with >1 non-gap base
|
|
455
|
+
'structural_indel_count': number of structural indel events
|
|
456
|
+
'structural_indel_length': length of longest structural indel event
|
|
457
|
+
'homopolymer_indel_count': number of homopolymer indel events
|
|
458
|
+
'homopolymer_indel_length': length of longest homopolymer indel event
|
|
459
|
+
'indel_count': total indel events (for backward compatibility)
|
|
460
|
+
'max_indel_length': max indel event length (for backward compatibility)
|
|
461
|
+
"""
|
|
462
|
+
alignment_length = len(aligned_seqs[0].seq)
|
|
463
|
+
|
|
464
|
+
# Step 1: Count SNPs
|
|
465
|
+
snp_count = 0
|
|
466
|
+
for col_idx in range(alignment_length):
|
|
467
|
+
column = [str(seq.seq[col_idx]) for seq in aligned_seqs]
|
|
468
|
+
unique_bases = set(c for c in column if c != '-')
|
|
469
|
+
has_gap = '-' in column
|
|
470
|
+
|
|
471
|
+
# SNP position: multiple different bases with NO gaps
|
|
472
|
+
# Columns with gaps are indels, not SNPs
|
|
473
|
+
if len(unique_bases) > 1 and not has_gap:
|
|
474
|
+
snp_count += 1
|
|
475
|
+
|
|
476
|
+
# Step 2: Identify indel events (consecutive runs of indel columns)
|
|
477
|
+
indel_events = identify_indel_events(aligned_seqs, alignment_length)
|
|
478
|
+
|
|
479
|
+
# Step 3: Classify each event as homopolymer or structural
|
|
480
|
+
structural_events = []
|
|
481
|
+
homopolymer_events = []
|
|
482
|
+
|
|
483
|
+
for start_col, end_col in indel_events:
|
|
484
|
+
if is_homopolymer_event(aligned_seqs, start_col, end_col):
|
|
485
|
+
homopolymer_events.append((start_col, end_col))
|
|
486
|
+
else:
|
|
487
|
+
structural_events.append((start_col, end_col))
|
|
488
|
+
|
|
489
|
+
# Step 4: Calculate statistics
|
|
490
|
+
# Count is number of events (not columns)
|
|
491
|
+
structural_indel_count = len(structural_events)
|
|
492
|
+
homopolymer_indel_count = len(homopolymer_events)
|
|
493
|
+
|
|
494
|
+
# Length is the size of the longest event
|
|
495
|
+
structural_indel_length = max((end - start + 1 for start, end in structural_events), default=0)
|
|
496
|
+
homopolymer_indel_length = max((end - start + 1 for start, end in homopolymer_events), default=0)
|
|
497
|
+
|
|
498
|
+
# Backward compatibility: total events and max length
|
|
499
|
+
total_indel_count = structural_indel_count + homopolymer_indel_count
|
|
500
|
+
max_indel_length = max(structural_indel_length, homopolymer_indel_length)
|
|
501
|
+
|
|
502
|
+
return {
|
|
503
|
+
'snp_count': snp_count,
|
|
504
|
+
'structural_indel_count': structural_indel_count,
|
|
505
|
+
'structural_indel_length': structural_indel_length,
|
|
506
|
+
'homopolymer_indel_count': homopolymer_indel_count,
|
|
507
|
+
'homopolymer_indel_length': homopolymer_indel_length,
|
|
508
|
+
'indel_count': total_indel_count, # Backward compatibility
|
|
509
|
+
'max_indel_length': max_indel_length # Backward compatibility
|
|
510
|
+
}
|
|
511
|
+
|
|
512
|
+
|
|
513
|
+
def analyze_msa_columns_overlap_aware(aligned_seqs: List, min_overlap_bp: int,
|
|
514
|
+
original_lengths: List[int]) -> dict:
|
|
515
|
+
"""
|
|
516
|
+
Analyze MSA columns, distinguishing terminal gaps from structural indels.
|
|
517
|
+
|
|
518
|
+
Terminal gaps (from length differences at sequence ends) are NOT counted
|
|
519
|
+
as structural indels when sequences have sufficient overlap in their
|
|
520
|
+
shared region. This enables merging sequences from primer pools with
|
|
521
|
+
different endpoints.
|
|
522
|
+
|
|
523
|
+
Args:
|
|
524
|
+
aligned_seqs: List of aligned sequences from SPOA
|
|
525
|
+
min_overlap_bp: Minimum overlap required (0 to disable overlap mode)
|
|
526
|
+
original_lengths: Original ungapped sequence lengths
|
|
527
|
+
|
|
528
|
+
Returns dict with:
|
|
529
|
+
'snp_count': SNPs in overlap region
|
|
530
|
+
'structural_indel_count': Structural indels in overlap region only
|
|
531
|
+
'structural_indel_length': Length of longest structural indel
|
|
532
|
+
'homopolymer_indel_count': Homopolymer indels (anywhere)
|
|
533
|
+
'homopolymer_indel_length': Length of longest homopolymer indel
|
|
534
|
+
'terminal_gap_columns': Number of terminal gap columns (not counted as structural)
|
|
535
|
+
'overlap_bp': Size of overlap region in base pairs
|
|
536
|
+
'prefix_bp': Extension before overlap region (for logging)
|
|
537
|
+
'suffix_bp': Extension after overlap region (for logging)
|
|
538
|
+
'content_regions': List of (start, end) tuples per sequence (for span logging)
|
|
539
|
+
'indel_count': Total events (backward compatibility)
|
|
540
|
+
'max_indel_length': Max event length (backward compatibility)
|
|
541
|
+
"""
|
|
542
|
+
alignment_length = len(aligned_seqs[0].seq)
|
|
543
|
+
|
|
544
|
+
# Step 1: Find content region for each sequence (first non-gap to last non-gap)
|
|
545
|
+
content_regions = [] # List of (start, end) tuples
|
|
546
|
+
for seq in aligned_seqs:
|
|
547
|
+
seq_str = str(seq.seq)
|
|
548
|
+
# Find first and last non-gap positions
|
|
549
|
+
first_base = next((i for i, c in enumerate(seq_str) if c != '-'), 0)
|
|
550
|
+
last_base = alignment_length - 1 - next(
|
|
551
|
+
(i for i, c in enumerate(reversed(seq_str)) if c != '-'), 0
|
|
552
|
+
)
|
|
553
|
+
content_regions.append((first_base, last_base))
|
|
554
|
+
|
|
555
|
+
# Step 2: Calculate overlap region (intersection of all content regions)
|
|
556
|
+
overlap_start = max(start for start, _ in content_regions)
|
|
557
|
+
overlap_end = min(end for _, end in content_regions)
|
|
558
|
+
|
|
559
|
+
# Calculate union region (for prefix/suffix extension reporting)
|
|
560
|
+
union_start = min(start for start, _ in content_regions)
|
|
561
|
+
union_end = max(end for _, end in content_regions)
|
|
562
|
+
prefix_bp = overlap_start - union_start
|
|
563
|
+
suffix_bp = union_end - overlap_end
|
|
564
|
+
|
|
565
|
+
# Calculate actual overlap in base pairs (count only columns where all have bases)
|
|
566
|
+
overlap_bp = 0
|
|
567
|
+
if overlap_end >= overlap_start:
|
|
568
|
+
for col_idx in range(overlap_start, overlap_end + 1):
|
|
569
|
+
column = [str(seq.seq[col_idx]) for seq in aligned_seqs]
|
|
570
|
+
if all(c != '-' for c in column):
|
|
571
|
+
overlap_bp += 1
|
|
572
|
+
|
|
573
|
+
# Determine effective threshold for containment cases
|
|
574
|
+
shorter_len = min(original_lengths)
|
|
575
|
+
effective_threshold = min(min_overlap_bp, shorter_len)
|
|
576
|
+
|
|
577
|
+
# Step 3: Count SNPs only within overlap region
|
|
578
|
+
snp_count = 0
|
|
579
|
+
for col_idx in range(overlap_start, overlap_end + 1):
|
|
580
|
+
column = [str(seq.seq[col_idx]) for seq in aligned_seqs]
|
|
581
|
+
unique_bases = set(c for c in column if c != '-')
|
|
582
|
+
has_gap = '-' in column
|
|
583
|
+
|
|
584
|
+
# SNP position: multiple different bases with NO gaps
|
|
585
|
+
if len(unique_bases) > 1 and not has_gap:
|
|
586
|
+
snp_count += 1
|
|
587
|
+
|
|
588
|
+
# Step 4: Identify indel events, but only count those within overlap region
|
|
589
|
+
indel_events = identify_indel_events(aligned_seqs, alignment_length)
|
|
590
|
+
|
|
591
|
+
# Step 5: Classify each event and determine if it's in overlap region
|
|
592
|
+
structural_events = []
|
|
593
|
+
homopolymer_events = []
|
|
594
|
+
terminal_gap_columns = 0
|
|
595
|
+
|
|
596
|
+
for start_col, end_col in indel_events:
|
|
597
|
+
# Check if this event is entirely within the overlap region
|
|
598
|
+
is_in_overlap = (start_col >= overlap_start and end_col <= overlap_end)
|
|
599
|
+
|
|
600
|
+
# Check if this is a terminal gap event (at the boundary of a content region)
|
|
601
|
+
is_terminal = False
|
|
602
|
+
for seq_start, seq_end in content_regions:
|
|
603
|
+
# Terminal if event is adjacent to or outside a sequence's content region
|
|
604
|
+
if end_col < seq_start or start_col > seq_end:
|
|
605
|
+
is_terminal = True
|
|
606
|
+
break
|
|
607
|
+
# Also terminal if event is at the very edge of content
|
|
608
|
+
if start_col == seq_start or end_col == seq_end:
|
|
609
|
+
# Check if the gaps in this event are from this sequence's terminal
|
|
610
|
+
for col_idx in range(start_col, end_col + 1):
|
|
611
|
+
column = [str(seq.seq[col_idx]) for seq in aligned_seqs]
|
|
612
|
+
for i, (s, e) in enumerate(content_regions):
|
|
613
|
+
if col_idx < s or col_idx > e:
|
|
614
|
+
if column[i] == '-':
|
|
615
|
+
is_terminal = True
|
|
616
|
+
break
|
|
617
|
+
if is_terminal:
|
|
618
|
+
break
|
|
619
|
+
|
|
620
|
+
if is_terminal and overlap_bp >= effective_threshold:
|
|
621
|
+
# Terminal gap from length difference - don't count as structural
|
|
622
|
+
terminal_gap_columns += (end_col - start_col + 1)
|
|
623
|
+
elif is_homopolymer_event(aligned_seqs, start_col, end_col):
|
|
624
|
+
homopolymer_events.append((start_col, end_col))
|
|
625
|
+
else:
|
|
626
|
+
# Only count as structural if within overlap region
|
|
627
|
+
if is_in_overlap:
|
|
628
|
+
structural_events.append((start_col, end_col))
|
|
629
|
+
else:
|
|
630
|
+
# Outside overlap - this is a terminal gap
|
|
631
|
+
terminal_gap_columns += (end_col - start_col + 1)
|
|
632
|
+
|
|
633
|
+
# Step 6: Calculate statistics
|
|
634
|
+
structural_indel_count = len(structural_events)
|
|
635
|
+
homopolymer_indel_count = len(homopolymer_events)
|
|
636
|
+
|
|
637
|
+
structural_indel_length = max((end - start + 1 for start, end in structural_events), default=0)
|
|
638
|
+
homopolymer_indel_length = max((end - start + 1 for start, end in homopolymer_events), default=0)
|
|
639
|
+
|
|
640
|
+
# Backward compatibility
|
|
641
|
+
total_indel_count = structural_indel_count + homopolymer_indel_count
|
|
642
|
+
max_indel_length = max(structural_indel_length, homopolymer_indel_length)
|
|
643
|
+
|
|
644
|
+
return {
|
|
645
|
+
'snp_count': snp_count,
|
|
646
|
+
'structural_indel_count': structural_indel_count,
|
|
647
|
+
'structural_indel_length': structural_indel_length,
|
|
648
|
+
'homopolymer_indel_count': homopolymer_indel_count,
|
|
649
|
+
'homopolymer_indel_length': homopolymer_indel_length,
|
|
650
|
+
'terminal_gap_columns': terminal_gap_columns,
|
|
651
|
+
'overlap_bp': overlap_bp,
|
|
652
|
+
'prefix_bp': prefix_bp,
|
|
653
|
+
'suffix_bp': suffix_bp,
|
|
654
|
+
'content_regions': content_regions,
|
|
655
|
+
'indel_count': total_indel_count, # Backward compatibility
|
|
656
|
+
'max_indel_length': max_indel_length # Backward compatibility
|
|
657
|
+
}
|
|
658
|
+
|
|
659
|
+
|
|
660
|
+
def analyze_cluster_quality(
|
|
661
|
+
msa_file: str,
|
|
662
|
+
consensus_seq: str,
|
|
663
|
+
max_reads: Optional[int] = None
|
|
664
|
+
) -> Optional[ClusterQualityData]:
|
|
665
|
+
"""
|
|
666
|
+
Analyze cluster quality using core.py's analyze_positional_variation().
|
|
667
|
+
|
|
668
|
+
Uses the canonical positional analysis from core.py to ensure consistent
|
|
669
|
+
treatment of homopolymer length differences across the pipeline.
|
|
670
|
+
|
|
671
|
+
Args:
|
|
672
|
+
msa_file: Path to MSA FASTA file
|
|
673
|
+
consensus_seq: Ungapped consensus sequence
|
|
674
|
+
max_reads: Maximum reads to include (for downsampling large clusters)
|
|
675
|
+
|
|
676
|
+
Returns:
|
|
677
|
+
ClusterQualityData with position error rates and read identities, or None if failed
|
|
678
|
+
"""
|
|
679
|
+
if not os.path.exists(msa_file):
|
|
680
|
+
logging.debug(f"MSA file not found: {msa_file}")
|
|
681
|
+
return None
|
|
682
|
+
|
|
683
|
+
# Load MSA file content
|
|
684
|
+
try:
|
|
685
|
+
with open(msa_file, 'r') as f:
|
|
686
|
+
msa_string = f.read()
|
|
687
|
+
except Exception as e:
|
|
688
|
+
logging.debug(f"Failed to read MSA file {msa_file}: {e}")
|
|
689
|
+
return None
|
|
690
|
+
|
|
691
|
+
# Extract alignments from MSA using core.py function with homopolymer normalization
|
|
692
|
+
# This returns ReadAlignment objects with score_aligned field
|
|
693
|
+
alignments, msa_consensus, msa_to_consensus_pos = extract_alignments_from_msa(
|
|
694
|
+
msa_string,
|
|
695
|
+
enable_homopolymer_normalization=True
|
|
696
|
+
)
|
|
697
|
+
|
|
698
|
+
if not alignments:
|
|
699
|
+
logging.debug(f"No alignments found in MSA: {msa_file}")
|
|
700
|
+
return None
|
|
701
|
+
|
|
702
|
+
# Verify consensus matches: the passed-in consensus_seq may be trimmed (shorter) with IUPAC codes
|
|
703
|
+
# The MSA consensus is untrimmed (longer) without IUPAC codes
|
|
704
|
+
# Use edlib in HW mode to check if trimmed consensus is contained within MSA consensus
|
|
705
|
+
if msa_consensus and msa_consensus != consensus_seq:
|
|
706
|
+
# Use edlib HW mode (semi-global) to find consensus_seq within msa_consensus
|
|
707
|
+
# This handles primer trimming (length difference) and IUPAC codes (via equivalencies)
|
|
708
|
+
result = edlib.align(consensus_seq, msa_consensus, mode="HW", task="distance",
|
|
709
|
+
additionalEqualities=IUPAC_EQUIV)
|
|
710
|
+
edit_distance = result["editDistance"]
|
|
711
|
+
if edit_distance > 0: # Any edits indicate a real mismatch
|
|
712
|
+
logging.warning(f"Consensus mismatch in MSA file: {msa_file}")
|
|
713
|
+
logging.warning(f" MSA length: {len(msa_consensus)}, consensus length: {len(consensus_seq)}, edit distance: {edit_distance}")
|
|
714
|
+
|
|
715
|
+
# Use the passed-in consensus (with IUPAC codes) as authoritative for quality analysis
|
|
716
|
+
# This reflects the actual output sequence
|
|
717
|
+
consensus_length = len(consensus_seq)
|
|
718
|
+
|
|
719
|
+
if consensus_length == 0:
|
|
720
|
+
logging.debug(f"Empty consensus sequence: {msa_file}")
|
|
721
|
+
return None
|
|
722
|
+
|
|
723
|
+
# Get consensus aligned sequence by parsing MSA string
|
|
724
|
+
msa_handle = StringIO(msa_string)
|
|
725
|
+
records = list(SeqIO.parse(msa_handle, 'fasta'))
|
|
726
|
+
consensus_aligned = None
|
|
727
|
+
for record in records:
|
|
728
|
+
if 'Consensus' in record.description or 'Consensus' in record.id:
|
|
729
|
+
consensus_aligned = str(record.seq).upper()
|
|
730
|
+
break
|
|
731
|
+
|
|
732
|
+
if consensus_aligned is None:
|
|
733
|
+
logging.debug(f"No consensus found in MSA: {msa_file}")
|
|
734
|
+
return None
|
|
735
|
+
|
|
736
|
+
# Downsample reads if needed
|
|
737
|
+
if max_reads and len(alignments) > max_reads:
|
|
738
|
+
# Sort by read identity (using normalized edit distance) and take worst reads first, then best
|
|
739
|
+
# This gives us a representative sample showing the quality range
|
|
740
|
+
read_identities_temp = []
|
|
741
|
+
for alignment in alignments:
|
|
742
|
+
# Use normalized edit distance for identity calculation
|
|
743
|
+
identity = 1.0 - (alignment.normalized_edit_distance / consensus_length) if consensus_length > 0 else 0.0
|
|
744
|
+
read_identities_temp.append((identity, alignment))
|
|
745
|
+
|
|
746
|
+
# Sort by identity
|
|
747
|
+
read_identities_temp.sort(key=lambda x: x[0])
|
|
748
|
+
|
|
749
|
+
# Take worst half and best half
|
|
750
|
+
n_worst = max_reads // 2
|
|
751
|
+
n_best = max_reads - n_worst
|
|
752
|
+
sampled = read_identities_temp[:n_worst] + read_identities_temp[-n_best:]
|
|
753
|
+
|
|
754
|
+
alignments = [alignment for _, alignment in sampled]
|
|
755
|
+
logging.debug(f"Downsampled {len(read_identities_temp)} reads to {len(alignments)} for analysis")
|
|
756
|
+
|
|
757
|
+
# Use core.py's canonical positional analysis
|
|
758
|
+
position_stats = analyze_positional_variation(alignments, consensus_aligned, msa_to_consensus_pos)
|
|
759
|
+
|
|
760
|
+
# Extract position error rates and counts for consensus positions only (skip insertion columns)
|
|
761
|
+
consensus_position_stats = [ps for ps in position_stats if ps.consensus_position is not None]
|
|
762
|
+
# Sort by consensus position to ensure correct order
|
|
763
|
+
consensus_position_stats.sort(key=lambda ps: ps.consensus_position)
|
|
764
|
+
position_error_rates = [ps.error_rate for ps in consensus_position_stats]
|
|
765
|
+
position_error_counts = [ps.error_count for ps in consensus_position_stats]
|
|
766
|
+
|
|
767
|
+
# Calculate per-read identities from alignments
|
|
768
|
+
read_identities = []
|
|
769
|
+
for alignment in alignments:
|
|
770
|
+
# Use normalized edit distance for identity calculation
|
|
771
|
+
identity = 1.0 - (alignment.normalized_edit_distance / consensus_length) if consensus_length > 0 else 0.0
|
|
772
|
+
read_identities.append(identity)
|
|
773
|
+
|
|
774
|
+
return ClusterQualityData(
|
|
775
|
+
consensus_seq=consensus_seq,
|
|
776
|
+
position_error_rates=position_error_rates,
|
|
777
|
+
position_error_counts=position_error_counts,
|
|
778
|
+
read_identities=read_identities,
|
|
779
|
+
position_stats=consensus_position_stats
|
|
780
|
+
)
|