speconsense 0.7.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- speconsense/__init__.py +16 -0
- speconsense/cli.py +6 -0
- speconsense/core/__init__.py +32 -0
- speconsense/core/__main__.py +6 -0
- speconsense/core/cli.py +308 -0
- speconsense/core/clusterer.py +1565 -0
- speconsense/core/workers.py +696 -0
- speconsense/msa.py +813 -0
- speconsense/profiles/__init__.py +514 -0
- speconsense/profiles/example.yaml +97 -0
- speconsense/profiles/herbarium.yaml +25 -0
- speconsense/profiles/largedata.yaml +19 -0
- speconsense/profiles/nostalgia.yaml +22 -0
- speconsense/profiles/strict.yaml +27 -0
- speconsense/quality_report.py +499 -0
- speconsense/scalability/__init__.py +29 -0
- speconsense/scalability/base.py +461 -0
- speconsense/scalability/config.py +42 -0
- speconsense/scalability/vsearch.py +226 -0
- speconsense/summarize/__init__.py +129 -0
- speconsense/summarize/__main__.py +6 -0
- speconsense/summarize/analysis.py +780 -0
- speconsense/summarize/cli.py +528 -0
- speconsense/summarize/clustering.py +669 -0
- speconsense/summarize/fields.py +262 -0
- speconsense/summarize/io.py +723 -0
- speconsense/summarize/iupac.py +294 -0
- speconsense/summarize/merging.py +606 -0
- speconsense/synth.py +292 -0
- speconsense/types.py +38 -0
- speconsense-0.7.2.dist-info/METADATA +1449 -0
- speconsense-0.7.2.dist-info/RECORD +36 -0
- speconsense-0.7.2.dist-info/WHEEL +5 -0
- speconsense-0.7.2.dist-info/entry_points.txt +4 -0
- speconsense-0.7.2.dist-info/licenses/LICENSE +28 -0
- speconsense-0.7.2.dist-info/top_level.txt +1 -0
speconsense/msa.py
ADDED
|
@@ -0,0 +1,813 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
MSA (Multiple Sequence Alignment) Analysis Module for Speconsense.
|
|
4
|
+
|
|
5
|
+
This module contains functions and data structures for analyzing MSA output from SPOA,
|
|
6
|
+
including:
|
|
7
|
+
- Homopolymer-normalized error detection
|
|
8
|
+
- Positional variation analysis
|
|
9
|
+
- Variant position detection and phasing support
|
|
10
|
+
- IUPAC ambiguity code generation
|
|
11
|
+
|
|
12
|
+
These functions were extracted from core.py to improve code organization and testability.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from collections import defaultdict
|
|
16
|
+
import logging
|
|
17
|
+
from typing import List, Set, Tuple, Optional, Dict, NamedTuple
|
|
18
|
+
|
|
19
|
+
import edlib
|
|
20
|
+
from adjusted_identity import score_alignment, AdjustmentParams
|
|
21
|
+
import numpy as np
|
|
22
|
+
from Bio import SeqIO
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
# IUPAC nucleotide ambiguity codes mapping
|
|
26
|
+
# Maps sets of nucleotides to their corresponding IUPAC code
|
|
27
|
+
IUPAC_CODES = {
|
|
28
|
+
frozenset(['A']): 'A',
|
|
29
|
+
frozenset(['C']): 'C',
|
|
30
|
+
frozenset(['G']): 'G',
|
|
31
|
+
frozenset(['T']): 'T',
|
|
32
|
+
frozenset(['A', 'G']): 'R',
|
|
33
|
+
frozenset(['C', 'T']): 'Y',
|
|
34
|
+
frozenset(['G', 'C']): 'S',
|
|
35
|
+
frozenset(['A', 'T']): 'W',
|
|
36
|
+
frozenset(['G', 'T']): 'K',
|
|
37
|
+
frozenset(['A', 'C']): 'M',
|
|
38
|
+
frozenset(['C', 'G', 'T']): 'B',
|
|
39
|
+
frozenset(['A', 'G', 'T']): 'D',
|
|
40
|
+
frozenset(['A', 'C', 'T']): 'H',
|
|
41
|
+
frozenset(['A', 'C', 'G']): 'V',
|
|
42
|
+
frozenset(['A', 'C', 'G', 'T']): 'N',
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class ErrorPosition(NamedTuple):
|
|
47
|
+
"""An error at a specific position in the MSA."""
|
|
48
|
+
msa_position: int # 0-indexed position in MSA alignment
|
|
49
|
+
error_type: str # 'sub', 'ins', or 'del'
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class ReadAlignment(NamedTuple):
|
|
53
|
+
"""Alignment result for a single read against consensus."""
|
|
54
|
+
read_id: str
|
|
55
|
+
aligned_sequence: str # Gapped sequence from MSA
|
|
56
|
+
read_length: int
|
|
57
|
+
|
|
58
|
+
# Raw metrics (count all differences including homopolymer length)
|
|
59
|
+
edit_distance: int
|
|
60
|
+
num_insertions: int
|
|
61
|
+
num_deletions: int
|
|
62
|
+
num_substitutions: int
|
|
63
|
+
error_positions: List[ErrorPosition] # Detailed error information
|
|
64
|
+
|
|
65
|
+
# Homopolymer-normalized metrics (exclude homopolymer extensions)
|
|
66
|
+
normalized_edit_distance: int # Edit distance excluding homopolymer length differences
|
|
67
|
+
normalized_error_positions: List[ErrorPosition] # Only non-homopolymer errors
|
|
68
|
+
score_aligned: str # Scoring string from adjusted-identity ('|'=match, '='=homopolymer, ' '=error)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class PositionStats(NamedTuple):
|
|
72
|
+
"""Statistics for a single position in the MSA."""
|
|
73
|
+
msa_position: int # Position in MSA (0-indexed)
|
|
74
|
+
consensus_position: Optional[int] # Position in consensus (None for insertion columns)
|
|
75
|
+
coverage: int
|
|
76
|
+
error_count: int
|
|
77
|
+
error_rate: float
|
|
78
|
+
sub_count: int
|
|
79
|
+
ins_count: int
|
|
80
|
+
del_count: int
|
|
81
|
+
consensus_nucleotide: str # Base in consensus at this MSA position (or '-' for insertion)
|
|
82
|
+
base_composition: Dict[str, int] # Raw base counts: {A: 50, C: 3, G: 45, T: 2, '-': 0}
|
|
83
|
+
homopolymer_composition: Dict[str, int] # HP extension counts: {A: 5, G: 2} (base and count)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class MSAResult(NamedTuple):
|
|
87
|
+
"""Result from SPOA multiple sequence alignment.
|
|
88
|
+
|
|
89
|
+
Attributes:
|
|
90
|
+
consensus: Ungapped consensus sequence
|
|
91
|
+
msa_string: Raw MSA in FASTA format (for file writing)
|
|
92
|
+
alignments: Parsed read alignments with gapped sequences
|
|
93
|
+
msa_to_consensus_pos: Mapping from MSA position to consensus position
|
|
94
|
+
"""
|
|
95
|
+
consensus: str
|
|
96
|
+
msa_string: str
|
|
97
|
+
alignments: List[ReadAlignment]
|
|
98
|
+
msa_to_consensus_pos: Dict[int, Optional[int]]
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
# ============================================================================
|
|
102
|
+
# MSA Analysis Functions
|
|
103
|
+
# ============================================================================
|
|
104
|
+
|
|
105
|
+
def parse_score_aligned_for_errors(
|
|
106
|
+
score_aligned: str,
|
|
107
|
+
read_aligned: str,
|
|
108
|
+
consensus_aligned: str
|
|
109
|
+
) -> List[ErrorPosition]:
|
|
110
|
+
"""
|
|
111
|
+
Parse score_aligned string to extract non-homopolymer errors.
|
|
112
|
+
|
|
113
|
+
The score_aligned string from adjusted-identity uses these codes:
|
|
114
|
+
- '|' : Exact match (not an error)
|
|
115
|
+
- '=' : Ambiguous match or homopolymer extension (not counted as error)
|
|
116
|
+
- ' ' : Substitution or indel (IS an error)
|
|
117
|
+
- '.' : End-trimmed position (not counted)
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
score_aligned: Scoring string from adjusted-identity
|
|
121
|
+
read_aligned: Aligned read sequence with gaps
|
|
122
|
+
consensus_aligned: Aligned consensus sequence with gaps
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
List of ErrorPosition for positions marked as errors (excluding homopolymer extensions)
|
|
126
|
+
"""
|
|
127
|
+
normalized_errors = []
|
|
128
|
+
|
|
129
|
+
for msa_pos, (score_char, read_base, cons_base) in enumerate(
|
|
130
|
+
zip(score_aligned, read_aligned, consensus_aligned)
|
|
131
|
+
):
|
|
132
|
+
# Skip matches and homopolymer extensions
|
|
133
|
+
if score_char in ('|', '=', '.'):
|
|
134
|
+
continue
|
|
135
|
+
|
|
136
|
+
# This is a real error (substitution or indel) - classify it
|
|
137
|
+
if read_base == '-' and cons_base != '-':
|
|
138
|
+
error_type = 'del'
|
|
139
|
+
elif read_base != '-' and cons_base == '-':
|
|
140
|
+
error_type = 'ins'
|
|
141
|
+
elif read_base != cons_base:
|
|
142
|
+
error_type = 'sub'
|
|
143
|
+
else:
|
|
144
|
+
# Both are gaps or identical - should not happen if score_char indicates error
|
|
145
|
+
continue
|
|
146
|
+
|
|
147
|
+
normalized_errors.append(ErrorPosition(msa_pos, error_type))
|
|
148
|
+
|
|
149
|
+
return normalized_errors
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def extract_alignments_from_msa(
|
|
153
|
+
msa_string: str,
|
|
154
|
+
enable_homopolymer_normalization: bool = True
|
|
155
|
+
) -> Tuple[List[ReadAlignment], str, Dict[int, Optional[int]]]:
|
|
156
|
+
"""
|
|
157
|
+
Extract read alignments from an MSA string with optional homopolymer normalization.
|
|
158
|
+
|
|
159
|
+
The MSA contains aligned sequences where the consensus has header containing "Consensus".
|
|
160
|
+
This function compares each read to the consensus at each aligned position.
|
|
161
|
+
|
|
162
|
+
Error classification (raw metrics):
|
|
163
|
+
- Both '-': Not an error (read doesn't cover this position)
|
|
164
|
+
- Read '-', consensus base: Deletion (missing base in read)
|
|
165
|
+
- Read base, consensus '-': Insertion (extra base in read)
|
|
166
|
+
- Different bases: Substitution
|
|
167
|
+
- Same base: Match (not an error)
|
|
168
|
+
|
|
169
|
+
When enable_homopolymer_normalization=True, also computes normalized metrics that
|
|
170
|
+
exclude homopolymer length differences using adjusted-identity library.
|
|
171
|
+
|
|
172
|
+
IMPORTANT: Errors are reported at MSA positions, not consensus positions.
|
|
173
|
+
This avoids ambiguity when multiple insertion columns map to the same consensus position.
|
|
174
|
+
|
|
175
|
+
Args:
|
|
176
|
+
msa_string: MSA content in FASTA format
|
|
177
|
+
enable_homopolymer_normalization: If True, compute homopolymer-normalized metrics
|
|
178
|
+
|
|
179
|
+
Returns:
|
|
180
|
+
Tuple of:
|
|
181
|
+
- list of ReadAlignment objects (with both raw and normalized metrics)
|
|
182
|
+
- consensus sequence without gaps
|
|
183
|
+
- mapping from MSA position to consensus position (None for insertion columns)
|
|
184
|
+
"""
|
|
185
|
+
from io import StringIO
|
|
186
|
+
|
|
187
|
+
# Define adjustment parameters for homopolymer normalization
|
|
188
|
+
# Only normalize homopolymers (single-base repeats), no other adjustments
|
|
189
|
+
HOMOPOLYMER_ADJUSTMENT_PARAMS = AdjustmentParams(
|
|
190
|
+
normalize_homopolymers=True,
|
|
191
|
+
handle_iupac_overlap=False,
|
|
192
|
+
normalize_indels=False,
|
|
193
|
+
end_skip_distance=0,
|
|
194
|
+
max_repeat_motif_length=1 # Single-base repeats only
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
# Parse MSA
|
|
198
|
+
msa_handle = StringIO(msa_string)
|
|
199
|
+
records = list(SeqIO.parse(msa_handle, 'fasta'))
|
|
200
|
+
|
|
201
|
+
if not records:
|
|
202
|
+
logging.warning("No sequences found in MSA string")
|
|
203
|
+
return [], "", {}
|
|
204
|
+
|
|
205
|
+
# Find consensus sequence
|
|
206
|
+
consensus_record = None
|
|
207
|
+
read_records = []
|
|
208
|
+
|
|
209
|
+
for record in records:
|
|
210
|
+
if 'Consensus' in record.description or 'Consensus' in record.id:
|
|
211
|
+
consensus_record = record
|
|
212
|
+
else:
|
|
213
|
+
read_records.append(record)
|
|
214
|
+
|
|
215
|
+
if consensus_record is None:
|
|
216
|
+
logging.warning("No consensus sequence found in MSA string")
|
|
217
|
+
return [], "", {}
|
|
218
|
+
|
|
219
|
+
consensus_aligned = str(consensus_record.seq).upper()
|
|
220
|
+
msa_length = len(consensus_aligned)
|
|
221
|
+
|
|
222
|
+
# Build mapping from MSA position to consensus position (excluding gaps)
|
|
223
|
+
# For insertion columns (consensus has '-'), maps to None
|
|
224
|
+
msa_to_consensus_pos = {}
|
|
225
|
+
consensus_pos = 0
|
|
226
|
+
for msa_pos in range(msa_length):
|
|
227
|
+
if consensus_aligned[msa_pos] != '-':
|
|
228
|
+
msa_to_consensus_pos[msa_pos] = consensus_pos
|
|
229
|
+
consensus_pos += 1
|
|
230
|
+
else:
|
|
231
|
+
# Insertion column - no consensus position
|
|
232
|
+
msa_to_consensus_pos[msa_pos] = None
|
|
233
|
+
|
|
234
|
+
# Get consensus without gaps for return value
|
|
235
|
+
consensus_ungapped = consensus_aligned.replace('-', '')
|
|
236
|
+
|
|
237
|
+
# Process each read
|
|
238
|
+
alignments = []
|
|
239
|
+
|
|
240
|
+
for read_record in read_records:
|
|
241
|
+
read_aligned = str(read_record.seq).upper()
|
|
242
|
+
|
|
243
|
+
if len(read_aligned) != msa_length:
|
|
244
|
+
logging.warning(f"Read {read_record.id} length mismatch with MSA length")
|
|
245
|
+
continue
|
|
246
|
+
|
|
247
|
+
# Compare read to consensus at each position
|
|
248
|
+
error_positions = []
|
|
249
|
+
num_insertions = 0
|
|
250
|
+
num_deletions = 0
|
|
251
|
+
num_substitutions = 0
|
|
252
|
+
|
|
253
|
+
for msa_pos in range(msa_length):
|
|
254
|
+
read_base = read_aligned[msa_pos]
|
|
255
|
+
cons_base = consensus_aligned[msa_pos]
|
|
256
|
+
|
|
257
|
+
# Skip if both are gaps (read doesn't cover this position)
|
|
258
|
+
if read_base == '-' and cons_base == '-':
|
|
259
|
+
continue
|
|
260
|
+
|
|
261
|
+
# Classify error type and record at MSA position
|
|
262
|
+
if read_base == '-' and cons_base != '-':
|
|
263
|
+
# Deletion (missing base in read)
|
|
264
|
+
error_positions.append(ErrorPosition(msa_pos, 'del'))
|
|
265
|
+
num_deletions += 1
|
|
266
|
+
elif read_base != '-' and cons_base == '-':
|
|
267
|
+
# Insertion (extra base in read)
|
|
268
|
+
error_positions.append(ErrorPosition(msa_pos, 'ins'))
|
|
269
|
+
num_insertions += 1
|
|
270
|
+
elif read_base != cons_base:
|
|
271
|
+
# Substitution (different bases)
|
|
272
|
+
error_positions.append(ErrorPosition(msa_pos, 'sub'))
|
|
273
|
+
num_substitutions += 1
|
|
274
|
+
# else: match, no error
|
|
275
|
+
|
|
276
|
+
# Calculate edit distance and read length
|
|
277
|
+
edit_distance = num_insertions + num_deletions + num_substitutions
|
|
278
|
+
read_length = len(read_aligned.replace('-', '')) # Length without gaps
|
|
279
|
+
|
|
280
|
+
# Compute homopolymer-normalized metrics if enabled
|
|
281
|
+
if enable_homopolymer_normalization:
|
|
282
|
+
try:
|
|
283
|
+
# Use adjusted-identity to get homopolymer-normalized scoring
|
|
284
|
+
# IMPORTANT: seq1=read, seq2=consensus. The score_aligned visualization
|
|
285
|
+
# is asymmetric and shows HP extensions from seq1's (the READ's) perspective.
|
|
286
|
+
# This is what we want since we're identifying which READ bases are extensions.
|
|
287
|
+
result = score_alignment(
|
|
288
|
+
read_aligned, # seq1 - the read
|
|
289
|
+
consensus_aligned, # seq2 - the consensus
|
|
290
|
+
HOMOPOLYMER_ADJUSTMENT_PARAMS
|
|
291
|
+
)
|
|
292
|
+
|
|
293
|
+
# Parse score_aligned string to extract normalized errors
|
|
294
|
+
normalized_error_positions = parse_score_aligned_for_errors(
|
|
295
|
+
result.score_aligned,
|
|
296
|
+
read_aligned,
|
|
297
|
+
consensus_aligned
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
normalized_edit_distance = result.mismatches
|
|
301
|
+
score_aligned_str = result.score_aligned
|
|
302
|
+
|
|
303
|
+
except Exception as e:
|
|
304
|
+
# If normalization fails, fall back to raw metrics
|
|
305
|
+
logging.warning(f"Homopolymer normalization failed for read {read_record.id}: {e}")
|
|
306
|
+
normalized_edit_distance = edit_distance
|
|
307
|
+
normalized_error_positions = error_positions
|
|
308
|
+
score_aligned_str = ""
|
|
309
|
+
else:
|
|
310
|
+
# Homopolymer normalization disabled - use raw metrics
|
|
311
|
+
normalized_edit_distance = edit_distance
|
|
312
|
+
normalized_error_positions = error_positions
|
|
313
|
+
score_aligned_str = ""
|
|
314
|
+
|
|
315
|
+
# Create alignment object with both raw and normalized metrics
|
|
316
|
+
alignment = ReadAlignment(
|
|
317
|
+
read_id=read_record.id,
|
|
318
|
+
aligned_sequence=read_aligned, # Store gapped sequence
|
|
319
|
+
read_length=read_length,
|
|
320
|
+
# Raw metrics
|
|
321
|
+
edit_distance=edit_distance,
|
|
322
|
+
num_insertions=num_insertions,
|
|
323
|
+
num_deletions=num_deletions,
|
|
324
|
+
num_substitutions=num_substitutions,
|
|
325
|
+
error_positions=error_positions,
|
|
326
|
+
# Normalized metrics
|
|
327
|
+
normalized_edit_distance=normalized_edit_distance,
|
|
328
|
+
normalized_error_positions=normalized_error_positions,
|
|
329
|
+
score_aligned=score_aligned_str
|
|
330
|
+
)
|
|
331
|
+
alignments.append(alignment)
|
|
332
|
+
|
|
333
|
+
return alignments, consensus_ungapped, msa_to_consensus_pos
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
def analyze_positional_variation(alignments: List[ReadAlignment], consensus_aligned: str,
|
|
337
|
+
msa_to_consensus_pos: Dict[int, Optional[int]]) -> List[PositionStats]:
|
|
338
|
+
"""
|
|
339
|
+
Analyze error rates at each position in the MSA with homopolymer tracking.
|
|
340
|
+
|
|
341
|
+
Uses normalized error positions and base composition to identify true variants
|
|
342
|
+
while tracking homopolymer length differences separately. For each position:
|
|
343
|
+
- base_composition: Raw counts of each base observed
|
|
344
|
+
- homopolymer_composition: Counts of bases that are homopolymer extensions (score_aligned='=')
|
|
345
|
+
|
|
346
|
+
Downstream variant detection uses effective counts (raw - HP) to identify true
|
|
347
|
+
biological variants while ignoring diversity due solely to homopolymer variation.
|
|
348
|
+
|
|
349
|
+
IMPORTANT: All analysis is performed in MSA space (not consensus space).
|
|
350
|
+
This correctly handles insertion columns where multiple MSA positions
|
|
351
|
+
don't correspond to any consensus position.
|
|
352
|
+
|
|
353
|
+
Args:
|
|
354
|
+
alignments: List of read alignments (with normalized metrics)
|
|
355
|
+
consensus_aligned: Consensus sequence (gapped, from MSA)
|
|
356
|
+
msa_to_consensus_pos: Mapping from MSA position to consensus position
|
|
357
|
+
|
|
358
|
+
Returns:
|
|
359
|
+
List of PositionStats for each MSA position with normalized base composition
|
|
360
|
+
"""
|
|
361
|
+
msa_length = len(consensus_aligned)
|
|
362
|
+
|
|
363
|
+
# Build error frequency matrix in MSA space
|
|
364
|
+
# For each MSA position: [sub_count, ins_count, del_count, total_coverage]
|
|
365
|
+
error_matrix = np.zeros((msa_length, 4), dtype=int)
|
|
366
|
+
|
|
367
|
+
# Build base composition matrix in MSA space (raw counts)
|
|
368
|
+
base_composition_matrix = [
|
|
369
|
+
{'A': 0, 'C': 0, 'G': 0, 'T': 0, '-': 0}
|
|
370
|
+
for _ in range(msa_length)
|
|
371
|
+
]
|
|
372
|
+
|
|
373
|
+
# Build homopolymer composition matrix in MSA space
|
|
374
|
+
# Tracks bases that are homopolymer extensions (score_aligned='=')
|
|
375
|
+
homopolymer_composition_matrix = [
|
|
376
|
+
{'A': 0, 'C': 0, 'G': 0, 'T': 0, '-': 0}
|
|
377
|
+
for _ in range(msa_length)
|
|
378
|
+
]
|
|
379
|
+
|
|
380
|
+
# Process alignments to count errors at MSA positions
|
|
381
|
+
for read_idx, alignment in enumerate(alignments):
|
|
382
|
+
# Count this read as coverage for all MSA positions
|
|
383
|
+
# Note: alignments span the full MSA
|
|
384
|
+
for msa_pos in range(msa_length):
|
|
385
|
+
error_matrix[msa_pos, 3] += 1 # coverage
|
|
386
|
+
|
|
387
|
+
# Add errors at specific MSA positions using normalized errors
|
|
388
|
+
# (excludes homopolymer extensions)
|
|
389
|
+
for error_pos in alignment.normalized_error_positions:
|
|
390
|
+
msa_pos = error_pos.msa_position
|
|
391
|
+
if 0 <= msa_pos < msa_length:
|
|
392
|
+
if error_pos.error_type == 'sub':
|
|
393
|
+
error_matrix[msa_pos, 0] += 1
|
|
394
|
+
elif error_pos.error_type == 'ins':
|
|
395
|
+
error_matrix[msa_pos, 1] += 1
|
|
396
|
+
elif error_pos.error_type == 'del':
|
|
397
|
+
error_matrix[msa_pos, 2] += 1
|
|
398
|
+
|
|
399
|
+
# Extract base composition from aligned sequence with homopolymer normalization
|
|
400
|
+
read_aligned = alignment.aligned_sequence
|
|
401
|
+
if len(read_aligned) != msa_length:
|
|
402
|
+
continue
|
|
403
|
+
|
|
404
|
+
# Track what base each read has at each MSA position
|
|
405
|
+
# Raw base composition plus separate tracking of homopolymer extensions
|
|
406
|
+
for msa_pos in range(msa_length):
|
|
407
|
+
read_base = read_aligned[msa_pos]
|
|
408
|
+
|
|
409
|
+
# Track raw base composition
|
|
410
|
+
if read_base in ['A', 'C', 'G', 'T', '-']:
|
|
411
|
+
base_composition_matrix[msa_pos][read_base] += 1
|
|
412
|
+
else:
|
|
413
|
+
# Treat N or other ambiguous as gap
|
|
414
|
+
base_composition_matrix[msa_pos]['-'] += 1
|
|
415
|
+
|
|
416
|
+
# Additionally track if this is a homopolymer extension
|
|
417
|
+
# NOTE: score_aligned is from the READ's perspective (seq1), which is what we want
|
|
418
|
+
# since we're asking whether this particular READ base is an HP extension
|
|
419
|
+
if alignment.score_aligned and msa_pos < len(alignment.score_aligned):
|
|
420
|
+
if alignment.score_aligned[msa_pos] == '=':
|
|
421
|
+
# Homopolymer extension - track separately
|
|
422
|
+
if read_base in ['A', 'C', 'G', 'T', '-']:
|
|
423
|
+
homopolymer_composition_matrix[msa_pos][read_base] += 1
|
|
424
|
+
else:
|
|
425
|
+
homopolymer_composition_matrix[msa_pos]['-'] += 1
|
|
426
|
+
|
|
427
|
+
# Calculate statistics for each MSA position
|
|
428
|
+
position_stats = []
|
|
429
|
+
|
|
430
|
+
for msa_pos in range(msa_length):
|
|
431
|
+
sub_count = error_matrix[msa_pos, 0]
|
|
432
|
+
ins_count = error_matrix[msa_pos, 1]
|
|
433
|
+
del_count = error_matrix[msa_pos, 2]
|
|
434
|
+
coverage = error_matrix[msa_pos, 3]
|
|
435
|
+
|
|
436
|
+
# Total error events
|
|
437
|
+
error_count = sub_count + ins_count + del_count
|
|
438
|
+
error_rate = error_count / coverage if coverage > 0 else 0.0
|
|
439
|
+
|
|
440
|
+
# Get consensus position (None for insertion columns)
|
|
441
|
+
cons_pos = msa_to_consensus_pos[msa_pos]
|
|
442
|
+
|
|
443
|
+
# Get consensus nucleotide at this MSA position
|
|
444
|
+
cons_nucleotide = consensus_aligned[msa_pos]
|
|
445
|
+
|
|
446
|
+
# Get base composition for this MSA position (raw counts)
|
|
447
|
+
base_comp = base_composition_matrix[msa_pos].copy()
|
|
448
|
+
|
|
449
|
+
# Get homopolymer extension composition for this MSA position
|
|
450
|
+
hp_comp = homopolymer_composition_matrix[msa_pos].copy()
|
|
451
|
+
|
|
452
|
+
position_stats.append(PositionStats(
|
|
453
|
+
msa_position=msa_pos,
|
|
454
|
+
consensus_position=cons_pos,
|
|
455
|
+
coverage=coverage,
|
|
456
|
+
error_count=error_count,
|
|
457
|
+
error_rate=error_rate,
|
|
458
|
+
sub_count=sub_count,
|
|
459
|
+
ins_count=ins_count,
|
|
460
|
+
del_count=del_count,
|
|
461
|
+
consensus_nucleotide=cons_nucleotide,
|
|
462
|
+
base_composition=base_comp,
|
|
463
|
+
homopolymer_composition=hp_comp
|
|
464
|
+
))
|
|
465
|
+
|
|
466
|
+
return position_stats
|
|
467
|
+
|
|
468
|
+
|
|
469
|
+
def is_variant_position_with_composition(
|
|
470
|
+
position_stats: PositionStats,
|
|
471
|
+
min_variant_frequency: float = 0.10,
|
|
472
|
+
min_variant_count: int = 5
|
|
473
|
+
) -> Tuple[bool, List[str], str]:
|
|
474
|
+
"""
|
|
475
|
+
Identify variant positions using simple frequency and count thresholds.
|
|
476
|
+
|
|
477
|
+
This function determines if a position shows systematic variation (true biological
|
|
478
|
+
variant) rather than scattered sequencing errors. Homopolymer extensions are
|
|
479
|
+
excluded from consideration - diversity due solely to homopolymer length variation
|
|
480
|
+
is not considered a true variant.
|
|
481
|
+
|
|
482
|
+
Criteria for variant detection:
|
|
483
|
+
1. At least one alternative allele must have frequency >= min_variant_frequency
|
|
484
|
+
2. That allele must have count >= min_variant_count
|
|
485
|
+
3. Counts are adjusted by subtracting homopolymer extension counts
|
|
486
|
+
|
|
487
|
+
Args:
|
|
488
|
+
position_stats: Position statistics including base composition
|
|
489
|
+
min_variant_frequency: Minimum alternative allele frequency (default: 0.10 for 10%)
|
|
490
|
+
min_variant_count: Minimum alternative allele read count (default: 5 reads)
|
|
491
|
+
|
|
492
|
+
Returns:
|
|
493
|
+
Tuple of (is_variant, variant_bases, reason)
|
|
494
|
+
- is_variant: True if this position requires cluster separation
|
|
495
|
+
- variant_bases: List of alternative bases meeting criteria (e.g., ['G', 'T'])
|
|
496
|
+
- reason: Explanation of decision for logging/debugging
|
|
497
|
+
"""
|
|
498
|
+
n = position_stats.coverage
|
|
499
|
+
base_composition = position_stats.base_composition
|
|
500
|
+
hp_composition = position_stats.homopolymer_composition
|
|
501
|
+
|
|
502
|
+
# Check we have composition data
|
|
503
|
+
if not base_composition or sum(base_composition.values()) == 0:
|
|
504
|
+
return False, [], "No composition data available"
|
|
505
|
+
|
|
506
|
+
# Calculate effective counts by subtracting homopolymer extensions
|
|
507
|
+
# This excludes diversity that's purely due to HP length variation
|
|
508
|
+
effective_composition = {}
|
|
509
|
+
for base in ['A', 'C', 'G', 'T', '-']:
|
|
510
|
+
raw_count = base_composition.get(base, 0)
|
|
511
|
+
hp_count = hp_composition.get(base, 0) if hp_composition else 0
|
|
512
|
+
effective_count = raw_count - hp_count
|
|
513
|
+
if effective_count > 0:
|
|
514
|
+
effective_composition[base] = effective_count
|
|
515
|
+
|
|
516
|
+
# Check we have effective composition data after HP adjustment
|
|
517
|
+
if not effective_composition or sum(effective_composition.values()) == 0:
|
|
518
|
+
return False, [], "No composition data after HP adjustment"
|
|
519
|
+
|
|
520
|
+
effective_total = sum(effective_composition.values())
|
|
521
|
+
|
|
522
|
+
sorted_bases = sorted(
|
|
523
|
+
effective_composition.items(),
|
|
524
|
+
key=lambda x: x[1],
|
|
525
|
+
reverse=True
|
|
526
|
+
)
|
|
527
|
+
|
|
528
|
+
if len(sorted_bases) < 2:
|
|
529
|
+
return False, [], "No alternative alleles observed (after HP adjustment)"
|
|
530
|
+
|
|
531
|
+
# Check each alternative allele (skip consensus base at index 0)
|
|
532
|
+
variant_bases = []
|
|
533
|
+
variant_details = []
|
|
534
|
+
|
|
535
|
+
for base, count in sorted_bases[1:]:
|
|
536
|
+
freq = count / effective_total if effective_total > 0 else 0
|
|
537
|
+
|
|
538
|
+
# Must meet both frequency and count thresholds
|
|
539
|
+
if freq >= min_variant_frequency and count >= min_variant_count:
|
|
540
|
+
variant_bases.append(base)
|
|
541
|
+
variant_details.append(f"{base}:{count}/{effective_total}({freq:.1%})")
|
|
542
|
+
|
|
543
|
+
if variant_bases:
|
|
544
|
+
return True, variant_bases, f"Variant alleles: {', '.join(variant_details)}"
|
|
545
|
+
|
|
546
|
+
# Debug: Check if this would be a variant WITHOUT HP normalization
|
|
547
|
+
# This helps identify cases where HP adjustment incorrectly eliminates variants
|
|
548
|
+
raw_total = sum(base_composition.get(b, 0) for b in ['A', 'C', 'G', 'T', '-'])
|
|
549
|
+
raw_sorted = sorted(
|
|
550
|
+
[(b, base_composition.get(b, 0)) for b in ['A', 'C', 'G', 'T', '-'] if base_composition.get(b, 0) > 0],
|
|
551
|
+
key=lambda x: x[1],
|
|
552
|
+
reverse=True
|
|
553
|
+
)
|
|
554
|
+
if len(raw_sorted) >= 2:
|
|
555
|
+
for base, count in raw_sorted[1:]:
|
|
556
|
+
freq = count / raw_total if raw_total > 0 else 0
|
|
557
|
+
if freq >= min_variant_frequency and count >= min_variant_count:
|
|
558
|
+
# Would be variant without HP normalization!
|
|
559
|
+
logging.debug(
|
|
560
|
+
f"HP normalization eliminated variant at MSA pos {position_stats.msa_position}: "
|
|
561
|
+
f"raw {base}:{count}/{raw_total}({freq:.1%}) meets threshold, "
|
|
562
|
+
f"but effective composition={effective_composition}, "
|
|
563
|
+
f"raw={base_composition}, hp={hp_composition}"
|
|
564
|
+
)
|
|
565
|
+
break
|
|
566
|
+
|
|
567
|
+
return False, [], "No variants detected (after HP adjustment)"
|
|
568
|
+
|
|
569
|
+
|
|
570
|
+
def call_iupac_ambiguities(
|
|
571
|
+
consensus: str,
|
|
572
|
+
alignments: List['ReadAlignment'],
|
|
573
|
+
msa_to_consensus_pos: Dict[int, Optional[int]],
|
|
574
|
+
min_variant_frequency: float = 0.10,
|
|
575
|
+
min_variant_count: int = 5
|
|
576
|
+
) -> Tuple[str, int, List[Dict]]:
|
|
577
|
+
"""
|
|
578
|
+
Replace consensus bases at variant positions with IUPAC ambiguity codes.
|
|
579
|
+
|
|
580
|
+
Analyzes positional variation in the MSA and identifies positions where
|
|
581
|
+
significant variation remains after phasing. At these positions, the
|
|
582
|
+
consensus base is replaced with the appropriate IUPAC code representing
|
|
583
|
+
all variant alleles that meet the threshold criteria.
|
|
584
|
+
|
|
585
|
+
Uses the same thresholds as phasing to ensure consistency. Homopolymer
|
|
586
|
+
length variation is excluded (only true nucleotide variants are considered).
|
|
587
|
+
|
|
588
|
+
Args:
|
|
589
|
+
consensus: Ungapped consensus sequence from SPOA
|
|
590
|
+
alignments: List of ReadAlignment objects from MSA
|
|
591
|
+
msa_to_consensus_pos: Mapping from MSA position to consensus position
|
|
592
|
+
min_variant_frequency: Minimum alternative allele frequency (default: 0.10)
|
|
593
|
+
min_variant_count: Minimum alternative allele read count (default: 5)
|
|
594
|
+
|
|
595
|
+
Returns:
|
|
596
|
+
Tuple of:
|
|
597
|
+
- Modified consensus sequence with IUPAC codes at variant positions
|
|
598
|
+
- Count of IUPAC positions introduced
|
|
599
|
+
- List of dicts with details about each IUPAC position:
|
|
600
|
+
{
|
|
601
|
+
'consensus_position': int,
|
|
602
|
+
'original_base': str,
|
|
603
|
+
'iupac_code': str,
|
|
604
|
+
'variant_bases': List[str],
|
|
605
|
+
'base_composition': Dict[str, int]
|
|
606
|
+
}
|
|
607
|
+
"""
|
|
608
|
+
if not consensus or not alignments:
|
|
609
|
+
return consensus, 0, []
|
|
610
|
+
|
|
611
|
+
# Reconstruct consensus_aligned from consensus and msa_to_consensus_pos
|
|
612
|
+
# (same pattern as detect_variant_positions)
|
|
613
|
+
msa_length = max(msa_to_consensus_pos.keys()) + 1 if msa_to_consensus_pos else 0
|
|
614
|
+
if msa_length == 0:
|
|
615
|
+
return consensus, 0, []
|
|
616
|
+
|
|
617
|
+
consensus_aligned = []
|
|
618
|
+
for msa_pos in range(msa_length):
|
|
619
|
+
cons_pos = msa_to_consensus_pos.get(msa_pos)
|
|
620
|
+
if cons_pos is not None and cons_pos < len(consensus):
|
|
621
|
+
consensus_aligned.append(consensus[cons_pos])
|
|
622
|
+
else:
|
|
623
|
+
consensus_aligned.append('-')
|
|
624
|
+
consensus_aligned_str = ''.join(consensus_aligned)
|
|
625
|
+
|
|
626
|
+
# Analyze positional variation
|
|
627
|
+
position_stats = analyze_positional_variation(alignments, consensus_aligned_str, msa_to_consensus_pos)
|
|
628
|
+
|
|
629
|
+
# Build list of positions to replace
|
|
630
|
+
iupac_positions = []
|
|
631
|
+
|
|
632
|
+
for pos_stat in position_stats:
|
|
633
|
+
# Skip insertion columns (no consensus position)
|
|
634
|
+
if pos_stat.consensus_position is None:
|
|
635
|
+
continue
|
|
636
|
+
|
|
637
|
+
# Check if this position has significant variation
|
|
638
|
+
is_variant, variant_bases, reason = is_variant_position_with_composition(
|
|
639
|
+
pos_stat, min_variant_frequency, min_variant_count
|
|
640
|
+
)
|
|
641
|
+
|
|
642
|
+
if not is_variant:
|
|
643
|
+
continue
|
|
644
|
+
|
|
645
|
+
# Filter out gaps from variant bases (we can only represent nucleotide ambiguities)
|
|
646
|
+
nucleotide_variants = [b for b in variant_bases if b in 'ACGT']
|
|
647
|
+
|
|
648
|
+
if not nucleotide_variants:
|
|
649
|
+
# Only gaps met the threshold - skip this position
|
|
650
|
+
continue
|
|
651
|
+
|
|
652
|
+
# Get the consensus base at this position
|
|
653
|
+
cons_pos = pos_stat.consensus_position
|
|
654
|
+
consensus_base = consensus[cons_pos] if cons_pos < len(consensus) else None
|
|
655
|
+
|
|
656
|
+
if consensus_base is None or consensus_base not in 'ACGT':
|
|
657
|
+
continue
|
|
658
|
+
|
|
659
|
+
# Build set of all significant bases (consensus + variants)
|
|
660
|
+
all_bases = set(nucleotide_variants)
|
|
661
|
+
all_bases.add(consensus_base)
|
|
662
|
+
|
|
663
|
+
# Look up IUPAC code
|
|
664
|
+
iupac_code = IUPAC_CODES.get(frozenset(all_bases), 'N')
|
|
665
|
+
|
|
666
|
+
# Only record if we actually need an ambiguity code (more than one base)
|
|
667
|
+
if len(all_bases) > 1:
|
|
668
|
+
iupac_positions.append({
|
|
669
|
+
'consensus_position': cons_pos,
|
|
670
|
+
'original_base': consensus_base,
|
|
671
|
+
'iupac_code': iupac_code,
|
|
672
|
+
'variant_bases': nucleotide_variants,
|
|
673
|
+
'base_composition': pos_stat.base_composition
|
|
674
|
+
})
|
|
675
|
+
|
|
676
|
+
if not iupac_positions:
|
|
677
|
+
return consensus, 0, []
|
|
678
|
+
|
|
679
|
+
# Build modified consensus
|
|
680
|
+
consensus_list = list(consensus)
|
|
681
|
+
for pos_info in iupac_positions:
|
|
682
|
+
cons_pos = pos_info['consensus_position']
|
|
683
|
+
consensus_list[cons_pos] = pos_info['iupac_code']
|
|
684
|
+
|
|
685
|
+
modified_consensus = ''.join(consensus_list)
|
|
686
|
+
|
|
687
|
+
return modified_consensus, len(iupac_positions), iupac_positions
|
|
688
|
+
|
|
689
|
+
|
|
690
|
+
def calculate_within_cluster_error(
|
|
691
|
+
haplotype_groups: Dict[str, Set[str]],
|
|
692
|
+
read_alleles: Dict[str, Dict[int, str]],
|
|
693
|
+
phasing_positions: Set[int],
|
|
694
|
+
all_variant_positions: Set[int]
|
|
695
|
+
) -> float:
|
|
696
|
+
"""Calculate within-cluster error for a given haplotype grouping.
|
|
697
|
+
|
|
698
|
+
Measures the average variation at ALL variant positions within each haplotype,
|
|
699
|
+
including positions used for phasing. This ensures fair comparison across
|
|
700
|
+
different candidate position sets and captures heterogeneity introduced by
|
|
701
|
+
reassignment of non-qualifying haplotypes.
|
|
702
|
+
|
|
703
|
+
Lower error indicates more homogeneous clusters.
|
|
704
|
+
|
|
705
|
+
Args:
|
|
706
|
+
haplotype_groups: Dict mapping allele_combo -> set of read_ids
|
|
707
|
+
read_alleles: Dict mapping read_id -> {msa_position -> allele}
|
|
708
|
+
phasing_positions: Set of MSA positions used for phasing (kept for API compatibility)
|
|
709
|
+
all_variant_positions: Set of all variant MSA positions (error measured at all of these)
|
|
710
|
+
|
|
711
|
+
Returns:
|
|
712
|
+
Weighted average error rate across haplotypes (0.0 = perfect, 1.0 = maximum error)
|
|
713
|
+
"""
|
|
714
|
+
# Measure error at ALL variant positions, not just non-phased ones.
|
|
715
|
+
# This ensures fair comparison across candidate position sets and captures
|
|
716
|
+
# heterogeneity introduced by reassignment at phasing positions.
|
|
717
|
+
measured_positions = all_variant_positions
|
|
718
|
+
|
|
719
|
+
if not measured_positions or not haplotype_groups:
|
|
720
|
+
return 0.0
|
|
721
|
+
|
|
722
|
+
total_weighted_error = 0.0
|
|
723
|
+
total_reads = 0
|
|
724
|
+
|
|
725
|
+
for combo, read_ids in haplotype_groups.items():
|
|
726
|
+
if not read_ids:
|
|
727
|
+
continue
|
|
728
|
+
|
|
729
|
+
haplotype_error = 0.0
|
|
730
|
+
positions_counted = 0
|
|
731
|
+
|
|
732
|
+
for pos in measured_positions:
|
|
733
|
+
# Count alleles at this position for reads in this haplotype
|
|
734
|
+
allele_counts = defaultdict(int)
|
|
735
|
+
for read_id in read_ids:
|
|
736
|
+
allele = read_alleles.get(read_id, {}).get(pos, '-')
|
|
737
|
+
allele_counts[allele] += 1
|
|
738
|
+
|
|
739
|
+
if not allele_counts:
|
|
740
|
+
continue
|
|
741
|
+
|
|
742
|
+
# Find consensus (most common) allele
|
|
743
|
+
total_at_pos = sum(allele_counts.values())
|
|
744
|
+
max_count = max(allele_counts.values())
|
|
745
|
+
|
|
746
|
+
# Error rate = fraction of reads NOT matching consensus
|
|
747
|
+
error_at_pos = (total_at_pos - max_count) / total_at_pos
|
|
748
|
+
haplotype_error += error_at_pos
|
|
749
|
+
positions_counted += 1
|
|
750
|
+
|
|
751
|
+
# Average error across all variant positions for this haplotype
|
|
752
|
+
if positions_counted > 0:
|
|
753
|
+
mean_haplotype_error = haplotype_error / positions_counted
|
|
754
|
+
total_weighted_error += mean_haplotype_error * len(read_ids)
|
|
755
|
+
total_reads += len(read_ids)
|
|
756
|
+
|
|
757
|
+
if total_reads == 0:
|
|
758
|
+
return 0.0
|
|
759
|
+
|
|
760
|
+
return total_weighted_error / total_reads
|
|
761
|
+
|
|
762
|
+
|
|
763
|
+
def filter_qualifying_haplotypes(
|
|
764
|
+
combo_to_reads: Dict[str, Set[str]],
|
|
765
|
+
total_reads: int,
|
|
766
|
+
min_count: int,
|
|
767
|
+
min_frequency: float
|
|
768
|
+
) -> Tuple[Dict[str, Set[str]], Dict[str, Set[str]]]:
|
|
769
|
+
"""Filter haplotypes to those meeting count and frequency thresholds.
|
|
770
|
+
|
|
771
|
+
Args:
|
|
772
|
+
combo_to_reads: Dict mapping allele_combo -> set of read_ids
|
|
773
|
+
total_reads: Total number of reads for frequency calculation
|
|
774
|
+
min_count: Minimum read count threshold
|
|
775
|
+
min_frequency: Minimum frequency threshold (0.0 to 1.0)
|
|
776
|
+
|
|
777
|
+
Returns:
|
|
778
|
+
Tuple of (qualifying_combos, non_qualifying_combos)
|
|
779
|
+
"""
|
|
780
|
+
qualifying = {}
|
|
781
|
+
non_qualifying = {}
|
|
782
|
+
for combo, reads in combo_to_reads.items():
|
|
783
|
+
count = len(reads)
|
|
784
|
+
freq = count / total_reads if total_reads > 0 else 0
|
|
785
|
+
if count >= min_count and freq >= min_frequency:
|
|
786
|
+
qualifying[combo] = reads
|
|
787
|
+
else:
|
|
788
|
+
non_qualifying[combo] = reads
|
|
789
|
+
return qualifying, non_qualifying
|
|
790
|
+
|
|
791
|
+
|
|
792
|
+
def group_reads_by_single_position(
|
|
793
|
+
read_alleles: Dict[str, Dict[int, str]],
|
|
794
|
+
position: int,
|
|
795
|
+
read_ids: Set[str]
|
|
796
|
+
) -> Dict[str, Set[str]]:
|
|
797
|
+
"""Group a subset of reads by their allele at a single position.
|
|
798
|
+
|
|
799
|
+
Args:
|
|
800
|
+
read_alleles: Dict mapping read_id -> {msa_position -> allele}
|
|
801
|
+
position: MSA position to group by
|
|
802
|
+
read_ids: Subset of read IDs to consider
|
|
803
|
+
|
|
804
|
+
Returns:
|
|
805
|
+
Dict mapping allele -> set of read_ids
|
|
806
|
+
"""
|
|
807
|
+
allele_to_reads = defaultdict(set)
|
|
808
|
+
for read_id in read_ids:
|
|
809
|
+
allele = read_alleles.get(read_id, {}).get(position, '-')
|
|
810
|
+
allele_to_reads[allele].add(read_id)
|
|
811
|
+
return dict(allele_to_reads)
|
|
812
|
+
|
|
813
|
+
|