speconsense 0.7.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- speconsense/__init__.py +16 -0
- speconsense/cli.py +6 -0
- speconsense/core/__init__.py +32 -0
- speconsense/core/__main__.py +6 -0
- speconsense/core/cli.py +308 -0
- speconsense/core/clusterer.py +1565 -0
- speconsense/core/workers.py +696 -0
- speconsense/msa.py +813 -0
- speconsense/profiles/__init__.py +514 -0
- speconsense/profiles/example.yaml +97 -0
- speconsense/profiles/herbarium.yaml +25 -0
- speconsense/profiles/largedata.yaml +19 -0
- speconsense/profiles/nostalgia.yaml +22 -0
- speconsense/profiles/strict.yaml +27 -0
- speconsense/quality_report.py +499 -0
- speconsense/scalability/__init__.py +29 -0
- speconsense/scalability/base.py +461 -0
- speconsense/scalability/config.py +42 -0
- speconsense/scalability/vsearch.py +226 -0
- speconsense/summarize/__init__.py +129 -0
- speconsense/summarize/__main__.py +6 -0
- speconsense/summarize/analysis.py +780 -0
- speconsense/summarize/cli.py +528 -0
- speconsense/summarize/clustering.py +669 -0
- speconsense/summarize/fields.py +262 -0
- speconsense/summarize/io.py +723 -0
- speconsense/summarize/iupac.py +294 -0
- speconsense/summarize/merging.py +606 -0
- speconsense/synth.py +292 -0
- speconsense/types.py +38 -0
- speconsense-0.7.2.dist-info/METADATA +1449 -0
- speconsense-0.7.2.dist-info/RECORD +36 -0
- speconsense-0.7.2.dist-info/WHEEL +5 -0
- speconsense-0.7.2.dist-info/entry_points.txt +4 -0
- speconsense-0.7.2.dist-info/licenses/LICENSE +28 -0
- speconsense-0.7.2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,294 @@
|
|
|
1
|
+
"""IUPAC ambiguity code handling and distance calculations.
|
|
2
|
+
|
|
3
|
+
Provides utilities for working with IUPAC nucleotide codes and calculating
|
|
4
|
+
adjusted identity distances between sequences with homopolymer normalization.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import List, Optional
|
|
8
|
+
|
|
9
|
+
import edlib
|
|
10
|
+
from adjusted_identity import score_alignment, AdjustmentParams, align_and_score
|
|
11
|
+
|
|
12
|
+
from speconsense.msa import IUPAC_CODES
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
# IUPAC equivalencies for edlib alignment
|
|
16
|
+
# This allows edlib to treat IUPAC ambiguity codes as matching their constituent bases
|
|
17
|
+
IUPAC_EQUIV = [("Y", "C"), ("Y", "T"), ("R", "A"), ("R", "G"),
|
|
18
|
+
("N", "A"), ("N", "C"), ("N", "G"), ("N", "T"),
|
|
19
|
+
("W", "A"), ("W", "T"), ("M", "A"), ("M", "C"),
|
|
20
|
+
("S", "C"), ("S", "G"), ("K", "G"), ("K", "T"),
|
|
21
|
+
("B", "C"), ("B", "G"), ("B", "T"),
|
|
22
|
+
("D", "A"), ("D", "G"), ("D", "T"),
|
|
23
|
+
("H", "A"), ("H", "C"), ("H", "T"),
|
|
24
|
+
("V", "A"), ("V", "C"), ("V", "G"), ]
|
|
25
|
+
|
|
26
|
+
# Standard adjustment parameters for consistent sequence comparison
|
|
27
|
+
# Used by both substitution distance calculation and adjusted identity distance
|
|
28
|
+
STANDARD_ADJUSTMENT_PARAMS = AdjustmentParams(
|
|
29
|
+
normalize_homopolymers=True, # Enable homopolymer normalization
|
|
30
|
+
handle_iupac_overlap=False, # Disable IUPAC overlap - use standard IUPAC semantics (Y!=M)
|
|
31
|
+
normalize_indels=False, # Disable indel normalization
|
|
32
|
+
end_skip_distance=0, # No end trimming - sequences must match end-to-end
|
|
33
|
+
max_repeat_motif_length=1 # Single-base repeats for homopolymer normalization
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def primers_are_same(p1: Optional[List[str]], p2: Optional[List[str]]) -> bool:
|
|
38
|
+
"""Check if two primer annotations indicate the same amplicon.
|
|
39
|
+
|
|
40
|
+
Used to determine whether overlap-aware merging should be allowed.
|
|
41
|
+
When primers match, sequences should have the same amplicon length,
|
|
42
|
+
so length differences indicate chimeras rather than primer pool variation.
|
|
43
|
+
|
|
44
|
+
Returns True (use global distance, no overlap merging) when:
|
|
45
|
+
- Either is None or empty (conservative: unknown = assume same)
|
|
46
|
+
- Both have identical primer sets
|
|
47
|
+
|
|
48
|
+
Returns False (allow overlap merging) when primers differ.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
p1: Primer annotation from first sequence (e.g., ['ITS1', 'ITS4'])
|
|
52
|
+
p2: Primer annotation from second sequence
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
True if primers are same or unknown (use global distance)
|
|
56
|
+
False if primers differ (allow overlap-aware distance)
|
|
57
|
+
"""
|
|
58
|
+
if not p1 or not p2:
|
|
59
|
+
return True # Conservative: missing info -> assume same
|
|
60
|
+
return set(p1) == set(p2)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def bases_match_with_iupac(base1: str, base2: str) -> bool:
|
|
64
|
+
"""
|
|
65
|
+
Check if two bases match, considering IUPAC ambiguity codes.
|
|
66
|
+
Two bases match if their IUPAC expansions have any nucleotides in common.
|
|
67
|
+
"""
|
|
68
|
+
if base1 == base2:
|
|
69
|
+
return True
|
|
70
|
+
|
|
71
|
+
# Handle gap characters
|
|
72
|
+
if base1 == '-' or base2 == '-':
|
|
73
|
+
return base1 == base2
|
|
74
|
+
|
|
75
|
+
# Expand IUPAC codes and check for overlap
|
|
76
|
+
expansion1 = expand_iupac_code(base1)
|
|
77
|
+
expansion2 = expand_iupac_code(base2)
|
|
78
|
+
|
|
79
|
+
# Bases match if their expansions have any nucleotides in common
|
|
80
|
+
return bool(expansion1.intersection(expansion2))
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def expand_iupac_code(base: str) -> set:
|
|
84
|
+
"""
|
|
85
|
+
Expand an IUPAC code to its constituent nucleotides.
|
|
86
|
+
Returns a set of nucleotides that the code represents.
|
|
87
|
+
"""
|
|
88
|
+
iupac_expansion = {
|
|
89
|
+
'A': {'A'},
|
|
90
|
+
'C': {'C'},
|
|
91
|
+
'G': {'G'},
|
|
92
|
+
'T': {'T'},
|
|
93
|
+
'R': {'A', 'G'},
|
|
94
|
+
'Y': {'C', 'T'},
|
|
95
|
+
'S': {'G', 'C'},
|
|
96
|
+
'W': {'A', 'T'},
|
|
97
|
+
'K': {'G', 'T'},
|
|
98
|
+
'M': {'A', 'C'},
|
|
99
|
+
'B': {'C', 'G', 'T'},
|
|
100
|
+
'D': {'A', 'G', 'T'},
|
|
101
|
+
'H': {'A', 'C', 'T'},
|
|
102
|
+
'V': {'A', 'C', 'G'},
|
|
103
|
+
'N': {'A', 'C', 'G', 'T'},
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
return iupac_expansion.get(base.upper(), {'N'})
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def merge_bases_to_iupac(bases: set) -> str:
|
|
110
|
+
"""
|
|
111
|
+
Merge a set of bases (which may include IUPAC codes) into a single IUPAC code.
|
|
112
|
+
|
|
113
|
+
Expands any existing IUPAC codes to their constituent nucleotides,
|
|
114
|
+
takes the union, and returns the appropriate IUPAC code.
|
|
115
|
+
|
|
116
|
+
Examples:
|
|
117
|
+
{'C', 'Y'} -> 'Y' (Y=CT, so C+Y = CT = Y)
|
|
118
|
+
{'A', 'R'} -> 'R' (R=AG, so A+R = AG = R)
|
|
119
|
+
{'C', 'R'} -> 'V' (R=AG, so C+R = ACG = V)
|
|
120
|
+
"""
|
|
121
|
+
# Expand all bases to their constituent nucleotides
|
|
122
|
+
all_nucleotides = set()
|
|
123
|
+
for base in bases:
|
|
124
|
+
all_nucleotides.update(expand_iupac_code(base))
|
|
125
|
+
|
|
126
|
+
# Look up the IUPAC code for the combined set
|
|
127
|
+
return IUPAC_CODES.get(frozenset(all_nucleotides), 'N')
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def create_variant_summary(primary_seq: str, variant_seq: str) -> str:
|
|
131
|
+
"""
|
|
132
|
+
Compare a variant sequence to the primary sequence and create a summary string
|
|
133
|
+
describing the differences. Returns a summary like:
|
|
134
|
+
"3 substitutions, 1 single-nt indel, 1 short (<= 3nt) indel, 2 long indels"
|
|
135
|
+
"""
|
|
136
|
+
if not primary_seq or not variant_seq:
|
|
137
|
+
return "sequences empty - cannot compare"
|
|
138
|
+
|
|
139
|
+
if primary_seq == variant_seq:
|
|
140
|
+
return "identical sequences"
|
|
141
|
+
|
|
142
|
+
try:
|
|
143
|
+
# Get alignment from edlib with IUPAC awareness
|
|
144
|
+
result = edlib.align(primary_seq, variant_seq, task="path", additionalEqualities=IUPAC_EQUIV)
|
|
145
|
+
if result["editDistance"] == -1:
|
|
146
|
+
return "alignment failed"
|
|
147
|
+
|
|
148
|
+
# Get nice alignment to examine differences
|
|
149
|
+
alignment = edlib.getNiceAlignment(result, primary_seq, variant_seq)
|
|
150
|
+
if not alignment or not alignment.get('query_aligned') or not alignment.get('target_aligned'):
|
|
151
|
+
return f"alignment parsing failed - edit distance {result['editDistance']}"
|
|
152
|
+
|
|
153
|
+
query_aligned = alignment['query_aligned']
|
|
154
|
+
target_aligned = alignment['target_aligned']
|
|
155
|
+
|
|
156
|
+
# Categorize differences
|
|
157
|
+
substitutions = 0
|
|
158
|
+
single_nt_indels = 0 # Single nucleotide indels
|
|
159
|
+
short_indels = 0 # 2-3 nt indels
|
|
160
|
+
long_indels = 0 # 4+ nt indels
|
|
161
|
+
|
|
162
|
+
i = 0
|
|
163
|
+
while i < len(query_aligned):
|
|
164
|
+
query_char = query_aligned[i]
|
|
165
|
+
target_char = target_aligned[i]
|
|
166
|
+
|
|
167
|
+
# Check if characters are different, considering IUPAC codes
|
|
168
|
+
if not bases_match_with_iupac(query_char, target_char):
|
|
169
|
+
if query_char == '-' or target_char == '-':
|
|
170
|
+
# This is an indel - determine its length
|
|
171
|
+
indel_length = 1
|
|
172
|
+
|
|
173
|
+
# Count consecutive indels
|
|
174
|
+
j = i + 1
|
|
175
|
+
while j < len(query_aligned) and (query_aligned[j] == '-' or target_aligned[j] == '-'):
|
|
176
|
+
indel_length += 1
|
|
177
|
+
j += 1
|
|
178
|
+
|
|
179
|
+
# Categorize by length
|
|
180
|
+
if indel_length == 1:
|
|
181
|
+
single_nt_indels += 1
|
|
182
|
+
elif indel_length <= 3:
|
|
183
|
+
short_indels += 1
|
|
184
|
+
else:
|
|
185
|
+
long_indels += 1
|
|
186
|
+
|
|
187
|
+
# Skip the rest of this indel
|
|
188
|
+
i = j
|
|
189
|
+
continue
|
|
190
|
+
else:
|
|
191
|
+
# This is a substitution
|
|
192
|
+
substitutions += 1
|
|
193
|
+
|
|
194
|
+
i += 1
|
|
195
|
+
|
|
196
|
+
# Build summary string
|
|
197
|
+
parts = []
|
|
198
|
+
if substitutions > 0:
|
|
199
|
+
parts.append(f"{substitutions} substitution{'s' if substitutions != 1 else ''}")
|
|
200
|
+
if single_nt_indels > 0:
|
|
201
|
+
parts.append(f"{single_nt_indels} single-nt indel{'s' if single_nt_indels != 1 else ''}")
|
|
202
|
+
if short_indels > 0:
|
|
203
|
+
parts.append(f"{short_indels} short (<= 3nt) indel{'s' if short_indels != 1 else ''}")
|
|
204
|
+
if long_indels > 0:
|
|
205
|
+
parts.append(f"{long_indels} long indel{'s' if long_indels != 1 else ''}")
|
|
206
|
+
|
|
207
|
+
if not parts:
|
|
208
|
+
return "identical sequences (IUPAC-compatible)"
|
|
209
|
+
|
|
210
|
+
return ", ".join(parts)
|
|
211
|
+
|
|
212
|
+
except Exception as e:
|
|
213
|
+
return f"comparison failed: {str(e)}"
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def calculate_adjusted_identity_distance(seq1: str, seq2: str) -> float:
|
|
217
|
+
"""Calculate adjusted identity distance between two sequences."""
|
|
218
|
+
if not seq1 or not seq2:
|
|
219
|
+
return 1.0 # Maximum distance
|
|
220
|
+
|
|
221
|
+
if seq1 == seq2:
|
|
222
|
+
return 0.0
|
|
223
|
+
|
|
224
|
+
# Get alignment from edlib with IUPAC awareness
|
|
225
|
+
result = edlib.align(seq1, seq2, task="path", additionalEqualities=IUPAC_EQUIV)
|
|
226
|
+
if result["editDistance"] == -1:
|
|
227
|
+
return 1.0
|
|
228
|
+
|
|
229
|
+
# Get nice alignment for adjusted identity scoring
|
|
230
|
+
alignment = edlib.getNiceAlignment(result, seq1, seq2)
|
|
231
|
+
if not alignment or not alignment.get('query_aligned') or not alignment.get('target_aligned'):
|
|
232
|
+
return 1.0
|
|
233
|
+
|
|
234
|
+
# Calculate adjusted identity using standard parameters
|
|
235
|
+
score_result = score_alignment(
|
|
236
|
+
alignment['query_aligned'],
|
|
237
|
+
alignment['target_aligned'],
|
|
238
|
+
adjustment_params=STANDARD_ADJUSTMENT_PARAMS
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
# Convert adjusted identity to distance
|
|
242
|
+
return 1.0 - score_result.identity
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def calculate_overlap_aware_distance(seq1: str, seq2: str, min_overlap_bp: int) -> float:
|
|
246
|
+
"""
|
|
247
|
+
Calculate distance that accounts for partial overlaps between sequences.
|
|
248
|
+
|
|
249
|
+
When sequences have sufficient overlap with good identity, returns the
|
|
250
|
+
overlap-region distance. Otherwise falls back to global distance.
|
|
251
|
+
|
|
252
|
+
For containment cases where one sequence is shorter than min_overlap_bp,
|
|
253
|
+
uses the shorter sequence length as the effective threshold.
|
|
254
|
+
|
|
255
|
+
Args:
|
|
256
|
+
seq1, seq2: DNA sequences (may have different lengths)
|
|
257
|
+
min_overlap_bp: Minimum overlap required in base pairs
|
|
258
|
+
|
|
259
|
+
Returns:
|
|
260
|
+
Distance (0.0 to 1.0) based on overlap region if sufficient,
|
|
261
|
+
otherwise global distance from calculate_adjusted_identity_distance()
|
|
262
|
+
|
|
263
|
+
Note: This function calculates distance purely based on sequence content.
|
|
264
|
+
Primer-based filtering (to prevent chimera grouping) is applied at the
|
|
265
|
+
caller level in perform_hac_clustering() using primers_are_same().
|
|
266
|
+
"""
|
|
267
|
+
if not seq1 or not seq2:
|
|
268
|
+
return 1.0 # Maximum distance
|
|
269
|
+
|
|
270
|
+
if seq1 == seq2:
|
|
271
|
+
return 0.0
|
|
272
|
+
|
|
273
|
+
# Use align_and_score which handles bidirectional alignment internally
|
|
274
|
+
result = align_and_score(seq1, seq2, STANDARD_ADJUSTMENT_PARAMS)
|
|
275
|
+
|
|
276
|
+
# Calculate overlap in base pairs
|
|
277
|
+
# Coverage is fraction of each sequence used in alignment
|
|
278
|
+
len1, len2 = len(seq1), len(seq2)
|
|
279
|
+
shorter_len = min(len1, len2)
|
|
280
|
+
|
|
281
|
+
# Overlap is the minimum of the two coverages times the respective lengths
|
|
282
|
+
# For containment, the shorter sequence should be fully covered
|
|
283
|
+
overlap_bp = int(min(result.seq1_coverage * len1, result.seq2_coverage * len2))
|
|
284
|
+
|
|
285
|
+
# Effective threshold: for containment cases, allow merge if short sequence is fully covered
|
|
286
|
+
effective_threshold = min(min_overlap_bp, shorter_len)
|
|
287
|
+
|
|
288
|
+
if overlap_bp >= effective_threshold:
|
|
289
|
+
# Sufficient overlap - use overlap identity for distance
|
|
290
|
+
return 1.0 - result.identity
|
|
291
|
+
else:
|
|
292
|
+
# Insufficient overlap - fall back to global distance
|
|
293
|
+
# This will typically be high due to terminal gaps
|
|
294
|
+
return calculate_adjusted_identity_distance(seq1, seq2)
|