speconsense 0.7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,294 @@
1
+ """IUPAC ambiguity code handling and distance calculations.
2
+
3
+ Provides utilities for working with IUPAC nucleotide codes and calculating
4
+ adjusted identity distances between sequences with homopolymer normalization.
5
+ """
6
+
7
+ from typing import List, Optional
8
+
9
+ import edlib
10
+ from adjusted_identity import score_alignment, AdjustmentParams, align_and_score
11
+
12
+ from speconsense.msa import IUPAC_CODES
13
+
14
+
15
+ # IUPAC equivalencies for edlib alignment
16
+ # This allows edlib to treat IUPAC ambiguity codes as matching their constituent bases
17
+ IUPAC_EQUIV = [("Y", "C"), ("Y", "T"), ("R", "A"), ("R", "G"),
18
+ ("N", "A"), ("N", "C"), ("N", "G"), ("N", "T"),
19
+ ("W", "A"), ("W", "T"), ("M", "A"), ("M", "C"),
20
+ ("S", "C"), ("S", "G"), ("K", "G"), ("K", "T"),
21
+ ("B", "C"), ("B", "G"), ("B", "T"),
22
+ ("D", "A"), ("D", "G"), ("D", "T"),
23
+ ("H", "A"), ("H", "C"), ("H", "T"),
24
+ ("V", "A"), ("V", "C"), ("V", "G"), ]
25
+
26
+ # Standard adjustment parameters for consistent sequence comparison
27
+ # Used by both substitution distance calculation and adjusted identity distance
28
+ STANDARD_ADJUSTMENT_PARAMS = AdjustmentParams(
29
+ normalize_homopolymers=True, # Enable homopolymer normalization
30
+ handle_iupac_overlap=False, # Disable IUPAC overlap - use standard IUPAC semantics (Y!=M)
31
+ normalize_indels=False, # Disable indel normalization
32
+ end_skip_distance=0, # No end trimming - sequences must match end-to-end
33
+ max_repeat_motif_length=1 # Single-base repeats for homopolymer normalization
34
+ )
35
+
36
+
37
+ def primers_are_same(p1: Optional[List[str]], p2: Optional[List[str]]) -> bool:
38
+ """Check if two primer annotations indicate the same amplicon.
39
+
40
+ Used to determine whether overlap-aware merging should be allowed.
41
+ When primers match, sequences should have the same amplicon length,
42
+ so length differences indicate chimeras rather than primer pool variation.
43
+
44
+ Returns True (use global distance, no overlap merging) when:
45
+ - Either is None or empty (conservative: unknown = assume same)
46
+ - Both have identical primer sets
47
+
48
+ Returns False (allow overlap merging) when primers differ.
49
+
50
+ Args:
51
+ p1: Primer annotation from first sequence (e.g., ['ITS1', 'ITS4'])
52
+ p2: Primer annotation from second sequence
53
+
54
+ Returns:
55
+ True if primers are same or unknown (use global distance)
56
+ False if primers differ (allow overlap-aware distance)
57
+ """
58
+ if not p1 or not p2:
59
+ return True # Conservative: missing info -> assume same
60
+ return set(p1) == set(p2)
61
+
62
+
63
+ def bases_match_with_iupac(base1: str, base2: str) -> bool:
64
+ """
65
+ Check if two bases match, considering IUPAC ambiguity codes.
66
+ Two bases match if their IUPAC expansions have any nucleotides in common.
67
+ """
68
+ if base1 == base2:
69
+ return True
70
+
71
+ # Handle gap characters
72
+ if base1 == '-' or base2 == '-':
73
+ return base1 == base2
74
+
75
+ # Expand IUPAC codes and check for overlap
76
+ expansion1 = expand_iupac_code(base1)
77
+ expansion2 = expand_iupac_code(base2)
78
+
79
+ # Bases match if their expansions have any nucleotides in common
80
+ return bool(expansion1.intersection(expansion2))
81
+
82
+
83
+ def expand_iupac_code(base: str) -> set:
84
+ """
85
+ Expand an IUPAC code to its constituent nucleotides.
86
+ Returns a set of nucleotides that the code represents.
87
+ """
88
+ iupac_expansion = {
89
+ 'A': {'A'},
90
+ 'C': {'C'},
91
+ 'G': {'G'},
92
+ 'T': {'T'},
93
+ 'R': {'A', 'G'},
94
+ 'Y': {'C', 'T'},
95
+ 'S': {'G', 'C'},
96
+ 'W': {'A', 'T'},
97
+ 'K': {'G', 'T'},
98
+ 'M': {'A', 'C'},
99
+ 'B': {'C', 'G', 'T'},
100
+ 'D': {'A', 'G', 'T'},
101
+ 'H': {'A', 'C', 'T'},
102
+ 'V': {'A', 'C', 'G'},
103
+ 'N': {'A', 'C', 'G', 'T'},
104
+ }
105
+
106
+ return iupac_expansion.get(base.upper(), {'N'})
107
+
108
+
109
+ def merge_bases_to_iupac(bases: set) -> str:
110
+ """
111
+ Merge a set of bases (which may include IUPAC codes) into a single IUPAC code.
112
+
113
+ Expands any existing IUPAC codes to their constituent nucleotides,
114
+ takes the union, and returns the appropriate IUPAC code.
115
+
116
+ Examples:
117
+ {'C', 'Y'} -> 'Y' (Y=CT, so C+Y = CT = Y)
118
+ {'A', 'R'} -> 'R' (R=AG, so A+R = AG = R)
119
+ {'C', 'R'} -> 'V' (R=AG, so C+R = ACG = V)
120
+ """
121
+ # Expand all bases to their constituent nucleotides
122
+ all_nucleotides = set()
123
+ for base in bases:
124
+ all_nucleotides.update(expand_iupac_code(base))
125
+
126
+ # Look up the IUPAC code for the combined set
127
+ return IUPAC_CODES.get(frozenset(all_nucleotides), 'N')
128
+
129
+
130
+ def create_variant_summary(primary_seq: str, variant_seq: str) -> str:
131
+ """
132
+ Compare a variant sequence to the primary sequence and create a summary string
133
+ describing the differences. Returns a summary like:
134
+ "3 substitutions, 1 single-nt indel, 1 short (<= 3nt) indel, 2 long indels"
135
+ """
136
+ if not primary_seq or not variant_seq:
137
+ return "sequences empty - cannot compare"
138
+
139
+ if primary_seq == variant_seq:
140
+ return "identical sequences"
141
+
142
+ try:
143
+ # Get alignment from edlib with IUPAC awareness
144
+ result = edlib.align(primary_seq, variant_seq, task="path", additionalEqualities=IUPAC_EQUIV)
145
+ if result["editDistance"] == -1:
146
+ return "alignment failed"
147
+
148
+ # Get nice alignment to examine differences
149
+ alignment = edlib.getNiceAlignment(result, primary_seq, variant_seq)
150
+ if not alignment or not alignment.get('query_aligned') or not alignment.get('target_aligned'):
151
+ return f"alignment parsing failed - edit distance {result['editDistance']}"
152
+
153
+ query_aligned = alignment['query_aligned']
154
+ target_aligned = alignment['target_aligned']
155
+
156
+ # Categorize differences
157
+ substitutions = 0
158
+ single_nt_indels = 0 # Single nucleotide indels
159
+ short_indels = 0 # 2-3 nt indels
160
+ long_indels = 0 # 4+ nt indels
161
+
162
+ i = 0
163
+ while i < len(query_aligned):
164
+ query_char = query_aligned[i]
165
+ target_char = target_aligned[i]
166
+
167
+ # Check if characters are different, considering IUPAC codes
168
+ if not bases_match_with_iupac(query_char, target_char):
169
+ if query_char == '-' or target_char == '-':
170
+ # This is an indel - determine its length
171
+ indel_length = 1
172
+
173
+ # Count consecutive indels
174
+ j = i + 1
175
+ while j < len(query_aligned) and (query_aligned[j] == '-' or target_aligned[j] == '-'):
176
+ indel_length += 1
177
+ j += 1
178
+
179
+ # Categorize by length
180
+ if indel_length == 1:
181
+ single_nt_indels += 1
182
+ elif indel_length <= 3:
183
+ short_indels += 1
184
+ else:
185
+ long_indels += 1
186
+
187
+ # Skip the rest of this indel
188
+ i = j
189
+ continue
190
+ else:
191
+ # This is a substitution
192
+ substitutions += 1
193
+
194
+ i += 1
195
+
196
+ # Build summary string
197
+ parts = []
198
+ if substitutions > 0:
199
+ parts.append(f"{substitutions} substitution{'s' if substitutions != 1 else ''}")
200
+ if single_nt_indels > 0:
201
+ parts.append(f"{single_nt_indels} single-nt indel{'s' if single_nt_indels != 1 else ''}")
202
+ if short_indels > 0:
203
+ parts.append(f"{short_indels} short (<= 3nt) indel{'s' if short_indels != 1 else ''}")
204
+ if long_indels > 0:
205
+ parts.append(f"{long_indels} long indel{'s' if long_indels != 1 else ''}")
206
+
207
+ if not parts:
208
+ return "identical sequences (IUPAC-compatible)"
209
+
210
+ return ", ".join(parts)
211
+
212
+ except Exception as e:
213
+ return f"comparison failed: {str(e)}"
214
+
215
+
216
+ def calculate_adjusted_identity_distance(seq1: str, seq2: str) -> float:
217
+ """Calculate adjusted identity distance between two sequences."""
218
+ if not seq1 or not seq2:
219
+ return 1.0 # Maximum distance
220
+
221
+ if seq1 == seq2:
222
+ return 0.0
223
+
224
+ # Get alignment from edlib with IUPAC awareness
225
+ result = edlib.align(seq1, seq2, task="path", additionalEqualities=IUPAC_EQUIV)
226
+ if result["editDistance"] == -1:
227
+ return 1.0
228
+
229
+ # Get nice alignment for adjusted identity scoring
230
+ alignment = edlib.getNiceAlignment(result, seq1, seq2)
231
+ if not alignment or not alignment.get('query_aligned') or not alignment.get('target_aligned'):
232
+ return 1.0
233
+
234
+ # Calculate adjusted identity using standard parameters
235
+ score_result = score_alignment(
236
+ alignment['query_aligned'],
237
+ alignment['target_aligned'],
238
+ adjustment_params=STANDARD_ADJUSTMENT_PARAMS
239
+ )
240
+
241
+ # Convert adjusted identity to distance
242
+ return 1.0 - score_result.identity
243
+
244
+
245
+ def calculate_overlap_aware_distance(seq1: str, seq2: str, min_overlap_bp: int) -> float:
246
+ """
247
+ Calculate distance that accounts for partial overlaps between sequences.
248
+
249
+ When sequences have sufficient overlap with good identity, returns the
250
+ overlap-region distance. Otherwise falls back to global distance.
251
+
252
+ For containment cases where one sequence is shorter than min_overlap_bp,
253
+ uses the shorter sequence length as the effective threshold.
254
+
255
+ Args:
256
+ seq1, seq2: DNA sequences (may have different lengths)
257
+ min_overlap_bp: Minimum overlap required in base pairs
258
+
259
+ Returns:
260
+ Distance (0.0 to 1.0) based on overlap region if sufficient,
261
+ otherwise global distance from calculate_adjusted_identity_distance()
262
+
263
+ Note: This function calculates distance purely based on sequence content.
264
+ Primer-based filtering (to prevent chimera grouping) is applied at the
265
+ caller level in perform_hac_clustering() using primers_are_same().
266
+ """
267
+ if not seq1 or not seq2:
268
+ return 1.0 # Maximum distance
269
+
270
+ if seq1 == seq2:
271
+ return 0.0
272
+
273
+ # Use align_and_score which handles bidirectional alignment internally
274
+ result = align_and_score(seq1, seq2, STANDARD_ADJUSTMENT_PARAMS)
275
+
276
+ # Calculate overlap in base pairs
277
+ # Coverage is fraction of each sequence used in alignment
278
+ len1, len2 = len(seq1), len(seq2)
279
+ shorter_len = min(len1, len2)
280
+
281
+ # Overlap is the minimum of the two coverages times the respective lengths
282
+ # For containment, the shorter sequence should be fully covered
283
+ overlap_bp = int(min(result.seq1_coverage * len1, result.seq2_coverage * len2))
284
+
285
+ # Effective threshold: for containment cases, allow merge if short sequence is fully covered
286
+ effective_threshold = min(min_overlap_bp, shorter_len)
287
+
288
+ if overlap_bp >= effective_threshold:
289
+ # Sufficient overlap - use overlap identity for distance
290
+ return 1.0 - result.identity
291
+ else:
292
+ # Insufficient overlap - fall back to global distance
293
+ # This will typically be high due to terminal gaps
294
+ return calculate_adjusted_identity_distance(seq1, seq2)