speconsense 0.7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,226 @@
1
+ """Vsearch-based candidate finding for scalable sequence comparison."""
2
+
3
+ import hashlib
4
+ import logging
5
+ import os
6
+ import subprocess
7
+ import tempfile
8
+ from collections import defaultdict
9
+ from typing import Dict, List, Optional
10
+
11
+ from tqdm import tqdm
12
+
13
+
14
+ class VsearchCandidateFinder:
15
+ """Vsearch-based candidate finding using usearch_global.
16
+
17
+ This implementation uses vsearch to quickly find approximate sequence matches,
18
+ which can then be refined with exact scoring. It is designed for large-scale
19
+ datasets where O(n^2) pairwise comparisons become infeasible.
20
+
21
+ The implementation uses SHA256-based deduplication to reduce the database size
22
+ when many identical sequences are present.
23
+ """
24
+
25
+ def __init__(self,
26
+ batch_size: int = 1000,
27
+ num_threads: int = 1):
28
+ """Initialize VsearchCandidateFinder.
29
+
30
+ Args:
31
+ batch_size: Number of sequences to query per batch
32
+ num_threads: Number of threads for vsearch (default: 1 for backward compatibility)
33
+ """
34
+ self.batch_size = batch_size
35
+ self.num_threads = num_threads
36
+ self._db_path: Optional[str] = None
37
+ self._hash_to_ids: Dict[str, List[str]] = {}
38
+ self._cache_dir: Optional[str] = None
39
+
40
+ @property
41
+ def name(self) -> str:
42
+ """Human-readable name of this backend."""
43
+ return "vsearch"
44
+
45
+ @property
46
+ def is_available(self) -> bool:
47
+ """Check if vsearch is installed and accessible."""
48
+ try:
49
+ result = subprocess.run(
50
+ ['vsearch', '--version'],
51
+ capture_output=True,
52
+ text=True
53
+ )
54
+ return result.returncode == 0
55
+ except FileNotFoundError:
56
+ return False
57
+
58
+ def build_index(self,
59
+ sequences: Dict[str, str],
60
+ output_dir: str,
61
+ cache_id: Optional[str] = None) -> None:
62
+ """Build vsearch database with SHA256-based deduplication.
63
+
64
+ Args:
65
+ sequences: Dict mapping sequence_id -> sequence_string
66
+ output_dir: Directory for cache files
67
+ cache_id: Unique identifier for this cache (e.g., sample name).
68
+ If not provided, uses process ID to avoid collisions.
69
+ """
70
+ # Use cache_id or PID to ensure parallel instances don't collide
71
+ unique_id = cache_id if cache_id else str(os.getpid())
72
+ self._cache_dir = os.path.join(output_dir, f".vsearch_cache_{unique_id}")
73
+ os.makedirs(self._cache_dir, exist_ok=True)
74
+
75
+ self._db_path = os.path.join(self._cache_dir, "sequences.fasta")
76
+
77
+ # Deduplicate sequences using hash
78
+ unique_seqs: Dict[str, tuple] = {} # hash -> (list of ids, sequence)
79
+ for seq_id, seq in sorted(sequences.items()):
80
+ seq_hash = hashlib.sha256(seq.encode()).hexdigest()[:16]
81
+ if seq_hash not in unique_seqs:
82
+ unique_seqs[seq_hash] = ([], seq)
83
+ unique_seqs[seq_hash][0].append(seq_id)
84
+
85
+ # Write deduplicated FASTA
86
+ with open(self._db_path, 'w') as f:
87
+ for seq_hash, (ids, seq) in unique_seqs.items():
88
+ f.write(f">{seq_hash}\n{seq}\n")
89
+
90
+ # Store mapping for result lookup
91
+ self._hash_to_ids = {
92
+ seq_hash: ids for seq_hash, (ids, _) in unique_seqs.items()
93
+ }
94
+
95
+ logging.debug(f"Built vsearch index: {len(unique_seqs)} unique sequences "
96
+ f"(deduplicated from {len(sequences)} total)")
97
+
98
+ def find_candidates(self,
99
+ query_ids: List[str],
100
+ sequences: Dict[str, str],
101
+ min_identity: float,
102
+ max_candidates: int) -> Dict[str, List[str]]:
103
+ """Find candidate matches using vsearch usearch_global.
104
+
105
+ Args:
106
+ query_ids: List of sequence IDs to query
107
+ sequences: Dict mapping sequence_id -> sequence_string
108
+ min_identity: Minimum identity threshold (0.0-1.0)
109
+ max_candidates: Maximum candidates to return per query
110
+
111
+ Returns:
112
+ Dict mapping query_id -> list of candidate target_ids
113
+ """
114
+ if not self._db_path or not os.path.exists(self._db_path):
115
+ raise RuntimeError("Index not built. Call build_index() first.")
116
+
117
+ all_results: Dict[str, List[str]] = defaultdict(list)
118
+
119
+ # Process in batches with progress bar
120
+ with tqdm(total=len(query_ids), desc="Finding candidates with vsearch") as pbar:
121
+ for i in range(0, len(query_ids), self.batch_size):
122
+ batch_ids = query_ids[i:i + self.batch_size]
123
+ batch_results = self._run_batch(batch_ids, sequences, min_identity, max_candidates)
124
+
125
+ for query_id, candidates in batch_results.items():
126
+ all_results[query_id].extend(candidates)
127
+
128
+ pbar.update(len(batch_ids))
129
+
130
+ # Validate results - detect likely vsearch failures
131
+ total_candidates = sum(len(c) for c in all_results.values())
132
+ seqs_with_candidates = sum(1 for c in all_results.values() if c)
133
+
134
+ logging.debug(f"vsearch found {total_candidates} candidates for {len(query_ids)} sequences "
135
+ f"({seqs_with_candidates} sequences with ≥1 candidate)")
136
+
137
+ # If zero candidates for a large dataset, vsearch likely failed
138
+ if len(query_ids) > 100 and total_candidates == 0:
139
+ raise RuntimeError(
140
+ f"vsearch returned zero candidates for {len(query_ids)} sequences. "
141
+ "This may indicate vsearch was killed due to resource contention. "
142
+ "Try running with --threads 1 when using GNU parallel."
143
+ )
144
+
145
+ return dict(all_results)
146
+
147
+ def _run_batch(self,
148
+ query_ids: List[str],
149
+ sequences: Dict[str, str],
150
+ min_identity: float,
151
+ max_candidates: int) -> Dict[str, List[str]]:
152
+ """Run vsearch on a single batch of queries.
153
+
154
+ Args:
155
+ query_ids: List of sequence IDs to query
156
+ sequences: Dict mapping sequence_id -> sequence_string
157
+ min_identity: Minimum identity threshold
158
+ max_candidates: Maximum candidates per query
159
+
160
+ Returns:
161
+ Dict mapping query_id -> list of candidate target_ids
162
+ """
163
+ # Create temporary query file
164
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.fasta', delete=False) as f:
165
+ for seq_id in query_ids:
166
+ f.write(f">{seq_id}\n{sequences[seq_id]}\n")
167
+ query_path = f.name
168
+
169
+ try:
170
+ cmd = [
171
+ 'vsearch',
172
+ '--usearch_global', query_path,
173
+ '--db', self._db_path,
174
+ '--userout', '/dev/stdout',
175
+ '--userfields', 'query+target+id',
176
+ '--id', str(min_identity),
177
+ '--maxaccepts', str(max_candidates),
178
+ '--threads', str(self.num_threads),
179
+ '--output_no_hits'
180
+ ]
181
+
182
+ result = subprocess.run(cmd, check=True, capture_output=True, text=True)
183
+
184
+ # Parse results
185
+ results: Dict[str, List[str]] = defaultdict(list)
186
+ for line in result.stdout.strip().split('\n'):
187
+ if not line:
188
+ continue
189
+ parts = line.split('\t')
190
+ if len(parts) != 3:
191
+ continue
192
+
193
+ query_id, target_hash, identity = parts
194
+
195
+ # Map hash back to original IDs
196
+ if target_hash in self._hash_to_ids:
197
+ for original_id in self._hash_to_ids[target_hash]:
198
+ if original_id != query_id: # Skip self-matches
199
+ results[query_id].append(original_id)
200
+
201
+ return dict(results)
202
+
203
+ except FileNotFoundError:
204
+ raise RuntimeError(
205
+ "vsearch command not found. Please install vsearch:\n"
206
+ " conda install bioconda::vsearch\n"
207
+ "or visit https://github.com/torognes/vsearch for installation instructions."
208
+ )
209
+ except subprocess.CalledProcessError as e:
210
+ logging.error(f"vsearch failed with return code {e.returncode}")
211
+ logging.error(f"vsearch stderr: {e.stderr}")
212
+ raise
213
+
214
+ finally:
215
+ # Clean up temporary query file
216
+ if os.path.exists(query_path):
217
+ os.unlink(query_path)
218
+
219
+ def cleanup(self) -> None:
220
+ """Clean up cache directory and temporary files."""
221
+ if self._cache_dir and os.path.exists(self._cache_dir):
222
+ import shutil
223
+ shutil.rmtree(self._cache_dir)
224
+ self._db_path = None
225
+ self._hash_to_ids = {}
226
+ self._cache_dir = None
@@ -0,0 +1,129 @@
1
+ """
2
+ Summarize subpackage for speconsense.
3
+
4
+ Provides post-processing of speconsense output: HAC variant grouping,
5
+ MSA-based merging with IUPAC ambiguity codes, and variant selection.
6
+ """
7
+
8
+ # CLI and entry point
9
+ from .cli import main, parse_arguments, setup_logging, process_single_specimen
10
+
11
+ # IUPAC utilities and distance functions
12
+ from .iupac import (
13
+ IUPAC_EQUIV,
14
+ STANDARD_ADJUSTMENT_PARAMS,
15
+ bases_match_with_iupac,
16
+ expand_iupac_code,
17
+ merge_bases_to_iupac,
18
+ calculate_adjusted_identity_distance,
19
+ calculate_overlap_aware_distance,
20
+ create_variant_summary,
21
+ primers_are_same,
22
+ )
23
+
24
+ # FASTA field classes
25
+ from .fields import (
26
+ FastaField,
27
+ FASTA_FIELDS,
28
+ FASTA_FIELD_PRESETS,
29
+ validate_field_registry,
30
+ parse_fasta_fields,
31
+ format_fasta_header,
32
+ )
33
+
34
+ # MSA analysis and quality assessment
35
+ from .analysis import (
36
+ ClusterQualityData,
37
+ MAX_MSA_MERGE_VARIANTS,
38
+ run_spoa_msa,
39
+ identify_indel_events,
40
+ is_homopolymer_event,
41
+ analyze_msa_columns,
42
+ analyze_msa_columns_overlap_aware,
43
+ analyze_cluster_quality,
44
+ identify_outliers,
45
+ analyze_positional_identity_outliers,
46
+ )
47
+
48
+ # MSA-based variant merging
49
+ from .merging import (
50
+ generate_all_subsets_by_size,
51
+ is_compatible_subset,
52
+ create_consensus_from_msa,
53
+ create_overlap_consensus_from_msa,
54
+ merge_group_with_msa,
55
+ )
56
+
57
+ # HAC clustering and variant selection
58
+ from .clustering import (
59
+ perform_hac_clustering,
60
+ select_variants,
61
+ )
62
+
63
+ # File I/O operations
64
+ from .io import (
65
+ parse_consensus_header,
66
+ load_consensus_sequences,
67
+ load_metadata_from_json,
68
+ build_fastq_lookup_table,
69
+ create_output_structure,
70
+ write_consensus_fastq,
71
+ write_specimen_data_files,
72
+ write_position_debug_file,
73
+ write_output_files,
74
+ )
75
+
76
+ __all__ = [
77
+ # CLI
78
+ "main",
79
+ "parse_arguments",
80
+ "setup_logging",
81
+ "process_single_specimen",
82
+ # IUPAC
83
+ "IUPAC_EQUIV",
84
+ "STANDARD_ADJUSTMENT_PARAMS",
85
+ "bases_match_with_iupac",
86
+ "expand_iupac_code",
87
+ "merge_bases_to_iupac",
88
+ "calculate_adjusted_identity_distance",
89
+ "calculate_overlap_aware_distance",
90
+ "create_variant_summary",
91
+ "primers_are_same",
92
+ # Fields
93
+ "FastaField",
94
+ "FASTA_FIELDS",
95
+ "FASTA_FIELD_PRESETS",
96
+ "validate_field_registry",
97
+ "parse_fasta_fields",
98
+ "format_fasta_header",
99
+ # Analysis
100
+ "ClusterQualityData",
101
+ "MAX_MSA_MERGE_VARIANTS",
102
+ "run_spoa_msa",
103
+ "identify_indel_events",
104
+ "is_homopolymer_event",
105
+ "analyze_msa_columns",
106
+ "analyze_msa_columns_overlap_aware",
107
+ "analyze_cluster_quality",
108
+ "identify_outliers",
109
+ "analyze_positional_identity_outliers",
110
+ # Merging
111
+ "generate_all_subsets_by_size",
112
+ "is_compatible_subset",
113
+ "create_consensus_from_msa",
114
+ "create_overlap_consensus_from_msa",
115
+ "merge_group_with_msa",
116
+ # Clustering
117
+ "perform_hac_clustering",
118
+ "select_variants",
119
+ # I/O
120
+ "parse_consensus_header",
121
+ "load_consensus_sequences",
122
+ "load_metadata_from_json",
123
+ "build_fastq_lookup_table",
124
+ "create_output_structure",
125
+ "write_consensus_fastq",
126
+ "write_specimen_data_files",
127
+ "write_position_debug_file",
128
+ "write_output_files",
129
+ ]
@@ -0,0 +1,6 @@
1
+ """Entry point for python -m speconsense.summarize."""
2
+
3
+ from .cli import main
4
+
5
+ if __name__ == "__main__":
6
+ main()