speconsense 0.7.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- speconsense/__init__.py +16 -0
- speconsense/cli.py +6 -0
- speconsense/core/__init__.py +32 -0
- speconsense/core/__main__.py +6 -0
- speconsense/core/cli.py +308 -0
- speconsense/core/clusterer.py +1565 -0
- speconsense/core/workers.py +696 -0
- speconsense/msa.py +813 -0
- speconsense/profiles/__init__.py +514 -0
- speconsense/profiles/example.yaml +97 -0
- speconsense/profiles/herbarium.yaml +25 -0
- speconsense/profiles/largedata.yaml +19 -0
- speconsense/profiles/nostalgia.yaml +22 -0
- speconsense/profiles/strict.yaml +27 -0
- speconsense/quality_report.py +499 -0
- speconsense/scalability/__init__.py +29 -0
- speconsense/scalability/base.py +461 -0
- speconsense/scalability/config.py +42 -0
- speconsense/scalability/vsearch.py +226 -0
- speconsense/summarize/__init__.py +129 -0
- speconsense/summarize/__main__.py +6 -0
- speconsense/summarize/analysis.py +780 -0
- speconsense/summarize/cli.py +528 -0
- speconsense/summarize/clustering.py +669 -0
- speconsense/summarize/fields.py +262 -0
- speconsense/summarize/io.py +723 -0
- speconsense/summarize/iupac.py +294 -0
- speconsense/summarize/merging.py +606 -0
- speconsense/synth.py +292 -0
- speconsense/types.py +38 -0
- speconsense-0.7.2.dist-info/METADATA +1449 -0
- speconsense-0.7.2.dist-info/RECORD +36 -0
- speconsense-0.7.2.dist-info/WHEEL +5 -0
- speconsense-0.7.2.dist-info/entry_points.txt +4 -0
- speconsense-0.7.2.dist-info/licenses/LICENSE +28 -0
- speconsense-0.7.2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
"""Vsearch-based candidate finding for scalable sequence comparison."""
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
import logging
|
|
5
|
+
import os
|
|
6
|
+
import subprocess
|
|
7
|
+
import tempfile
|
|
8
|
+
from collections import defaultdict
|
|
9
|
+
from typing import Dict, List, Optional
|
|
10
|
+
|
|
11
|
+
from tqdm import tqdm
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class VsearchCandidateFinder:
|
|
15
|
+
"""Vsearch-based candidate finding using usearch_global.
|
|
16
|
+
|
|
17
|
+
This implementation uses vsearch to quickly find approximate sequence matches,
|
|
18
|
+
which can then be refined with exact scoring. It is designed for large-scale
|
|
19
|
+
datasets where O(n^2) pairwise comparisons become infeasible.
|
|
20
|
+
|
|
21
|
+
The implementation uses SHA256-based deduplication to reduce the database size
|
|
22
|
+
when many identical sequences are present.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
def __init__(self,
|
|
26
|
+
batch_size: int = 1000,
|
|
27
|
+
num_threads: int = 1):
|
|
28
|
+
"""Initialize VsearchCandidateFinder.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
batch_size: Number of sequences to query per batch
|
|
32
|
+
num_threads: Number of threads for vsearch (default: 1 for backward compatibility)
|
|
33
|
+
"""
|
|
34
|
+
self.batch_size = batch_size
|
|
35
|
+
self.num_threads = num_threads
|
|
36
|
+
self._db_path: Optional[str] = None
|
|
37
|
+
self._hash_to_ids: Dict[str, List[str]] = {}
|
|
38
|
+
self._cache_dir: Optional[str] = None
|
|
39
|
+
|
|
40
|
+
@property
|
|
41
|
+
def name(self) -> str:
|
|
42
|
+
"""Human-readable name of this backend."""
|
|
43
|
+
return "vsearch"
|
|
44
|
+
|
|
45
|
+
@property
|
|
46
|
+
def is_available(self) -> bool:
|
|
47
|
+
"""Check if vsearch is installed and accessible."""
|
|
48
|
+
try:
|
|
49
|
+
result = subprocess.run(
|
|
50
|
+
['vsearch', '--version'],
|
|
51
|
+
capture_output=True,
|
|
52
|
+
text=True
|
|
53
|
+
)
|
|
54
|
+
return result.returncode == 0
|
|
55
|
+
except FileNotFoundError:
|
|
56
|
+
return False
|
|
57
|
+
|
|
58
|
+
def build_index(self,
|
|
59
|
+
sequences: Dict[str, str],
|
|
60
|
+
output_dir: str,
|
|
61
|
+
cache_id: Optional[str] = None) -> None:
|
|
62
|
+
"""Build vsearch database with SHA256-based deduplication.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
sequences: Dict mapping sequence_id -> sequence_string
|
|
66
|
+
output_dir: Directory for cache files
|
|
67
|
+
cache_id: Unique identifier for this cache (e.g., sample name).
|
|
68
|
+
If not provided, uses process ID to avoid collisions.
|
|
69
|
+
"""
|
|
70
|
+
# Use cache_id or PID to ensure parallel instances don't collide
|
|
71
|
+
unique_id = cache_id if cache_id else str(os.getpid())
|
|
72
|
+
self._cache_dir = os.path.join(output_dir, f".vsearch_cache_{unique_id}")
|
|
73
|
+
os.makedirs(self._cache_dir, exist_ok=True)
|
|
74
|
+
|
|
75
|
+
self._db_path = os.path.join(self._cache_dir, "sequences.fasta")
|
|
76
|
+
|
|
77
|
+
# Deduplicate sequences using hash
|
|
78
|
+
unique_seqs: Dict[str, tuple] = {} # hash -> (list of ids, sequence)
|
|
79
|
+
for seq_id, seq in sorted(sequences.items()):
|
|
80
|
+
seq_hash = hashlib.sha256(seq.encode()).hexdigest()[:16]
|
|
81
|
+
if seq_hash not in unique_seqs:
|
|
82
|
+
unique_seqs[seq_hash] = ([], seq)
|
|
83
|
+
unique_seqs[seq_hash][0].append(seq_id)
|
|
84
|
+
|
|
85
|
+
# Write deduplicated FASTA
|
|
86
|
+
with open(self._db_path, 'w') as f:
|
|
87
|
+
for seq_hash, (ids, seq) in unique_seqs.items():
|
|
88
|
+
f.write(f">{seq_hash}\n{seq}\n")
|
|
89
|
+
|
|
90
|
+
# Store mapping for result lookup
|
|
91
|
+
self._hash_to_ids = {
|
|
92
|
+
seq_hash: ids for seq_hash, (ids, _) in unique_seqs.items()
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
logging.debug(f"Built vsearch index: {len(unique_seqs)} unique sequences "
|
|
96
|
+
f"(deduplicated from {len(sequences)} total)")
|
|
97
|
+
|
|
98
|
+
def find_candidates(self,
|
|
99
|
+
query_ids: List[str],
|
|
100
|
+
sequences: Dict[str, str],
|
|
101
|
+
min_identity: float,
|
|
102
|
+
max_candidates: int) -> Dict[str, List[str]]:
|
|
103
|
+
"""Find candidate matches using vsearch usearch_global.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
query_ids: List of sequence IDs to query
|
|
107
|
+
sequences: Dict mapping sequence_id -> sequence_string
|
|
108
|
+
min_identity: Minimum identity threshold (0.0-1.0)
|
|
109
|
+
max_candidates: Maximum candidates to return per query
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
Dict mapping query_id -> list of candidate target_ids
|
|
113
|
+
"""
|
|
114
|
+
if not self._db_path or not os.path.exists(self._db_path):
|
|
115
|
+
raise RuntimeError("Index not built. Call build_index() first.")
|
|
116
|
+
|
|
117
|
+
all_results: Dict[str, List[str]] = defaultdict(list)
|
|
118
|
+
|
|
119
|
+
# Process in batches with progress bar
|
|
120
|
+
with tqdm(total=len(query_ids), desc="Finding candidates with vsearch") as pbar:
|
|
121
|
+
for i in range(0, len(query_ids), self.batch_size):
|
|
122
|
+
batch_ids = query_ids[i:i + self.batch_size]
|
|
123
|
+
batch_results = self._run_batch(batch_ids, sequences, min_identity, max_candidates)
|
|
124
|
+
|
|
125
|
+
for query_id, candidates in batch_results.items():
|
|
126
|
+
all_results[query_id].extend(candidates)
|
|
127
|
+
|
|
128
|
+
pbar.update(len(batch_ids))
|
|
129
|
+
|
|
130
|
+
# Validate results - detect likely vsearch failures
|
|
131
|
+
total_candidates = sum(len(c) for c in all_results.values())
|
|
132
|
+
seqs_with_candidates = sum(1 for c in all_results.values() if c)
|
|
133
|
+
|
|
134
|
+
logging.debug(f"vsearch found {total_candidates} candidates for {len(query_ids)} sequences "
|
|
135
|
+
f"({seqs_with_candidates} sequences with ≥1 candidate)")
|
|
136
|
+
|
|
137
|
+
# If zero candidates for a large dataset, vsearch likely failed
|
|
138
|
+
if len(query_ids) > 100 and total_candidates == 0:
|
|
139
|
+
raise RuntimeError(
|
|
140
|
+
f"vsearch returned zero candidates for {len(query_ids)} sequences. "
|
|
141
|
+
"This may indicate vsearch was killed due to resource contention. "
|
|
142
|
+
"Try running with --threads 1 when using GNU parallel."
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
return dict(all_results)
|
|
146
|
+
|
|
147
|
+
def _run_batch(self,
|
|
148
|
+
query_ids: List[str],
|
|
149
|
+
sequences: Dict[str, str],
|
|
150
|
+
min_identity: float,
|
|
151
|
+
max_candidates: int) -> Dict[str, List[str]]:
|
|
152
|
+
"""Run vsearch on a single batch of queries.
|
|
153
|
+
|
|
154
|
+
Args:
|
|
155
|
+
query_ids: List of sequence IDs to query
|
|
156
|
+
sequences: Dict mapping sequence_id -> sequence_string
|
|
157
|
+
min_identity: Minimum identity threshold
|
|
158
|
+
max_candidates: Maximum candidates per query
|
|
159
|
+
|
|
160
|
+
Returns:
|
|
161
|
+
Dict mapping query_id -> list of candidate target_ids
|
|
162
|
+
"""
|
|
163
|
+
# Create temporary query file
|
|
164
|
+
with tempfile.NamedTemporaryFile(mode='w', suffix='.fasta', delete=False) as f:
|
|
165
|
+
for seq_id in query_ids:
|
|
166
|
+
f.write(f">{seq_id}\n{sequences[seq_id]}\n")
|
|
167
|
+
query_path = f.name
|
|
168
|
+
|
|
169
|
+
try:
|
|
170
|
+
cmd = [
|
|
171
|
+
'vsearch',
|
|
172
|
+
'--usearch_global', query_path,
|
|
173
|
+
'--db', self._db_path,
|
|
174
|
+
'--userout', '/dev/stdout',
|
|
175
|
+
'--userfields', 'query+target+id',
|
|
176
|
+
'--id', str(min_identity),
|
|
177
|
+
'--maxaccepts', str(max_candidates),
|
|
178
|
+
'--threads', str(self.num_threads),
|
|
179
|
+
'--output_no_hits'
|
|
180
|
+
]
|
|
181
|
+
|
|
182
|
+
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
|
|
183
|
+
|
|
184
|
+
# Parse results
|
|
185
|
+
results: Dict[str, List[str]] = defaultdict(list)
|
|
186
|
+
for line in result.stdout.strip().split('\n'):
|
|
187
|
+
if not line:
|
|
188
|
+
continue
|
|
189
|
+
parts = line.split('\t')
|
|
190
|
+
if len(parts) != 3:
|
|
191
|
+
continue
|
|
192
|
+
|
|
193
|
+
query_id, target_hash, identity = parts
|
|
194
|
+
|
|
195
|
+
# Map hash back to original IDs
|
|
196
|
+
if target_hash in self._hash_to_ids:
|
|
197
|
+
for original_id in self._hash_to_ids[target_hash]:
|
|
198
|
+
if original_id != query_id: # Skip self-matches
|
|
199
|
+
results[query_id].append(original_id)
|
|
200
|
+
|
|
201
|
+
return dict(results)
|
|
202
|
+
|
|
203
|
+
except FileNotFoundError:
|
|
204
|
+
raise RuntimeError(
|
|
205
|
+
"vsearch command not found. Please install vsearch:\n"
|
|
206
|
+
" conda install bioconda::vsearch\n"
|
|
207
|
+
"or visit https://github.com/torognes/vsearch for installation instructions."
|
|
208
|
+
)
|
|
209
|
+
except subprocess.CalledProcessError as e:
|
|
210
|
+
logging.error(f"vsearch failed with return code {e.returncode}")
|
|
211
|
+
logging.error(f"vsearch stderr: {e.stderr}")
|
|
212
|
+
raise
|
|
213
|
+
|
|
214
|
+
finally:
|
|
215
|
+
# Clean up temporary query file
|
|
216
|
+
if os.path.exists(query_path):
|
|
217
|
+
os.unlink(query_path)
|
|
218
|
+
|
|
219
|
+
def cleanup(self) -> None:
|
|
220
|
+
"""Clean up cache directory and temporary files."""
|
|
221
|
+
if self._cache_dir and os.path.exists(self._cache_dir):
|
|
222
|
+
import shutil
|
|
223
|
+
shutil.rmtree(self._cache_dir)
|
|
224
|
+
self._db_path = None
|
|
225
|
+
self._hash_to_ids = {}
|
|
226
|
+
self._cache_dir = None
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Summarize subpackage for speconsense.
|
|
3
|
+
|
|
4
|
+
Provides post-processing of speconsense output: HAC variant grouping,
|
|
5
|
+
MSA-based merging with IUPAC ambiguity codes, and variant selection.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
# CLI and entry point
|
|
9
|
+
from .cli import main, parse_arguments, setup_logging, process_single_specimen
|
|
10
|
+
|
|
11
|
+
# IUPAC utilities and distance functions
|
|
12
|
+
from .iupac import (
|
|
13
|
+
IUPAC_EQUIV,
|
|
14
|
+
STANDARD_ADJUSTMENT_PARAMS,
|
|
15
|
+
bases_match_with_iupac,
|
|
16
|
+
expand_iupac_code,
|
|
17
|
+
merge_bases_to_iupac,
|
|
18
|
+
calculate_adjusted_identity_distance,
|
|
19
|
+
calculate_overlap_aware_distance,
|
|
20
|
+
create_variant_summary,
|
|
21
|
+
primers_are_same,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
# FASTA field classes
|
|
25
|
+
from .fields import (
|
|
26
|
+
FastaField,
|
|
27
|
+
FASTA_FIELDS,
|
|
28
|
+
FASTA_FIELD_PRESETS,
|
|
29
|
+
validate_field_registry,
|
|
30
|
+
parse_fasta_fields,
|
|
31
|
+
format_fasta_header,
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
# MSA analysis and quality assessment
|
|
35
|
+
from .analysis import (
|
|
36
|
+
ClusterQualityData,
|
|
37
|
+
MAX_MSA_MERGE_VARIANTS,
|
|
38
|
+
run_spoa_msa,
|
|
39
|
+
identify_indel_events,
|
|
40
|
+
is_homopolymer_event,
|
|
41
|
+
analyze_msa_columns,
|
|
42
|
+
analyze_msa_columns_overlap_aware,
|
|
43
|
+
analyze_cluster_quality,
|
|
44
|
+
identify_outliers,
|
|
45
|
+
analyze_positional_identity_outliers,
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
# MSA-based variant merging
|
|
49
|
+
from .merging import (
|
|
50
|
+
generate_all_subsets_by_size,
|
|
51
|
+
is_compatible_subset,
|
|
52
|
+
create_consensus_from_msa,
|
|
53
|
+
create_overlap_consensus_from_msa,
|
|
54
|
+
merge_group_with_msa,
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
# HAC clustering and variant selection
|
|
58
|
+
from .clustering import (
|
|
59
|
+
perform_hac_clustering,
|
|
60
|
+
select_variants,
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
# File I/O operations
|
|
64
|
+
from .io import (
|
|
65
|
+
parse_consensus_header,
|
|
66
|
+
load_consensus_sequences,
|
|
67
|
+
load_metadata_from_json,
|
|
68
|
+
build_fastq_lookup_table,
|
|
69
|
+
create_output_structure,
|
|
70
|
+
write_consensus_fastq,
|
|
71
|
+
write_specimen_data_files,
|
|
72
|
+
write_position_debug_file,
|
|
73
|
+
write_output_files,
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
__all__ = [
|
|
77
|
+
# CLI
|
|
78
|
+
"main",
|
|
79
|
+
"parse_arguments",
|
|
80
|
+
"setup_logging",
|
|
81
|
+
"process_single_specimen",
|
|
82
|
+
# IUPAC
|
|
83
|
+
"IUPAC_EQUIV",
|
|
84
|
+
"STANDARD_ADJUSTMENT_PARAMS",
|
|
85
|
+
"bases_match_with_iupac",
|
|
86
|
+
"expand_iupac_code",
|
|
87
|
+
"merge_bases_to_iupac",
|
|
88
|
+
"calculate_adjusted_identity_distance",
|
|
89
|
+
"calculate_overlap_aware_distance",
|
|
90
|
+
"create_variant_summary",
|
|
91
|
+
"primers_are_same",
|
|
92
|
+
# Fields
|
|
93
|
+
"FastaField",
|
|
94
|
+
"FASTA_FIELDS",
|
|
95
|
+
"FASTA_FIELD_PRESETS",
|
|
96
|
+
"validate_field_registry",
|
|
97
|
+
"parse_fasta_fields",
|
|
98
|
+
"format_fasta_header",
|
|
99
|
+
# Analysis
|
|
100
|
+
"ClusterQualityData",
|
|
101
|
+
"MAX_MSA_MERGE_VARIANTS",
|
|
102
|
+
"run_spoa_msa",
|
|
103
|
+
"identify_indel_events",
|
|
104
|
+
"is_homopolymer_event",
|
|
105
|
+
"analyze_msa_columns",
|
|
106
|
+
"analyze_msa_columns_overlap_aware",
|
|
107
|
+
"analyze_cluster_quality",
|
|
108
|
+
"identify_outliers",
|
|
109
|
+
"analyze_positional_identity_outliers",
|
|
110
|
+
# Merging
|
|
111
|
+
"generate_all_subsets_by_size",
|
|
112
|
+
"is_compatible_subset",
|
|
113
|
+
"create_consensus_from_msa",
|
|
114
|
+
"create_overlap_consensus_from_msa",
|
|
115
|
+
"merge_group_with_msa",
|
|
116
|
+
# Clustering
|
|
117
|
+
"perform_hac_clustering",
|
|
118
|
+
"select_variants",
|
|
119
|
+
# I/O
|
|
120
|
+
"parse_consensus_header",
|
|
121
|
+
"load_consensus_sequences",
|
|
122
|
+
"load_metadata_from_json",
|
|
123
|
+
"build_fastq_lookup_table",
|
|
124
|
+
"create_output_structure",
|
|
125
|
+
"write_consensus_fastq",
|
|
126
|
+
"write_specimen_data_files",
|
|
127
|
+
"write_position_debug_file",
|
|
128
|
+
"write_output_files",
|
|
129
|
+
]
|