speconsense 0.7.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- speconsense/__init__.py +16 -0
- speconsense/cli.py +6 -0
- speconsense/core/__init__.py +32 -0
- speconsense/core/__main__.py +6 -0
- speconsense/core/cli.py +308 -0
- speconsense/core/clusterer.py +1565 -0
- speconsense/core/workers.py +696 -0
- speconsense/msa.py +813 -0
- speconsense/profiles/__init__.py +514 -0
- speconsense/profiles/example.yaml +97 -0
- speconsense/profiles/herbarium.yaml +25 -0
- speconsense/profiles/largedata.yaml +19 -0
- speconsense/profiles/nostalgia.yaml +22 -0
- speconsense/profiles/strict.yaml +27 -0
- speconsense/quality_report.py +499 -0
- speconsense/scalability/__init__.py +29 -0
- speconsense/scalability/base.py +461 -0
- speconsense/scalability/config.py +42 -0
- speconsense/scalability/vsearch.py +226 -0
- speconsense/summarize/__init__.py +129 -0
- speconsense/summarize/__main__.py +6 -0
- speconsense/summarize/analysis.py +780 -0
- speconsense/summarize/cli.py +528 -0
- speconsense/summarize/clustering.py +669 -0
- speconsense/summarize/fields.py +262 -0
- speconsense/summarize/io.py +723 -0
- speconsense/summarize/iupac.py +294 -0
- speconsense/summarize/merging.py +606 -0
- speconsense/synth.py +292 -0
- speconsense/types.py +38 -0
- speconsense-0.7.2.dist-info/METADATA +1449 -0
- speconsense-0.7.2.dist-info/RECORD +36 -0
- speconsense-0.7.2.dist-info/WHEEL +5 -0
- speconsense-0.7.2.dist-info/entry_points.txt +4 -0
- speconsense-0.7.2.dist-info/licenses/LICENSE +28 -0
- speconsense-0.7.2.dist-info/top_level.txt +1 -0
speconsense/__init__.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Speconsense: High-quality clustering and consensus generation for Oxford Nanopore amplicon reads.
|
|
3
|
+
|
|
4
|
+
A Python tool for experimental clustering and consensus generation as an alternative to NGSpeciesID
|
|
5
|
+
in the fungal DNA barcoding pipeline.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
__version__ = "0.7.2"
|
|
9
|
+
__author__ = "Josh Walker"
|
|
10
|
+
__email__ = "joshowalker@yahoo.com"
|
|
11
|
+
|
|
12
|
+
from .core import main as speconsense_main
|
|
13
|
+
from .summarize import main as summarize_main
|
|
14
|
+
from .synth import main as synth_main
|
|
15
|
+
|
|
16
|
+
__all__ = ["speconsense_main", "summarize_main", "synth_main", "__version__"]
|
speconsense/cli.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Core subpackage for speconsense.
|
|
3
|
+
|
|
4
|
+
Provides clustering and consensus generation for Oxford Nanopore amplicon reads.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
# CLI and entry point
|
|
8
|
+
from .cli import main
|
|
9
|
+
|
|
10
|
+
# Main class
|
|
11
|
+
from .clusterer import SpecimenClusterer
|
|
12
|
+
|
|
13
|
+
# Worker functions and config classes (for advanced usage)
|
|
14
|
+
from .workers import (
|
|
15
|
+
ClusterProcessingConfig,
|
|
16
|
+
ConsensusGenerationConfig,
|
|
17
|
+
_run_spoa_worker,
|
|
18
|
+
_process_cluster_worker,
|
|
19
|
+
_generate_cluster_consensus_worker,
|
|
20
|
+
_trim_primers_standalone,
|
|
21
|
+
_phase_reads_by_variants_standalone,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
__all__ = [
|
|
25
|
+
# CLI
|
|
26
|
+
"main",
|
|
27
|
+
# Main class
|
|
28
|
+
"SpecimenClusterer",
|
|
29
|
+
# Config classes
|
|
30
|
+
"ClusterProcessingConfig",
|
|
31
|
+
"ConsensusGenerationConfig",
|
|
32
|
+
]
|
speconsense/core/cli.py
ADDED
|
@@ -0,0 +1,308 @@
|
|
|
1
|
+
"""CLI and entry point for speconsense core clustering tool."""
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import logging
|
|
5
|
+
import os
|
|
6
|
+
import sys
|
|
7
|
+
|
|
8
|
+
from Bio import SeqIO
|
|
9
|
+
|
|
10
|
+
try:
|
|
11
|
+
from speconsense import __version__
|
|
12
|
+
except ImportError:
|
|
13
|
+
__version__ = "dev"
|
|
14
|
+
|
|
15
|
+
from speconsense.profiles import (
|
|
16
|
+
Profile,
|
|
17
|
+
ProfileError,
|
|
18
|
+
print_profiles_list,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
from .clusterer import SpecimenClusterer
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def main():
|
|
25
|
+
parser = argparse.ArgumentParser(
|
|
26
|
+
description="MCL-based clustering of nanopore amplicon reads"
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
# Input/Output group
|
|
30
|
+
io_group = parser.add_argument_group("Input/Output")
|
|
31
|
+
io_group.add_argument("input_file", help="Input FASTQ file")
|
|
32
|
+
io_group.add_argument("-O", "--output-dir", default="clusters",
|
|
33
|
+
help="Output directory for all files (default: clusters)")
|
|
34
|
+
io_group.add_argument("--primers", help="FASTA file containing primer sequences (default: looks for primers.fasta in input file directory)")
|
|
35
|
+
io_group.add_argument("--augment-input", help="Additional FASTQ/FASTA file with sequences recovered after primary demultiplexing (e.g., from specimine)")
|
|
36
|
+
|
|
37
|
+
# Clustering group
|
|
38
|
+
clustering_group = parser.add_argument_group("Clustering")
|
|
39
|
+
clustering_group.add_argument("--algorithm", type=str, default="graph", choices=["graph", "greedy"],
|
|
40
|
+
help="Clustering algorithm to use (default: graph)")
|
|
41
|
+
clustering_group.add_argument("--min-identity", type=float, default=0.9,
|
|
42
|
+
help="Minimum sequence identity threshold for clustering (default: 0.9)")
|
|
43
|
+
clustering_group.add_argument("--inflation", type=float, default=4.0,
|
|
44
|
+
help="MCL inflation parameter (default: 4.0)")
|
|
45
|
+
clustering_group.add_argument("--k-nearest-neighbors", type=int, default=5,
|
|
46
|
+
help="Number of nearest neighbors for graph construction (default: 5)")
|
|
47
|
+
|
|
48
|
+
# Filtering group
|
|
49
|
+
filtering_group = parser.add_argument_group("Filtering")
|
|
50
|
+
filtering_group.add_argument("--min-size", type=int, default=5,
|
|
51
|
+
help="Minimum cluster size (default: 5, 0 to disable)")
|
|
52
|
+
filtering_group.add_argument("--min-cluster-ratio", type=float, default=0.01,
|
|
53
|
+
help="Minimum size ratio between a cluster and the largest cluster (default: 0.01, 0 to disable)")
|
|
54
|
+
filtering_group.add_argument("--max-sample-size", type=int, default=100,
|
|
55
|
+
help="Maximum cluster size for consensus (default: 100)")
|
|
56
|
+
filtering_group.add_argument("--outlier-identity", type=float, default=None,
|
|
57
|
+
help="Minimum read-to-consensus identity to keep a read (default: auto). "
|
|
58
|
+
"Reads below this threshold are removed as outliers before final "
|
|
59
|
+
"consensus generation. Auto-calculated as (1 + min_identity) / 2. "
|
|
60
|
+
"This threshold is typically higher than --min-identity because "
|
|
61
|
+
"the consensus is error-corrected through averaging.")
|
|
62
|
+
|
|
63
|
+
# Variant Phasing group
|
|
64
|
+
phasing_group = parser.add_argument_group("Variant Phasing")
|
|
65
|
+
phasing_group.add_argument("--disable-position-phasing", action="store_true",
|
|
66
|
+
help="Disable position-based variant phasing (enabled by default). "
|
|
67
|
+
"MCL graph clustering already separates most variants; this "
|
|
68
|
+
"second pass analyzes MSA positions to phase remaining variants.")
|
|
69
|
+
phasing_group.add_argument("--min-variant-frequency", type=float, default=0.10,
|
|
70
|
+
help="Minimum alternative allele frequency to call variant (default: 0.10 for 10%%)")
|
|
71
|
+
phasing_group.add_argument("--min-variant-count", type=int, default=5,
|
|
72
|
+
help="Minimum alternative allele read count to call variant (default: 5)")
|
|
73
|
+
|
|
74
|
+
# Ambiguity Calling group
|
|
75
|
+
ambiguity_group = parser.add_argument_group("Ambiguity Calling")
|
|
76
|
+
ambiguity_group.add_argument("--disable-ambiguity-calling", action="store_true",
|
|
77
|
+
help="Disable IUPAC ambiguity code calling for unphased variant positions")
|
|
78
|
+
ambiguity_group.add_argument("--min-ambiguity-frequency", type=float, default=0.10,
|
|
79
|
+
help="Minimum alternative allele frequency for IUPAC ambiguity calling (default: 0.10 for 10%%)")
|
|
80
|
+
ambiguity_group.add_argument("--min-ambiguity-count", type=int, default=3,
|
|
81
|
+
help="Minimum alternative allele read count for IUPAC ambiguity calling (default: 3)")
|
|
82
|
+
|
|
83
|
+
# Cluster Merging group
|
|
84
|
+
merging_group = parser.add_argument_group("Cluster Merging")
|
|
85
|
+
merging_group.add_argument("--disable-cluster-merging", action="store_true",
|
|
86
|
+
help="Disable merging of clusters with identical consensus sequences")
|
|
87
|
+
merging_group.add_argument("--disable-homopolymer-equivalence", action="store_true",
|
|
88
|
+
help="Disable homopolymer equivalence in cluster merging (only merge identical sequences)")
|
|
89
|
+
|
|
90
|
+
# Orientation group
|
|
91
|
+
orient_group = parser.add_argument_group("Orientation")
|
|
92
|
+
orient_group.add_argument("--orient-mode", choices=["skip", "keep-all", "filter-failed"], default="skip",
|
|
93
|
+
help="Sequence orientation mode: skip (default, no orientation), keep-all (orient but keep failed), or filter-failed (orient and remove failed)")
|
|
94
|
+
|
|
95
|
+
# Performance group
|
|
96
|
+
perf_group = parser.add_argument_group("Performance")
|
|
97
|
+
perf_group.add_argument("--presample", type=int, default=1000,
|
|
98
|
+
help="Presample size for initial reads (default: 1000, 0 to disable)")
|
|
99
|
+
perf_group.add_argument("--scale-threshold", type=int, default=1001,
|
|
100
|
+
help="Sequence count threshold for scalable mode (requires vsearch). "
|
|
101
|
+
"Set to 0 to disable. Default: 1001")
|
|
102
|
+
perf_group.add_argument("--threads", type=int, default=1, metavar="N",
|
|
103
|
+
help="Max threads for internal parallelism (vsearch, SPOA). "
|
|
104
|
+
"0=auto-detect, default=1 (safe for parallel workflows).")
|
|
105
|
+
perf_group.add_argument("--enable-early-filter", action="store_true",
|
|
106
|
+
help="Enable early filtering to skip small clusters before variant phasing (improves performance for large datasets)")
|
|
107
|
+
|
|
108
|
+
# Debugging group
|
|
109
|
+
debug_group = parser.add_argument_group("Debugging")
|
|
110
|
+
debug_group.add_argument("--collect-discards", action="store_true",
|
|
111
|
+
help="Write discarded reads (outliers and filtered clusters) to cluster_debug/{sample}-discards.fastq")
|
|
112
|
+
debug_group.add_argument("--log-level", default="INFO",
|
|
113
|
+
choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"])
|
|
114
|
+
|
|
115
|
+
# Version and profile options (default group)
|
|
116
|
+
parser.add_argument("--version", action="version",
|
|
117
|
+
version=f"Speconsense {__version__}",
|
|
118
|
+
help="Show program's version number and exit")
|
|
119
|
+
parser.add_argument("-p", "--profile", metavar="NAME",
|
|
120
|
+
help="Load parameter profile (use --list-profiles to see available)")
|
|
121
|
+
parser.add_argument("--list-profiles", action="store_true",
|
|
122
|
+
help="List available profiles and exit")
|
|
123
|
+
|
|
124
|
+
# Handle --list-profiles early (before requiring input_file)
|
|
125
|
+
if '--list-profiles' in sys.argv:
|
|
126
|
+
print_profiles_list('speconsense')
|
|
127
|
+
sys.exit(0)
|
|
128
|
+
|
|
129
|
+
# First pass: get profile name if specified
|
|
130
|
+
# We need to detect which args were explicitly provided to not override them
|
|
131
|
+
pre_args, _ = parser.parse_known_args()
|
|
132
|
+
|
|
133
|
+
# Track which arguments were explicitly provided on CLI
|
|
134
|
+
explicit_args = set()
|
|
135
|
+
for arg in sys.argv[1:]:
|
|
136
|
+
if arg.startswith('--') and '=' in arg:
|
|
137
|
+
explicit_args.add(arg.split('=')[0][2:].replace('-', '_'))
|
|
138
|
+
elif arg.startswith('--'):
|
|
139
|
+
explicit_args.add(arg[2:].replace('-', '_'))
|
|
140
|
+
elif arg.startswith('-') and len(arg) == 2:
|
|
141
|
+
# Short option - would need to map to long name
|
|
142
|
+
# For now, we skip this since profile args use long names
|
|
143
|
+
pass
|
|
144
|
+
|
|
145
|
+
# Load and apply profile if specified
|
|
146
|
+
loaded_profile = None
|
|
147
|
+
if pre_args.profile:
|
|
148
|
+
try:
|
|
149
|
+
loaded_profile = Profile.load(pre_args.profile)
|
|
150
|
+
except ProfileError as e:
|
|
151
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
152
|
+
sys.exit(1)
|
|
153
|
+
|
|
154
|
+
# Apply profile values to parser defaults (explicit CLI args will override)
|
|
155
|
+
for key, value in loaded_profile.speconsense.items():
|
|
156
|
+
attr_name = key.replace('-', '_')
|
|
157
|
+
if attr_name not in explicit_args:
|
|
158
|
+
parser.set_defaults(**{attr_name: value})
|
|
159
|
+
|
|
160
|
+
args = parser.parse_args()
|
|
161
|
+
|
|
162
|
+
# Setup standard logging
|
|
163
|
+
log_format = '%(asctime)s - %(levelname)s - %(message)s'
|
|
164
|
+
logging.basicConfig(
|
|
165
|
+
level=getattr(logging, args.log_level),
|
|
166
|
+
format=log_format
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
# Log profile usage after logging is configured
|
|
170
|
+
if loaded_profile:
|
|
171
|
+
logging.info(f"Using profile '{loaded_profile.name}': {loaded_profile.description}")
|
|
172
|
+
|
|
173
|
+
# Resolve threads: 0 means auto-detect
|
|
174
|
+
threads = args.threads if args.threads > 0 else os.cpu_count()
|
|
175
|
+
|
|
176
|
+
sample = os.path.splitext(os.path.basename(args.input_file))[0]
|
|
177
|
+
clusterer = SpecimenClusterer(
|
|
178
|
+
min_identity=args.min_identity,
|
|
179
|
+
inflation=args.inflation,
|
|
180
|
+
min_size=args.min_size,
|
|
181
|
+
min_cluster_ratio=args.min_cluster_ratio,
|
|
182
|
+
max_sample_size=args.max_sample_size,
|
|
183
|
+
presample_size=args.presample,
|
|
184
|
+
k_nearest_neighbors=args.k_nearest_neighbors,
|
|
185
|
+
sample_name=sample,
|
|
186
|
+
disable_homopolymer_equivalence=args.disable_homopolymer_equivalence,
|
|
187
|
+
disable_cluster_merging=args.disable_cluster_merging,
|
|
188
|
+
output_dir=args.output_dir,
|
|
189
|
+
outlier_identity_threshold=args.outlier_identity,
|
|
190
|
+
enable_secondpass_phasing=not args.disable_position_phasing,
|
|
191
|
+
min_variant_frequency=args.min_variant_frequency,
|
|
192
|
+
min_variant_count=args.min_variant_count,
|
|
193
|
+
min_ambiguity_frequency=args.min_ambiguity_frequency,
|
|
194
|
+
min_ambiguity_count=args.min_ambiguity_count,
|
|
195
|
+
enable_iupac_calling=not args.disable_ambiguity_calling,
|
|
196
|
+
scale_threshold=args.scale_threshold,
|
|
197
|
+
max_threads=threads,
|
|
198
|
+
early_filter=args.enable_early_filter,
|
|
199
|
+
collect_discards=args.collect_discards
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
# Log configuration
|
|
203
|
+
if args.outlier_identity is not None:
|
|
204
|
+
logging.info(f"Outlier removal enabled: outlier_identity={args.outlier_identity*100:.1f}% (user-specified)")
|
|
205
|
+
else:
|
|
206
|
+
# Auto-calculated threshold
|
|
207
|
+
auto_threshold = (1.0 + args.min_identity) / 2.0
|
|
208
|
+
logging.info(f"Outlier removal enabled: outlier_identity={auto_threshold*100:.1f}% (auto-calculated from min_identity={args.min_identity*100:.1f}%)")
|
|
209
|
+
|
|
210
|
+
if not args.disable_position_phasing:
|
|
211
|
+
logging.info(f"Position-based variant phasing enabled: min_freq={args.min_variant_frequency:.0%}, "
|
|
212
|
+
f"min_count={args.min_variant_count}")
|
|
213
|
+
|
|
214
|
+
# Set additional attributes for metadata
|
|
215
|
+
clusterer.input_file = os.path.abspath(args.input_file)
|
|
216
|
+
clusterer.augment_input = os.path.abspath(args.augment_input) if args.augment_input else None
|
|
217
|
+
clusterer.algorithm = args.algorithm
|
|
218
|
+
clusterer.orient_mode = args.orient_mode
|
|
219
|
+
|
|
220
|
+
# Read primary sequences
|
|
221
|
+
logging.info(f"Reading sequences from {args.input_file}")
|
|
222
|
+
format = "fasta" if args.input_file.endswith(".fasta") else "fastq"
|
|
223
|
+
records = list(SeqIO.parse(args.input_file, format))
|
|
224
|
+
logging.info(f"Loaded {len(records)} primary sequences")
|
|
225
|
+
|
|
226
|
+
if len(records) == 0:
|
|
227
|
+
logging.warning("No sequences found in input file. Nothing to cluster.")
|
|
228
|
+
sys.exit(0)
|
|
229
|
+
|
|
230
|
+
# Load augmented sequences if specified
|
|
231
|
+
augment_records = None
|
|
232
|
+
if args.augment_input:
|
|
233
|
+
# Check if augment input file exists
|
|
234
|
+
if not os.path.exists(args.augment_input):
|
|
235
|
+
logging.error(f"Augment input file not found: {args.augment_input}")
|
|
236
|
+
sys.exit(1)
|
|
237
|
+
|
|
238
|
+
logging.info(f"Reading augmented sequences from {args.augment_input}")
|
|
239
|
+
|
|
240
|
+
# Auto-detect format like main input
|
|
241
|
+
augment_format = "fasta" if args.augment_input.endswith(".fasta") else "fastq"
|
|
242
|
+
|
|
243
|
+
try:
|
|
244
|
+
augment_records = list(SeqIO.parse(args.augment_input, augment_format))
|
|
245
|
+
logging.info(f"Loaded {len(augment_records)} augmented sequences")
|
|
246
|
+
|
|
247
|
+
if len(augment_records) == 0:
|
|
248
|
+
logging.warning(f"No sequences found in augment input file: {args.augment_input}")
|
|
249
|
+
|
|
250
|
+
# Add dummy quality scores to FASTA sequences so they can be written as FASTQ later
|
|
251
|
+
if augment_format == "fasta":
|
|
252
|
+
for record in augment_records:
|
|
253
|
+
if not hasattr(record, 'letter_annotations') or 'phred_quality' not in record.letter_annotations:
|
|
254
|
+
# Add dummy quality scores (quality 30 = '?' in FASTQ)
|
|
255
|
+
record.letter_annotations = {'phred_quality': [30] * len(record.seq)}
|
|
256
|
+
logging.debug(f"Added quality scores to {len(augment_records)} FASTA sequences for downstream compatibility")
|
|
257
|
+
|
|
258
|
+
except Exception as e:
|
|
259
|
+
logging.error(f"Failed to read augment input file '{args.augment_input}': {e}")
|
|
260
|
+
sys.exit(1)
|
|
261
|
+
|
|
262
|
+
# Add sequences to clusterer (both primary and augmented)
|
|
263
|
+
clusterer.add_sequences(records, augment_records)
|
|
264
|
+
|
|
265
|
+
if args.primers:
|
|
266
|
+
clusterer.primers_file = os.path.abspath(args.primers)
|
|
267
|
+
clusterer.load_primers(args.primers)
|
|
268
|
+
else:
|
|
269
|
+
# Look for primers.fasta in the same directory as the input file
|
|
270
|
+
input_dir = os.path.dirname(os.path.abspath(args.input_file))
|
|
271
|
+
auto_primer_path = os.path.join(input_dir, "primers.fasta")
|
|
272
|
+
|
|
273
|
+
if os.path.exists(auto_primer_path):
|
|
274
|
+
logging.debug(f"Found primers.fasta in input directory: {auto_primer_path}")
|
|
275
|
+
clusterer.primers_file = os.path.abspath(auto_primer_path)
|
|
276
|
+
clusterer.load_primers(auto_primer_path)
|
|
277
|
+
else:
|
|
278
|
+
logging.warning("No primer file specified and primers.fasta not found in input directory. Primer trimming will be disabled.")
|
|
279
|
+
clusterer.primers_file = None
|
|
280
|
+
|
|
281
|
+
# Handle sequence orientation based on mode
|
|
282
|
+
if args.orient_mode != "skip":
|
|
283
|
+
if hasattr(clusterer, 'forward_primers') and hasattr(clusterer, 'reverse_primers'):
|
|
284
|
+
failed_sequences = clusterer.orient_sequences()
|
|
285
|
+
|
|
286
|
+
# Filter failed sequences if requested
|
|
287
|
+
if args.orient_mode == "filter-failed" and failed_sequences:
|
|
288
|
+
logging.info(f"Filtering out {len(failed_sequences)} sequences with failed orientation")
|
|
289
|
+
|
|
290
|
+
# Track as discarded and remove from clustering (but keep records for discards file)
|
|
291
|
+
clusterer.discarded_read_ids.update(failed_sequences)
|
|
292
|
+
for seq_id in failed_sequences:
|
|
293
|
+
del clusterer.sequences[seq_id]
|
|
294
|
+
# Keep records so they can be written to discards file
|
|
295
|
+
|
|
296
|
+
remaining = len(clusterer.sequences)
|
|
297
|
+
logging.info(f"Continuing with {remaining} successfully oriented sequences")
|
|
298
|
+
else:
|
|
299
|
+
logging.warning(f"--orient-mode={args.orient_mode} specified but no primers with position information loaded")
|
|
300
|
+
|
|
301
|
+
# Write metadata file for use by post-processing tools
|
|
302
|
+
clusterer.write_metadata()
|
|
303
|
+
|
|
304
|
+
clusterer.cluster(algorithm=args.algorithm)
|
|
305
|
+
print()
|
|
306
|
+
|
|
307
|
+
if __name__ == "__main__":
|
|
308
|
+
main()
|