speconsense 0.7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,16 @@
1
+ """
2
+ Speconsense: High-quality clustering and consensus generation for Oxford Nanopore amplicon reads.
3
+
4
+ A Python tool for experimental clustering and consensus generation as an alternative to NGSpeciesID
5
+ in the fungal DNA barcoding pipeline.
6
+ """
7
+
8
+ __version__ = "0.7.2"
9
+ __author__ = "Josh Walker"
10
+ __email__ = "joshowalker@yahoo.com"
11
+
12
+ from .core import main as speconsense_main
13
+ from .summarize import main as summarize_main
14
+ from .synth import main as synth_main
15
+
16
+ __all__ = ["speconsense_main", "summarize_main", "synth_main", "__version__"]
speconsense/cli.py ADDED
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env python3
2
+
3
+ from .core import main
4
+
5
+ if __name__ == "__main__":
6
+ main()
@@ -0,0 +1,32 @@
1
+ """
2
+ Core subpackage for speconsense.
3
+
4
+ Provides clustering and consensus generation for Oxford Nanopore amplicon reads.
5
+ """
6
+
7
+ # CLI and entry point
8
+ from .cli import main
9
+
10
+ # Main class
11
+ from .clusterer import SpecimenClusterer
12
+
13
+ # Worker functions and config classes (for advanced usage)
14
+ from .workers import (
15
+ ClusterProcessingConfig,
16
+ ConsensusGenerationConfig,
17
+ _run_spoa_worker,
18
+ _process_cluster_worker,
19
+ _generate_cluster_consensus_worker,
20
+ _trim_primers_standalone,
21
+ _phase_reads_by_variants_standalone,
22
+ )
23
+
24
+ __all__ = [
25
+ # CLI
26
+ "main",
27
+ # Main class
28
+ "SpecimenClusterer",
29
+ # Config classes
30
+ "ClusterProcessingConfig",
31
+ "ConsensusGenerationConfig",
32
+ ]
@@ -0,0 +1,6 @@
1
+ """Entry point for python -m speconsense.core."""
2
+
3
+ from .cli import main
4
+
5
+ if __name__ == "__main__":
6
+ main()
@@ -0,0 +1,308 @@
1
+ """CLI and entry point for speconsense core clustering tool."""
2
+
3
+ import argparse
4
+ import logging
5
+ import os
6
+ import sys
7
+
8
+ from Bio import SeqIO
9
+
10
+ try:
11
+ from speconsense import __version__
12
+ except ImportError:
13
+ __version__ = "dev"
14
+
15
+ from speconsense.profiles import (
16
+ Profile,
17
+ ProfileError,
18
+ print_profiles_list,
19
+ )
20
+
21
+ from .clusterer import SpecimenClusterer
22
+
23
+
24
+ def main():
25
+ parser = argparse.ArgumentParser(
26
+ description="MCL-based clustering of nanopore amplicon reads"
27
+ )
28
+
29
+ # Input/Output group
30
+ io_group = parser.add_argument_group("Input/Output")
31
+ io_group.add_argument("input_file", help="Input FASTQ file")
32
+ io_group.add_argument("-O", "--output-dir", default="clusters",
33
+ help="Output directory for all files (default: clusters)")
34
+ io_group.add_argument("--primers", help="FASTA file containing primer sequences (default: looks for primers.fasta in input file directory)")
35
+ io_group.add_argument("--augment-input", help="Additional FASTQ/FASTA file with sequences recovered after primary demultiplexing (e.g., from specimine)")
36
+
37
+ # Clustering group
38
+ clustering_group = parser.add_argument_group("Clustering")
39
+ clustering_group.add_argument("--algorithm", type=str, default="graph", choices=["graph", "greedy"],
40
+ help="Clustering algorithm to use (default: graph)")
41
+ clustering_group.add_argument("--min-identity", type=float, default=0.9,
42
+ help="Minimum sequence identity threshold for clustering (default: 0.9)")
43
+ clustering_group.add_argument("--inflation", type=float, default=4.0,
44
+ help="MCL inflation parameter (default: 4.0)")
45
+ clustering_group.add_argument("--k-nearest-neighbors", type=int, default=5,
46
+ help="Number of nearest neighbors for graph construction (default: 5)")
47
+
48
+ # Filtering group
49
+ filtering_group = parser.add_argument_group("Filtering")
50
+ filtering_group.add_argument("--min-size", type=int, default=5,
51
+ help="Minimum cluster size (default: 5, 0 to disable)")
52
+ filtering_group.add_argument("--min-cluster-ratio", type=float, default=0.01,
53
+ help="Minimum size ratio between a cluster and the largest cluster (default: 0.01, 0 to disable)")
54
+ filtering_group.add_argument("--max-sample-size", type=int, default=100,
55
+ help="Maximum cluster size for consensus (default: 100)")
56
+ filtering_group.add_argument("--outlier-identity", type=float, default=None,
57
+ help="Minimum read-to-consensus identity to keep a read (default: auto). "
58
+ "Reads below this threshold are removed as outliers before final "
59
+ "consensus generation. Auto-calculated as (1 + min_identity) / 2. "
60
+ "This threshold is typically higher than --min-identity because "
61
+ "the consensus is error-corrected through averaging.")
62
+
63
+ # Variant Phasing group
64
+ phasing_group = parser.add_argument_group("Variant Phasing")
65
+ phasing_group.add_argument("--disable-position-phasing", action="store_true",
66
+ help="Disable position-based variant phasing (enabled by default). "
67
+ "MCL graph clustering already separates most variants; this "
68
+ "second pass analyzes MSA positions to phase remaining variants.")
69
+ phasing_group.add_argument("--min-variant-frequency", type=float, default=0.10,
70
+ help="Minimum alternative allele frequency to call variant (default: 0.10 for 10%%)")
71
+ phasing_group.add_argument("--min-variant-count", type=int, default=5,
72
+ help="Minimum alternative allele read count to call variant (default: 5)")
73
+
74
+ # Ambiguity Calling group
75
+ ambiguity_group = parser.add_argument_group("Ambiguity Calling")
76
+ ambiguity_group.add_argument("--disable-ambiguity-calling", action="store_true",
77
+ help="Disable IUPAC ambiguity code calling for unphased variant positions")
78
+ ambiguity_group.add_argument("--min-ambiguity-frequency", type=float, default=0.10,
79
+ help="Minimum alternative allele frequency for IUPAC ambiguity calling (default: 0.10 for 10%%)")
80
+ ambiguity_group.add_argument("--min-ambiguity-count", type=int, default=3,
81
+ help="Minimum alternative allele read count for IUPAC ambiguity calling (default: 3)")
82
+
83
+ # Cluster Merging group
84
+ merging_group = parser.add_argument_group("Cluster Merging")
85
+ merging_group.add_argument("--disable-cluster-merging", action="store_true",
86
+ help="Disable merging of clusters with identical consensus sequences")
87
+ merging_group.add_argument("--disable-homopolymer-equivalence", action="store_true",
88
+ help="Disable homopolymer equivalence in cluster merging (only merge identical sequences)")
89
+
90
+ # Orientation group
91
+ orient_group = parser.add_argument_group("Orientation")
92
+ orient_group.add_argument("--orient-mode", choices=["skip", "keep-all", "filter-failed"], default="skip",
93
+ help="Sequence orientation mode: skip (default, no orientation), keep-all (orient but keep failed), or filter-failed (orient and remove failed)")
94
+
95
+ # Performance group
96
+ perf_group = parser.add_argument_group("Performance")
97
+ perf_group.add_argument("--presample", type=int, default=1000,
98
+ help="Presample size for initial reads (default: 1000, 0 to disable)")
99
+ perf_group.add_argument("--scale-threshold", type=int, default=1001,
100
+ help="Sequence count threshold for scalable mode (requires vsearch). "
101
+ "Set to 0 to disable. Default: 1001")
102
+ perf_group.add_argument("--threads", type=int, default=1, metavar="N",
103
+ help="Max threads for internal parallelism (vsearch, SPOA). "
104
+ "0=auto-detect, default=1 (safe for parallel workflows).")
105
+ perf_group.add_argument("--enable-early-filter", action="store_true",
106
+ help="Enable early filtering to skip small clusters before variant phasing (improves performance for large datasets)")
107
+
108
+ # Debugging group
109
+ debug_group = parser.add_argument_group("Debugging")
110
+ debug_group.add_argument("--collect-discards", action="store_true",
111
+ help="Write discarded reads (outliers and filtered clusters) to cluster_debug/{sample}-discards.fastq")
112
+ debug_group.add_argument("--log-level", default="INFO",
113
+ choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"])
114
+
115
+ # Version and profile options (default group)
116
+ parser.add_argument("--version", action="version",
117
+ version=f"Speconsense {__version__}",
118
+ help="Show program's version number and exit")
119
+ parser.add_argument("-p", "--profile", metavar="NAME",
120
+ help="Load parameter profile (use --list-profiles to see available)")
121
+ parser.add_argument("--list-profiles", action="store_true",
122
+ help="List available profiles and exit")
123
+
124
+ # Handle --list-profiles early (before requiring input_file)
125
+ if '--list-profiles' in sys.argv:
126
+ print_profiles_list('speconsense')
127
+ sys.exit(0)
128
+
129
+ # First pass: get profile name if specified
130
+ # We need to detect which args were explicitly provided to not override them
131
+ pre_args, _ = parser.parse_known_args()
132
+
133
+ # Track which arguments were explicitly provided on CLI
134
+ explicit_args = set()
135
+ for arg in sys.argv[1:]:
136
+ if arg.startswith('--') and '=' in arg:
137
+ explicit_args.add(arg.split('=')[0][2:].replace('-', '_'))
138
+ elif arg.startswith('--'):
139
+ explicit_args.add(arg[2:].replace('-', '_'))
140
+ elif arg.startswith('-') and len(arg) == 2:
141
+ # Short option - would need to map to long name
142
+ # For now, we skip this since profile args use long names
143
+ pass
144
+
145
+ # Load and apply profile if specified
146
+ loaded_profile = None
147
+ if pre_args.profile:
148
+ try:
149
+ loaded_profile = Profile.load(pre_args.profile)
150
+ except ProfileError as e:
151
+ print(f"Error: {e}", file=sys.stderr)
152
+ sys.exit(1)
153
+
154
+ # Apply profile values to parser defaults (explicit CLI args will override)
155
+ for key, value in loaded_profile.speconsense.items():
156
+ attr_name = key.replace('-', '_')
157
+ if attr_name not in explicit_args:
158
+ parser.set_defaults(**{attr_name: value})
159
+
160
+ args = parser.parse_args()
161
+
162
+ # Setup standard logging
163
+ log_format = '%(asctime)s - %(levelname)s - %(message)s'
164
+ logging.basicConfig(
165
+ level=getattr(logging, args.log_level),
166
+ format=log_format
167
+ )
168
+
169
+ # Log profile usage after logging is configured
170
+ if loaded_profile:
171
+ logging.info(f"Using profile '{loaded_profile.name}': {loaded_profile.description}")
172
+
173
+ # Resolve threads: 0 means auto-detect
174
+ threads = args.threads if args.threads > 0 else os.cpu_count()
175
+
176
+ sample = os.path.splitext(os.path.basename(args.input_file))[0]
177
+ clusterer = SpecimenClusterer(
178
+ min_identity=args.min_identity,
179
+ inflation=args.inflation,
180
+ min_size=args.min_size,
181
+ min_cluster_ratio=args.min_cluster_ratio,
182
+ max_sample_size=args.max_sample_size,
183
+ presample_size=args.presample,
184
+ k_nearest_neighbors=args.k_nearest_neighbors,
185
+ sample_name=sample,
186
+ disable_homopolymer_equivalence=args.disable_homopolymer_equivalence,
187
+ disable_cluster_merging=args.disable_cluster_merging,
188
+ output_dir=args.output_dir,
189
+ outlier_identity_threshold=args.outlier_identity,
190
+ enable_secondpass_phasing=not args.disable_position_phasing,
191
+ min_variant_frequency=args.min_variant_frequency,
192
+ min_variant_count=args.min_variant_count,
193
+ min_ambiguity_frequency=args.min_ambiguity_frequency,
194
+ min_ambiguity_count=args.min_ambiguity_count,
195
+ enable_iupac_calling=not args.disable_ambiguity_calling,
196
+ scale_threshold=args.scale_threshold,
197
+ max_threads=threads,
198
+ early_filter=args.enable_early_filter,
199
+ collect_discards=args.collect_discards
200
+ )
201
+
202
+ # Log configuration
203
+ if args.outlier_identity is not None:
204
+ logging.info(f"Outlier removal enabled: outlier_identity={args.outlier_identity*100:.1f}% (user-specified)")
205
+ else:
206
+ # Auto-calculated threshold
207
+ auto_threshold = (1.0 + args.min_identity) / 2.0
208
+ logging.info(f"Outlier removal enabled: outlier_identity={auto_threshold*100:.1f}% (auto-calculated from min_identity={args.min_identity*100:.1f}%)")
209
+
210
+ if not args.disable_position_phasing:
211
+ logging.info(f"Position-based variant phasing enabled: min_freq={args.min_variant_frequency:.0%}, "
212
+ f"min_count={args.min_variant_count}")
213
+
214
+ # Set additional attributes for metadata
215
+ clusterer.input_file = os.path.abspath(args.input_file)
216
+ clusterer.augment_input = os.path.abspath(args.augment_input) if args.augment_input else None
217
+ clusterer.algorithm = args.algorithm
218
+ clusterer.orient_mode = args.orient_mode
219
+
220
+ # Read primary sequences
221
+ logging.info(f"Reading sequences from {args.input_file}")
222
+ format = "fasta" if args.input_file.endswith(".fasta") else "fastq"
223
+ records = list(SeqIO.parse(args.input_file, format))
224
+ logging.info(f"Loaded {len(records)} primary sequences")
225
+
226
+ if len(records) == 0:
227
+ logging.warning("No sequences found in input file. Nothing to cluster.")
228
+ sys.exit(0)
229
+
230
+ # Load augmented sequences if specified
231
+ augment_records = None
232
+ if args.augment_input:
233
+ # Check if augment input file exists
234
+ if not os.path.exists(args.augment_input):
235
+ logging.error(f"Augment input file not found: {args.augment_input}")
236
+ sys.exit(1)
237
+
238
+ logging.info(f"Reading augmented sequences from {args.augment_input}")
239
+
240
+ # Auto-detect format like main input
241
+ augment_format = "fasta" if args.augment_input.endswith(".fasta") else "fastq"
242
+
243
+ try:
244
+ augment_records = list(SeqIO.parse(args.augment_input, augment_format))
245
+ logging.info(f"Loaded {len(augment_records)} augmented sequences")
246
+
247
+ if len(augment_records) == 0:
248
+ logging.warning(f"No sequences found in augment input file: {args.augment_input}")
249
+
250
+ # Add dummy quality scores to FASTA sequences so they can be written as FASTQ later
251
+ if augment_format == "fasta":
252
+ for record in augment_records:
253
+ if not hasattr(record, 'letter_annotations') or 'phred_quality' not in record.letter_annotations:
254
+ # Add dummy quality scores (quality 30 = '?' in FASTQ)
255
+ record.letter_annotations = {'phred_quality': [30] * len(record.seq)}
256
+ logging.debug(f"Added quality scores to {len(augment_records)} FASTA sequences for downstream compatibility")
257
+
258
+ except Exception as e:
259
+ logging.error(f"Failed to read augment input file '{args.augment_input}': {e}")
260
+ sys.exit(1)
261
+
262
+ # Add sequences to clusterer (both primary and augmented)
263
+ clusterer.add_sequences(records, augment_records)
264
+
265
+ if args.primers:
266
+ clusterer.primers_file = os.path.abspath(args.primers)
267
+ clusterer.load_primers(args.primers)
268
+ else:
269
+ # Look for primers.fasta in the same directory as the input file
270
+ input_dir = os.path.dirname(os.path.abspath(args.input_file))
271
+ auto_primer_path = os.path.join(input_dir, "primers.fasta")
272
+
273
+ if os.path.exists(auto_primer_path):
274
+ logging.debug(f"Found primers.fasta in input directory: {auto_primer_path}")
275
+ clusterer.primers_file = os.path.abspath(auto_primer_path)
276
+ clusterer.load_primers(auto_primer_path)
277
+ else:
278
+ logging.warning("No primer file specified and primers.fasta not found in input directory. Primer trimming will be disabled.")
279
+ clusterer.primers_file = None
280
+
281
+ # Handle sequence orientation based on mode
282
+ if args.orient_mode != "skip":
283
+ if hasattr(clusterer, 'forward_primers') and hasattr(clusterer, 'reverse_primers'):
284
+ failed_sequences = clusterer.orient_sequences()
285
+
286
+ # Filter failed sequences if requested
287
+ if args.orient_mode == "filter-failed" and failed_sequences:
288
+ logging.info(f"Filtering out {len(failed_sequences)} sequences with failed orientation")
289
+
290
+ # Track as discarded and remove from clustering (but keep records for discards file)
291
+ clusterer.discarded_read_ids.update(failed_sequences)
292
+ for seq_id in failed_sequences:
293
+ del clusterer.sequences[seq_id]
294
+ # Keep records so they can be written to discards file
295
+
296
+ remaining = len(clusterer.sequences)
297
+ logging.info(f"Continuing with {remaining} successfully oriented sequences")
298
+ else:
299
+ logging.warning(f"--orient-mode={args.orient_mode} specified but no primers with position information loaded")
300
+
301
+ # Write metadata file for use by post-processing tools
302
+ clusterer.write_metadata()
303
+
304
+ clusterer.cluster(algorithm=args.algorithm)
305
+ print()
306
+
307
+ if __name__ == "__main__":
308
+ main()