speconsense 0.7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,528 @@
1
+ """Command-line interface for speconsense-summarize.
2
+
3
+ Provides argument parsing, logging setup, and main entry point.
4
+ """
5
+
6
+ import os
7
+ import sys
8
+ import argparse
9
+ import logging
10
+ import tempfile
11
+ from typing import List, Tuple, Dict
12
+ from collections import defaultdict
13
+
14
+ # Python 3.8 compatibility: BooleanOptionalAction was added in Python 3.9
15
+ if not hasattr(argparse, 'BooleanOptionalAction'):
16
+ class BooleanOptionalAction(argparse.Action):
17
+ def __init__(self, option_strings, dest, default=None, required=False, help=None):
18
+ _option_strings = []
19
+ for option_string in option_strings:
20
+ _option_strings.append(option_string)
21
+ if option_string.startswith('--'):
22
+ _option_strings.append('--no-' + option_string[2:])
23
+ super().__init__(option_strings=_option_strings, dest=dest, nargs=0,
24
+ default=default, required=required, help=help)
25
+
26
+ def __call__(self, parser, namespace, values, option_string=None):
27
+ if option_string.startswith('--no-'):
28
+ setattr(namespace, self.dest, False)
29
+ else:
30
+ setattr(namespace, self.dest, True)
31
+ argparse.BooleanOptionalAction = BooleanOptionalAction
32
+
33
+ from tqdm import tqdm
34
+
35
+ try:
36
+ from speconsense import __version__
37
+ except ImportError:
38
+ # Fallback for when running as a script directly (e.g., in tests)
39
+ __version__ = "dev"
40
+
41
+ from speconsense.profiles import (
42
+ Profile,
43
+ ProfileError,
44
+ print_profiles_list,
45
+ )
46
+ from speconsense.scalability import ScalabilityConfig
47
+ from speconsense.types import ConsensusInfo, OverlapMergeInfo
48
+
49
+ from .fields import parse_fasta_fields
50
+ from .io import (
51
+ load_consensus_sequences,
52
+ build_fastq_lookup_table,
53
+ write_specimen_data_files,
54
+ write_output_files,
55
+ )
56
+ from .clustering import perform_hac_clustering, select_variants
57
+ from .merging import merge_group_with_msa
58
+ from .analysis import MAX_MSA_MERGE_VARIANTS, MIN_MERGE_BATCH, MAX_MERGE_BATCH
59
+
60
+
61
+ # Merge effort configuration
62
+ MERGE_EFFORT_PRESETS = {
63
+ 'fast': 8,
64
+ 'balanced': 10,
65
+ 'thorough': 12,
66
+ }
67
+
68
+
69
+ def parse_merge_effort(spec: str) -> int:
70
+ """Parse merge effort specification into numeric value.
71
+
72
+ Args:
73
+ spec: Preset name (fast, balanced, thorough) or numeric 6-14
74
+
75
+ Returns:
76
+ Effort level as integer
77
+
78
+ Raises:
79
+ ValueError: If spec is invalid
80
+ """
81
+ spec = spec.strip().lower()
82
+ if spec in MERGE_EFFORT_PRESETS:
83
+ return MERGE_EFFORT_PRESETS[spec]
84
+ try:
85
+ value = int(spec)
86
+ if 6 <= value <= 14:
87
+ return value
88
+ raise ValueError(f"Numeric merge-effort must be 6-14, got {value}")
89
+ except ValueError as e:
90
+ if "invalid literal" in str(e):
91
+ raise ValueError(
92
+ f"Unknown merge-effort: '{spec}'. "
93
+ f"Use preset (fast, balanced, thorough) or numeric 6-14"
94
+ )
95
+ raise
96
+
97
+
98
+ def parse_arguments():
99
+ parser = argparse.ArgumentParser(description="Process Speconsense output with advanced variant handling.")
100
+
101
+ # Input/Output group
102
+ io_group = parser.add_argument_group("Input/Output")
103
+ io_group.add_argument("--source", type=str, default="clusters",
104
+ help="Source directory containing Speconsense output (default: clusters)")
105
+ io_group.add_argument("--summary-dir", type=str, default="__Summary__",
106
+ help="Output directory for summary files (default: __Summary__)")
107
+ io_group.add_argument("--fasta-fields", type=str, default="default",
108
+ help="FASTA header fields to output. Can be: "
109
+ "(1) a preset name (default, minimal, qc, full, id-only), "
110
+ "(2) comma-separated field names (size, ric, length, rawric, "
111
+ "snp, rid, rid_min, primers, group, variant), or "
112
+ "(3) a combination of presets and fields (e.g., minimal,qc or "
113
+ "minimal,rid). Duplicates removed, order preserved "
114
+ "left to right. Default: default")
115
+
116
+ # Filtering group
117
+ filtering_group = parser.add_argument_group("Filtering")
118
+ filtering_group.add_argument("--min-ric", type=int, default=3,
119
+ help="Minimum Reads in Consensus (RiC) threshold (default: 3)")
120
+ filtering_group.add_argument("--min-len", type=int, default=0,
121
+ help="Minimum sequence length in bp (default: 0 = disabled)")
122
+ filtering_group.add_argument("--max-len", type=int, default=0,
123
+ help="Maximum sequence length in bp (default: 0 = disabled)")
124
+
125
+ # Grouping group
126
+ grouping_group = parser.add_argument_group("Grouping")
127
+ grouping_group.add_argument("--group-identity", "--variant-group-identity",
128
+ dest="group_identity", type=float, default=0.9,
129
+ help="Identity threshold for variant grouping using HAC (default: 0.9)")
130
+
131
+ # Merging group
132
+ merging_group = parser.add_argument_group("Merging")
133
+ merging_group.add_argument("--disable-merging", action="store_true",
134
+ help="Disable all variant merging (skip MSA-based merge evaluation entirely)")
135
+ merging_group.add_argument("--merge-snp", action=argparse.BooleanOptionalAction, default=True,
136
+ help="Enable SNP-based merging (default: True, use --no-merge-snp to disable)")
137
+ merging_group.add_argument("--merge-indel-length", type=int, default=0,
138
+ help="Maximum length of individual indels allowed in merging (default: 0 = disabled)")
139
+ merging_group.add_argument("--merge-position-count", type=int, default=2,
140
+ help="Maximum total SNP+indel positions allowed in merging (default: 2)")
141
+ merging_group.add_argument("--merge-min-size-ratio", type=float, default=0.1,
142
+ help="Minimum size ratio (smaller/larger) for merging clusters (default: 0.1, 0 to disable)")
143
+ merging_group.add_argument("--min-merge-overlap", type=int, default=200,
144
+ help="Minimum overlap in bp for merging sequences of different lengths (default: 200, 0 to disable)")
145
+ merging_group.add_argument("--disable-homopolymer-equivalence", action="store_true",
146
+ help="Disable homopolymer equivalence in merging (treat AAA vs AAAA as different)")
147
+ merging_group.add_argument("--merge-effort", type=str, default="balanced", metavar="LEVEL",
148
+ help="Merging effort level: fast (8), balanced (10), thorough (12), "
149
+ "or numeric 6-14. Higher values allow larger batch sizes for "
150
+ "exhaustive subset search. Default: balanced")
151
+
152
+ # Backward compatibility: support old --snp-merge-limit parameter
153
+ parser.add_argument("--snp-merge-limit", type=int, dest="_snp_merge_limit_deprecated",
154
+ help=argparse.SUPPRESS) # Hidden but functional
155
+
156
+ # Selection group
157
+ selection_group = parser.add_argument_group("Selection")
158
+ selection_group.add_argument("--select-max-groups", "--max-groups",
159
+ dest="select_max_groups", type=int, default=-1,
160
+ help="Maximum number of groups to output per specimen (default: -1 = all groups)")
161
+ selection_group.add_argument("--select-max-variants", "--max-variants",
162
+ dest="select_max_variants", type=int, default=-1,
163
+ help="Maximum total variants to output per group (default: -1 = no limit, 0 also means no limit)")
164
+ selection_group.add_argument("--select-strategy", "--variant-selection",
165
+ dest="select_strategy", choices=["size", "diversity"], default="size",
166
+ help="Variant selection strategy: size or diversity (default: size)")
167
+
168
+ # Performance group
169
+ perf_group = parser.add_argument_group("Performance")
170
+ perf_group.add_argument("--scale-threshold", type=int, default=1001,
171
+ help="Sequence count threshold for scalable mode in HAC clustering (requires vsearch). "
172
+ "Set to 0 to disable. Default: 1001")
173
+ perf_group.add_argument("--threads", type=int, default=0, metavar="N",
174
+ help="Max threads for internal parallelism. "
175
+ "0=auto-detect (default), N>0 for explicit count.")
176
+
177
+ # Version and profile options (default group)
178
+ parser.add_argument("--log-level", default="INFO",
179
+ choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
180
+ help="Logging level")
181
+ parser.add_argument("--version", action="version",
182
+ version=f"speconsense-summarize {__version__}",
183
+ help="Show program's version number and exit")
184
+ parser.add_argument("-p", "--profile", metavar="NAME",
185
+ help="Load parameter profile (use --list-profiles to see available)")
186
+ parser.add_argument("--list-profiles", action="store_true",
187
+ help="List available profiles and exit")
188
+
189
+ # Handle --list-profiles early (before requiring other args)
190
+ if '--list-profiles' in sys.argv:
191
+ print_profiles_list('speconsense-summarize')
192
+ sys.exit(0)
193
+
194
+ # First pass: get profile name if specified
195
+ pre_args, _ = parser.parse_known_args()
196
+
197
+ # Track which arguments were explicitly provided on CLI
198
+ explicit_args = set()
199
+ for arg in sys.argv[1:]:
200
+ if arg.startswith('--') and '=' in arg:
201
+ explicit_args.add(arg.split('=')[0][2:].replace('-', '_'))
202
+ elif arg.startswith('--'):
203
+ explicit_args.add(arg[2:].replace('-', '_'))
204
+
205
+ # Load and apply profile if specified
206
+ loaded_profile = None
207
+ if pre_args.profile:
208
+ try:
209
+ loaded_profile = Profile.load(pre_args.profile)
210
+ except ProfileError as e:
211
+ print(f"Error: {e}", file=sys.stderr)
212
+ sys.exit(1)
213
+
214
+ # Apply profile values to parser defaults (explicit CLI args will override)
215
+ for key, value in loaded_profile.speconsense_summarize.items():
216
+ attr_name = key.replace('-', '_')
217
+ if attr_name not in explicit_args:
218
+ parser.set_defaults(**{attr_name: value})
219
+
220
+ args = parser.parse_args()
221
+
222
+ # Store loaded profile for logging later
223
+ args._loaded_profile = loaded_profile
224
+
225
+ # Handle backward compatibility for deprecated parameters
226
+ if args._snp_merge_limit_deprecated is not None:
227
+ if '--snp-merge-limit' in sys.argv:
228
+ logging.warning("--snp-merge-limit is deprecated, use --merge-position-count instead")
229
+ args.merge_position_count = args._snp_merge_limit_deprecated
230
+
231
+ if '--variant-group-identity' in sys.argv:
232
+ logging.warning("--variant-group-identity is deprecated, use --group-identity instead")
233
+
234
+ if '--max-variants' in sys.argv:
235
+ logging.warning("--max-variants is deprecated, use --select-max-variants instead")
236
+
237
+ if '--max-groups' in sys.argv:
238
+ logging.warning("--max-groups is deprecated, use --select-max-groups instead")
239
+
240
+ if '--variant-selection' in sys.argv:
241
+ logging.warning("--variant-selection is deprecated, use --select-strategy instead")
242
+
243
+ return args
244
+
245
+
246
+ def setup_logging(log_level: str, log_file: str = None):
247
+ """Setup logging configuration with optional file output."""
248
+ # Clear any existing handlers
249
+ for handler in logging.root.handlers[:]:
250
+ logging.root.removeHandler(handler)
251
+
252
+ # Create formatter
253
+ formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
254
+
255
+ # Console handler
256
+ console_handler = logging.StreamHandler()
257
+ console_handler.setFormatter(formatter)
258
+
259
+ # Set up root logger
260
+ logger = logging.getLogger()
261
+ logger.setLevel(getattr(logging, log_level))
262
+ logger.addHandler(console_handler)
263
+
264
+ # File handler if specified
265
+ if log_file:
266
+ file_handler = logging.FileHandler(log_file, mode='w')
267
+ file_handler.setFormatter(formatter)
268
+ logger.addHandler(file_handler)
269
+ return log_file
270
+
271
+ return None
272
+
273
+
274
+ def process_single_specimen(file_consensuses: List[ConsensusInfo],
275
+ args) -> Tuple[List[ConsensusInfo], Dict[str, List[str]], Dict, int, List[OverlapMergeInfo]]:
276
+ """
277
+ Process a single specimen file: HAC cluster, MSA-based merge per group, and select final variants.
278
+ Returns final consensus list, merge traceability, naming info, limited_count, and overlap merge info.
279
+
280
+ Architecture (Phase 3):
281
+ 1. HAC clustering to separate variant groups (primary vs contaminants)
282
+ 2. MSA-based merging within each group
283
+ 3. Select representative variants per group
284
+ """
285
+ if not file_consensuses:
286
+ return [], {}, {}, 0, []
287
+
288
+ file_name = os.path.basename(file_consensuses[0].file_path)
289
+ logging.info(f"Processing specimen from file: {file_name}")
290
+
291
+ # Phase 1: HAC clustering to separate variant groups (moved before merging!)
292
+ scale_threshold = getattr(args, 'scale_threshold', 1001)
293
+ threads_arg = getattr(args, 'threads', 0)
294
+ max_threads = threads_arg if threads_arg > 0 else os.cpu_count()
295
+ scalability_config = None
296
+ if scale_threshold > 0:
297
+ scalability_config = ScalabilityConfig(
298
+ enabled=True,
299
+ activation_threshold=scale_threshold,
300
+ max_threads=max_threads
301
+ )
302
+
303
+ variant_groups = perform_hac_clustering(
304
+ file_consensuses, args.group_identity, min_overlap_bp=args.min_merge_overlap,
305
+ scalability_config=scalability_config, output_dir=getattr(args, 'source', '.')
306
+ )
307
+
308
+ # Filter to max groups if specified
309
+ if args.select_max_groups > 0 and len(variant_groups) > args.select_max_groups:
310
+ # Sort groups by size of largest member
311
+ sorted_for_filtering = sorted(
312
+ variant_groups.items(),
313
+ key=lambda x: max(m.size for m in x[1]),
314
+ reverse=True
315
+ )
316
+ # Keep only top N groups
317
+ variant_groups = dict(sorted_for_filtering[:args.select_max_groups])
318
+ logging.info(f"Filtered to top {args.select_max_groups} groups by size (from {len(sorted_for_filtering)} total groups)")
319
+
320
+ # Phase 2: MSA-based merging within each group
321
+ merged_groups = {}
322
+ all_merge_traceability = {}
323
+ total_limited_count = 0
324
+ all_overlap_merges = []
325
+
326
+ if args.disable_merging:
327
+ # Skip merging entirely - pass variants through unchanged
328
+ logging.info("Merging disabled - skipping MSA-based merge evaluation")
329
+ for group_id, group_members in variant_groups.items():
330
+ merged_groups[group_id] = group_members
331
+ else:
332
+ for group_id, group_members in variant_groups.items():
333
+ merged, traceability, limited_count, overlap_merges = merge_group_with_msa(group_members, args)
334
+ merged_groups[group_id] = merged
335
+ all_merge_traceability.update(traceability)
336
+ total_limited_count += limited_count
337
+ all_overlap_merges.extend(overlap_merges)
338
+
339
+ # Phase 3: Select representative variants for each group in this specimen
340
+ final_consensus = []
341
+ naming_info = {}
342
+
343
+ # Sort variant groups by size of largest member (descending)
344
+ sorted_groups = sorted(merged_groups.items(),
345
+ key=lambda x: max(m.size for m in x[1]),
346
+ reverse=True)
347
+
348
+ for group_idx, (_, group_members) in enumerate(sorted_groups):
349
+ final_group_name = group_idx + 1
350
+
351
+ # Select variants for this group
352
+ selected_variants = select_variants(group_members, args.select_max_variants, args.select_strategy, group_number=final_group_name)
353
+
354
+ # Create naming for this group within this specimen
355
+ group_naming = []
356
+
357
+ for variant_idx, variant in enumerate(selected_variants):
358
+ # All variants get .v suffix (primary is .v1, additional are .v2, .v3, etc.)
359
+ # Use rsplit to split on the LAST '-c' (specimen names may contain '-c')
360
+ specimen_base = variant.sample_name.rsplit('-c', 1)[0]
361
+ new_name = f"{specimen_base}-{group_idx + 1}.v{variant_idx + 1}"
362
+
363
+ # Use _replace to preserve all fields while updating sample_name
364
+ renamed_variant = variant._replace(sample_name=new_name)
365
+
366
+ final_consensus.append(renamed_variant)
367
+ group_naming.append((variant.sample_name, new_name))
368
+
369
+ naming_info[group_idx + 1] = group_naming
370
+
371
+ logging.info(f"Processed {file_name}: {len(final_consensus)} final variants across {len(merged_groups)} groups")
372
+
373
+ return final_consensus, all_merge_traceability, naming_info, total_limited_count, all_overlap_merges
374
+
375
+
376
+ def main():
377
+ """Main function to process command line arguments and run the summarization."""
378
+ args = parse_arguments()
379
+
380
+ # Parse FASTA field specification early
381
+ try:
382
+ fasta_fields = parse_fasta_fields(args.fasta_fields)
383
+ except ValueError as e:
384
+ logging.error(f"Invalid --fasta-fields specification: {e}")
385
+ sys.exit(1)
386
+
387
+ # Parse merge effort specification
388
+ try:
389
+ args.merge_effort_value = parse_merge_effort(args.merge_effort)
390
+ except ValueError as e:
391
+ logging.error(f"Invalid --merge-effort: {e}")
392
+ sys.exit(1)
393
+
394
+ # Set up logging with temporary log file
395
+ temp_log_file = tempfile.NamedTemporaryFile(mode='w', suffix='.log', delete=False)
396
+ temp_log_file.close()
397
+
398
+ setup_logging(args.log_level, temp_log_file.name)
399
+
400
+ logging.info(f"speconsense-summarize version {__version__}")
401
+ if args._loaded_profile:
402
+ logging.info(f"Using profile '{args._loaded_profile.name}': {args._loaded_profile.description}")
403
+ logging.info(f"Command: speconsense-summarize {' '.join(sys.argv[1:])}")
404
+ logging.info("")
405
+ logging.info("Starting enhanced speconsense summarization")
406
+ logging.info(f"Parameters:")
407
+ logging.info(f" --source: {args.source}")
408
+ logging.info(f" --summary-dir: {args.summary_dir}")
409
+ logging.info(f" --min-ric: {args.min_ric}")
410
+ logging.info(f" --min-len: {args.min_len}")
411
+ logging.info(f" --max-len: {args.max_len}")
412
+ logging.info(f" --fasta-fields: {args.fasta_fields}")
413
+ logging.info(f" --merge-snp: {args.merge_snp}")
414
+ logging.info(f" --merge-indel-length: {args.merge_indel_length}")
415
+ logging.info(f" --merge-position-count: {args.merge_position_count}")
416
+ logging.info(f" --merge-min-size-ratio: {args.merge_min_size_ratio}")
417
+ logging.info(f" --disable-homopolymer-equivalence: {args.disable_homopolymer_equivalence}")
418
+ logging.info(f" --min-merge-overlap: {args.min_merge_overlap}")
419
+ logging.info(f" --merge-effort: {args.merge_effort} ({args.merge_effort_value})")
420
+ logging.info(f" --group-identity: {args.group_identity}")
421
+ logging.info(f" --select-max-variants: {args.select_max_variants}")
422
+ logging.info(f" --select-max-groups: {args.select_max_groups}")
423
+ logging.info(f" --select-strategy: {args.select_strategy}")
424
+ logging.info(f" --log-level: {args.log_level}")
425
+ logging.info("")
426
+ logging.info("Processing each specimen file independently to organize variants within specimens")
427
+
428
+ # Load all consensus sequences
429
+ consensus_list = load_consensus_sequences(
430
+ args.source, args.min_ric, args.min_len, args.max_len
431
+ )
432
+ if not consensus_list:
433
+ logging.error("No consensus sequences found")
434
+ return
435
+
436
+ # Group consensus sequences by input file (one file per specimen)
437
+ file_groups = defaultdict(list)
438
+ for cons in consensus_list:
439
+ file_groups[cons.file_path].append(cons)
440
+
441
+ # Create output directories before processing
442
+ os.makedirs(args.summary_dir, exist_ok=True)
443
+ os.makedirs(os.path.join(args.summary_dir, 'FASTQ Files'), exist_ok=True)
444
+ os.makedirs(os.path.join(args.summary_dir, 'variants'), exist_ok=True)
445
+ os.makedirs(os.path.join(args.summary_dir, 'variants', 'FASTQ Files'), exist_ok=True)
446
+
447
+ # Build lookup tables once before processing loop
448
+ fastq_lookup = build_fastq_lookup_table(args.source)
449
+ original_consensus_lookup = {cons.sample_name: cons for cons in consensus_list}
450
+
451
+ # Process each specimen file independently
452
+ all_final_consensus = []
453
+ all_merge_traceability = {}
454
+ all_naming_info = {}
455
+ all_raw_consensuses = [] # Collect .raw files from all specimens
456
+ all_overlap_merges = [] # Collect overlap merge info for quality reporting
457
+ total_limited_merges = 0
458
+
459
+ sorted_file_paths = sorted(file_groups.keys())
460
+ for file_path in tqdm(sorted_file_paths, desc="Processing specimens", unit="specimen"):
461
+ file_consensuses = file_groups[file_path]
462
+
463
+ # Process specimen
464
+ final_consensus, merge_traceability, naming_info, limited_count, overlap_merges = process_single_specimen(
465
+ file_consensuses, args
466
+ )
467
+
468
+ # Write individual data files immediately
469
+ specimen_raw_consensuses = write_specimen_data_files(
470
+ final_consensus,
471
+ merge_traceability,
472
+ naming_info,
473
+ args.summary_dir,
474
+ os.path.join(args.summary_dir, 'FASTQ Files'),
475
+ fastq_lookup,
476
+ original_consensus_lookup,
477
+ fasta_fields
478
+ )
479
+
480
+ # Accumulate results for summary files
481
+ all_final_consensus.extend(final_consensus)
482
+ all_merge_traceability.update(merge_traceability)
483
+ all_raw_consensuses.extend(specimen_raw_consensuses)
484
+ all_overlap_merges.extend(overlap_merges)
485
+ total_limited_merges += limited_count
486
+
487
+ # Update naming info with unique keys per specimen
488
+ file_name = os.path.basename(file_path)
489
+ for group_id, group_naming in naming_info.items():
490
+ unique_key = f"{file_name}_{group_id}"
491
+ all_naming_info[unique_key] = group_naming
492
+
493
+ # Write summary files at end (after all processing)
494
+ write_output_files(
495
+ all_final_consensus,
496
+ all_raw_consensuses,
497
+ args.summary_dir,
498
+ temp_log_file.name,
499
+ fasta_fields
500
+ )
501
+
502
+ # Write quality report (deferred import to avoid circular dependency)
503
+ from speconsense import quality_report
504
+ quality_report.write_quality_report(
505
+ all_final_consensus,
506
+ all_raw_consensuses,
507
+ args.summary_dir,
508
+ args.source,
509
+ all_overlap_merges,
510
+ args.min_merge_overlap
511
+ )
512
+
513
+ logging.info(f"Enhanced summarization completed successfully")
514
+ logging.info(f"Final output: {len(all_final_consensus)} consensus sequences in {args.summary_dir}")
515
+
516
+ # Report if any variant groups were potentially suboptimal due to size
517
+ if total_limited_merges > 0:
518
+ logging.info(f"Note: {total_limited_merges} variant group(s) had >{MAX_MSA_MERGE_VARIANTS} variants (results potentially suboptimal)")
519
+
520
+ # Clean up temporary log file
521
+ try:
522
+ os.unlink(temp_log_file.name)
523
+ except Exception as e:
524
+ logging.debug(f"Could not clean up temporary log file: {e}")
525
+
526
+
527
+ if __name__ == "__main__":
528
+ main()