barcadia 3.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,98 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ generate_random_sequences.py
4
+
5
+ Generate random DNA sequences for testing validation scripts (variable length sequences supported).
6
+
7
+ Example usage: python src/barcadia/tools/generate_random_sequences.py --count 10000 --lengths 12 13
8
+
9
+ Output: random DNA sequences (one per line as .txt)
10
+
11
+ Optional arguments:
12
+ --output: output file path (default: test/{count}_random_{min}to{max}bp_sequences.txt)
13
+
14
+ Required arguments:
15
+ --count: number of sequences to generate
16
+ --lengths: possible lengths for sequences
17
+ """
18
+
19
+ import argparse
20
+ import random
21
+ import os
22
+ import sys
23
+ import numpy as np
24
+
25
+ # Add src directory to Python path for imports
26
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
27
+
28
+ from config_utils import decode_sequence # type: ignore
29
+
30
+ def generate_random_sequence(length):
31
+ """Generate a single random DNA sequence of given length as integer array"""
32
+ return np.random.randint(0, 4, size=length, dtype=np.int8)
33
+
34
+ def main():
35
+ parser = argparse.ArgumentParser(
36
+ description="Generate random DNA sequences for testing",
37
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
38
+ epilog="Example: python src/barcadia/tools/generate_random_sequences.py --count 10000 --lengths 12 13"
39
+ )
40
+
41
+ parser.add_argument('--count', type=int, required=True,
42
+ help='Number of sequences to generate')
43
+ parser.add_argument('--lengths', nargs='+', type=int, required=True,
44
+ help='Possible lengths for sequences')
45
+ parser.add_argument('--output', type=str,
46
+ help='Output file path (default: test/{count}_random_{min}to{max}bp_sequences.txt)')
47
+
48
+ args = parser.parse_args()
49
+
50
+ # Validate arguments
51
+ if args.count <= 0:
52
+ raise ValueError("Count must be > 0")
53
+
54
+ for length in args.lengths:
55
+ if length <= 0:
56
+ raise ValueError(f"All lengths must be > 0, got {length}")
57
+
58
+ # Check if count exceeds maximum possible sequences across all lengths
59
+ total_max_possible = sum(4 ** length for length in args.lengths)
60
+ if args.count > total_max_possible:
61
+ raise ValueError(f"Count ({args.count}) exceeds maximum possible sequences for lengths {args.lengths} ({total_max_possible})")
62
+
63
+ # Generate default output path if not specified
64
+ if args.output is None:
65
+ os.makedirs('test', exist_ok=True)
66
+ min_length = min(args.lengths)
67
+ max_length = max(args.lengths)
68
+
69
+ # Handle single length vs range
70
+ if min_length == max_length:
71
+ length_range = str(min_length)
72
+ else:
73
+ length_range = f"{min_length}to{max_length}"
74
+
75
+ args.output = f"test/{args.count}_random_{length_range}bp_sequences.txt"
76
+
77
+ # Generate sequences, write to file, and count lengths in one loop
78
+ length_counts = {}
79
+ with open(args.output, 'w') as f:
80
+ for _ in range(args.count):
81
+ # Randomly choose a length from the provided options
82
+ chosen_length = random.choice(args.lengths)
83
+ seq_array = generate_random_sequence(chosen_length)
84
+
85
+ # Write to file (convert to DNA string for output)
86
+ dna_string = decode_sequence(seq_array)
87
+ f.write(dna_string + '\n')
88
+
89
+ # Count sequences by length
90
+ length_counts[chosen_length] = length_counts.get(chosen_length, 0) + 1
91
+
92
+ length_breakdown = ", ".join([f"{count} at length {length}" for length, count in sorted(length_counts.items())])
93
+
94
+ print(f"Generated {args.count} random DNA sequences with lengths: {length_breakdown}")
95
+ print(f"Output written to: {args.output}")
96
+
97
+ if __name__ == "__main__":
98
+ main()
@@ -0,0 +1,139 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ memory_benchmark.py
4
+
5
+ Simple memory benchmarking utility for tracking maximum memory usage of barcadia commands.
6
+
7
+ Example usage:
8
+ python src/barcadia/tools/memory_benchmark.py barcadia generate --args
9
+ python src/barcadia/tools/memory_benchmark.py barcadia validate --args
10
+
11
+ Output: memory usage report shown in terminal and memory_benchmark_{timestamp}.log file
12
+
13
+ Optional arguments:
14
+ --mem-output-dir: output directory for benchmark logs (default: test)
15
+
16
+ Required arguments:
17
+ command: the barcadia command to benchmark (e.g., generate or validate)
18
+ """
19
+
20
+ import psutil
21
+ import subprocess
22
+ import sys
23
+ import time
24
+ import os
25
+ import logging
26
+ import argparse
27
+ import os
28
+ import sys
29
+
30
+ # Add src directory to Python path for imports
31
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
32
+
33
+ from config_utils import setup_logging # type: ignore
34
+
35
+ def benchmark_command(command):
36
+ """
37
+ Benchmark memory usage of a barcadia command.
38
+
39
+ Args:
40
+ command: The command to run (e.g., ['barcadia', 'generate', '--count', '1000'])
41
+
42
+ Returns:
43
+ dict: Benchmark results
44
+ """
45
+ # Run the command as a subprocess and track its memory
46
+ start_time = time.time()
47
+
48
+ # Start the subprocess; let it inherit stdout/stderr so it can print directly to terminal
49
+ process = subprocess.Popen(command)
50
+
51
+ # Track memory usage of the subprocess
52
+ peak_memory = 0
53
+ try:
54
+ while process.poll() is None: # While process is still running
55
+ try:
56
+ # Get memory usage of the subprocess
57
+ child_process = psutil.Process(process.pid)
58
+ memory_bytes = child_process.memory_info().rss
59
+ peak_memory = max(peak_memory, memory_bytes)
60
+ time.sleep(0.1) # Sample every 100ms
61
+ except (psutil.NoSuchProcess, psutil.AccessDenied):
62
+ # Process might have finished or we don't have access
63
+ break
64
+
65
+ # Wait for process to complete
66
+ process.wait()
67
+ return_code = process.returncode
68
+ stdout, stderr = None, None
69
+
70
+ except Exception as e:
71
+ logging.error(f"Error tracking memory: {e}")
72
+ process.terminate()
73
+ return None
74
+
75
+ duration = time.time() - start_time
76
+ peak_memory_mb = peak_memory / 1024 / 1024
77
+
78
+ if return_code != 0:
79
+ logging.error(f"Command failed with return code {return_code}")
80
+ logging.error("Check the command's own log file for details")
81
+ return None
82
+
83
+ return {
84
+ 'command': ' '.join(command),
85
+ 'duration_seconds': duration,
86
+ 'peak_memory_mb': peak_memory_mb,
87
+ 'return_code': return_code,
88
+ 'stdout': stdout,
89
+ 'stderr': stderr
90
+ }
91
+
92
+ def main():
93
+ """Command-line interface for benchmarking barcadia commands."""
94
+ parser = argparse.ArgumentParser(
95
+ description="Benchmark memory usage of barcadia commands",
96
+ formatter_class=argparse.RawDescriptionHelpFormatter,
97
+ epilog="Examples:\n"
98
+ " python src/barcadia/tools/memory_benchmark.py barcadia validate --args\n"
99
+ " python src/barcadia/tools/memory_benchmark.py barcadia generate --args"
100
+ )
101
+
102
+ parser.add_argument('--mem-output-dir', type=str, default='test',
103
+ help='Output directory for benchmark logs (default: test)')
104
+ parser.add_argument('command', nargs=argparse.REMAINDER,
105
+ help='The barcadia command to benchmark (e.g., generate or validate)')
106
+
107
+ args = parser.parse_args()
108
+
109
+ if not args.command:
110
+ parser.error("Please provide a barcadia command to benchmark")
111
+
112
+ # Setup logging
113
+ args.output_dir = args.mem_output_dir
114
+ log_filepath = setup_logging(args, "memory_benchmark")
115
+
116
+ # Log benchmark start
117
+ command_str = ' '.join(args.command)
118
+ logging.info(f"Starting memory benchmark for: {command_str}")
119
+ logging.info("-" * 50)
120
+
121
+ # Run benchmark
122
+ result = benchmark_command(args.command)
123
+
124
+ if result:
125
+ # Log benchmark results only
126
+ logging.info("-" * 50)
127
+ logging.info("BENCHMARK RESULTS:")
128
+ logging.info(f"Command: {result['command']}")
129
+ logging.info(f"Duration: {result['duration_seconds']:.2f} seconds")
130
+ logging.info(f"Peak Memory: {result['peak_memory_mb']:.2f} MB")
131
+ logging.info(f"Return Code: {result['return_code']}")
132
+ logging.info("-" * 50)
133
+ # Note: the benchmarked command prints its own log file path to the terminal.
134
+ logging.info(f"Benchmark log file: {log_filepath}")
135
+ else:
136
+ logging.error("Benchmark failed!")
137
+
138
+ if __name__ == "__main__":
139
+ main()
@@ -0,0 +1,393 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ validate_barcodes.py
4
+
5
+ Validate if provided lists of NGS barcodes satisfy all quality filters (variable length sequences supported).
6
+
7
+ Program Overview:
8
+
9
+ 1. Load and parse input file(s) and report lengths distribution
10
+ 2. Check if sequences fail biological filters (GC content, homopolymer checks) - reports ALL violations per sequence
11
+ 3. For sequences passing biological filters, apply intelligent algorithm selection for distance validation (unless --skip-distance flag is enabled):
12
+ 3a. Method selection logic (based on sequences passing biological filters):
13
+ - Small barcode sets (<10K sequences): Pairwise sequential
14
+ - Large barcode sets (≥10K sequences) with mixed lengths and/or min_distance > 4: Pairwise parallel (when multiple CPUs available, otherwise sequential)
15
+ - Large equal-length barcode sets (≥10K sequences) with min_distance ≤ 4: Neighbor enumeration
16
+ 3b. Distance calculation: Hamming distance for equal-length sequences, Levenshtein for mixed lengths
17
+ 3c. Progress logging during validation (every 10 chunks for pairwise parallel, every 10K sequences for neighbor enumeration)
18
+ 3d. Early stopping on first distance violation with detailed reporting
19
+ 4. Generate comprehensive validation report with violation details
20
+
21
+ Input: list(s) of NGS barcodes (one per line as .txt). Multiple files supported, concatenated automatically.
22
+
23
+ Output: validation report (validation_report_{timestamp}.txt) and validate_barcodes_{timestamp}.log file
24
+
25
+ Optional arguments:
26
+ --gc-min: minimum GC content (default: 0.4)
27
+ --gc-max: maximum GC content (default: 0.6)
28
+ --homopolymer-max: maximum allowed homopolymer repeat length (default: 2)
29
+ --min-distance: minimum edit distance between barcodes (default: 3)
30
+ --skip-distance: skip distance validation entirely (default: off)
31
+ --output-dir: output directory for validation logs and reports (default: test)
32
+ --cpus: number of CPUs to use for pairwise parallel distance validation (default: all available)
33
+
34
+ Required arguments:
35
+ --input: input file(s) containing NGS barcodes (one per line)
36
+ """
37
+
38
+ import argparse
39
+ import logging
40
+ import os
41
+ import time
42
+ import multiprocessing as mp
43
+ import sys
44
+ from concurrent.futures import ProcessPoolExecutor
45
+ from datetime import datetime
46
+
47
+ # Import utility functions
48
+ from .config_utils import decode_sequence, setup_logging, ExistingSequenceSet
49
+ from .filter_utils import validate_filter_arguments, check_gc_content_int, check_homopolymer_int, calculate_distance, select_distance_method, generate_hamming_neighbors
50
+
51
+ def validate_biological_filters(seq_array, gc_min, gc_max, homopolymer_max):
52
+ """Check if sequence passes all biological filters and return all violations"""
53
+ violations = []
54
+
55
+ # Check GC content
56
+ if not check_gc_content_int(seq_array, gc_min, gc_max):
57
+ violations.append("GC content outside range")
58
+
59
+ # Check homopolymer runs
60
+ if not check_homopolymer_int(seq_array, homopolymer_max):
61
+ violations.append("Homopolymer run too long")
62
+
63
+ if violations:
64
+ return False, "; ".join(violations)
65
+ else:
66
+ return True, "Passes all filters"
67
+
68
+ def log_violation_details(sequences, i, j, violation):
69
+ """Log distance violation and create violation details with DNA strings for reporting"""
70
+ logging.info(f"Early stopping: Found distance violation between sequences {violation[0]+1} and {violation[1]+1} (distance={violation[2]})")
71
+ seq1_str = decode_sequence(sequences[i])
72
+ seq2_str = decode_sequence(sequences[j])
73
+ return (violation[0]+1, violation[1]+1, seq1_str, seq2_str, violation[2])
74
+
75
+ def validate_distances_neighbor_enum(sequences, min_distance):
76
+ """Validate using neighbor enumeration - much faster for appropriate cases"""
77
+ # Build hash set of all sequences for O(1) lookup
78
+ sequence_set = set(tuple(seq) for seq in sequences)
79
+ total_sequences = len(sequences)
80
+
81
+ # Check each sequence for violations
82
+ for i, seq in enumerate(sequences):
83
+ # Progress logging every 10K sequences
84
+ if i % 10_000 == 0 and i > 0:
85
+ logging.info(f"Progress: {i:,}/{total_sequences:,} sequences processed "
86
+ f"({i/total_sequences*100:.1f}%)")
87
+
88
+ seq_array = list(seq) # Make mutable copy for neighbor generation
89
+
90
+ # Generate all neighbors within min_distance
91
+ for neighbor in generate_hamming_neighbors(seq_array, min_distance - 1):
92
+ if neighbor in sequence_set and neighbor != tuple(seq):
93
+ # Found a violation - get the index of the violating sequence
94
+ for j, other_seq in enumerate(sequences):
95
+ if j != i and tuple(other_seq) == neighbor:
96
+ # Calculate actual distance for reporting
97
+ actual_distance = sum(a != b for a, b in zip(seq, other_seq))
98
+ violation = (i, j, actual_distance)
99
+ violation_details = log_violation_details(sequences, i, j, violation)
100
+ sequences_processed = i + 1 # Number of sequences processed when violation found
101
+ return True, sequences_processed, violation_details
102
+
103
+ # No violations found - processed all sequences
104
+ return False, total_sequences, None
105
+
106
+ def generate_pair_chunk(start_idx, chunk_size, n):
107
+ """Generate a chunk of pairs lazily starting from start_idx"""
108
+ pairs_generated = 0
109
+ current_idx = 0
110
+
111
+ for i in range(n):
112
+ for j in range(i + 1, n):
113
+ if current_idx >= start_idx:
114
+ if pairs_generated >= chunk_size:
115
+ return
116
+ yield (i, j)
117
+ pairs_generated += 1
118
+ current_idx += 1
119
+
120
+ def validate_chunk(sequences_chunk, min_distance):
121
+ """Worker function for distance validation (can be used sequentially or in parallel)"""
122
+ pairs_checked = 0
123
+ n = len(sequences_chunk)
124
+
125
+ # Use the lazy pair generator for memory efficiency
126
+ for i, j in generate_pair_chunk(0, n * (n - 1) // 2, n):
127
+ distance = calculate_distance(sequences_chunk[i], sequences_chunk[j], min_distance)
128
+ pairs_checked += 1
129
+
130
+ if distance < min_distance:
131
+ violation = (i, j, distance)
132
+ violation_details = log_violation_details(sequences_chunk, i, j, violation)
133
+ return True, pairs_checked, violation_details
134
+
135
+ return False, pairs_checked, None
136
+
137
+ def validate_distances(sequences, min_distance, method, cpus, chunk_size):
138
+ """Unified distance validation with method selection and parallel/sequential execution"""
139
+ # Execute the chosen method
140
+ if method == "neighbor_enumeration":
141
+ # Use neighbor enumeration when set up is optimal (no parallelization involved)
142
+ return validate_distances_neighbor_enum(sequences, min_distance)
143
+ elif method == "pairwise_sequential" or cpus == 1:
144
+ # Use sequential for small barcode sets or single CPU
145
+ return validate_chunk(sequences, min_distance)
146
+ else: # method == "pairwise" and cpus > 1
147
+ # Large dataset with multiple CPUs - always use parallel with pre-calculated chunk size
148
+ chunks = [sequences[i:i+chunk_size] for i in range(0, len(sequences), chunk_size)]
149
+
150
+ with ProcessPoolExecutor(max_workers=cpus) as executor:
151
+ futures = [
152
+ executor.submit(validate_chunk, chunk, min_distance)
153
+ for chunk in chunks
154
+ ]
155
+
156
+ # Process results with early stopping
157
+ total_pairs_checked = 0
158
+ for future in futures:
159
+ early_stopped, pairs_checked, violation_info = future.result()
160
+ total_pairs_checked += pairs_checked
161
+
162
+ if early_stopped:
163
+ # Cancel remaining futures for early stopping
164
+ for f in futures:
165
+ f.cancel()
166
+ return True, total_pairs_checked, violation_info
167
+
168
+ return False, total_pairs_checked, None
169
+
170
+ def validate_barcodes_core(sequences, gc_min, gc_max, homopolymer_max, min_distance, has_mixed_lengths, skip_distance, cpus, output_dir, input_file, log_filepath):
171
+ """Main function to validate input barcode sets against biological filters and distance constraints"""
172
+ start_time = time.time()
173
+
174
+ logging.info(f"Starting barcode validation...")
175
+ logging.info(f"Filter 1 (within-sequence), GC content: {gc_min:.1%} - {gc_max:.1%}")
176
+ logging.info(f"Filter 2 (within-sequence), Max homopolymer repeat: {homopolymer_max}")
177
+ logging.info(f"Filter 3 (between-sequence), Minimum edit distance: {min_distance}")
178
+
179
+ # 1. Validate sequences for biological filters
180
+ valid_sequences = []
181
+ biological_violations = []
182
+
183
+ for i, seq_array in enumerate(sequences):
184
+ # Check biological filters
185
+ is_valid, reason = validate_biological_filters(seq_array, gc_min, gc_max, homopolymer_max)
186
+
187
+ if is_valid:
188
+ valid_sequences.append(seq_array)
189
+ else:
190
+ # Convert back to DNA string for reporting
191
+ dna_string = decode_sequence(seq_array)
192
+ biological_violations.append((i+1, dna_string, reason))
193
+
194
+ logging.info(f"Biological filter (GC content and homopolymer repeats) results:")
195
+ logging.info(f" Passed: {len(valid_sequences)} sequences")
196
+ logging.info(f" Failed: {len(biological_violations)} sequences")
197
+
198
+ # 2. Validate sequences for distance constraints
199
+ # Calculate total distance pairs for sequences that passed biological filters
200
+ n = len(valid_sequences)
201
+ total_pairs = n * (n - 1) // 2
202
+
203
+ # Check if we should skip distance validation
204
+ distance_skipped = False
205
+ logging.info(f"Distance filter results:")
206
+ if skip_distance:
207
+ logging.info(f"Skipping distance validation (--skip-distance flag enabled)")
208
+ early_stopped = False
209
+ features_checked = 0
210
+ distance_skipped = True
211
+ validation_method = "skipped"
212
+ violation_info = None
213
+ logging.info(f" Distance validation skipped")
214
+ # If not, continue with distance validation
215
+ else:
216
+ logging.info(f"Validating distances for sequences that passed biological filters...")
217
+ # Determine method and calculate chunk size for pairwise method
218
+ method = select_distance_method(n, min_distance, has_mixed_lengths)
219
+
220
+ # Calculate chunk size for pairwise method with multiple CPUs
221
+ chunk_size = None
222
+ if method == "pairwise_sequential":
223
+ logging.info(f"Using sequential pairwise distance checking (small barcode set for sequences that passed biological filters)")
224
+ elif method == "pairwise" and cpus == 1:
225
+ logging.info(f"Using sequential pairwise distance checking (1 CPU)")
226
+ elif method == "pairwise" and cpus > 1:
227
+ chunk_size = max(100000, total_pairs // (cpus * 10))
228
+ logging.info(f"Using parallel pairwise distance checking (chunk size: {chunk_size})")
229
+
230
+ # Execute validation
231
+ early_stopped, features_checked, violation_info = validate_distances(valid_sequences, min_distance, method, cpus, chunk_size)
232
+
233
+ # Log results
234
+ if method == "neighbor_enumeration":
235
+ logging.info(f" Total sequences (that passed biological filters): {n}")
236
+ logging.info(f" Sequences processed: {features_checked}")
237
+ else:
238
+ logging.info(f" Total sequence pairs: {total_pairs:,} (sequences that passed biological filters)")
239
+ logging.info(f" Pairs checked: {features_checked:,}")
240
+
241
+ validation_method = method
242
+
243
+ overall_valid = len(valid_sequences) == len(sequences) and not early_stopped
244
+
245
+ duration = time.time() - start_time
246
+
247
+ logging.info(f"Validation complete!")
248
+ logging.info(f"Overall validation: {'PASSED' if overall_valid else 'FAILED'}")
249
+ logging.info(f"Total time: {duration:.2f} seconds")
250
+
251
+ # 3. Generate report
252
+ logging.info(f"Generating report...")
253
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
254
+ report_file = os.path.join(output_dir, f"validation_report_{timestamp}.txt")
255
+
256
+ with open(report_file, 'w') as f:
257
+ f.write("Barcode Validation Report\n")
258
+ f.write("=" * 50 + "\n\n")
259
+
260
+ f.write(f"Input file: {input_file}\n")
261
+ f.write(f"Total sequences: {len(sequences)}\n\n")
262
+ f.write("Filter Settings:\n")
263
+ f.write(f" GC content: {gc_min:.1%} - {gc_max:.1%}\n")
264
+ f.write(f" Max homopolymer: {homopolymer_max}\n")
265
+ f.write(f" Minimum distance: {min_distance}\n\n")
266
+ f.write(f"Biological filter passed: {len(valid_sequences)}\n")
267
+ f.write(f"Biological filter failed: {len(biological_violations)}\n")
268
+
269
+ if distance_skipped:
270
+ f.write(f"Distance validation: SKIPPED (--skip-distance flag enabled)\n")
271
+ elif early_stopped:
272
+ f.write(f"Distance validation: EARLY STOPPED (found first violation)\n")
273
+ f.write(f" Method used: {validation_method}\n")
274
+ if validation_method == "neighbor_enumeration":
275
+ f.write(f" Sequences processed before stopping: {features_checked:,}\n")
276
+ else:
277
+ f.write(f" Pairs checked before stopping: {features_checked:,}\n")
278
+ else:
279
+ f.write(f"Distance validation: PASSED (no violations found)\n")
280
+ f.write(f" Method used: {validation_method}\n")
281
+ if validation_method == "neighbor_enumeration":
282
+ f.write(f" Total sequences processed: {features_checked:,}\n")
283
+ else:
284
+ f.write(f" Total pairs checked: {features_checked:,}\n")
285
+ f.write("\n")
286
+
287
+ if biological_violations:
288
+ f.write("Biological Filter (GC content and homopolymer) Violations:\n")
289
+ f.write("-" * 30 + "\n")
290
+ for line_num, seq, reason in biological_violations:
291
+ f.write(f"Line {line_num}: {seq} - {reason}\n")
292
+ f.write("\n")
293
+
294
+ # Add distance violation details if available
295
+ if violation_info is not None:
296
+ f.write("Distance Violations:\n")
297
+ f.write("-" * 19 + "\n")
298
+ seq1_line, seq2_line, seq1_str, seq2_str, distance = violation_info
299
+ f.write(f"Line {seq1_line}: {seq1_str} and Line {seq2_line}: {seq2_str} - distance {distance} (minimum required: {min_distance})\n")
300
+ f.write("\n")
301
+
302
+ # Log file locations
303
+ if log_filepath:
304
+ logging.info(f"Log file: {log_filepath}")
305
+ logging.info(f"Report file: {report_file}")
306
+
307
+ # Final output
308
+ if overall_valid:
309
+ print("All barcodes are valid!")
310
+ else:
311
+ print(f"VALIDATION FAILED!")
312
+
313
+ def setup_argument_parser():
314
+ """Setup and return the argument parser for barcode validation"""
315
+ parser = argparse.ArgumentParser(
316
+ description="Validate DNA barcodes against quality filters (GC content, homopolymer repeats, minimum distance)",
317
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter
318
+ )
319
+
320
+ # Required arguments
321
+ parser.add_argument('--input', type=str, required=True, nargs='+',
322
+ help='Input file(s) containing DNA barcodes (one per line)')
323
+
324
+ # Output arguments
325
+ parser.add_argument('--output-dir', type=str, default='test',
326
+ help='Output directory for validation logs and reports')
327
+
328
+ # Filter arguments with defaults
329
+ parser.add_argument('--gc-min', type=float, default=0.4,
330
+ help='Minimum GC content (as fraction, e.g., 0.4 = 40%%)')
331
+ parser.add_argument('--gc-max', type=float, default=0.6,
332
+ help='Maximum GC content (as fraction, e.g., 0.6 = 60%%)')
333
+ parser.add_argument('--homopolymer-max', type=int, default=2,
334
+ help='Maximum allowed homopolymer repeat length')
335
+ parser.add_argument('--min-distance', type=int, default=3,
336
+ help='Minimum edit distance between sequences')
337
+
338
+ # Performance arguments
339
+ parser.add_argument('--cpus', type=int, default=mp.cpu_count(),
340
+ help='Number of CPU cores to use for parallel distance validation')
341
+
342
+ # Mode arguments
343
+ parser.add_argument('--skip-distance', action='store_true',
344
+ help='Skip distance validation entirely')
345
+
346
+ return parser
347
+
348
+ def validate_validator_arguments(args, length_counts):
349
+ """Validate validator-specific arguments (length, distance, homopolymer) and return has_mixed_lengths flag"""
350
+ input_length = max(length_counts.keys())
351
+
352
+ # Homopolymer repeat x max input length validation
353
+ if args.homopolymer_max >= input_length:
354
+ raise ValueError(f"Maximum homopolymer repeat length must be < max input length ({input_length}bp)")
355
+
356
+ # Minimum distance x max input length validation
357
+ if args.min_distance >= input_length:
358
+ raise ValueError(f"Minimum distance must be < max input length ({input_length}bp)")
359
+
360
+ # Check for mixed lengths
361
+ has_mixed_lengths = len(length_counts) > 1
362
+
363
+ return has_mixed_lengths
364
+
365
+ def main(argv=None):
366
+ parser = setup_argument_parser()
367
+ args = parser.parse_args(argv)
368
+ log_filepath = setup_logging(args, "validate_barcodes")
369
+ validate_filter_arguments(args) # simple validation of filter arguments
370
+
371
+ # Load input files using ExistingSequenceSet
372
+ sequence_set = ExistingSequenceSet.from_input_files(args.input)
373
+
374
+ # Validate validator-specific arguments and get mixed lengths flag
375
+ has_mixed_lengths = validate_validator_arguments(args, sequence_set.length_counts)
376
+
377
+ # Validate barcodes
378
+ validate_barcodes_core(
379
+ sequences=sequence_set.sequences,
380
+ gc_min=args.gc_min,
381
+ gc_max=args.gc_max,
382
+ homopolymer_max=args.homopolymer_max,
383
+ min_distance=args.min_distance,
384
+ has_mixed_lengths=has_mixed_lengths,
385
+ skip_distance=args.skip_distance,
386
+ cpus=args.cpus,
387
+ output_dir=args.output_dir,
388
+ input_file=args.input,
389
+ log_filepath=log_filepath
390
+ )
391
+
392
+ if __name__ == "__main__":
393
+ main(sys.argv[1:])