barcadia 3.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,613 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ generate_barcodes.py
4
+
5
+ Generate high-performance DNA barcodes for NGS applications using optimized iterative growth algorithm (supports extension from seed sequences and paired mode for dual-indexing).
6
+
7
+ Program Overview:
8
+
9
+ 1. Load seed sequence files as existing pool and report length distribution (will generate from scratch if no seeds are provided)
10
+ 2. Validate input arguments, including whether the requested count is possible for the barcode length and minimum distance given Hamming and Gilbert-Varshamov bounds (when no seeds provided or everything equal-length)
11
+ * Target count is updated given whether seeds are provided and/or paired mode is on
12
+
13
+ 3. Generate a batch of candidate unique random sequences that pass the within-sequence biological filters (i.e., GC content and homopolymer repeats) → step 1/3 of the overall filtering process
14
+ * Batch size is capped at 10,000 for barcode sets larger than 10,000, otherwise enforces 10 batches of minimum size 50
15
+ 4. Conduct two-step between-sequence distance filtering with optimized method selection (neighbor enumeration or pairwise):
16
+ 4a. Filter candidates against existing pool → step 2/3 of the overall filtering process
17
+ * If pairwise, use parallel processing for large datasets (≥10K) with multiple CPUs, sequential for small datasets (<10K) or single CPU
18
+ 4b. Filter remaining candidates against each other within the current batch → step 3/3 of the overall filtering process
19
+ * If pairwise, always proceed sequentially in this step to ensure that the final pool fully satisfies the minimum distance requirement
20
+
21
+ Method selection (applies to both steps, decided once per generation):
22
+ - Small barcode sets (<10K sequences counting seeds if seeds are present): Always use pairwise distance checking
23
+ - Large mixed-length (within seeds and/or between seeds and new barcodes): Always use pairwise distance checking
24
+ - Large equal-length (no seeds or everything of the same length counting seeds): Choose between pairwise and neighbor enumeration based on min_distance
25
+ * Pairwise distance checking: when min_distance > 4 (large number of neighbors to check)
26
+ * Neighbor enumeration: when min_distance <= 4 (limited number of neighbors to check)
27
+ 5. Add the verified batch (candidates that pass all 3 steps of filtering) to pool, repeat until target count is reached
28
+
29
+ 6. Write outputs to files and log results
30
+
31
+ Input: none (or optionally, seed sequence files or paired seed files)
32
+
33
+ Output: barcode list (one per line as .txt; if paired mode: two files with suffixes _paired1.txt and _paired2.txt) and generate_barcodes_{timestamp}.log file
34
+
35
+ Optional arguments:
36
+ --gc-min: minimum GC content (default: 0.4)
37
+ --gc-max: maximum GC content (default: 0.6)
38
+ --homopolymer-max: maximum allowed homopolymer repeat length (default: 2)
39
+ --min-distance: minimum edit distance between barcodes (default: 3)
40
+ --cpus: number of CPU cores to use during the parallel filtering step (default: all available)
41
+ --seeds: seed sequence files (any number of .txt files with one sequence per line, multiple files will be concatenated automatically; if not provided, will generate from scratch; incompatible with --paired mode; default: None)
42
+ --paired: generate paired barcodes (doubles target count, randomly splits output into two equal parts; incompatible with --seeds; default: off)
43
+ --paired-seed1: paired seed sequence file 1 (used only with --paired and --paired-seed2, only one file is accepted, all sequences must be same length and match count/length of --paired-seed2; default: None)
44
+ --paired-seed2: paired seed sequence file 2 (used only with --paired and --paired-seed1, only one file is accepted, all sequences must be same length and match count/length of --paired-seed1; default: None)
45
+ --output-dir: output directory for barcodes and logs (default: test)
46
+ --output-prefix: output filename prefix (adds .txt automatically; if paired mode is on, adds _paired1.txt and _paired2.txt; default: barcodes)
47
+
48
+ Required arguments:
49
+ --count: number of barcodes or barcode pairs to generate
50
+ --length: length of barcodes or barcode pairs to generate
51
+
52
+ NOTE: seed sequences (paired or unpaired) are not validated against intended filters. If necessary, please run validate_barcodes.py first to ensure they pass all the filters before providing them as seeds here.
53
+ """
54
+
55
+ import numpy as np
56
+ import argparse
57
+ import logging
58
+ import math
59
+ import time
60
+ import multiprocessing as mp
61
+ import os
62
+ import random
63
+ import sys
64
+ from concurrent.futures import ProcessPoolExecutor
65
+
66
+ # Import utility functions
67
+ from .config_utils import decode_sequence, setup_logging, ExistingSequenceSet
68
+ from .filter_utils import validate_filter_arguments, check_gc_content_int, check_homopolymer_int, hamming_distance_int, calculate_distance, select_distance_method, generate_hamming_neighbors
69
+
70
+ def pass_biological_filters(seq_array, gc_min, gc_max, homopolymer_max):
71
+ """Check if sequence passes all biological quality filters (works with integer arrays)"""
72
+ # Check GC content
73
+ if not check_gc_content_int(seq_array, gc_min, gc_max):
74
+ return False
75
+
76
+ # Check homopolymer repeats
77
+ if not check_homopolymer_int(seq_array, homopolymer_max):
78
+ return False
79
+
80
+ return True
81
+
82
+ def generate_random_sequences(count, length, gc_min, gc_max, homopolymer_max):
83
+ """Generate batch of random DNA sequences passing within-sequence biological filters"""
84
+ sequences = []
85
+ seen_sequences = set()
86
+
87
+ # Use NumPy's newer, thread-safe random number generator
88
+ rng = np.random.default_rng()
89
+
90
+ # For reasonable batch sizes, the duplicate probability is low enough
91
+ # that we can optimize for speed over perfect duplicate detection
92
+ max_attempts = count * 50 # Prevent infinite loops in saturated spaces
93
+ attempts = 0
94
+
95
+ while len(sequences) < count and attempts < max_attempts:
96
+ attempts += 1
97
+ # Generate integer array directly (A=0, T=1, G=2, C=3) with explicit dtype
98
+ seq_array = rng.integers(0, 4, size=length, dtype=np.int8)
99
+
100
+ # Apply biological filters first (cheaper than duplicate check)
101
+ if pass_biological_filters(seq_array, gc_min, gc_max, homopolymer_max):
102
+ # Only check duplicates for sequences that pass bio filters
103
+ seq_tuple = tuple(seq_array)
104
+ if seq_tuple not in seen_sequences:
105
+ seen_sequences.add(seq_tuple)
106
+ sequences.append(seq_array)
107
+
108
+ if len(sequences) < count:
109
+ logging.warning(f"Could only generate {len(sequences)}/{count} unique sequences after {max_attempts} attempts")
110
+
111
+ return sequences
112
+
113
+ def filter_candidates_neighbor_enum(candidates, existing_pool, min_distance):
114
+ """Filter candidates using neighbor enumeration for equal-length sequences"""
115
+ if not candidates:
116
+ return []
117
+
118
+ # Convert existing pool to hash set for O(1) lookup
119
+ # Note: This function is only called when all sequences are guaranteed to be the same length
120
+ existing_set = set(tuple(seq) for seq in existing_pool)
121
+
122
+ valid_candidates = []
123
+ for candidate in candidates:
124
+ seq_array = list(candidate) # Make mutable copy for neighbor generation
125
+
126
+ # Generate all neighbors within min_distance-1 and check for collisions
127
+ is_valid = True
128
+ for neighbor in generate_hamming_neighbors(seq_array, min_distance - 1):
129
+ if neighbor in existing_set:
130
+ is_valid = False
131
+ break
132
+
133
+ if is_valid:
134
+ valid_candidates.append(candidate)
135
+
136
+ return valid_candidates
137
+
138
+ def filter_chunk(candidates_chunk, existing_pool, min_distance):
139
+ """Filter a chunk of candidates (helper function for parallel processing/standalone sequential processing)"""
140
+ valid_candidates = []
141
+ for candidate in candidates_chunk:
142
+ # Check if candidate is sufficiently distant from all existing sequences
143
+ is_valid = True
144
+ for existing_seq in existing_pool:
145
+ if calculate_distance(candidate, existing_seq, min_distance) < min_distance:
146
+ is_valid = False
147
+ break
148
+
149
+ if is_valid:
150
+ valid_candidates.append(candidate)
151
+ return valid_candidates
152
+
153
+ def filter_candidates(candidates, existing_pool, min_distance, n_cpus, method, chunk_size):
154
+ """Enhanced filtering with method selection (neighbor enumeration or sequential/parallel pairwise)"""
155
+ if not candidates:
156
+ return []
157
+
158
+ if method == "neighbor_enumeration":
159
+ # Use neighbor enumeration when set up is optimal (no parallelization involved)
160
+ return filter_candidates_neighbor_enum(candidates, existing_pool, min_distance)
161
+ elif method == "pairwise_sequential" or n_cpus == 1:
162
+ # Use sequential for small barcode sets or single CPU
163
+ return filter_chunk(candidates, existing_pool, min_distance)
164
+ else: # method == "pairwise" and n_cpus > 1
165
+ # Large dataset with multiple CPUs - always use parallel with pre-calculated chunk size
166
+ chunks = [candidates[i:i+chunk_size] for i in range(0, len(candidates), chunk_size)]
167
+
168
+ valid_candidates = []
169
+ with ProcessPoolExecutor(max_workers=n_cpus) as executor:
170
+ futures = [
171
+ executor.submit(filter_chunk, chunk, existing_pool, min_distance)
172
+ for chunk in chunks
173
+ ]
174
+
175
+ for future in futures:
176
+ valid_candidates.extend(future.result())
177
+
178
+ return valid_candidates
179
+
180
+ def filter_within_batch(valid_candidates, min_distance, method, sequences_needed):
181
+ """Filter candidates within a batch using the specified method, stopping when we have enough sequences"""
182
+ newly_selected = []
183
+
184
+ if method == "neighbor_enumeration":
185
+ # Use neighbor enumeration approach
186
+ newly_selected_set = set() # Hash set for O(1) neighbor lookups
187
+
188
+ for candidate in valid_candidates:
189
+ if len(newly_selected) >= sequences_needed:
190
+ break
191
+
192
+ seq_array = list(candidate) # Make mutable copy for neighbor generation
193
+
194
+ # Check if any neighbor of this candidate is already in newly_selected
195
+ collision_found = False
196
+ for neighbor in generate_hamming_neighbors(seq_array, min_distance - 1):
197
+ if neighbor in newly_selected_set:
198
+ collision_found = True
199
+ break
200
+
201
+ if not collision_found:
202
+ newly_selected.append(candidate)
203
+ newly_selected_set.add(tuple(candidate))
204
+ else:
205
+ # Use original pairwise approach
206
+ for candidate in valid_candidates:
207
+ if len(newly_selected) >= sequences_needed:
208
+ break
209
+
210
+ valid = True
211
+ # Check against sequences already accepted in THIS batch
212
+ for new_seq in newly_selected:
213
+ if hamming_distance_int(candidate, new_seq, min_distance) < min_distance:
214
+ valid = False
215
+ break
216
+ if valid:
217
+ newly_selected.append(candidate)
218
+
219
+ return newly_selected
220
+
221
+ def generate_barcodes_core(target_count, length, gc_min, gc_max, homopolymer_max, min_distance,
222
+ n_cpus, seed_pool, is_paired, has_mixed_lengths):
223
+ """Main function to generate diverse barcode set using iterative growth"""
224
+ logging.info(f"Starting barcode generation...")
225
+
226
+ # 1. First log mode information (paired vs non-paired)
227
+ original_target = target_count
228
+ if is_paired:
229
+ target_count *= 2
230
+ logging.info(f"Mode: Paired (target count {original_target} → {target_count}, doubled)")
231
+ else:
232
+ logging.info(f"Mode: Standard (target count {target_count})")
233
+
234
+ # Store the target count before adding seeds
235
+ target_before_seeds = target_count
236
+
237
+ # 2. Initialize and log seed information
238
+ selected_pool = []
239
+
240
+ if seed_pool:
241
+ seed_count = len(seed_pool)
242
+ selected_pool = seed_pool.copy()
243
+
244
+ # Adjust target count to account for seeds
245
+ target_count += seed_count
246
+
247
+ # Log seed initialization and target count adjustment
248
+ if is_paired:
249
+ logging.info(f"Initialized pool with {seed_count} paired seed sequences ({seed_count // 2} pairs)")
250
+ logging.info(f"Adjusted target count: {target_before_seeds} → {target_count} (including {seed_count} paired seed sequences)")
251
+ else:
252
+ logging.info(f"Initialized pool with {seed_count} seed sequences")
253
+ logging.info(f"Adjusted target count: {target_before_seeds} → {target_count} (including {seed_count} seed sequences)")
254
+ logging.warning("Building from seed lists without validation. Assuming that seed lists pass all the filters. Please run validate_barcodes.py to ensure this if necessary.")
255
+ else:
256
+ # No seeds for either mode
257
+ logging.info("Seeds: None (building from scratch)")
258
+
259
+ # 3. Calculate batch size after seed adjustment: cap at 10,000 for large barcode sets, use 10 batches for small barcode sets (min batch size is 50)
260
+ if target_count <= 10000:
261
+ # Small barcode sets: use 10 batches
262
+ batch_size = max(50, target_count // 10)
263
+ else:
264
+ # Large barcode sets: cap at 10,000 per batch
265
+ batch_size = 10000
266
+
267
+ logging.info(f"Target count: {target_count} barcode sequences of length {length}")
268
+ logging.info(f"Filter 1 (within-sequence), GC content: {gc_min:.1%} - {gc_max:.1%}")
269
+ logging.info(f"Filter 2 (within-sequence), Max homopolymer repeat length: {homopolymer_max}")
270
+ logging.info(f"Filter 3 (between-sequence), Minimum edit distance: {min_distance}")
271
+ logging.info(f"CPUs: {n_cpus}; batch size: {batch_size}")
272
+
273
+ # 4. Make one global decision about which distance method to use
274
+ # Use the shared utility function to determine the method with pre-calculated has_mixed_lengths
275
+ method = select_distance_method(target_count, min_distance, has_mixed_lengths)
276
+
277
+ # Calculate chunk size once for pairwise method with multiple CPUs
278
+ chunk_size = None
279
+ if method == "pairwise_sequential":
280
+ logging.info(f"Using sequential pairwise distance checking during step 2 (small barcode set)")
281
+ elif method == "pairwise" and n_cpus == 1:
282
+ logging.info(f"Using sequential pairwise distance checking during step 2 (1 CPU)")
283
+ elif method == "pairwise" and n_cpus > 1:
284
+ chunk_size = max(100, target_count // (n_cpus * 4))
285
+ logging.info(f"Using parallel pairwise distance checking during step 2 (chunk size: {chunk_size})")
286
+
287
+ start_time = time.time()
288
+ batch_num = 0
289
+ total_generated = 0
290
+ total_processed = 0
291
+
292
+ while len(selected_pool) < target_count:
293
+ batch_num += 1
294
+ batch_start = time.time()
295
+
296
+ # Generate random candidate sequences with biological filtering
297
+ logging.info(f"Batch {batch_num}: (Step 1/3) Generating {batch_size} random candidates that pass within-sequence filters...")
298
+ candidates = generate_random_sequences(batch_size, length, gc_min, gc_max, homopolymer_max)
299
+ total_generated += len(candidates)
300
+
301
+ # Filter candidates for distance constraints
302
+ logging.info(f"Batch {batch_num}: (Step 2/3) Filtering candidates for distance ≥{min_distance} with existing pool...")
303
+ valid_candidates = filter_candidates(candidates, selected_pool, min_distance, n_cpus, method, chunk_size)
304
+
305
+ # Within-batch distance checking
306
+ logging.info(f"Batch {batch_num}: (Step 3/3) Filtering candidates for distance ≥{min_distance} within a batch...")
307
+ sequences_needed = target_count - len(selected_pool)
308
+ newly_selected = filter_within_batch(valid_candidates, min_distance, method, sequences_needed)
309
+
310
+ # Add the verified batch to pool
311
+ selected_pool.extend(newly_selected)
312
+ total_processed += len(candidates)
313
+
314
+ # Calculate metrics
315
+ batch_time = time.time() - batch_start
316
+ final_pass_rate = len(newly_selected) / len(candidates) * 100 if candidates else 0
317
+
318
+ logging.info(f"Batch {batch_num}: Found {len(valid_candidates)} candidates, added {len(newly_selected)} sequences that satisfied all constraints")
319
+ logging.info(f" Final pass rate: {final_pass_rate:.1f}% ({batch_time:.1f}s)")
320
+ logging.info(f"Progress: {len(selected_pool)}/{target_count} sequences ({len(selected_pool)/target_count*100:.1f}%)")
321
+
322
+ # Check if we've reached target
323
+ if len(selected_pool) >= target_count:
324
+ break
325
+
326
+ total_time = time.time() - start_time
327
+ overall_pass_rate = len(selected_pool) / total_processed * 100 if total_processed > 0 else 0
328
+
329
+ logging.info(f"Generation complete!")
330
+ logging.info(f"Final count: {len(selected_pool)} sequences")
331
+ logging.info(f"Total candidates processed: {total_processed}")
332
+ logging.info(f"Total time: {total_time:.1f} seconds")
333
+ logging.info(f"Overall pass rate: {overall_pass_rate:.2f}%")
334
+ logging.info(f"Sequences per second: {len(selected_pool)/total_time:.1f}")
335
+
336
+ # Convert back to DNA strings
337
+ return [decode_sequence(seq) for seq in selected_pool]
338
+
339
+ def write_barcode_outputs(barcodes, args, sequence_set, log_filepath):
340
+ """Write barcode outputs to files and log results"""
341
+ if args.paired:
342
+ # Convert seed sequences to DNA strings
343
+ seed_strings = [decode_sequence(seq) for seq in sequence_set.sequences] if sequence_set.sequences else []
344
+
345
+ # Calculate how many seeds to skip from barcodes list
346
+ num_seeds = len(seed_strings)
347
+ new_barcodes = barcodes[num_seeds:] if sequence_set.sequences else barcodes
348
+
349
+ # Split seed pool in half (empty if no seeds)
350
+ seed_split_point = num_seeds // 2
351
+ seed1_strings = seed_strings[:seed_split_point]
352
+ seed2_strings = seed_strings[seed_split_point:]
353
+
354
+ # Split new barcodes into two equal groups
355
+ random.shuffle(new_barcodes)
356
+ split_point = len(new_barcodes) // 2
357
+
358
+ # Write paired files
359
+ paired1_filepath = os.path.join(args.output_dir, f"{args.output_prefix}_paired1.txt")
360
+ paired2_filepath = os.path.join(args.output_dir, f"{args.output_prefix}_paired2.txt")
361
+
362
+ with open(paired1_filepath, 'w') as f:
363
+ for barcode in seed1_strings + new_barcodes[:split_point]:
364
+ f.write(barcode + '\n')
365
+
366
+ with open(paired2_filepath, 'w') as f:
367
+ for barcode in seed2_strings + new_barcodes[split_point:]:
368
+ f.write(barcode + '\n')
369
+
370
+ # Log results
371
+ logging.info(f"Paired files written to:")
372
+ logging.info(f" {paired1_filepath}")
373
+ logging.info(f" {paired2_filepath}")
374
+ logging.info(f"Log file: {log_filepath}")
375
+
376
+ # Print result
377
+ if sequence_set.sequences:
378
+ print(f"Successfully generated {len(barcodes)} barcodes (paired with seeds)")
379
+ else:
380
+ print(f"Successfully generated {len(barcodes)} barcodes (paired)")
381
+ else:
382
+ # Write single output file
383
+ output_filepath = os.path.join(args.output_dir, f"{args.output_prefix}.txt")
384
+ with open(output_filepath, 'w') as f:
385
+ for barcode in barcodes:
386
+ f.write(barcode + '\n')
387
+
388
+ # Log results
389
+ logging.info(f"Output written to: {output_filepath}")
390
+ logging.info(f"Log file: {log_filepath}")
391
+
392
+ # Print result
393
+ if sequence_set.sequences:
394
+ print(f"Successfully generated {len(barcodes)} barcodes (with seeds)")
395
+ else:
396
+ print(f"Successfully generated {len(barcodes)} barcodes")
397
+
398
+ def setup_argument_parser():
399
+ """Setup and return the argument parser for barcode generation"""
400
+ parser = argparse.ArgumentParser(
401
+ description="Generate high-performance DNA barcodes for NGS applications (from scratch or by extending from provided seed sequences)",
402
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter
403
+ )
404
+
405
+ # Required arguments
406
+ parser.add_argument('--count', type=int, required=True,
407
+ help='Number of barcodes or barcode pairs to generate')
408
+ parser.add_argument('--length', type=int, required=True,
409
+ help='Length of barcodes or barcode pairs to generate')
410
+
411
+ # Output arguments
412
+ parser.add_argument('--output-dir', type=str, default='test',
413
+ help='Output directory for barcodes and logs')
414
+ parser.add_argument('--output-prefix', type=str, default='barcodes',
415
+ help='Output filename prefix (adds .txt automatically; when paired mode is on, adds _paired1.txt and _paired2.txt)')
416
+
417
+ # Filter arguments with defaults
418
+ parser.add_argument('--gc-min', type=float, default=0.4,
419
+ help='Minimum GC content (as fraction, e.g., 0.4 = 40%%;)')
420
+ parser.add_argument('--gc-max', type=float, default=0.6,
421
+ help='Maximum GC content (as fraction, e.g., 0.6 = 60%%;)')
422
+ parser.add_argument('--homopolymer-max', type=int, default=2,
423
+ help='Maximum allowed homopolymer repeat length')
424
+ parser.add_argument('--min-distance', type=int, default=3,
425
+ help='Minimum edit distance between barcodes')
426
+
427
+ # Performance arguments
428
+ parser.add_argument('--cpus', type=int, default=mp.cpu_count(),
429
+ help='Number of CPU cores to use during the parallel filtering step')
430
+
431
+ # Seed arguments
432
+ parser.add_argument('--seeds', nargs='+', type=str, default=[],
433
+ help='Seed sequence files with existing barcode lists to extend from (any number of .txt files, one sequence per line; multiple files will be concatenated automatically) - incompatible with --paired mode')
434
+
435
+ # Mode arguments
436
+ parser.add_argument('--paired', action='store_true',
437
+ help='Generate paired barcodes by doubling the target count and randomly splitting the output into two equal parts, saved as _paired1.txt and _paired2.txt. - incompatible with --seeds')
438
+
439
+ # Paired seed arguments (for paired mode only)
440
+ parser.add_argument('--paired-seed1', type=str, default=None,
441
+ help='Seed sequence file for one side of the paired barcode set to extend from (single .txt, one sequence per line; requires matching file provided via --paired-seed2) - used only with --paired')
442
+ parser.add_argument('--paired-seed2', type=str, default=None,
443
+ help='Seed sequence file for the other side of the paired barcode set to extend from (single .txt, one sequence per line; requires matching file provided via --paired-seed1) - used only with --paired')
444
+
445
+ return parser
446
+
447
+ def validate_seed_arguments(args):
448
+ """Validate seed-related argument combinations"""
449
+ # Validate paired seed arguments
450
+ if args.paired:
451
+ # In paired mode, either both paired seeds or no seeds at all
452
+ if (args.paired_seed1 is not None) != (args.paired_seed2 is not None):
453
+ raise ValueError("When using --paired with seeds, both --paired-seed1 and --paired-seed2 must be provided")
454
+ elif args.seeds:
455
+ raise ValueError("--seeds argument is incompatible with --paired mode. Use --paired-seed1 and --paired-seed2 instead")
456
+ else:
457
+ # In non-paired mode, paired seed arguments should not be used
458
+ if args.paired_seed1 is not None or args.paired_seed2 is not None:
459
+ raise ValueError("--paired-seed1 and --paired-seed2 can only be used with --paired mode")
460
+
461
+ def calculate_hamming_bound(length, min_distance):
462
+ """Calculate theoretical maximum number of sequences for given length and minimum distance"""
463
+
464
+ # Hamming bound: M ≤ 4^n / V(n, t) where t = floor((d-1)/2)
465
+ # V(n, t) is the volume of a Hamming sphere of radius t
466
+ total_sequences = 4 ** length
467
+ t = (min_distance - 1) // 2
468
+
469
+ # Calculate volume of Hamming sphere: V(n, t) = sum(C(n,i) * 3^i) for i=0 to t
470
+ sphere_volume = 0
471
+ for i in range(t + 1):
472
+ # C(n, i) = n! / (i! * (n-i)!)
473
+ combinations = math.comb(length, i)
474
+ sphere_volume += combinations * (3 ** i) # 3 possible mutations per position
475
+
476
+ # Hamming bound
477
+ max_sequences = total_sequences // sphere_volume
478
+ return max_sequences
479
+
480
+ def calculate_gv_bound(length, min_distance):
481
+ """Calculate Gilbert-Varshamov bound (lower bound) for given length and minimum distance"""
482
+
483
+ # GV bound: M ≥ 4^n / V(n, d-1) where d is the minimum distance
484
+ # V(n, d-1) is the volume of a Hamming sphere of radius d-1
485
+ total_sequences = 4 ** length
486
+
487
+ # Calculate volume of Hamming sphere: V(n, d-1) = sum(C(n,i) * 3^i) for i=0 to d-1
488
+ sphere_volume = 0
489
+ for i in range(min_distance):
490
+ # C(n, i) = n! / (i! * (n-i)!)
491
+ combinations = math.comb(length, i)
492
+ sphere_volume += combinations * (3 ** i) # 3 possible mutations per position
493
+
494
+ # GV bound (minimum possible)
495
+ min_sequences = total_sequences // sphere_volume
496
+ return min_sequences
497
+
498
+ def validate_generator_arguments(args, seed_pool, length_counts):
499
+ """Validate generator-specific arguments (length, count, min_distance, homopolymer_max) and return has_mixed_lengths flag"""
500
+ # 1. Length validation
501
+ if args.length <= 0:
502
+ raise ValueError("Length must be > 0")
503
+
504
+ # Length warning for very long barcodes
505
+ if args.length > 20:
506
+ logging.warning(f"Requested barcode length ({args.length}) exceeds 20bp. Very long barcodes can be synthetically unstable and more error-prone, and this program is optimized for short sequences. It is recommended to generate barcodes of length 20bp or less for expected behavior.")
507
+
508
+ # 2. Homopolymer repeat x barcode length validation
509
+ if args.homopolymer_max >= args.length:
510
+ raise ValueError("Maximum homopolymer repeat length must be < new barcode length")
511
+
512
+ # 3. Minimum distance x barcode length validation
513
+ # Determine effective length for distance validation and check for mixed lengths
514
+ effective_length = args.length
515
+ seed_length = None
516
+ has_mixed_lengths = False # will return this flag for later use
517
+
518
+ if seed_pool:
519
+ # Use length_counts to determine seed lengths
520
+ seed_length = max(length_counts.keys())
521
+ effective_length = max(args.length, seed_length)
522
+
523
+ # Distance validation for case with seeds
524
+ if args.min_distance >= effective_length:
525
+ raise ValueError(f"Minimum distance must be < {effective_length} (the longer of new barcode length {args.length} and max seed length {seed_length})")
526
+
527
+ # Check for mixed lengths (within seeds or between seeds and target), will return this flag for later use
528
+ if len(length_counts) > 1 or seed_length != args.length:
529
+ has_mixed_lengths = True
530
+
531
+ else:
532
+ # Distance validation for case without seeds
533
+ if args.min_distance >= effective_length:
534
+ raise ValueError(f"Minimum distance must be < {effective_length} (new barcode length)")
535
+
536
+ # 4. Count validation
537
+ if args.count <= 0:
538
+ raise ValueError("Count must be > 0")
539
+
540
+ # 5. Count x barcode length x minimum distance validation given Hamming and Gilbert-Varshamov bounds
541
+ logging.info(f"Performing Hamming and Gilbert-Varshamov bounds validation for barcodes of length {args.length} with minimum distance {args.min_distance} to see if the requested count is possible...")
542
+
543
+ # Check bounds only for simple cases (no mixed lengths)
544
+ if not has_mixed_lengths:
545
+ # Calculate Hamming and Gilbert-Varshamov bounds
546
+ max_possible = calculate_hamming_bound(args.length, args.min_distance)
547
+ min_possible = calculate_gv_bound(args.length, args.min_distance)
548
+
549
+ # Format large numbers for readability
550
+ max_formatted = f"{max_possible:,}" if max_possible > 1000 else str(max_possible)
551
+ min_formatted = f"{min_possible:,}" if min_possible > 1000 else str(min_possible)
552
+
553
+ logging.info(f"Bounds for barcode length {args.length}, min distance {args.min_distance}: GV (lower) bound = {min_formatted}, Hamming (upper) bound = {max_formatted}")
554
+
555
+ # Validate against bounds
556
+ if not seed_pool:
557
+ # Case 1: No seeds - simple check
558
+ if args.count > max_possible:
559
+ raise ValueError(f"Requested count ({args.count:,}) exceeds Hamming (upper) bound ({max_formatted}) for length {args.length}, min distance {args.min_distance}. Please reduce the requested count.")
560
+ elif args.count > min_possible:
561
+ logging.warning(f"Requested count ({args.count:,}) exceeds GV (lower) bound ({min_formatted}) - may take longer to generate and could potentially fail")
562
+ else:
563
+ # Case 2: With seeds - check total sequences
564
+ total_sequences = args.count + len(seed_pool)
565
+ if total_sequences > max_possible:
566
+ raise ValueError(f"Total sequences ({total_sequences:,}) exceeds Hamming (upper) bound ({max_formatted}) for length {args.length}, min distance {args.min_distance}. Please reduce the requested count or seed size.")
567
+ elif total_sequences > min_possible:
568
+ logging.warning(f"Total sequences ({total_sequences:,}) exceeds GV (lower) bound ({min_formatted}) - may take longer to generate and could potentially fail")
569
+ else:
570
+ # Complex case: mixed lengths - skip bounds validation
571
+ logging.warning(f"Seeds have mixed lengths or different length(s) from target. Hamming and Gilbert-Varshamov bounds validation skipped.")
572
+
573
+ return has_mixed_lengths
574
+
575
+ def main(argv=None):
576
+ parser = setup_argument_parser()
577
+ args = parser.parse_args(argv)
578
+ log_filepath = setup_logging(args, "generate_barcodes")
579
+ validate_filter_arguments(args) # simple validation of filter arguments
580
+
581
+ # Initialize empty sequence set
582
+ sequence_set = ExistingSequenceSet()
583
+
584
+ # Load and validate seeds if any seed arguments are provided
585
+ if args.seeds or args.paired_seed1 or args.paired_seed2:
586
+ validate_seed_arguments(args)
587
+ if args.seeds:
588
+ sequence_set = ExistingSequenceSet.from_unpaired_seeds(args.seeds)
589
+ elif args.paired and args.paired_seed1 and args.paired_seed2:
590
+ sequence_set = ExistingSequenceSet.from_paired_seeds(args.paired_seed1, args.paired_seed2)
591
+
592
+ # Perform complex validation on generator-specific arguments and get the has_mixed_lengths flag
593
+ has_mixed_lengths = validate_generator_arguments(args, sequence_set.sequences, sequence_set.length_counts)
594
+
595
+ # Generate barcodes
596
+ barcodes = generate_barcodes_core(
597
+ target_count=args.count,
598
+ length=args.length,
599
+ gc_min=args.gc_min,
600
+ gc_max=args.gc_max,
601
+ homopolymer_max=args.homopolymer_max,
602
+ min_distance=args.min_distance,
603
+ n_cpus=args.cpus,
604
+ seed_pool=sequence_set.sequences,
605
+ is_paired=args.paired,
606
+ has_mixed_lengths=has_mixed_lengths
607
+ )
608
+
609
+ # Write outputs
610
+ write_barcode_outputs(barcodes, args, sequence_set, log_filepath)
611
+
612
+ if __name__ == "__main__":
613
+ main(sys.argv[1:])