barcadia 3.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- barcadia/__init__.py +20 -0
- barcadia/cli.py +61 -0
- barcadia/config_utils.py +225 -0
- barcadia/filter_utils.py +152 -0
- barcadia/generate_barcodes.py +613 -0
- barcadia/tools/generate_random_sequences.py +98 -0
- barcadia/tools/memory_benchmark.py +139 -0
- barcadia/validate_barcodes.py +393 -0
- barcadia-3.2.0.dist-info/METADATA +466 -0
- barcadia-3.2.0.dist-info/RECORD +14 -0
- barcadia-3.2.0.dist-info/WHEEL +5 -0
- barcadia-3.2.0.dist-info/entry_points.txt +2 -0
- barcadia-3.2.0.dist-info/licenses/LICENSE +202 -0
- barcadia-3.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,613 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
generate_barcodes.py
|
4
|
+
|
5
|
+
Generate high-performance DNA barcodes for NGS applications using optimized iterative growth algorithm (supports extension from seed sequences and paired mode for dual-indexing).
|
6
|
+
|
7
|
+
Program Overview:
|
8
|
+
|
9
|
+
1. Load seed sequence files as existing pool and report length distribution (will generate from scratch if no seeds are provided)
|
10
|
+
2. Validate input arguments, including whether the requested count is possible for the barcode length and minimum distance given Hamming and Gilbert-Varshamov bounds (when no seeds provided or everything equal-length)
|
11
|
+
* Target count is updated given whether seeds are provided and/or paired mode is on
|
12
|
+
|
13
|
+
3. Generate a batch of candidate unique random sequences that pass the within-sequence biological filters (i.e., GC content and homopolymer repeats) → step 1/3 of the overall filtering process
|
14
|
+
* Batch size is capped at 10,000 for barcode sets larger than 10,000, otherwise enforces 10 batches of minimum size 50
|
15
|
+
4. Conduct two-step between-sequence distance filtering with optimized method selection (neighbor enumeration or pairwise):
|
16
|
+
4a. Filter candidates against existing pool → step 2/3 of the overall filtering process
|
17
|
+
* If pairwise, use parallel processing for large datasets (≥10K) with multiple CPUs, sequential for small datasets (<10K) or single CPU
|
18
|
+
4b. Filter remaining candidates against each other within the current batch → step 3/3 of the overall filtering process
|
19
|
+
* If pairwise, always proceed sequentially in this step to ensure that the final pool fully satisfies the minimum distance requirement
|
20
|
+
|
21
|
+
Method selection (applies to both steps, decided once per generation):
|
22
|
+
- Small barcode sets (<10K sequences counting seeds if seeds are present): Always use pairwise distance checking
|
23
|
+
- Large mixed-length (within seeds and/or between seeds and new barcodes): Always use pairwise distance checking
|
24
|
+
- Large equal-length (no seeds or everything of the same length counting seeds): Choose between pairwise and neighbor enumeration based on min_distance
|
25
|
+
* Pairwise distance checking: when min_distance > 4 (large number of neighbors to check)
|
26
|
+
* Neighbor enumeration: when min_distance <= 4 (limited number of neighbors to check)
|
27
|
+
5. Add the verified batch (candidates that pass all 3 steps of filtering) to pool, repeat until target count is reached
|
28
|
+
|
29
|
+
6. Write outputs to files and log results
|
30
|
+
|
31
|
+
Input: none (or optionally, seed sequence files or paired seed files)
|
32
|
+
|
33
|
+
Output: barcode list (one per line as .txt; if paired mode: two files with suffixes _paired1.txt and _paired2.txt) and generate_barcodes_{timestamp}.log file
|
34
|
+
|
35
|
+
Optional arguments:
|
36
|
+
--gc-min: minimum GC content (default: 0.4)
|
37
|
+
--gc-max: maximum GC content (default: 0.6)
|
38
|
+
--homopolymer-max: maximum allowed homopolymer repeat length (default: 2)
|
39
|
+
--min-distance: minimum edit distance between barcodes (default: 3)
|
40
|
+
--cpus: number of CPU cores to use during the parallel filtering step (default: all available)
|
41
|
+
--seeds: seed sequence files (any number of .txt files with one sequence per line, multiple files will be concatenated automatically; if not provided, will generate from scratch; incompatible with --paired mode; default: None)
|
42
|
+
--paired: generate paired barcodes (doubles target count, randomly splits output into two equal parts; incompatible with --seeds; default: off)
|
43
|
+
--paired-seed1: paired seed sequence file 1 (used only with --paired and --paired-seed2, only one file is accepted, all sequences must be same length and match count/length of --paired-seed2; default: None)
|
44
|
+
--paired-seed2: paired seed sequence file 2 (used only with --paired and --paired-seed1, only one file is accepted, all sequences must be same length and match count/length of --paired-seed1; default: None)
|
45
|
+
--output-dir: output directory for barcodes and logs (default: test)
|
46
|
+
--output-prefix: output filename prefix (adds .txt automatically; if paired mode is on, adds _paired1.txt and _paired2.txt; default: barcodes)
|
47
|
+
|
48
|
+
Required arguments:
|
49
|
+
--count: number of barcodes or barcode pairs to generate
|
50
|
+
--length: length of barcodes or barcode pairs to generate
|
51
|
+
|
52
|
+
NOTE: seed sequences (paired or unpaired) are not validated against intended filters. If necessary, please run validate_barcodes.py first to ensure they pass all the filters before providing them as seeds here.
|
53
|
+
"""
|
54
|
+
|
55
|
+
import numpy as np
|
56
|
+
import argparse
|
57
|
+
import logging
|
58
|
+
import math
|
59
|
+
import time
|
60
|
+
import multiprocessing as mp
|
61
|
+
import os
|
62
|
+
import random
|
63
|
+
import sys
|
64
|
+
from concurrent.futures import ProcessPoolExecutor
|
65
|
+
|
66
|
+
# Import utility functions
|
67
|
+
from .config_utils import decode_sequence, setup_logging, ExistingSequenceSet
|
68
|
+
from .filter_utils import validate_filter_arguments, check_gc_content_int, check_homopolymer_int, hamming_distance_int, calculate_distance, select_distance_method, generate_hamming_neighbors
|
69
|
+
|
70
|
+
def pass_biological_filters(seq_array, gc_min, gc_max, homopolymer_max):
|
71
|
+
"""Check if sequence passes all biological quality filters (works with integer arrays)"""
|
72
|
+
# Check GC content
|
73
|
+
if not check_gc_content_int(seq_array, gc_min, gc_max):
|
74
|
+
return False
|
75
|
+
|
76
|
+
# Check homopolymer repeats
|
77
|
+
if not check_homopolymer_int(seq_array, homopolymer_max):
|
78
|
+
return False
|
79
|
+
|
80
|
+
return True
|
81
|
+
|
82
|
+
def generate_random_sequences(count, length, gc_min, gc_max, homopolymer_max):
|
83
|
+
"""Generate batch of random DNA sequences passing within-sequence biological filters"""
|
84
|
+
sequences = []
|
85
|
+
seen_sequences = set()
|
86
|
+
|
87
|
+
# Use NumPy's newer, thread-safe random number generator
|
88
|
+
rng = np.random.default_rng()
|
89
|
+
|
90
|
+
# For reasonable batch sizes, the duplicate probability is low enough
|
91
|
+
# that we can optimize for speed over perfect duplicate detection
|
92
|
+
max_attempts = count * 50 # Prevent infinite loops in saturated spaces
|
93
|
+
attempts = 0
|
94
|
+
|
95
|
+
while len(sequences) < count and attempts < max_attempts:
|
96
|
+
attempts += 1
|
97
|
+
# Generate integer array directly (A=0, T=1, G=2, C=3) with explicit dtype
|
98
|
+
seq_array = rng.integers(0, 4, size=length, dtype=np.int8)
|
99
|
+
|
100
|
+
# Apply biological filters first (cheaper than duplicate check)
|
101
|
+
if pass_biological_filters(seq_array, gc_min, gc_max, homopolymer_max):
|
102
|
+
# Only check duplicates for sequences that pass bio filters
|
103
|
+
seq_tuple = tuple(seq_array)
|
104
|
+
if seq_tuple not in seen_sequences:
|
105
|
+
seen_sequences.add(seq_tuple)
|
106
|
+
sequences.append(seq_array)
|
107
|
+
|
108
|
+
if len(sequences) < count:
|
109
|
+
logging.warning(f"Could only generate {len(sequences)}/{count} unique sequences after {max_attempts} attempts")
|
110
|
+
|
111
|
+
return sequences
|
112
|
+
|
113
|
+
def filter_candidates_neighbor_enum(candidates, existing_pool, min_distance):
|
114
|
+
"""Filter candidates using neighbor enumeration for equal-length sequences"""
|
115
|
+
if not candidates:
|
116
|
+
return []
|
117
|
+
|
118
|
+
# Convert existing pool to hash set for O(1) lookup
|
119
|
+
# Note: This function is only called when all sequences are guaranteed to be the same length
|
120
|
+
existing_set = set(tuple(seq) for seq in existing_pool)
|
121
|
+
|
122
|
+
valid_candidates = []
|
123
|
+
for candidate in candidates:
|
124
|
+
seq_array = list(candidate) # Make mutable copy for neighbor generation
|
125
|
+
|
126
|
+
# Generate all neighbors within min_distance-1 and check for collisions
|
127
|
+
is_valid = True
|
128
|
+
for neighbor in generate_hamming_neighbors(seq_array, min_distance - 1):
|
129
|
+
if neighbor in existing_set:
|
130
|
+
is_valid = False
|
131
|
+
break
|
132
|
+
|
133
|
+
if is_valid:
|
134
|
+
valid_candidates.append(candidate)
|
135
|
+
|
136
|
+
return valid_candidates
|
137
|
+
|
138
|
+
def filter_chunk(candidates_chunk, existing_pool, min_distance):
|
139
|
+
"""Filter a chunk of candidates (helper function for parallel processing/standalone sequential processing)"""
|
140
|
+
valid_candidates = []
|
141
|
+
for candidate in candidates_chunk:
|
142
|
+
# Check if candidate is sufficiently distant from all existing sequences
|
143
|
+
is_valid = True
|
144
|
+
for existing_seq in existing_pool:
|
145
|
+
if calculate_distance(candidate, existing_seq, min_distance) < min_distance:
|
146
|
+
is_valid = False
|
147
|
+
break
|
148
|
+
|
149
|
+
if is_valid:
|
150
|
+
valid_candidates.append(candidate)
|
151
|
+
return valid_candidates
|
152
|
+
|
153
|
+
def filter_candidates(candidates, existing_pool, min_distance, n_cpus, method, chunk_size):
|
154
|
+
"""Enhanced filtering with method selection (neighbor enumeration or sequential/parallel pairwise)"""
|
155
|
+
if not candidates:
|
156
|
+
return []
|
157
|
+
|
158
|
+
if method == "neighbor_enumeration":
|
159
|
+
# Use neighbor enumeration when set up is optimal (no parallelization involved)
|
160
|
+
return filter_candidates_neighbor_enum(candidates, existing_pool, min_distance)
|
161
|
+
elif method == "pairwise_sequential" or n_cpus == 1:
|
162
|
+
# Use sequential for small barcode sets or single CPU
|
163
|
+
return filter_chunk(candidates, existing_pool, min_distance)
|
164
|
+
else: # method == "pairwise" and n_cpus > 1
|
165
|
+
# Large dataset with multiple CPUs - always use parallel with pre-calculated chunk size
|
166
|
+
chunks = [candidates[i:i+chunk_size] for i in range(0, len(candidates), chunk_size)]
|
167
|
+
|
168
|
+
valid_candidates = []
|
169
|
+
with ProcessPoolExecutor(max_workers=n_cpus) as executor:
|
170
|
+
futures = [
|
171
|
+
executor.submit(filter_chunk, chunk, existing_pool, min_distance)
|
172
|
+
for chunk in chunks
|
173
|
+
]
|
174
|
+
|
175
|
+
for future in futures:
|
176
|
+
valid_candidates.extend(future.result())
|
177
|
+
|
178
|
+
return valid_candidates
|
179
|
+
|
180
|
+
def filter_within_batch(valid_candidates, min_distance, method, sequences_needed):
|
181
|
+
"""Filter candidates within a batch using the specified method, stopping when we have enough sequences"""
|
182
|
+
newly_selected = []
|
183
|
+
|
184
|
+
if method == "neighbor_enumeration":
|
185
|
+
# Use neighbor enumeration approach
|
186
|
+
newly_selected_set = set() # Hash set for O(1) neighbor lookups
|
187
|
+
|
188
|
+
for candidate in valid_candidates:
|
189
|
+
if len(newly_selected) >= sequences_needed:
|
190
|
+
break
|
191
|
+
|
192
|
+
seq_array = list(candidate) # Make mutable copy for neighbor generation
|
193
|
+
|
194
|
+
# Check if any neighbor of this candidate is already in newly_selected
|
195
|
+
collision_found = False
|
196
|
+
for neighbor in generate_hamming_neighbors(seq_array, min_distance - 1):
|
197
|
+
if neighbor in newly_selected_set:
|
198
|
+
collision_found = True
|
199
|
+
break
|
200
|
+
|
201
|
+
if not collision_found:
|
202
|
+
newly_selected.append(candidate)
|
203
|
+
newly_selected_set.add(tuple(candidate))
|
204
|
+
else:
|
205
|
+
# Use original pairwise approach
|
206
|
+
for candidate in valid_candidates:
|
207
|
+
if len(newly_selected) >= sequences_needed:
|
208
|
+
break
|
209
|
+
|
210
|
+
valid = True
|
211
|
+
# Check against sequences already accepted in THIS batch
|
212
|
+
for new_seq in newly_selected:
|
213
|
+
if hamming_distance_int(candidate, new_seq, min_distance) < min_distance:
|
214
|
+
valid = False
|
215
|
+
break
|
216
|
+
if valid:
|
217
|
+
newly_selected.append(candidate)
|
218
|
+
|
219
|
+
return newly_selected
|
220
|
+
|
221
|
+
def generate_barcodes_core(target_count, length, gc_min, gc_max, homopolymer_max, min_distance,
|
222
|
+
n_cpus, seed_pool, is_paired, has_mixed_lengths):
|
223
|
+
"""Main function to generate diverse barcode set using iterative growth"""
|
224
|
+
logging.info(f"Starting barcode generation...")
|
225
|
+
|
226
|
+
# 1. First log mode information (paired vs non-paired)
|
227
|
+
original_target = target_count
|
228
|
+
if is_paired:
|
229
|
+
target_count *= 2
|
230
|
+
logging.info(f"Mode: Paired (target count {original_target} → {target_count}, doubled)")
|
231
|
+
else:
|
232
|
+
logging.info(f"Mode: Standard (target count {target_count})")
|
233
|
+
|
234
|
+
# Store the target count before adding seeds
|
235
|
+
target_before_seeds = target_count
|
236
|
+
|
237
|
+
# 2. Initialize and log seed information
|
238
|
+
selected_pool = []
|
239
|
+
|
240
|
+
if seed_pool:
|
241
|
+
seed_count = len(seed_pool)
|
242
|
+
selected_pool = seed_pool.copy()
|
243
|
+
|
244
|
+
# Adjust target count to account for seeds
|
245
|
+
target_count += seed_count
|
246
|
+
|
247
|
+
# Log seed initialization and target count adjustment
|
248
|
+
if is_paired:
|
249
|
+
logging.info(f"Initialized pool with {seed_count} paired seed sequences ({seed_count // 2} pairs)")
|
250
|
+
logging.info(f"Adjusted target count: {target_before_seeds} → {target_count} (including {seed_count} paired seed sequences)")
|
251
|
+
else:
|
252
|
+
logging.info(f"Initialized pool with {seed_count} seed sequences")
|
253
|
+
logging.info(f"Adjusted target count: {target_before_seeds} → {target_count} (including {seed_count} seed sequences)")
|
254
|
+
logging.warning("Building from seed lists without validation. Assuming that seed lists pass all the filters. Please run validate_barcodes.py to ensure this if necessary.")
|
255
|
+
else:
|
256
|
+
# No seeds for either mode
|
257
|
+
logging.info("Seeds: None (building from scratch)")
|
258
|
+
|
259
|
+
# 3. Calculate batch size after seed adjustment: cap at 10,000 for large barcode sets, use 10 batches for small barcode sets (min batch size is 50)
|
260
|
+
if target_count <= 10000:
|
261
|
+
# Small barcode sets: use 10 batches
|
262
|
+
batch_size = max(50, target_count // 10)
|
263
|
+
else:
|
264
|
+
# Large barcode sets: cap at 10,000 per batch
|
265
|
+
batch_size = 10000
|
266
|
+
|
267
|
+
logging.info(f"Target count: {target_count} barcode sequences of length {length}")
|
268
|
+
logging.info(f"Filter 1 (within-sequence), GC content: {gc_min:.1%} - {gc_max:.1%}")
|
269
|
+
logging.info(f"Filter 2 (within-sequence), Max homopolymer repeat length: {homopolymer_max}")
|
270
|
+
logging.info(f"Filter 3 (between-sequence), Minimum edit distance: {min_distance}")
|
271
|
+
logging.info(f"CPUs: {n_cpus}; batch size: {batch_size}")
|
272
|
+
|
273
|
+
# 4. Make one global decision about which distance method to use
|
274
|
+
# Use the shared utility function to determine the method with pre-calculated has_mixed_lengths
|
275
|
+
method = select_distance_method(target_count, min_distance, has_mixed_lengths)
|
276
|
+
|
277
|
+
# Calculate chunk size once for pairwise method with multiple CPUs
|
278
|
+
chunk_size = None
|
279
|
+
if method == "pairwise_sequential":
|
280
|
+
logging.info(f"Using sequential pairwise distance checking during step 2 (small barcode set)")
|
281
|
+
elif method == "pairwise" and n_cpus == 1:
|
282
|
+
logging.info(f"Using sequential pairwise distance checking during step 2 (1 CPU)")
|
283
|
+
elif method == "pairwise" and n_cpus > 1:
|
284
|
+
chunk_size = max(100, target_count // (n_cpus * 4))
|
285
|
+
logging.info(f"Using parallel pairwise distance checking during step 2 (chunk size: {chunk_size})")
|
286
|
+
|
287
|
+
start_time = time.time()
|
288
|
+
batch_num = 0
|
289
|
+
total_generated = 0
|
290
|
+
total_processed = 0
|
291
|
+
|
292
|
+
while len(selected_pool) < target_count:
|
293
|
+
batch_num += 1
|
294
|
+
batch_start = time.time()
|
295
|
+
|
296
|
+
# Generate random candidate sequences with biological filtering
|
297
|
+
logging.info(f"Batch {batch_num}: (Step 1/3) Generating {batch_size} random candidates that pass within-sequence filters...")
|
298
|
+
candidates = generate_random_sequences(batch_size, length, gc_min, gc_max, homopolymer_max)
|
299
|
+
total_generated += len(candidates)
|
300
|
+
|
301
|
+
# Filter candidates for distance constraints
|
302
|
+
logging.info(f"Batch {batch_num}: (Step 2/3) Filtering candidates for distance ≥{min_distance} with existing pool...")
|
303
|
+
valid_candidates = filter_candidates(candidates, selected_pool, min_distance, n_cpus, method, chunk_size)
|
304
|
+
|
305
|
+
# Within-batch distance checking
|
306
|
+
logging.info(f"Batch {batch_num}: (Step 3/3) Filtering candidates for distance ≥{min_distance} within a batch...")
|
307
|
+
sequences_needed = target_count - len(selected_pool)
|
308
|
+
newly_selected = filter_within_batch(valid_candidates, min_distance, method, sequences_needed)
|
309
|
+
|
310
|
+
# Add the verified batch to pool
|
311
|
+
selected_pool.extend(newly_selected)
|
312
|
+
total_processed += len(candidates)
|
313
|
+
|
314
|
+
# Calculate metrics
|
315
|
+
batch_time = time.time() - batch_start
|
316
|
+
final_pass_rate = len(newly_selected) / len(candidates) * 100 if candidates else 0
|
317
|
+
|
318
|
+
logging.info(f"Batch {batch_num}: Found {len(valid_candidates)} candidates, added {len(newly_selected)} sequences that satisfied all constraints")
|
319
|
+
logging.info(f" Final pass rate: {final_pass_rate:.1f}% ({batch_time:.1f}s)")
|
320
|
+
logging.info(f"Progress: {len(selected_pool)}/{target_count} sequences ({len(selected_pool)/target_count*100:.1f}%)")
|
321
|
+
|
322
|
+
# Check if we've reached target
|
323
|
+
if len(selected_pool) >= target_count:
|
324
|
+
break
|
325
|
+
|
326
|
+
total_time = time.time() - start_time
|
327
|
+
overall_pass_rate = len(selected_pool) / total_processed * 100 if total_processed > 0 else 0
|
328
|
+
|
329
|
+
logging.info(f"Generation complete!")
|
330
|
+
logging.info(f"Final count: {len(selected_pool)} sequences")
|
331
|
+
logging.info(f"Total candidates processed: {total_processed}")
|
332
|
+
logging.info(f"Total time: {total_time:.1f} seconds")
|
333
|
+
logging.info(f"Overall pass rate: {overall_pass_rate:.2f}%")
|
334
|
+
logging.info(f"Sequences per second: {len(selected_pool)/total_time:.1f}")
|
335
|
+
|
336
|
+
# Convert back to DNA strings
|
337
|
+
return [decode_sequence(seq) for seq in selected_pool]
|
338
|
+
|
339
|
+
def write_barcode_outputs(barcodes, args, sequence_set, log_filepath):
|
340
|
+
"""Write barcode outputs to files and log results"""
|
341
|
+
if args.paired:
|
342
|
+
# Convert seed sequences to DNA strings
|
343
|
+
seed_strings = [decode_sequence(seq) for seq in sequence_set.sequences] if sequence_set.sequences else []
|
344
|
+
|
345
|
+
# Calculate how many seeds to skip from barcodes list
|
346
|
+
num_seeds = len(seed_strings)
|
347
|
+
new_barcodes = barcodes[num_seeds:] if sequence_set.sequences else barcodes
|
348
|
+
|
349
|
+
# Split seed pool in half (empty if no seeds)
|
350
|
+
seed_split_point = num_seeds // 2
|
351
|
+
seed1_strings = seed_strings[:seed_split_point]
|
352
|
+
seed2_strings = seed_strings[seed_split_point:]
|
353
|
+
|
354
|
+
# Split new barcodes into two equal groups
|
355
|
+
random.shuffle(new_barcodes)
|
356
|
+
split_point = len(new_barcodes) // 2
|
357
|
+
|
358
|
+
# Write paired files
|
359
|
+
paired1_filepath = os.path.join(args.output_dir, f"{args.output_prefix}_paired1.txt")
|
360
|
+
paired2_filepath = os.path.join(args.output_dir, f"{args.output_prefix}_paired2.txt")
|
361
|
+
|
362
|
+
with open(paired1_filepath, 'w') as f:
|
363
|
+
for barcode in seed1_strings + new_barcodes[:split_point]:
|
364
|
+
f.write(barcode + '\n')
|
365
|
+
|
366
|
+
with open(paired2_filepath, 'w') as f:
|
367
|
+
for barcode in seed2_strings + new_barcodes[split_point:]:
|
368
|
+
f.write(barcode + '\n')
|
369
|
+
|
370
|
+
# Log results
|
371
|
+
logging.info(f"Paired files written to:")
|
372
|
+
logging.info(f" {paired1_filepath}")
|
373
|
+
logging.info(f" {paired2_filepath}")
|
374
|
+
logging.info(f"Log file: {log_filepath}")
|
375
|
+
|
376
|
+
# Print result
|
377
|
+
if sequence_set.sequences:
|
378
|
+
print(f"Successfully generated {len(barcodes)} barcodes (paired with seeds)")
|
379
|
+
else:
|
380
|
+
print(f"Successfully generated {len(barcodes)} barcodes (paired)")
|
381
|
+
else:
|
382
|
+
# Write single output file
|
383
|
+
output_filepath = os.path.join(args.output_dir, f"{args.output_prefix}.txt")
|
384
|
+
with open(output_filepath, 'w') as f:
|
385
|
+
for barcode in barcodes:
|
386
|
+
f.write(barcode + '\n')
|
387
|
+
|
388
|
+
# Log results
|
389
|
+
logging.info(f"Output written to: {output_filepath}")
|
390
|
+
logging.info(f"Log file: {log_filepath}")
|
391
|
+
|
392
|
+
# Print result
|
393
|
+
if sequence_set.sequences:
|
394
|
+
print(f"Successfully generated {len(barcodes)} barcodes (with seeds)")
|
395
|
+
else:
|
396
|
+
print(f"Successfully generated {len(barcodes)} barcodes")
|
397
|
+
|
398
|
+
def setup_argument_parser():
|
399
|
+
"""Setup and return the argument parser for barcode generation"""
|
400
|
+
parser = argparse.ArgumentParser(
|
401
|
+
description="Generate high-performance DNA barcodes for NGS applications (from scratch or by extending from provided seed sequences)",
|
402
|
+
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
403
|
+
)
|
404
|
+
|
405
|
+
# Required arguments
|
406
|
+
parser.add_argument('--count', type=int, required=True,
|
407
|
+
help='Number of barcodes or barcode pairs to generate')
|
408
|
+
parser.add_argument('--length', type=int, required=True,
|
409
|
+
help='Length of barcodes or barcode pairs to generate')
|
410
|
+
|
411
|
+
# Output arguments
|
412
|
+
parser.add_argument('--output-dir', type=str, default='test',
|
413
|
+
help='Output directory for barcodes and logs')
|
414
|
+
parser.add_argument('--output-prefix', type=str, default='barcodes',
|
415
|
+
help='Output filename prefix (adds .txt automatically; when paired mode is on, adds _paired1.txt and _paired2.txt)')
|
416
|
+
|
417
|
+
# Filter arguments with defaults
|
418
|
+
parser.add_argument('--gc-min', type=float, default=0.4,
|
419
|
+
help='Minimum GC content (as fraction, e.g., 0.4 = 40%%;)')
|
420
|
+
parser.add_argument('--gc-max', type=float, default=0.6,
|
421
|
+
help='Maximum GC content (as fraction, e.g., 0.6 = 60%%;)')
|
422
|
+
parser.add_argument('--homopolymer-max', type=int, default=2,
|
423
|
+
help='Maximum allowed homopolymer repeat length')
|
424
|
+
parser.add_argument('--min-distance', type=int, default=3,
|
425
|
+
help='Minimum edit distance between barcodes')
|
426
|
+
|
427
|
+
# Performance arguments
|
428
|
+
parser.add_argument('--cpus', type=int, default=mp.cpu_count(),
|
429
|
+
help='Number of CPU cores to use during the parallel filtering step')
|
430
|
+
|
431
|
+
# Seed arguments
|
432
|
+
parser.add_argument('--seeds', nargs='+', type=str, default=[],
|
433
|
+
help='Seed sequence files with existing barcode lists to extend from (any number of .txt files, one sequence per line; multiple files will be concatenated automatically) - incompatible with --paired mode')
|
434
|
+
|
435
|
+
# Mode arguments
|
436
|
+
parser.add_argument('--paired', action='store_true',
|
437
|
+
help='Generate paired barcodes by doubling the target count and randomly splitting the output into two equal parts, saved as _paired1.txt and _paired2.txt. - incompatible with --seeds')
|
438
|
+
|
439
|
+
# Paired seed arguments (for paired mode only)
|
440
|
+
parser.add_argument('--paired-seed1', type=str, default=None,
|
441
|
+
help='Seed sequence file for one side of the paired barcode set to extend from (single .txt, one sequence per line; requires matching file provided via --paired-seed2) - used only with --paired')
|
442
|
+
parser.add_argument('--paired-seed2', type=str, default=None,
|
443
|
+
help='Seed sequence file for the other side of the paired barcode set to extend from (single .txt, one sequence per line; requires matching file provided via --paired-seed1) - used only with --paired')
|
444
|
+
|
445
|
+
return parser
|
446
|
+
|
447
|
+
def validate_seed_arguments(args):
|
448
|
+
"""Validate seed-related argument combinations"""
|
449
|
+
# Validate paired seed arguments
|
450
|
+
if args.paired:
|
451
|
+
# In paired mode, either both paired seeds or no seeds at all
|
452
|
+
if (args.paired_seed1 is not None) != (args.paired_seed2 is not None):
|
453
|
+
raise ValueError("When using --paired with seeds, both --paired-seed1 and --paired-seed2 must be provided")
|
454
|
+
elif args.seeds:
|
455
|
+
raise ValueError("--seeds argument is incompatible with --paired mode. Use --paired-seed1 and --paired-seed2 instead")
|
456
|
+
else:
|
457
|
+
# In non-paired mode, paired seed arguments should not be used
|
458
|
+
if args.paired_seed1 is not None or args.paired_seed2 is not None:
|
459
|
+
raise ValueError("--paired-seed1 and --paired-seed2 can only be used with --paired mode")
|
460
|
+
|
461
|
+
def calculate_hamming_bound(length, min_distance):
|
462
|
+
"""Calculate theoretical maximum number of sequences for given length and minimum distance"""
|
463
|
+
|
464
|
+
# Hamming bound: M ≤ 4^n / V(n, t) where t = floor((d-1)/2)
|
465
|
+
# V(n, t) is the volume of a Hamming sphere of radius t
|
466
|
+
total_sequences = 4 ** length
|
467
|
+
t = (min_distance - 1) // 2
|
468
|
+
|
469
|
+
# Calculate volume of Hamming sphere: V(n, t) = sum(C(n,i) * 3^i) for i=0 to t
|
470
|
+
sphere_volume = 0
|
471
|
+
for i in range(t + 1):
|
472
|
+
# C(n, i) = n! / (i! * (n-i)!)
|
473
|
+
combinations = math.comb(length, i)
|
474
|
+
sphere_volume += combinations * (3 ** i) # 3 possible mutations per position
|
475
|
+
|
476
|
+
# Hamming bound
|
477
|
+
max_sequences = total_sequences // sphere_volume
|
478
|
+
return max_sequences
|
479
|
+
|
480
|
+
def calculate_gv_bound(length, min_distance):
|
481
|
+
"""Calculate Gilbert-Varshamov bound (lower bound) for given length and minimum distance"""
|
482
|
+
|
483
|
+
# GV bound: M ≥ 4^n / V(n, d-1) where d is the minimum distance
|
484
|
+
# V(n, d-1) is the volume of a Hamming sphere of radius d-1
|
485
|
+
total_sequences = 4 ** length
|
486
|
+
|
487
|
+
# Calculate volume of Hamming sphere: V(n, d-1) = sum(C(n,i) * 3^i) for i=0 to d-1
|
488
|
+
sphere_volume = 0
|
489
|
+
for i in range(min_distance):
|
490
|
+
# C(n, i) = n! / (i! * (n-i)!)
|
491
|
+
combinations = math.comb(length, i)
|
492
|
+
sphere_volume += combinations * (3 ** i) # 3 possible mutations per position
|
493
|
+
|
494
|
+
# GV bound (minimum possible)
|
495
|
+
min_sequences = total_sequences // sphere_volume
|
496
|
+
return min_sequences
|
497
|
+
|
498
|
+
def validate_generator_arguments(args, seed_pool, length_counts):
|
499
|
+
"""Validate generator-specific arguments (length, count, min_distance, homopolymer_max) and return has_mixed_lengths flag"""
|
500
|
+
# 1. Length validation
|
501
|
+
if args.length <= 0:
|
502
|
+
raise ValueError("Length must be > 0")
|
503
|
+
|
504
|
+
# Length warning for very long barcodes
|
505
|
+
if args.length > 20:
|
506
|
+
logging.warning(f"Requested barcode length ({args.length}) exceeds 20bp. Very long barcodes can be synthetically unstable and more error-prone, and this program is optimized for short sequences. It is recommended to generate barcodes of length 20bp or less for expected behavior.")
|
507
|
+
|
508
|
+
# 2. Homopolymer repeat x barcode length validation
|
509
|
+
if args.homopolymer_max >= args.length:
|
510
|
+
raise ValueError("Maximum homopolymer repeat length must be < new barcode length")
|
511
|
+
|
512
|
+
# 3. Minimum distance x barcode length validation
|
513
|
+
# Determine effective length for distance validation and check for mixed lengths
|
514
|
+
effective_length = args.length
|
515
|
+
seed_length = None
|
516
|
+
has_mixed_lengths = False # will return this flag for later use
|
517
|
+
|
518
|
+
if seed_pool:
|
519
|
+
# Use length_counts to determine seed lengths
|
520
|
+
seed_length = max(length_counts.keys())
|
521
|
+
effective_length = max(args.length, seed_length)
|
522
|
+
|
523
|
+
# Distance validation for case with seeds
|
524
|
+
if args.min_distance >= effective_length:
|
525
|
+
raise ValueError(f"Minimum distance must be < {effective_length} (the longer of new barcode length {args.length} and max seed length {seed_length})")
|
526
|
+
|
527
|
+
# Check for mixed lengths (within seeds or between seeds and target), will return this flag for later use
|
528
|
+
if len(length_counts) > 1 or seed_length != args.length:
|
529
|
+
has_mixed_lengths = True
|
530
|
+
|
531
|
+
else:
|
532
|
+
# Distance validation for case without seeds
|
533
|
+
if args.min_distance >= effective_length:
|
534
|
+
raise ValueError(f"Minimum distance must be < {effective_length} (new barcode length)")
|
535
|
+
|
536
|
+
# 4. Count validation
|
537
|
+
if args.count <= 0:
|
538
|
+
raise ValueError("Count must be > 0")
|
539
|
+
|
540
|
+
# 5. Count x barcode length x minimum distance validation given Hamming and Gilbert-Varshamov bounds
|
541
|
+
logging.info(f"Performing Hamming and Gilbert-Varshamov bounds validation for barcodes of length {args.length} with minimum distance {args.min_distance} to see if the requested count is possible...")
|
542
|
+
|
543
|
+
# Check bounds only for simple cases (no mixed lengths)
|
544
|
+
if not has_mixed_lengths:
|
545
|
+
# Calculate Hamming and Gilbert-Varshamov bounds
|
546
|
+
max_possible = calculate_hamming_bound(args.length, args.min_distance)
|
547
|
+
min_possible = calculate_gv_bound(args.length, args.min_distance)
|
548
|
+
|
549
|
+
# Format large numbers for readability
|
550
|
+
max_formatted = f"{max_possible:,}" if max_possible > 1000 else str(max_possible)
|
551
|
+
min_formatted = f"{min_possible:,}" if min_possible > 1000 else str(min_possible)
|
552
|
+
|
553
|
+
logging.info(f"Bounds for barcode length {args.length}, min distance {args.min_distance}: GV (lower) bound = {min_formatted}, Hamming (upper) bound = {max_formatted}")
|
554
|
+
|
555
|
+
# Validate against bounds
|
556
|
+
if not seed_pool:
|
557
|
+
# Case 1: No seeds - simple check
|
558
|
+
if args.count > max_possible:
|
559
|
+
raise ValueError(f"Requested count ({args.count:,}) exceeds Hamming (upper) bound ({max_formatted}) for length {args.length}, min distance {args.min_distance}. Please reduce the requested count.")
|
560
|
+
elif args.count > min_possible:
|
561
|
+
logging.warning(f"Requested count ({args.count:,}) exceeds GV (lower) bound ({min_formatted}) - may take longer to generate and could potentially fail")
|
562
|
+
else:
|
563
|
+
# Case 2: With seeds - check total sequences
|
564
|
+
total_sequences = args.count + len(seed_pool)
|
565
|
+
if total_sequences > max_possible:
|
566
|
+
raise ValueError(f"Total sequences ({total_sequences:,}) exceeds Hamming (upper) bound ({max_formatted}) for length {args.length}, min distance {args.min_distance}. Please reduce the requested count or seed size.")
|
567
|
+
elif total_sequences > min_possible:
|
568
|
+
logging.warning(f"Total sequences ({total_sequences:,}) exceeds GV (lower) bound ({min_formatted}) - may take longer to generate and could potentially fail")
|
569
|
+
else:
|
570
|
+
# Complex case: mixed lengths - skip bounds validation
|
571
|
+
logging.warning(f"Seeds have mixed lengths or different length(s) from target. Hamming and Gilbert-Varshamov bounds validation skipped.")
|
572
|
+
|
573
|
+
return has_mixed_lengths
|
574
|
+
|
575
|
+
def main(argv=None):
|
576
|
+
parser = setup_argument_parser()
|
577
|
+
args = parser.parse_args(argv)
|
578
|
+
log_filepath = setup_logging(args, "generate_barcodes")
|
579
|
+
validate_filter_arguments(args) # simple validation of filter arguments
|
580
|
+
|
581
|
+
# Initialize empty sequence set
|
582
|
+
sequence_set = ExistingSequenceSet()
|
583
|
+
|
584
|
+
# Load and validate seeds if any seed arguments are provided
|
585
|
+
if args.seeds or args.paired_seed1 or args.paired_seed2:
|
586
|
+
validate_seed_arguments(args)
|
587
|
+
if args.seeds:
|
588
|
+
sequence_set = ExistingSequenceSet.from_unpaired_seeds(args.seeds)
|
589
|
+
elif args.paired and args.paired_seed1 and args.paired_seed2:
|
590
|
+
sequence_set = ExistingSequenceSet.from_paired_seeds(args.paired_seed1, args.paired_seed2)
|
591
|
+
|
592
|
+
# Perform complex validation on generator-specific arguments and get the has_mixed_lengths flag
|
593
|
+
has_mixed_lengths = validate_generator_arguments(args, sequence_set.sequences, sequence_set.length_counts)
|
594
|
+
|
595
|
+
# Generate barcodes
|
596
|
+
barcodes = generate_barcodes_core(
|
597
|
+
target_count=args.count,
|
598
|
+
length=args.length,
|
599
|
+
gc_min=args.gc_min,
|
600
|
+
gc_max=args.gc_max,
|
601
|
+
homopolymer_max=args.homopolymer_max,
|
602
|
+
min_distance=args.min_distance,
|
603
|
+
n_cpus=args.cpus,
|
604
|
+
seed_pool=sequence_set.sequences,
|
605
|
+
is_paired=args.paired,
|
606
|
+
has_mixed_lengths=has_mixed_lengths
|
607
|
+
)
|
608
|
+
|
609
|
+
# Write outputs
|
610
|
+
write_barcode_outputs(barcodes, args, sequence_set, log_filepath)
|
611
|
+
|
612
|
+
if __name__ == "__main__":
|
613
|
+
main(sys.argv[1:])
|