barcadia 3.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- barcadia/__init__.py +20 -0
- barcadia/cli.py +61 -0
- barcadia/config_utils.py +225 -0
- barcadia/filter_utils.py +152 -0
- barcadia/generate_barcodes.py +613 -0
- barcadia/tools/generate_random_sequences.py +98 -0
- barcadia/tools/memory_benchmark.py +139 -0
- barcadia/validate_barcodes.py +393 -0
- barcadia-3.2.0.dist-info/METADATA +466 -0
- barcadia-3.2.0.dist-info/RECORD +14 -0
- barcadia-3.2.0.dist-info/WHEEL +5 -0
- barcadia-3.2.0.dist-info/entry_points.txt +2 -0
- barcadia-3.2.0.dist-info/licenses/LICENSE +202 -0
- barcadia-3.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,98 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
generate_random_sequences.py
|
4
|
+
|
5
|
+
Generate random DNA sequences for testing validation scripts (variable length sequences supported).
|
6
|
+
|
7
|
+
Example usage: python src/barcadia/tools/generate_random_sequences.py --count 10000 --lengths 12 13
|
8
|
+
|
9
|
+
Output: random DNA sequences (one per line as .txt)
|
10
|
+
|
11
|
+
Optional arguments:
|
12
|
+
--output: output file path (default: test/{count}_random_{min}to{max}bp_sequences.txt)
|
13
|
+
|
14
|
+
Required arguments:
|
15
|
+
--count: number of sequences to generate
|
16
|
+
--lengths: possible lengths for sequences
|
17
|
+
"""
|
18
|
+
|
19
|
+
import argparse
|
20
|
+
import random
|
21
|
+
import os
|
22
|
+
import sys
|
23
|
+
import numpy as np
|
24
|
+
|
25
|
+
# Add src directory to Python path for imports
|
26
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
27
|
+
|
28
|
+
from config_utils import decode_sequence # type: ignore
|
29
|
+
|
30
|
+
def generate_random_sequence(length):
|
31
|
+
"""Generate a single random DNA sequence of given length as integer array"""
|
32
|
+
return np.random.randint(0, 4, size=length, dtype=np.int8)
|
33
|
+
|
34
|
+
def main():
|
35
|
+
parser = argparse.ArgumentParser(
|
36
|
+
description="Generate random DNA sequences for testing",
|
37
|
+
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
38
|
+
epilog="Example: python src/barcadia/tools/generate_random_sequences.py --count 10000 --lengths 12 13"
|
39
|
+
)
|
40
|
+
|
41
|
+
parser.add_argument('--count', type=int, required=True,
|
42
|
+
help='Number of sequences to generate')
|
43
|
+
parser.add_argument('--lengths', nargs='+', type=int, required=True,
|
44
|
+
help='Possible lengths for sequences')
|
45
|
+
parser.add_argument('--output', type=str,
|
46
|
+
help='Output file path (default: test/{count}_random_{min}to{max}bp_sequences.txt)')
|
47
|
+
|
48
|
+
args = parser.parse_args()
|
49
|
+
|
50
|
+
# Validate arguments
|
51
|
+
if args.count <= 0:
|
52
|
+
raise ValueError("Count must be > 0")
|
53
|
+
|
54
|
+
for length in args.lengths:
|
55
|
+
if length <= 0:
|
56
|
+
raise ValueError(f"All lengths must be > 0, got {length}")
|
57
|
+
|
58
|
+
# Check if count exceeds maximum possible sequences across all lengths
|
59
|
+
total_max_possible = sum(4 ** length for length in args.lengths)
|
60
|
+
if args.count > total_max_possible:
|
61
|
+
raise ValueError(f"Count ({args.count}) exceeds maximum possible sequences for lengths {args.lengths} ({total_max_possible})")
|
62
|
+
|
63
|
+
# Generate default output path if not specified
|
64
|
+
if args.output is None:
|
65
|
+
os.makedirs('test', exist_ok=True)
|
66
|
+
min_length = min(args.lengths)
|
67
|
+
max_length = max(args.lengths)
|
68
|
+
|
69
|
+
# Handle single length vs range
|
70
|
+
if min_length == max_length:
|
71
|
+
length_range = str(min_length)
|
72
|
+
else:
|
73
|
+
length_range = f"{min_length}to{max_length}"
|
74
|
+
|
75
|
+
args.output = f"test/{args.count}_random_{length_range}bp_sequences.txt"
|
76
|
+
|
77
|
+
# Generate sequences, write to file, and count lengths in one loop
|
78
|
+
length_counts = {}
|
79
|
+
with open(args.output, 'w') as f:
|
80
|
+
for _ in range(args.count):
|
81
|
+
# Randomly choose a length from the provided options
|
82
|
+
chosen_length = random.choice(args.lengths)
|
83
|
+
seq_array = generate_random_sequence(chosen_length)
|
84
|
+
|
85
|
+
# Write to file (convert to DNA string for output)
|
86
|
+
dna_string = decode_sequence(seq_array)
|
87
|
+
f.write(dna_string + '\n')
|
88
|
+
|
89
|
+
# Count sequences by length
|
90
|
+
length_counts[chosen_length] = length_counts.get(chosen_length, 0) + 1
|
91
|
+
|
92
|
+
length_breakdown = ", ".join([f"{count} at length {length}" for length, count in sorted(length_counts.items())])
|
93
|
+
|
94
|
+
print(f"Generated {args.count} random DNA sequences with lengths: {length_breakdown}")
|
95
|
+
print(f"Output written to: {args.output}")
|
96
|
+
|
97
|
+
if __name__ == "__main__":
|
98
|
+
main()
|
@@ -0,0 +1,139 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
memory_benchmark.py
|
4
|
+
|
5
|
+
Simple memory benchmarking utility for tracking maximum memory usage of barcadia commands.
|
6
|
+
|
7
|
+
Example usage:
|
8
|
+
python src/barcadia/tools/memory_benchmark.py barcadia generate --args
|
9
|
+
python src/barcadia/tools/memory_benchmark.py barcadia validate --args
|
10
|
+
|
11
|
+
Output: memory usage report shown in terminal and memory_benchmark_{timestamp}.log file
|
12
|
+
|
13
|
+
Optional arguments:
|
14
|
+
--mem-output-dir: output directory for benchmark logs (default: test)
|
15
|
+
|
16
|
+
Required arguments:
|
17
|
+
command: the barcadia command to benchmark (e.g., generate or validate)
|
18
|
+
"""
|
19
|
+
|
20
|
+
import psutil
|
21
|
+
import subprocess
|
22
|
+
import sys
|
23
|
+
import time
|
24
|
+
import os
|
25
|
+
import logging
|
26
|
+
import argparse
|
27
|
+
import os
|
28
|
+
import sys
|
29
|
+
|
30
|
+
# Add src directory to Python path for imports
|
31
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
32
|
+
|
33
|
+
from config_utils import setup_logging # type: ignore
|
34
|
+
|
35
|
+
def benchmark_command(command):
|
36
|
+
"""
|
37
|
+
Benchmark memory usage of a barcadia command.
|
38
|
+
|
39
|
+
Args:
|
40
|
+
command: The command to run (e.g., ['barcadia', 'generate', '--count', '1000'])
|
41
|
+
|
42
|
+
Returns:
|
43
|
+
dict: Benchmark results
|
44
|
+
"""
|
45
|
+
# Run the command as a subprocess and track its memory
|
46
|
+
start_time = time.time()
|
47
|
+
|
48
|
+
# Start the subprocess; let it inherit stdout/stderr so it can print directly to terminal
|
49
|
+
process = subprocess.Popen(command)
|
50
|
+
|
51
|
+
# Track memory usage of the subprocess
|
52
|
+
peak_memory = 0
|
53
|
+
try:
|
54
|
+
while process.poll() is None: # While process is still running
|
55
|
+
try:
|
56
|
+
# Get memory usage of the subprocess
|
57
|
+
child_process = psutil.Process(process.pid)
|
58
|
+
memory_bytes = child_process.memory_info().rss
|
59
|
+
peak_memory = max(peak_memory, memory_bytes)
|
60
|
+
time.sleep(0.1) # Sample every 100ms
|
61
|
+
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
62
|
+
# Process might have finished or we don't have access
|
63
|
+
break
|
64
|
+
|
65
|
+
# Wait for process to complete
|
66
|
+
process.wait()
|
67
|
+
return_code = process.returncode
|
68
|
+
stdout, stderr = None, None
|
69
|
+
|
70
|
+
except Exception as e:
|
71
|
+
logging.error(f"Error tracking memory: {e}")
|
72
|
+
process.terminate()
|
73
|
+
return None
|
74
|
+
|
75
|
+
duration = time.time() - start_time
|
76
|
+
peak_memory_mb = peak_memory / 1024 / 1024
|
77
|
+
|
78
|
+
if return_code != 0:
|
79
|
+
logging.error(f"Command failed with return code {return_code}")
|
80
|
+
logging.error("Check the command's own log file for details")
|
81
|
+
return None
|
82
|
+
|
83
|
+
return {
|
84
|
+
'command': ' '.join(command),
|
85
|
+
'duration_seconds': duration,
|
86
|
+
'peak_memory_mb': peak_memory_mb,
|
87
|
+
'return_code': return_code,
|
88
|
+
'stdout': stdout,
|
89
|
+
'stderr': stderr
|
90
|
+
}
|
91
|
+
|
92
|
+
def main():
|
93
|
+
"""Command-line interface for benchmarking barcadia commands."""
|
94
|
+
parser = argparse.ArgumentParser(
|
95
|
+
description="Benchmark memory usage of barcadia commands",
|
96
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
97
|
+
epilog="Examples:\n"
|
98
|
+
" python src/barcadia/tools/memory_benchmark.py barcadia validate --args\n"
|
99
|
+
" python src/barcadia/tools/memory_benchmark.py barcadia generate --args"
|
100
|
+
)
|
101
|
+
|
102
|
+
parser.add_argument('--mem-output-dir', type=str, default='test',
|
103
|
+
help='Output directory for benchmark logs (default: test)')
|
104
|
+
parser.add_argument('command', nargs=argparse.REMAINDER,
|
105
|
+
help='The barcadia command to benchmark (e.g., generate or validate)')
|
106
|
+
|
107
|
+
args = parser.parse_args()
|
108
|
+
|
109
|
+
if not args.command:
|
110
|
+
parser.error("Please provide a barcadia command to benchmark")
|
111
|
+
|
112
|
+
# Setup logging
|
113
|
+
args.output_dir = args.mem_output_dir
|
114
|
+
log_filepath = setup_logging(args, "memory_benchmark")
|
115
|
+
|
116
|
+
# Log benchmark start
|
117
|
+
command_str = ' '.join(args.command)
|
118
|
+
logging.info(f"Starting memory benchmark for: {command_str}")
|
119
|
+
logging.info("-" * 50)
|
120
|
+
|
121
|
+
# Run benchmark
|
122
|
+
result = benchmark_command(args.command)
|
123
|
+
|
124
|
+
if result:
|
125
|
+
# Log benchmark results only
|
126
|
+
logging.info("-" * 50)
|
127
|
+
logging.info("BENCHMARK RESULTS:")
|
128
|
+
logging.info(f"Command: {result['command']}")
|
129
|
+
logging.info(f"Duration: {result['duration_seconds']:.2f} seconds")
|
130
|
+
logging.info(f"Peak Memory: {result['peak_memory_mb']:.2f} MB")
|
131
|
+
logging.info(f"Return Code: {result['return_code']}")
|
132
|
+
logging.info("-" * 50)
|
133
|
+
# Note: the benchmarked command prints its own log file path to the terminal.
|
134
|
+
logging.info(f"Benchmark log file: {log_filepath}")
|
135
|
+
else:
|
136
|
+
logging.error("Benchmark failed!")
|
137
|
+
|
138
|
+
if __name__ == "__main__":
|
139
|
+
main()
|
@@ -0,0 +1,393 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
validate_barcodes.py
|
4
|
+
|
5
|
+
Validate if provided lists of NGS barcodes satisfy all quality filters (variable length sequences supported).
|
6
|
+
|
7
|
+
Program Overview:
|
8
|
+
|
9
|
+
1. Load and parse input file(s) and report lengths distribution
|
10
|
+
2. Check if sequences fail biological filters (GC content, homopolymer checks) - reports ALL violations per sequence
|
11
|
+
3. For sequences passing biological filters, apply intelligent algorithm selection for distance validation (unless --skip-distance flag is enabled):
|
12
|
+
3a. Method selection logic (based on sequences passing biological filters):
|
13
|
+
- Small barcode sets (<10K sequences): Pairwise sequential
|
14
|
+
- Large barcode sets (≥10K sequences) with mixed lengths and/or min_distance > 4: Pairwise parallel (when multiple CPUs available, otherwise sequential)
|
15
|
+
- Large equal-length barcode sets (≥10K sequences) with min_distance ≤ 4: Neighbor enumeration
|
16
|
+
3b. Distance calculation: Hamming distance for equal-length sequences, Levenshtein for mixed lengths
|
17
|
+
3c. Progress logging during validation (every 10 chunks for pairwise parallel, every 10K sequences for neighbor enumeration)
|
18
|
+
3d. Early stopping on first distance violation with detailed reporting
|
19
|
+
4. Generate comprehensive validation report with violation details
|
20
|
+
|
21
|
+
Input: list(s) of NGS barcodes (one per line as .txt). Multiple files supported, concatenated automatically.
|
22
|
+
|
23
|
+
Output: validation report (validation_report_{timestamp}.txt) and validate_barcodes_{timestamp}.log file
|
24
|
+
|
25
|
+
Optional arguments:
|
26
|
+
--gc-min: minimum GC content (default: 0.4)
|
27
|
+
--gc-max: maximum GC content (default: 0.6)
|
28
|
+
--homopolymer-max: maximum allowed homopolymer repeat length (default: 2)
|
29
|
+
--min-distance: minimum edit distance between barcodes (default: 3)
|
30
|
+
--skip-distance: skip distance validation entirely (default: off)
|
31
|
+
--output-dir: output directory for validation logs and reports (default: test)
|
32
|
+
--cpus: number of CPUs to use for pairwise parallel distance validation (default: all available)
|
33
|
+
|
34
|
+
Required arguments:
|
35
|
+
--input: input file(s) containing NGS barcodes (one per line)
|
36
|
+
"""
|
37
|
+
|
38
|
+
import argparse
|
39
|
+
import logging
|
40
|
+
import os
|
41
|
+
import time
|
42
|
+
import multiprocessing as mp
|
43
|
+
import sys
|
44
|
+
from concurrent.futures import ProcessPoolExecutor
|
45
|
+
from datetime import datetime
|
46
|
+
|
47
|
+
# Import utility functions
|
48
|
+
from .config_utils import decode_sequence, setup_logging, ExistingSequenceSet
|
49
|
+
from .filter_utils import validate_filter_arguments, check_gc_content_int, check_homopolymer_int, calculate_distance, select_distance_method, generate_hamming_neighbors
|
50
|
+
|
51
|
+
def validate_biological_filters(seq_array, gc_min, gc_max, homopolymer_max):
|
52
|
+
"""Check if sequence passes all biological filters and return all violations"""
|
53
|
+
violations = []
|
54
|
+
|
55
|
+
# Check GC content
|
56
|
+
if not check_gc_content_int(seq_array, gc_min, gc_max):
|
57
|
+
violations.append("GC content outside range")
|
58
|
+
|
59
|
+
# Check homopolymer runs
|
60
|
+
if not check_homopolymer_int(seq_array, homopolymer_max):
|
61
|
+
violations.append("Homopolymer run too long")
|
62
|
+
|
63
|
+
if violations:
|
64
|
+
return False, "; ".join(violations)
|
65
|
+
else:
|
66
|
+
return True, "Passes all filters"
|
67
|
+
|
68
|
+
def log_violation_details(sequences, i, j, violation):
|
69
|
+
"""Log distance violation and create violation details with DNA strings for reporting"""
|
70
|
+
logging.info(f"Early stopping: Found distance violation between sequences {violation[0]+1} and {violation[1]+1} (distance={violation[2]})")
|
71
|
+
seq1_str = decode_sequence(sequences[i])
|
72
|
+
seq2_str = decode_sequence(sequences[j])
|
73
|
+
return (violation[0]+1, violation[1]+1, seq1_str, seq2_str, violation[2])
|
74
|
+
|
75
|
+
def validate_distances_neighbor_enum(sequences, min_distance):
|
76
|
+
"""Validate using neighbor enumeration - much faster for appropriate cases"""
|
77
|
+
# Build hash set of all sequences for O(1) lookup
|
78
|
+
sequence_set = set(tuple(seq) for seq in sequences)
|
79
|
+
total_sequences = len(sequences)
|
80
|
+
|
81
|
+
# Check each sequence for violations
|
82
|
+
for i, seq in enumerate(sequences):
|
83
|
+
# Progress logging every 10K sequences
|
84
|
+
if i % 10_000 == 0 and i > 0:
|
85
|
+
logging.info(f"Progress: {i:,}/{total_sequences:,} sequences processed "
|
86
|
+
f"({i/total_sequences*100:.1f}%)")
|
87
|
+
|
88
|
+
seq_array = list(seq) # Make mutable copy for neighbor generation
|
89
|
+
|
90
|
+
# Generate all neighbors within min_distance
|
91
|
+
for neighbor in generate_hamming_neighbors(seq_array, min_distance - 1):
|
92
|
+
if neighbor in sequence_set and neighbor != tuple(seq):
|
93
|
+
# Found a violation - get the index of the violating sequence
|
94
|
+
for j, other_seq in enumerate(sequences):
|
95
|
+
if j != i and tuple(other_seq) == neighbor:
|
96
|
+
# Calculate actual distance for reporting
|
97
|
+
actual_distance = sum(a != b for a, b in zip(seq, other_seq))
|
98
|
+
violation = (i, j, actual_distance)
|
99
|
+
violation_details = log_violation_details(sequences, i, j, violation)
|
100
|
+
sequences_processed = i + 1 # Number of sequences processed when violation found
|
101
|
+
return True, sequences_processed, violation_details
|
102
|
+
|
103
|
+
# No violations found - processed all sequences
|
104
|
+
return False, total_sequences, None
|
105
|
+
|
106
|
+
def generate_pair_chunk(start_idx, chunk_size, n):
|
107
|
+
"""Generate a chunk of pairs lazily starting from start_idx"""
|
108
|
+
pairs_generated = 0
|
109
|
+
current_idx = 0
|
110
|
+
|
111
|
+
for i in range(n):
|
112
|
+
for j in range(i + 1, n):
|
113
|
+
if current_idx >= start_idx:
|
114
|
+
if pairs_generated >= chunk_size:
|
115
|
+
return
|
116
|
+
yield (i, j)
|
117
|
+
pairs_generated += 1
|
118
|
+
current_idx += 1
|
119
|
+
|
120
|
+
def validate_chunk(sequences_chunk, min_distance):
|
121
|
+
"""Worker function for distance validation (can be used sequentially or in parallel)"""
|
122
|
+
pairs_checked = 0
|
123
|
+
n = len(sequences_chunk)
|
124
|
+
|
125
|
+
# Use the lazy pair generator for memory efficiency
|
126
|
+
for i, j in generate_pair_chunk(0, n * (n - 1) // 2, n):
|
127
|
+
distance = calculate_distance(sequences_chunk[i], sequences_chunk[j], min_distance)
|
128
|
+
pairs_checked += 1
|
129
|
+
|
130
|
+
if distance < min_distance:
|
131
|
+
violation = (i, j, distance)
|
132
|
+
violation_details = log_violation_details(sequences_chunk, i, j, violation)
|
133
|
+
return True, pairs_checked, violation_details
|
134
|
+
|
135
|
+
return False, pairs_checked, None
|
136
|
+
|
137
|
+
def validate_distances(sequences, min_distance, method, cpus, chunk_size):
|
138
|
+
"""Unified distance validation with method selection and parallel/sequential execution"""
|
139
|
+
# Execute the chosen method
|
140
|
+
if method == "neighbor_enumeration":
|
141
|
+
# Use neighbor enumeration when set up is optimal (no parallelization involved)
|
142
|
+
return validate_distances_neighbor_enum(sequences, min_distance)
|
143
|
+
elif method == "pairwise_sequential" or cpus == 1:
|
144
|
+
# Use sequential for small barcode sets or single CPU
|
145
|
+
return validate_chunk(sequences, min_distance)
|
146
|
+
else: # method == "pairwise" and cpus > 1
|
147
|
+
# Large dataset with multiple CPUs - always use parallel with pre-calculated chunk size
|
148
|
+
chunks = [sequences[i:i+chunk_size] for i in range(0, len(sequences), chunk_size)]
|
149
|
+
|
150
|
+
with ProcessPoolExecutor(max_workers=cpus) as executor:
|
151
|
+
futures = [
|
152
|
+
executor.submit(validate_chunk, chunk, min_distance)
|
153
|
+
for chunk in chunks
|
154
|
+
]
|
155
|
+
|
156
|
+
# Process results with early stopping
|
157
|
+
total_pairs_checked = 0
|
158
|
+
for future in futures:
|
159
|
+
early_stopped, pairs_checked, violation_info = future.result()
|
160
|
+
total_pairs_checked += pairs_checked
|
161
|
+
|
162
|
+
if early_stopped:
|
163
|
+
# Cancel remaining futures for early stopping
|
164
|
+
for f in futures:
|
165
|
+
f.cancel()
|
166
|
+
return True, total_pairs_checked, violation_info
|
167
|
+
|
168
|
+
return False, total_pairs_checked, None
|
169
|
+
|
170
|
+
def validate_barcodes_core(sequences, gc_min, gc_max, homopolymer_max, min_distance, has_mixed_lengths, skip_distance, cpus, output_dir, input_file, log_filepath):
|
171
|
+
"""Main function to validate input barcode sets against biological filters and distance constraints"""
|
172
|
+
start_time = time.time()
|
173
|
+
|
174
|
+
logging.info(f"Starting barcode validation...")
|
175
|
+
logging.info(f"Filter 1 (within-sequence), GC content: {gc_min:.1%} - {gc_max:.1%}")
|
176
|
+
logging.info(f"Filter 2 (within-sequence), Max homopolymer repeat: {homopolymer_max}")
|
177
|
+
logging.info(f"Filter 3 (between-sequence), Minimum edit distance: {min_distance}")
|
178
|
+
|
179
|
+
# 1. Validate sequences for biological filters
|
180
|
+
valid_sequences = []
|
181
|
+
biological_violations = []
|
182
|
+
|
183
|
+
for i, seq_array in enumerate(sequences):
|
184
|
+
# Check biological filters
|
185
|
+
is_valid, reason = validate_biological_filters(seq_array, gc_min, gc_max, homopolymer_max)
|
186
|
+
|
187
|
+
if is_valid:
|
188
|
+
valid_sequences.append(seq_array)
|
189
|
+
else:
|
190
|
+
# Convert back to DNA string for reporting
|
191
|
+
dna_string = decode_sequence(seq_array)
|
192
|
+
biological_violations.append((i+1, dna_string, reason))
|
193
|
+
|
194
|
+
logging.info(f"Biological filter (GC content and homopolymer repeats) results:")
|
195
|
+
logging.info(f" Passed: {len(valid_sequences)} sequences")
|
196
|
+
logging.info(f" Failed: {len(biological_violations)} sequences")
|
197
|
+
|
198
|
+
# 2. Validate sequences for distance constraints
|
199
|
+
# Calculate total distance pairs for sequences that passed biological filters
|
200
|
+
n = len(valid_sequences)
|
201
|
+
total_pairs = n * (n - 1) // 2
|
202
|
+
|
203
|
+
# Check if we should skip distance validation
|
204
|
+
distance_skipped = False
|
205
|
+
logging.info(f"Distance filter results:")
|
206
|
+
if skip_distance:
|
207
|
+
logging.info(f"Skipping distance validation (--skip-distance flag enabled)")
|
208
|
+
early_stopped = False
|
209
|
+
features_checked = 0
|
210
|
+
distance_skipped = True
|
211
|
+
validation_method = "skipped"
|
212
|
+
violation_info = None
|
213
|
+
logging.info(f" Distance validation skipped")
|
214
|
+
# If not, continue with distance validation
|
215
|
+
else:
|
216
|
+
logging.info(f"Validating distances for sequences that passed biological filters...")
|
217
|
+
# Determine method and calculate chunk size for pairwise method
|
218
|
+
method = select_distance_method(n, min_distance, has_mixed_lengths)
|
219
|
+
|
220
|
+
# Calculate chunk size for pairwise method with multiple CPUs
|
221
|
+
chunk_size = None
|
222
|
+
if method == "pairwise_sequential":
|
223
|
+
logging.info(f"Using sequential pairwise distance checking (small barcode set for sequences that passed biological filters)")
|
224
|
+
elif method == "pairwise" and cpus == 1:
|
225
|
+
logging.info(f"Using sequential pairwise distance checking (1 CPU)")
|
226
|
+
elif method == "pairwise" and cpus > 1:
|
227
|
+
chunk_size = max(100000, total_pairs // (cpus * 10))
|
228
|
+
logging.info(f"Using parallel pairwise distance checking (chunk size: {chunk_size})")
|
229
|
+
|
230
|
+
# Execute validation
|
231
|
+
early_stopped, features_checked, violation_info = validate_distances(valid_sequences, min_distance, method, cpus, chunk_size)
|
232
|
+
|
233
|
+
# Log results
|
234
|
+
if method == "neighbor_enumeration":
|
235
|
+
logging.info(f" Total sequences (that passed biological filters): {n}")
|
236
|
+
logging.info(f" Sequences processed: {features_checked}")
|
237
|
+
else:
|
238
|
+
logging.info(f" Total sequence pairs: {total_pairs:,} (sequences that passed biological filters)")
|
239
|
+
logging.info(f" Pairs checked: {features_checked:,}")
|
240
|
+
|
241
|
+
validation_method = method
|
242
|
+
|
243
|
+
overall_valid = len(valid_sequences) == len(sequences) and not early_stopped
|
244
|
+
|
245
|
+
duration = time.time() - start_time
|
246
|
+
|
247
|
+
logging.info(f"Validation complete!")
|
248
|
+
logging.info(f"Overall validation: {'PASSED' if overall_valid else 'FAILED'}")
|
249
|
+
logging.info(f"Total time: {duration:.2f} seconds")
|
250
|
+
|
251
|
+
# 3. Generate report
|
252
|
+
logging.info(f"Generating report...")
|
253
|
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
254
|
+
report_file = os.path.join(output_dir, f"validation_report_{timestamp}.txt")
|
255
|
+
|
256
|
+
with open(report_file, 'w') as f:
|
257
|
+
f.write("Barcode Validation Report\n")
|
258
|
+
f.write("=" * 50 + "\n\n")
|
259
|
+
|
260
|
+
f.write(f"Input file: {input_file}\n")
|
261
|
+
f.write(f"Total sequences: {len(sequences)}\n\n")
|
262
|
+
f.write("Filter Settings:\n")
|
263
|
+
f.write(f" GC content: {gc_min:.1%} - {gc_max:.1%}\n")
|
264
|
+
f.write(f" Max homopolymer: {homopolymer_max}\n")
|
265
|
+
f.write(f" Minimum distance: {min_distance}\n\n")
|
266
|
+
f.write(f"Biological filter passed: {len(valid_sequences)}\n")
|
267
|
+
f.write(f"Biological filter failed: {len(biological_violations)}\n")
|
268
|
+
|
269
|
+
if distance_skipped:
|
270
|
+
f.write(f"Distance validation: SKIPPED (--skip-distance flag enabled)\n")
|
271
|
+
elif early_stopped:
|
272
|
+
f.write(f"Distance validation: EARLY STOPPED (found first violation)\n")
|
273
|
+
f.write(f" Method used: {validation_method}\n")
|
274
|
+
if validation_method == "neighbor_enumeration":
|
275
|
+
f.write(f" Sequences processed before stopping: {features_checked:,}\n")
|
276
|
+
else:
|
277
|
+
f.write(f" Pairs checked before stopping: {features_checked:,}\n")
|
278
|
+
else:
|
279
|
+
f.write(f"Distance validation: PASSED (no violations found)\n")
|
280
|
+
f.write(f" Method used: {validation_method}\n")
|
281
|
+
if validation_method == "neighbor_enumeration":
|
282
|
+
f.write(f" Total sequences processed: {features_checked:,}\n")
|
283
|
+
else:
|
284
|
+
f.write(f" Total pairs checked: {features_checked:,}\n")
|
285
|
+
f.write("\n")
|
286
|
+
|
287
|
+
if biological_violations:
|
288
|
+
f.write("Biological Filter (GC content and homopolymer) Violations:\n")
|
289
|
+
f.write("-" * 30 + "\n")
|
290
|
+
for line_num, seq, reason in biological_violations:
|
291
|
+
f.write(f"Line {line_num}: {seq} - {reason}\n")
|
292
|
+
f.write("\n")
|
293
|
+
|
294
|
+
# Add distance violation details if available
|
295
|
+
if violation_info is not None:
|
296
|
+
f.write("Distance Violations:\n")
|
297
|
+
f.write("-" * 19 + "\n")
|
298
|
+
seq1_line, seq2_line, seq1_str, seq2_str, distance = violation_info
|
299
|
+
f.write(f"Line {seq1_line}: {seq1_str} and Line {seq2_line}: {seq2_str} - distance {distance} (minimum required: {min_distance})\n")
|
300
|
+
f.write("\n")
|
301
|
+
|
302
|
+
# Log file locations
|
303
|
+
if log_filepath:
|
304
|
+
logging.info(f"Log file: {log_filepath}")
|
305
|
+
logging.info(f"Report file: {report_file}")
|
306
|
+
|
307
|
+
# Final output
|
308
|
+
if overall_valid:
|
309
|
+
print("All barcodes are valid!")
|
310
|
+
else:
|
311
|
+
print(f"VALIDATION FAILED!")
|
312
|
+
|
313
|
+
def setup_argument_parser():
|
314
|
+
"""Setup and return the argument parser for barcode validation"""
|
315
|
+
parser = argparse.ArgumentParser(
|
316
|
+
description="Validate DNA barcodes against quality filters (GC content, homopolymer repeats, minimum distance)",
|
317
|
+
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
318
|
+
)
|
319
|
+
|
320
|
+
# Required arguments
|
321
|
+
parser.add_argument('--input', type=str, required=True, nargs='+',
|
322
|
+
help='Input file(s) containing DNA barcodes (one per line)')
|
323
|
+
|
324
|
+
# Output arguments
|
325
|
+
parser.add_argument('--output-dir', type=str, default='test',
|
326
|
+
help='Output directory for validation logs and reports')
|
327
|
+
|
328
|
+
# Filter arguments with defaults
|
329
|
+
parser.add_argument('--gc-min', type=float, default=0.4,
|
330
|
+
help='Minimum GC content (as fraction, e.g., 0.4 = 40%%)')
|
331
|
+
parser.add_argument('--gc-max', type=float, default=0.6,
|
332
|
+
help='Maximum GC content (as fraction, e.g., 0.6 = 60%%)')
|
333
|
+
parser.add_argument('--homopolymer-max', type=int, default=2,
|
334
|
+
help='Maximum allowed homopolymer repeat length')
|
335
|
+
parser.add_argument('--min-distance', type=int, default=3,
|
336
|
+
help='Minimum edit distance between sequences')
|
337
|
+
|
338
|
+
# Performance arguments
|
339
|
+
parser.add_argument('--cpus', type=int, default=mp.cpu_count(),
|
340
|
+
help='Number of CPU cores to use for parallel distance validation')
|
341
|
+
|
342
|
+
# Mode arguments
|
343
|
+
parser.add_argument('--skip-distance', action='store_true',
|
344
|
+
help='Skip distance validation entirely')
|
345
|
+
|
346
|
+
return parser
|
347
|
+
|
348
|
+
def validate_validator_arguments(args, length_counts):
|
349
|
+
"""Validate validator-specific arguments (length, distance, homopolymer) and return has_mixed_lengths flag"""
|
350
|
+
input_length = max(length_counts.keys())
|
351
|
+
|
352
|
+
# Homopolymer repeat x max input length validation
|
353
|
+
if args.homopolymer_max >= input_length:
|
354
|
+
raise ValueError(f"Maximum homopolymer repeat length must be < max input length ({input_length}bp)")
|
355
|
+
|
356
|
+
# Minimum distance x max input length validation
|
357
|
+
if args.min_distance >= input_length:
|
358
|
+
raise ValueError(f"Minimum distance must be < max input length ({input_length}bp)")
|
359
|
+
|
360
|
+
# Check for mixed lengths
|
361
|
+
has_mixed_lengths = len(length_counts) > 1
|
362
|
+
|
363
|
+
return has_mixed_lengths
|
364
|
+
|
365
|
+
def main(argv=None):
|
366
|
+
parser = setup_argument_parser()
|
367
|
+
args = parser.parse_args(argv)
|
368
|
+
log_filepath = setup_logging(args, "validate_barcodes")
|
369
|
+
validate_filter_arguments(args) # simple validation of filter arguments
|
370
|
+
|
371
|
+
# Load input files using ExistingSequenceSet
|
372
|
+
sequence_set = ExistingSequenceSet.from_input_files(args.input)
|
373
|
+
|
374
|
+
# Validate validator-specific arguments and get mixed lengths flag
|
375
|
+
has_mixed_lengths = validate_validator_arguments(args, sequence_set.length_counts)
|
376
|
+
|
377
|
+
# Validate barcodes
|
378
|
+
validate_barcodes_core(
|
379
|
+
sequences=sequence_set.sequences,
|
380
|
+
gc_min=args.gc_min,
|
381
|
+
gc_max=args.gc_max,
|
382
|
+
homopolymer_max=args.homopolymer_max,
|
383
|
+
min_distance=args.min_distance,
|
384
|
+
has_mixed_lengths=has_mixed_lengths,
|
385
|
+
skip_distance=args.skip_distance,
|
386
|
+
cpus=args.cpus,
|
387
|
+
output_dir=args.output_dir,
|
388
|
+
input_file=args.input,
|
389
|
+
log_filepath=log_filepath
|
390
|
+
)
|
391
|
+
|
392
|
+
if __name__ == "__main__":
|
393
|
+
main(sys.argv[1:])
|