barcadia 3.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
barcadia/__init__.py ADDED
@@ -0,0 +1,20 @@
1
+ """
2
+ Barcadia: High-performance DNA barcode generation and validation for NGS applications.
3
+
4
+ This package provides efficient algorithms for generating and validating DNA barcodes
5
+ with configurable quality filters including GC content, homopolymer repeats, and
6
+ minimum edit distance constraints.
7
+
8
+ Public API:
9
+ generate_barcodes_core: Generate DNA barcodes with iterative growth algorithm
10
+ validate_barcodes_core: Validate DNA barcodes against quality filters
11
+ """
12
+
13
+ # Public API - only expose the core functions
14
+ from .generate_barcodes import generate_barcodes_core
15
+ from .validate_barcodes import validate_barcodes_core
16
+
17
+ __all__ = [
18
+ "generate_barcodes_core",
19
+ "validate_barcodes_core",
20
+ ]
barcadia/cli.py ADDED
@@ -0,0 +1,61 @@
1
+ """
2
+ Unified CLI for Barcadia.
3
+ Usage:
4
+ barcadia generate [options...] -> delegates to barcadia.generate_barcodes.main(argv)
5
+ barcadia validate [options...] -> delegates to barcadia.validate_barcodes.main(argv)
6
+ """
7
+
8
+ import sys
9
+ from importlib.metadata import version
10
+ from . import generate_barcodes as gen
11
+ from . import validate_barcodes as val
12
+
13
+ TOP_USAGE = (
14
+ "Barcadia - A high-performance, memory-efficient toolkit for fast generation and validation of large-scale NGS barcodes\n"
15
+ "\n"
16
+ "Usage:\n"
17
+ " barcadia <command> [options...]\n"
18
+ "\n"
19
+ "Commands:\n"
20
+ " generate Generate high-performance DNA barcodes for NGS applications\n"
21
+ " validate Validate DNA barcodes against quality filters\n"
22
+ "\n"
23
+ "Examples:\n"
24
+ " barcadia --help\n"
25
+ " barcadia generate --help\n"
26
+ " barcadia validate --help\n"
27
+ " barcadia generate --count 1000 --length 12\n"
28
+ " barcadia validate --input test/barcodes.txt\n"
29
+ "\n"
30
+ "Global options:\n"
31
+ " --help, -h Show this help message\n"
32
+ " --version, -v Show version information\n"
33
+ )
34
+
35
+ def main() -> int:
36
+ # Handle version flag
37
+ if len(sys.argv) >= 2 and sys.argv[1] in {"-v", "--version"}:
38
+ print(version("barcadia"))
39
+ return 0
40
+
41
+ # No subcommand → show top-level help
42
+ if len(sys.argv) < 2 or sys.argv[1] in {"-h", "--help"}:
43
+ print(TOP_USAGE, file=sys.stderr)
44
+ return 0
45
+
46
+ cmd, argv = sys.argv[1], sys.argv[2:]
47
+
48
+ if cmd == "generate":
49
+ # gen.main must accept argv: list[str] | None
50
+ return gen.main(argv) or 0
51
+
52
+ if cmd == "validate":
53
+ # val.main must accept argv: list[str] | None
54
+ return val.main(argv) or 0
55
+
56
+ # Unknown subcommand
57
+ print(f"Unknown subcommand: {cmd}\n\n{TOP_USAGE}", file=sys.stderr)
58
+ return 2
59
+
60
+ if __name__ == "__main__":
61
+ sys.exit(main())
@@ -0,0 +1,225 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ config_utils.py
4
+
5
+ Configuration and DNA-encoding/decoding utility functions for efficient barcode generation and validation.
6
+ """
7
+
8
+ import os
9
+ import logging
10
+ import numpy as np
11
+ from datetime import datetime
12
+
13
+ # DNA encoding constants
14
+ DNA_BASES = 'ATGC'
15
+ DNA_TO_INT = {'A': 0, 'T': 1, 'G': 2, 'C': 3}
16
+ INT_TO_DNA = {0: 'A', 1: 'T', 2: 'G', 3: 'C'}
17
+
18
+ def encode_sequence(dna_string):
19
+ """Convert DNA string to integer array"""
20
+ return np.array([DNA_TO_INT[base] for base in dna_string], dtype=np.int8)
21
+
22
+ def decode_sequence(seq_array):
23
+ """Convert integer array back to DNA string"""
24
+ return ''.join(INT_TO_DNA[base] for base in seq_array)
25
+
26
+ def setup_logging(args, script_name):
27
+ """Setup logging and create output directory. Returns log filepath."""
28
+ # Create output directory
29
+ os.makedirs(args.output_dir, exist_ok=True)
30
+
31
+ # Setup logging with file output
32
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
33
+ log_filename = f"{script_name}_{timestamp}.log"
34
+ log_filepath = os.path.join(args.output_dir, log_filename)
35
+
36
+ # Configure logging to both file and console
37
+ logging.basicConfig(
38
+ level=logging.INFO,
39
+ format='%(asctime)s - %(levelname)s - %(message)s',
40
+ datefmt='%H:%M:%S',
41
+ handlers=[
42
+ logging.FileHandler(log_filepath),
43
+ logging.StreamHandler()
44
+ ]
45
+ )
46
+
47
+ return log_filepath
48
+
49
+ class ExistingSequenceSet:
50
+ """
51
+ A class to manage existing DNA sequence sets with file operations and validation.
52
+
53
+ This class consolidates file reading, existence checking, and sequence management
54
+ for both generation and validation scripts.
55
+ """
56
+
57
+ def __init__(self, sequences=None, length_counts=None):
58
+ """
59
+ Initialize the sequence set.
60
+
61
+ Args:
62
+ sequences: List of integer arrays (encoded DNA sequences)
63
+ length_counts: Dictionary mapping length to count
64
+ """
65
+ self.sequences = sequences or []
66
+ self.length_counts = length_counts or {}
67
+
68
+ def _read_files(self, file_paths):
69
+ """
70
+ Internal method to read DNA sequences from files and convert to integer arrays.
71
+ Handles file existence checking and path normalization internally.
72
+
73
+ Args:
74
+ file_paths: List of file paths or single file path
75
+
76
+ Returns:
77
+ tuple: (sequences, length_counts) where sequences are integer arrays
78
+
79
+ Raises:
80
+ ValueError: If any file does not exist or files are empty
81
+ """
82
+ # Normalize file paths (convert single file to list)
83
+ if isinstance(file_paths, str):
84
+ file_paths = [file_paths]
85
+
86
+ # Check that all files exist
87
+ for file_path in file_paths:
88
+ if not os.path.exists(file_path):
89
+ raise ValueError(f"File does not exist: {file_path}")
90
+
91
+ sequences = []
92
+ length_counts = {}
93
+
94
+ for file_path in file_paths:
95
+ file_count = 0
96
+ with open(file_path, 'r') as f:
97
+ for line_num, line in enumerate(f, 1):
98
+ seq = line.strip()
99
+ if not seq: # Skip empty lines
100
+ continue
101
+
102
+ # Basic validation
103
+ if not all(base in DNA_BASES for base in seq):
104
+ logging.warning(f"File {file_path}, line {line_num}: Invalid DNA sequence '{seq}', skipping")
105
+ continue
106
+
107
+ # Convert to integer array for efficient processing
108
+ seq_array = encode_sequence(seq)
109
+ sequences.append(seq_array)
110
+
111
+ # Count length while reading
112
+ length = len(seq_array)
113
+ length_counts[length] = length_counts.get(length, 0) + 1
114
+
115
+ file_count += 1
116
+
117
+ logging.info(f"Loaded {file_count} sequences from {file_path}")
118
+
119
+ if not sequences:
120
+ raise ValueError(f"File(s) are empty: {', '.join(file_paths)}")
121
+
122
+ # Generate length info for logging
123
+ if len(length_counts) == 1:
124
+ length_info = f"length {list(length_counts.keys())[0]}"
125
+ else:
126
+ length_breakdown = ", ".join([f"{count} at length {length}" for length, count in sorted(length_counts.items())])
127
+ length_info = f"mixed lengths: {length_breakdown}"
128
+
129
+ logging.info(f"Total loaded: {len(sequences)} sequences from {len(file_paths)} file(s) ({length_info})")
130
+
131
+ return sequences, length_counts
132
+
133
+ @classmethod
134
+ def from_files(cls, file_paths):
135
+ """
136
+ Create ExistingSequenceSet from files (used by both validation and generation scripts).
137
+
138
+ Args:
139
+ file_paths: List of file paths or single file path
140
+
141
+ Returns:
142
+ ExistingSequenceSet: Instance with loaded sequences and length counts
143
+ """
144
+ instance = cls()
145
+ sequences, length_counts = instance._read_files(file_paths)
146
+ instance.sequences = sequences
147
+ instance.length_counts = length_counts
148
+ return instance
149
+
150
+ @classmethod
151
+ def from_input_files(cls, file_paths):
152
+ """
153
+ Create ExistingSequenceSet from input files (used by validation script).
154
+
155
+ Args:
156
+ file_paths: List of file paths or single file path
157
+
158
+ Returns:
159
+ ExistingSequenceSet: Instance with loaded sequences and length counts
160
+ """
161
+ return cls.from_files(file_paths)
162
+
163
+ @classmethod
164
+ def from_unpaired_seeds(cls, file_paths):
165
+ """
166
+ Create ExistingSequenceSet from unpaired seed files (used by generation script).
167
+
168
+ Args:
169
+ file_paths: List of file paths or single file path
170
+
171
+ Returns:
172
+ ExistingSequenceSet: Instance with loaded sequences and length counts
173
+ """
174
+ return cls.from_files(file_paths)
175
+
176
+ @classmethod
177
+ def from_paired_seeds(cls, file1, file2):
178
+ """
179
+ Create ExistingSequenceSet from paired seed files (used by generation script).
180
+
181
+ Args:
182
+ file1: Path to first paired seed file
183
+ file2: Path to second paired seed file
184
+
185
+ Returns:
186
+ ExistingSequenceSet: Instance with combined sequences and length counts
187
+ """
188
+ instance = cls()
189
+
190
+ # Load paired seeds separately
191
+ paired_seed1_pool, seed1_length_counts = instance._read_files([file1])
192
+ paired_seed2_pool, seed2_length_counts = instance._read_files([file2])
193
+
194
+ # Validate paired seeds
195
+ # 1. Check that both files have the same number of sequences
196
+ if len(paired_seed1_pool) != len(paired_seed2_pool):
197
+ raise ValueError(f"Paired seed files must have the same number of sequences. "
198
+ f"Seed1: {len(paired_seed1_pool)} sequences, Seed2: {len(paired_seed2_pool)} sequences")
199
+
200
+ # 2. Check that both files have sequences of the same length within the file
201
+ elif len(seed1_length_counts) != 1:
202
+ raise ValueError(f"All sequences in paired seed file 1 must be the same length. "
203
+ f"Found lengths: {sorted(seed1_length_counts.keys())}")
204
+ elif len(seed2_length_counts) != 1:
205
+ raise ValueError(f"All sequences in paired seed file 2 must be the same length. "
206
+ f"Found lengths: {sorted(seed2_length_counts.keys())}")
207
+
208
+ # 3. Check that both files have sequences of the same length between the files
209
+ elif list(seed1_length_counts.keys())[0] != list(seed2_length_counts.keys())[0]:
210
+ raise ValueError(f"Paired seed files must have sequences of the same length. "
211
+ f"Seed1 length: {list(seed1_length_counts.keys())[0]}, Seed2 length: {list(seed2_length_counts.keys())[0]}")
212
+ else:
213
+ # All validations passed - combine both for generation pool
214
+ combined_sequences = paired_seed1_pool + paired_seed2_pool
215
+
216
+ # Since paired seeds are validated to have the same length, just use seed1's length counts
217
+ # and double the count since we have two files
218
+ combined_length_counts = {}
219
+ for length, count in seed1_length_counts.items():
220
+ combined_length_counts[length] = count * 2
221
+
222
+ instance.sequences = combined_sequences
223
+ instance.length_counts = combined_length_counts
224
+
225
+ return instance
@@ -0,0 +1,152 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ filter_utils.py
4
+
5
+ Filter-related utility functions with Numba JIT compilation for efficient barcode generation and validation.
6
+ """
7
+
8
+ from numba import jit
9
+ import numpy as np
10
+ import logging
11
+
12
+ # Simple validation of filter arguments used in both generation and validation
13
+ def validate_filter_arguments(args):
14
+ """Validate filter-related command line arguments and raise ValueError if invalid"""
15
+ if args.gc_min < 0 or args.gc_max > 1 or args.gc_min >= args.gc_max:
16
+ raise ValueError("GC content bounds must be: 0 ≤ gc_min < gc_max ≤ 1")
17
+
18
+ elif args.homopolymer_max < 1:
19
+ raise ValueError("Maximum homopolymer repeat length must be ≥ 1")
20
+
21
+ elif args.min_distance < 1:
22
+ raise ValueError("Minimum edit distance must be ≥ 1")
23
+
24
+ # Biological filter functions
25
+ @jit(nopython=True, cache=True)
26
+ def check_gc_content_int(seq_array, gc_min, gc_max):
27
+ """Check if sequence passes GC content filter (works with integer arrays)"""
28
+ # G=2, C=3 in our encoding - count them directly
29
+ gc_count = 0
30
+ for base in seq_array:
31
+ if base == 2 or base == 3: # G or C
32
+ gc_count += 1
33
+ gc_content = gc_count / len(seq_array)
34
+ return gc_min <= gc_content <= gc_max
35
+
36
+ @jit(nopython=True, cache=True)
37
+ def check_homopolymer_int(seq_array, homopolymer_max):
38
+ """Check for homopolymer repeats longer than homopolymer_max (works with integer arrays)"""
39
+ current_base = seq_array[0]
40
+ current_count = 1
41
+
42
+ for base in seq_array[1:]:
43
+ if base == current_base:
44
+ current_count += 1
45
+ if current_count > homopolymer_max:
46
+ return False # Fails check
47
+ else:
48
+ current_base = base
49
+ current_count = 1
50
+
51
+ return True # Passes check
52
+
53
+ # Distance calculation functions
54
+ @jit(nopython=True, cache=True)
55
+ def hamming_distance_int(seq1, seq2, min_distance):
56
+ """Calculate Hamming distance with early stopping (assumes equal-length sequences, works with integer arrays)"""
57
+ distance = 0
58
+ for i in range(len(seq1)):
59
+ if seq1[i] != seq2[i]:
60
+ distance += 1
61
+ if distance >= min_distance:
62
+ return distance # Early stopping
63
+ return distance
64
+
65
+ @jit(nopython=True, cache=True)
66
+ def levenshtein_distance_int(seq1, seq2, min_distance):
67
+ """Calculate Levenshtein distance with early stopping (assumes mixed-length sequences, works with integer arrays)"""
68
+ if len(seq1) < len(seq2):
69
+ return levenshtein_distance_int(seq2, seq1, min_distance)
70
+
71
+ elif len(seq2) == 0:
72
+ return len(seq1)
73
+
74
+ # Use numpy arrays for better performance with numba
75
+ previous_row = np.arange(len(seq2) + 1, dtype=np.int32)
76
+
77
+ # Early stopping: if initial row already exceeds min_distance, return early
78
+ if previous_row.min() >= min_distance:
79
+ return min_distance
80
+
81
+ for i in range(len(seq1)):
82
+ current_row = np.zeros(len(seq2) + 1, dtype=np.int32)
83
+ current_row[0] = i + 1
84
+ for j in range(len(seq2)):
85
+ insertions = previous_row[j + 1] + 1
86
+ deletions = current_row[j] + 1
87
+ substitutions = previous_row[j] + (seq1[i] != seq2[j])
88
+ current_row[j + 1] = min(insertions, deletions, substitutions)
89
+
90
+ # Early stopping: if minimum value in current row >= min_distance,
91
+ # the final distance will be >= min_distance
92
+ if current_row.min() >= min_distance:
93
+ return min_distance
94
+
95
+ previous_row = current_row
96
+
97
+ return previous_row[-1]
98
+
99
+ def calculate_distance(seq1, seq2, min_distance):
100
+ """Calculate distance between two sequences, using Hamming for equal length, Levenshtein otherwise"""
101
+ if len(seq1) == len(seq2):
102
+ return hamming_distance_int(seq1, seq2, min_distance)
103
+ else:
104
+ return levenshtein_distance_int(seq1, seq2, min_distance)
105
+
106
+ def select_distance_method(target_count, min_distance, has_mixed_lengths):
107
+ """
108
+ Determine which distance checking method to use based on barcode set characteristics and log the decision.
109
+ Returns: "pairwise_sequential", "pairwise", or "neighbor_enumeration"
110
+
111
+ Rules:
112
+ 1. Small barcode sets (<10K sequences counting seeds if seeds are present): Always use pairwise_sequential
113
+ 2. Large sets, mixed-length (within seeds and/or between seeds and new barcodes): Always use pairwise (parallelization determined later)
114
+ 3. Large sets, equal-length (counting seeds): Always use pairwise with large minimum distance (> 4), otherwise use neighbor enumeration
115
+ """
116
+ # Rule 1: Small barcode sets, always use pairwise_sequential
117
+ if target_count < 10000:
118
+ logging.info(f"Using pairwise distance checking for small barcode set (size < 10K)")
119
+ return "pairwise_sequential"
120
+
121
+ # Rule 2: Large mixed-length sets, always use pairwise (parallel if multiple CPUs, determined in main generation/validation functions)
122
+ elif has_mixed_lengths:
123
+ logging.info(f"Using pairwise distance checking for large mixed-length barcode set (size ≥ 10K)")
124
+ return "pairwise"
125
+
126
+ # Rule 3: Large equal-length sets with large minimum distance (> 4), always use pairwise (parallel if multiple CPUs, determined in main generation/validation functions)
127
+ elif min_distance > 4:
128
+ logging.info(f"Using pairwise distance checking for large equal-length barcode set (size ≥ 10K, min distance > 4)")
129
+ return "pairwise"
130
+ else:
131
+ # Special case - neighbor enumeration for large equal-length sets with small minimum distance (<= 4) (no parallelization involved)
132
+ logging.info(f"Using neighbor enumeration for distance checking for large equal-length barcode set (size ≥ 10K, min distance ≤ 4)")
133
+ return "neighbor_enumeration"
134
+
135
+ def generate_hamming_neighbors(seq_array, max_distance, current_distance=0):
136
+ """Generate all Hamming neighbors within max_distance of a sequence"""
137
+ if current_distance == max_distance:
138
+ yield tuple(seq_array)
139
+ return
140
+
141
+ # Yield current sequence if distance > 0
142
+ if current_distance > 0:
143
+ yield tuple(seq_array)
144
+
145
+ # Generate neighbors by substitution
146
+ for i in range(len(seq_array)):
147
+ original_base = seq_array[i]
148
+ for new_base in [0, 1, 2, 3]: # A, T, G, C
149
+ if new_base != original_base:
150
+ seq_array[i] = new_base
151
+ yield from generate_hamming_neighbors(seq_array, max_distance, current_distance + 1)
152
+ seq_array[i] = original_base # backtrack