PyPI - barcadia - Versions diffs - 3.2.0__py3-none-any.whl - Mend

barcadia 3.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

barcadia/__init__.py +20 -0
barcadia/cli.py +61 -0
barcadia/config_utils.py +225 -0
barcadia/filter_utils.py +152 -0
barcadia/generate_barcodes.py +613 -0
barcadia/tools/generate_random_sequences.py +98 -0
barcadia/tools/memory_benchmark.py +139 -0
barcadia/validate_barcodes.py +393 -0
barcadia-3.2.0.dist-info/METADATA +466 -0
barcadia-3.2.0.dist-info/RECORD +14 -0
barcadia-3.2.0.dist-info/WHEEL +5 -0
barcadia-3.2.0.dist-info/entry_points.txt +2 -0
barcadia-3.2.0.dist-info/licenses/LICENSE +202 -0
barcadia-3.2.0.dist-info/top_level.txt +1 -0

barcadia/__init__.py ADDED Viewed

@@ -0,0 +1,20 @@
+"""
+Barcadia: High-performance DNA barcode generation and validation for NGS applications.
+This package provides efficient algorithms for generating and validating DNA barcodes
+with configurable quality filters including GC content, homopolymer repeats, and
+minimum edit distance constraints.
+Public API:
+    generate_barcodes_core: Generate DNA barcodes with iterative growth algorithm
+    validate_barcodes_core: Validate DNA barcodes against quality filters
+"""
+# Public API - only expose the core functions
+from .generate_barcodes import generate_barcodes_core
+from .validate_barcodes import validate_barcodes_core
+__all__ = [
+    "generate_barcodes_core",
+    "validate_barcodes_core",
+]

barcadia/cli.py ADDED Viewed

@@ -0,0 +1,61 @@
+"""
+Unified CLI for Barcadia.
+Usage:
+  barcadia generate [options...]   -> delegates to barcadia.generate_barcodes.main(argv)
+  barcadia validate [options...]   -> delegates to barcadia.validate_barcodes.main(argv)
+"""
+import sys
+from importlib.metadata import version
+from . import generate_barcodes as gen
+from . import validate_barcodes as val
+TOP_USAGE = (
+    "Barcadia - A high-performance, memory-efficient toolkit for fast generation and validation of large-scale NGS barcodes\n"
+    "\n"
+    "Usage:\n"
+    "  barcadia <command> [options...]\n"
+    "\n"
+    "Commands:\n"
+    "  generate    Generate high-performance DNA barcodes for NGS applications\n"
+    "  validate    Validate DNA barcodes against quality filters\n"
+    "\n"
+    "Examples:\n"
+    "  barcadia --help\n"
+    "  barcadia generate --help\n"
+    "  barcadia validate --help\n"
+    "  barcadia generate --count 1000 --length 12\n"
+    "  barcadia validate --input test/barcodes.txt\n"
+    "\n"
+    "Global options:\n"
+    "  --help, -h     Show this help message\n"
+    "  --version, -v  Show version information\n"
+)
+def main() -> int:
+    # Handle version flag
+    if len(sys.argv) >= 2 and sys.argv[1] in {"-v", "--version"}:
+        print(version("barcadia"))
+        return 0
+    # No subcommand → show top-level help
+    if len(sys.argv) < 2 or sys.argv[1] in {"-h", "--help"}:
+        print(TOP_USAGE, file=sys.stderr)
+        return 0
+    cmd, argv = sys.argv[1], sys.argv[2:]
+    if cmd == "generate":
+        # gen.main must accept argv: list[str] | None
+        return gen.main(argv) or 0
+    if cmd == "validate":
+        # val.main must accept argv: list[str] | None
+        return val.main(argv) or 0
+    # Unknown subcommand
+    print(f"Unknown subcommand: {cmd}\n\n{TOP_USAGE}", file=sys.stderr)
+    return 2
+if __name__ == "__main__":
+    sys.exit(main())

barcadia/config_utils.py ADDED Viewed

@@ -0,0 +1,225 @@
+#!/usr/bin/env python3
+"""
+config_utils.py
+Configuration and DNA-encoding/decoding utility functions for efficient barcode generation and validation.
+"""
+import os
+import logging
+import numpy as np
+from datetime import datetime
+# DNA encoding constants
+DNA_BASES = 'ATGC'
+DNA_TO_INT = {'A': 0, 'T': 1, 'G': 2, 'C': 3}
+INT_TO_DNA = {0: 'A', 1: 'T', 2: 'G', 3: 'C'}
+def encode_sequence(dna_string):
+    """Convert DNA string to integer array"""
+    return np.array([DNA_TO_INT[base] for base in dna_string], dtype=np.int8)
+def decode_sequence(seq_array):
+    """Convert integer array back to DNA string"""
+    return ''.join(INT_TO_DNA[base] for base in seq_array)
+def setup_logging(args, script_name):
+    """Setup logging and create output directory. Returns log filepath."""
+    # Create output directory
+    os.makedirs(args.output_dir, exist_ok=True)
+    # Setup logging with file output
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    log_filename = f"{script_name}_{timestamp}.log"
+    log_filepath = os.path.join(args.output_dir, log_filename)
+    # Configure logging to both file and console
+    logging.basicConfig(
+        level=logging.INFO,
+        format='%(asctime)s - %(levelname)s - %(message)s',
+        datefmt='%H:%M:%S',
+        handlers=[
+            logging.FileHandler(log_filepath),
+            logging.StreamHandler()
+        ]
+    )
+    return log_filepath
+class ExistingSequenceSet:
+    """
+    A class to manage existing DNA sequence sets with file operations and validation.
+    This class consolidates file reading, existence checking, and sequence management
+    for both generation and validation scripts.
+    """
+    def __init__(self, sequences=None, length_counts=None):
+        """
+        Initialize the sequence set.
+        Args:
+            sequences: List of integer arrays (encoded DNA sequences)
+            length_counts: Dictionary mapping length to count
+        """
+        self.sequences = sequences or []
+        self.length_counts = length_counts or {}
+    def _read_files(self, file_paths):
+        """
+        Internal method to read DNA sequences from files and convert to integer arrays.
+        Handles file existence checking and path normalization internally.
+        Args:
+            file_paths: List of file paths or single file path
+        Returns:
+            tuple: (sequences, length_counts) where sequences are integer arrays
+        Raises:
+            ValueError: If any file does not exist or files are empty
+        """
+        # Normalize file paths (convert single file to list)
+        if isinstance(file_paths, str):
+            file_paths = [file_paths]
+        # Check that all files exist
+        for file_path in file_paths:
+            if not os.path.exists(file_path):
+                raise ValueError(f"File does not exist: {file_path}")
+        sequences = []
+        length_counts = {}
+        for file_path in file_paths:
+            file_count = 0
+            with open(file_path, 'r') as f:
+                for line_num, line in enumerate(f, 1):
+                    seq = line.strip()
+                    if not seq:  # Skip empty lines
+                        continue
+                    # Basic validation
+                    if not all(base in DNA_BASES for base in seq):
+                        logging.warning(f"File {file_path}, line {line_num}: Invalid DNA sequence '{seq}', skipping")
+                        continue
+                    # Convert to integer array for efficient processing
+                    seq_array = encode_sequence(seq)
+                    sequences.append(seq_array)
+                    # Count length while reading
+                    length = len(seq_array)
+                    length_counts[length] = length_counts.get(length, 0) + 1
+                    file_count += 1
+            logging.info(f"Loaded {file_count} sequences from {file_path}")
+        if not sequences:
+            raise ValueError(f"File(s) are empty: {', '.join(file_paths)}")
+        # Generate length info for logging
+        if len(length_counts) == 1:
+            length_info = f"length {list(length_counts.keys())[0]}"
+        else:
+            length_breakdown = ", ".join([f"{count} at length {length}" for length, count in sorted(length_counts.items())])
+            length_info = f"mixed lengths: {length_breakdown}"
+        logging.info(f"Total loaded: {len(sequences)} sequences from {len(file_paths)} file(s) ({length_info})")
+        return sequences, length_counts
+    @classmethod
+    def from_files(cls, file_paths):
+        """
+        Create ExistingSequenceSet from files (used by both validation and generation scripts).
+        Args:
+            file_paths: List of file paths or single file path
+        Returns:
+            ExistingSequenceSet: Instance with loaded sequences and length counts
+        """
+        instance = cls()
+        sequences, length_counts = instance._read_files(file_paths)
+        instance.sequences = sequences
+        instance.length_counts = length_counts
+        return instance
+    @classmethod
+    def from_input_files(cls, file_paths):
+        """
+        Create ExistingSequenceSet from input files (used by validation script).
+        Args:
+            file_paths: List of file paths or single file path
+        Returns:
+            ExistingSequenceSet: Instance with loaded sequences and length counts
+        """
+        return cls.from_files(file_paths)
+    @classmethod
+    def from_unpaired_seeds(cls, file_paths):
+        """
+        Create ExistingSequenceSet from unpaired seed files (used by generation script).
+        Args:
+            file_paths: List of file paths or single file path
+        Returns:
+            ExistingSequenceSet: Instance with loaded sequences and length counts
+        """
+        return cls.from_files(file_paths)
+    @classmethod
+    def from_paired_seeds(cls, file1, file2):
+        """
+        Create ExistingSequenceSet from paired seed files (used by generation script).
+        Args:
+            file1: Path to first paired seed file
+            file2: Path to second paired seed file
+        Returns:
+            ExistingSequenceSet: Instance with combined sequences and length counts
+        """
+        instance = cls()
+        # Load paired seeds separately
+        paired_seed1_pool, seed1_length_counts = instance._read_files([file1])
+        paired_seed2_pool, seed2_length_counts = instance._read_files([file2])
+        # Validate paired seeds
+        # 1. Check that both files have the same number of sequences
+        if len(paired_seed1_pool) != len(paired_seed2_pool):
+            raise ValueError(f"Paired seed files must have the same number of sequences. "
+                           f"Seed1: {len(paired_seed1_pool)} sequences, Seed2: {len(paired_seed2_pool)} sequences")
+        # 2. Check that both files have sequences of the same length within the file
+        elif len(seed1_length_counts) != 1:
+            raise ValueError(f"All sequences in paired seed file 1 must be the same length. "
+                           f"Found lengths: {sorted(seed1_length_counts.keys())}")
+        elif len(seed2_length_counts) != 1:
+            raise ValueError(f"All sequences in paired seed file 2 must be the same length. "
+                           f"Found lengths: {sorted(seed2_length_counts.keys())}")
+        # 3. Check that both files have sequences of the same length between the files
+        elif list(seed1_length_counts.keys())[0] != list(seed2_length_counts.keys())[0]:
+            raise ValueError(f"Paired seed files must have sequences of the same length. "
+                           f"Seed1 length: {list(seed1_length_counts.keys())[0]}, Seed2 length: {list(seed2_length_counts.keys())[0]}")
+        else:
+            # All validations passed - combine both for generation pool
+            combined_sequences = paired_seed1_pool + paired_seed2_pool
+            # Since paired seeds are validated to have the same length, just use seed1's length counts
+            # and double the count since we have two files
+            combined_length_counts = {}
+            for length, count in seed1_length_counts.items():
+                combined_length_counts[length] = count * 2
+            instance.sequences = combined_sequences
+            instance.length_counts = combined_length_counts
+        return instance

barcadia/filter_utils.py ADDED Viewed

@@ -0,0 +1,152 @@
+#!/usr/bin/env python3
+"""
+filter_utils.py
+Filter-related utility functions with Numba JIT compilation for efficient barcode generation and validation.
+"""
+from numba import jit
+import numpy as np
+import logging
+# Simple validation of filter arguments used in both generation and validation
+def validate_filter_arguments(args):
+    """Validate filter-related command line arguments and raise ValueError if invalid"""
+    if args.gc_min < 0 or args.gc_max > 1 or args.gc_min >= args.gc_max:
+        raise ValueError("GC content bounds must be: 0 ≤ gc_min < gc_max ≤ 1")
+    elif args.homopolymer_max < 1:
+        raise ValueError("Maximum homopolymer repeat length must be ≥ 1")
+    elif args.min_distance < 1:
+        raise ValueError("Minimum edit distance must be ≥ 1")
+# Biological filter functions
+@jit(nopython=True, cache=True)
+def check_gc_content_int(seq_array, gc_min, gc_max):
+    """Check if sequence passes GC content filter (works with integer arrays)"""
+    # G=2, C=3 in our encoding - count them directly
+    gc_count = 0
+    for base in seq_array:
+        if base == 2 or base == 3:  # G or C
+            gc_count += 1
+    gc_content = gc_count / len(seq_array)
+    return gc_min <= gc_content <= gc_max
+@jit(nopython=True, cache=True)
+def check_homopolymer_int(seq_array, homopolymer_max):
+    """Check for homopolymer repeats longer than homopolymer_max (works with integer arrays)"""
+    current_base = seq_array[0]
+    current_count = 1
+    for base in seq_array[1:]:
+        if base == current_base:
+            current_count += 1
+            if current_count > homopolymer_max:
+                return False  # Fails check
+        else:
+            current_base = base
+            current_count = 1
+    return True  # Passes check
+# Distance calculation functions
+@jit(nopython=True, cache=True)
+def hamming_distance_int(seq1, seq2, min_distance):
+    """Calculate Hamming distance with early stopping (assumes equal-length sequences, works with integer arrays)"""
+    distance = 0
+    for i in range(len(seq1)):
+        if seq1[i] != seq2[i]:
+            distance += 1
+            if distance >= min_distance:
+                return distance  # Early stopping
+    return distance
+@jit(nopython=True, cache=True)
+def levenshtein_distance_int(seq1, seq2, min_distance):
+    """Calculate Levenshtein distance with early stopping (assumes mixed-length sequences, works with integer arrays)"""
+    if len(seq1) < len(seq2):
+        return levenshtein_distance_int(seq2, seq1, min_distance)
+    elif len(seq2) == 0:
+        return len(seq1)
+    # Use numpy arrays for better performance with numba
+    previous_row = np.arange(len(seq2) + 1, dtype=np.int32)
+    # Early stopping: if initial row already exceeds min_distance, return early
+    if previous_row.min() >= min_distance:
+        return min_distance
+    for i in range(len(seq1)):
+        current_row = np.zeros(len(seq2) + 1, dtype=np.int32)
+        current_row[0] = i + 1
+        for j in range(len(seq2)):
+            insertions = previous_row[j + 1] + 1
+            deletions = current_row[j] + 1
+            substitutions = previous_row[j] + (seq1[i] != seq2[j])
+            current_row[j + 1] = min(insertions, deletions, substitutions)
+        # Early stopping: if minimum value in current row >= min_distance,
+        # the final distance will be >= min_distance
+        if current_row.min() >= min_distance:
+            return min_distance
+        previous_row = current_row
+    return previous_row[-1]
+def calculate_distance(seq1, seq2, min_distance):
+    """Calculate distance between two sequences, using Hamming for equal length, Levenshtein otherwise"""
+    if len(seq1) == len(seq2):
+        return hamming_distance_int(seq1, seq2, min_distance)
+    else:
+        return levenshtein_distance_int(seq1, seq2, min_distance)
+def select_distance_method(target_count, min_distance, has_mixed_lengths):
+    """
+    Determine which distance checking method to use based on barcode set characteristics and log the decision.
+    Returns: "pairwise_sequential", "pairwise", or "neighbor_enumeration"
+    Rules:
+    1. Small barcode sets (<10K sequences counting seeds if seeds are present): Always use pairwise_sequential
+    2. Large sets, mixed-length (within seeds and/or between seeds and new barcodes): Always use pairwise (parallelization determined later)
+    3. Large sets, equal-length (counting seeds): Always use pairwise with large minimum distance (> 4), otherwise use neighbor enumeration
+    """
+    # Rule 1: Small barcode sets, always use pairwise_sequential
+    if target_count < 10000:
+        logging.info(f"Using pairwise distance checking for small barcode set (size < 10K)")
+        return "pairwise_sequential"
+    # Rule 2: Large mixed-length sets, always use pairwise (parallel if multiple CPUs, determined in main generation/validation functions)
+    elif has_mixed_lengths:
+        logging.info(f"Using pairwise distance checking for large mixed-length barcode set (size ≥ 10K)")
+        return "pairwise"
+    # Rule 3: Large equal-length sets with large minimum distance (> 4), always use pairwise (parallel if multiple CPUs, determined in main generation/validation functions)
+    elif min_distance > 4:
+        logging.info(f"Using pairwise distance checking for large equal-length barcode set (size ≥ 10K, min distance > 4)")
+        return "pairwise"
+    else:
+        # Special case - neighbor enumeration for large equal-length sets with small minimum distance (<= 4) (no parallelization involved)
+        logging.info(f"Using neighbor enumeration for distance checking for large equal-length barcode set (size ≥ 10K, min distance ≤ 4)")
+        return "neighbor_enumeration"
+def generate_hamming_neighbors(seq_array, max_distance, current_distance=0):
+    """Generate all Hamming neighbors within max_distance of a sequence"""
+    if current_distance == max_distance:
+        yield tuple(seq_array)
+        return
+    # Yield current sequence if distance > 0
+    if current_distance > 0:
+        yield tuple(seq_array)
+    # Generate neighbors by substitution
+    for i in range(len(seq_array)):
+        original_base = seq_array[i]
+        for new_base in [0, 1, 2, 3]:  # A, T, G, C
+            if new_base != original_base:
+                seq_array[i] = new_base
+                yield from generate_hamming_neighbors(seq_array, max_distance, current_distance + 1)
+        seq_array[i] = original_base  # backtrack