PyPI - py-gbcms - Versions diffs - 2.0.0__py3-none-any.whl - Mend

py-gbcms 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

gbcms/__init__.py +13 -0
gbcms/cli.py +745 -0
gbcms/config.py +98 -0
gbcms/counter.py +1074 -0
gbcms/models.py +295 -0
gbcms/numba_counter.py +394 -0
gbcms/output.py +573 -0
gbcms/parallel.py +129 -0
gbcms/processor.py +293 -0
gbcms/reference.py +86 -0
gbcms/variant.py +390 -0
py_gbcms-2.0.0.dist-info/METADATA +506 -0
py_gbcms-2.0.0.dist-info/RECORD +16 -0
py_gbcms-2.0.0.dist-info/WHEEL +4 -0
py_gbcms-2.0.0.dist-info/entry_points.txt +2 -0
py_gbcms-2.0.0.dist-info/licenses/LICENSE +664 -0

gbcms/models.py ADDED Viewed

@@ -0,0 +1,295 @@
+"""Pydantic models for type-safe configuration and data structures."""
+from enum import IntEnum
+from pathlib import Path
+import numpy as np
+from pydantic import BaseModel, Field, field_validator, model_validator
+class CountType(IntEnum):
+    """Enumeration for different count types."""
+    DP = 0  # Total depth
+    RD = 1  # Reference depth
+    AD = 2  # Alternate depth
+    DPP = 3  # Positive strand depth
+    RDP = 4  # Positive strand reference depth
+    ADP = 5  # Positive strand alternate depth
+    DPF = 6  # Fragment depth
+    RDF = 7  # Fragment reference depth
+    ADF = 8  # Fragment alternate depth
+class BamFileConfig(BaseModel):
+    """Configuration for a single BAM file."""
+    sample_name: str = Field(..., description="Sample name")
+    bam_path: Path = Field(..., description="Path to BAM file")
+    bai_path: Path | None = Field(None, description="Path to BAM index")
+    @field_validator("bam_path")
+    @classmethod
+    def validate_bam_exists(cls, v: Path) -> Path:
+        """Validate BAM file exists."""
+        if not v.exists():
+            raise ValueError(f"BAM file not found: {v}")
+        return v
+    @model_validator(mode="after")
+    def validate_bai(self) -> "BamFileConfig":
+        """Validate BAM index exists."""
+        if self.bai_path is None:
+            # Try to find index
+            bai_path1 = Path(str(self.bam_path).replace(".bam", ".bai"))
+            bai_path2 = Path(f"{self.bam_path}.bai")
+            if bai_path1.exists():
+                self.bai_path = bai_path1
+            elif bai_path2.exists():
+                self.bai_path = bai_path2
+            else:
+                raise ValueError(f"BAM index not found for: {self.bam_path}")
+        return self
+    model_config = {"arbitrary_types_allowed": True}
+class VariantFileConfig(BaseModel):
+    """Configuration for variant files."""
+    file_path: Path = Field(..., description="Path to variant file")
+    file_format: str = Field(..., description="File format (vcf or maf)")
+    @field_validator("file_path")
+    @classmethod
+    def validate_file_exists(cls, v: Path) -> Path:
+        """Validate variant file exists."""
+        if not v.exists():
+            raise ValueError(f"Variant file not found: {v}")
+        return v
+    @field_validator("file_format")
+    @classmethod
+    def validate_format(cls, v: str) -> str:
+        """Validate file format."""
+        if v.lower() not in ["vcf", "maf"]:
+            raise ValueError(f"Invalid format: {v}. Must be 'vcf' or 'maf'")
+        return v.lower()
+    model_config = {"arbitrary_types_allowed": True}
+class QualityFilters(BaseModel):
+    """Quality filtering parameters."""
+    mapping_quality_threshold: int = Field(20, ge=0, description="Mapping quality threshold")
+    base_quality_threshold: int = Field(0, ge=0, description="Base quality threshold")
+    filter_duplicate: bool = Field(True, description="Filter duplicate reads")
+    filter_improper_pair: bool = Field(False, description="Filter improper pairs")
+    filter_qc_failed: bool = Field(False, description="Filter QC failed reads")
+    filter_indel: bool = Field(False, description="Filter reads with indels")
+    filter_non_primary: bool = Field(False, description="Filter non-primary alignments")
+class OutputOptions(BaseModel):
+    """Output configuration options."""
+    output_file: Path = Field(..., description="Output file path")
+    output_maf: bool = Field(False, description="Output in MAF format")
+    output_positive_count: bool = Field(True, description="Output positive strand counts")
+    output_negative_count: bool = Field(False, description="Output negative strand counts")
+    output_fragment_count: bool = Field(False, description="Output fragment counts")
+    fragment_fractional_weight: bool = Field(
+        False, description="Use fractional weights for fragments"
+    )
+    model_config = {"arbitrary_types_allowed": True}
+class PerformanceConfig(BaseModel):
+    """Performance and parallelization configuration."""
+    num_threads: int = Field(1, ge=1, description="Number of threads")
+    max_block_size: int = Field(10000, ge=1, description="Maximum variants per block")
+    max_block_dist: int = Field(100000, ge=1, description="Maximum block distance in bp")
+    use_numba: bool = Field(True, description="Use Numba JIT compilation")
+    @field_validator("backend")
+    @classmethod
+    def validate_backend(cls, v: str) -> str:
+        """Validate backend choice."""
+        valid_backends = ["joblib", "loky", "threading", "multiprocessing"]
+        if v.lower() not in valid_backends:
+            raise ValueError(f"Invalid backend: {v}. Must be one of: {', '.join(valid_backends)}")
+        return v.lower()
+class GetBaseCountsConfig(BaseModel):
+    """Complete configuration for GetBaseCounts with Pydantic validation."""
+    # Input files
+    fasta_file: Path = Field(..., description="Reference FASTA file")
+    bam_files: list[BamFileConfig] = Field(..., description="BAM files to process")
+    variant_files: list[VariantFileConfig] = Field(..., description="Variant files")
+    # Options
+    quality_filters: QualityFilters = Field(
+        default_factory=QualityFilters, description="Quality filtering options"  # type: ignore[arg-type]
+    )
+    output_options: OutputOptions = Field(..., description="Output options")
+    performance: PerformanceConfig = Field(
+        default_factory=PerformanceConfig, description="Performance options"  # type: ignore[arg-type]
+    )
+    # Advanced
+    generic_counting: bool = Field(False, description="Use generic counting algorithm")
+    max_warning_per_type: int = Field(3, ge=0, description="Maximum warnings per type")
+    @field_validator("fasta_file")
+    @classmethod
+    def validate_fasta_exists(cls, v: Path) -> Path:
+        """Validate FASTA file exists."""
+        if not v.exists():
+            raise ValueError(f"FASTA file not found: {v}")
+        fai_file = Path(f"{v}.fai")
+        if not fai_file.exists():
+            raise ValueError(f"FASTA index not found: {fai_file}")
+        return v
+    @model_validator(mode="after")
+    def validate_variant_format_consistency(self) -> "GetBaseCountsConfig":
+        """Validate variant file format consistency."""
+        formats = {vf.file_format for vf in self.variant_files}
+        if len(formats) > 1:
+            raise ValueError("All variant files must be the same format (all VCF or all MAF)")
+        # Check MAF output compatibility
+        if self.output_options.output_maf and "maf" not in formats:
+            raise ValueError("--omaf can only be used with MAF input")
+        return self
+    def get_sample_names(self) -> list[str]:
+        """Get list of sample names in order."""
+        return [bam.sample_name for bam in self.bam_files]
+    def is_maf_input(self) -> bool:
+        """Check if input is MAF format."""
+        return self.variant_files[0].file_format == "maf"
+    def is_vcf_input(self) -> bool:
+        """Check if input is VCF format."""
+        return self.variant_files[0].file_format == "vcf"
+    model_config = {"arbitrary_types_allowed": True}
+class VariantCounts(BaseModel):
+    """Type-safe variant counts structure."""
+    sample_name: str
+    counts: np.ndarray = Field(..., description="Count array")
+    @field_validator("counts")
+    @classmethod
+    def validate_counts_shape(cls, v: np.ndarray) -> np.ndarray:
+        """Validate counts array shape."""
+        if v.shape != (len(CountType),):
+            raise ValueError(f"Counts array must have shape ({len(CountType)},)")
+        return v
+    def get_count(self, count_type: CountType) -> float:
+        """Get count for specific type."""
+        return float(self.counts[count_type])
+    def set_count(self, count_type: CountType, value: float) -> None:
+        """Set count for specific type."""
+        self.counts[count_type] = value
+    model_config = {"arbitrary_types_allowed": True}
+class VariantModel(BaseModel):
+    """Pydantic model for variant with type safety."""
+    chrom: str = Field(..., description="Chromosome")
+    pos: int = Field(..., ge=0, description="Position (0-indexed)")
+    end_pos: int = Field(..., ge=0, description="End position")
+    ref: str = Field(..., min_length=1, description="Reference allele")
+    alt: str = Field(..., min_length=1, description="Alternate allele")
+    # Variant type flags
+    snp: bool = Field(False, description="Is SNP")
+    dnp: bool = Field(False, description="Is DNP")
+    dnp_len: int = Field(0, ge=0, description="DNP length")
+    insertion: bool = Field(False, description="Is insertion")
+    deletion: bool = Field(False, description="Is deletion")
+    # Sample information
+    tumor_sample: str = Field("", description="Tumor sample name")
+    normal_sample: str = Field("", description="Normal sample name")
+    # Annotation
+    gene: str = Field("", description="Gene name")
+    effect: str = Field("", description="Variant effect")
+    # Original MAF coordinates
+    maf_pos: int = Field(0, ge=0, description="Original MAF position")
+    maf_end_pos: int = Field(0, ge=0, description="Original MAF end position")
+    maf_ref: str = Field("", description="Original MAF reference")
+    maf_alt: str = Field("", description="Original MAF alternate")
+    caller: str = Field("", description="Variant caller")
+    # Counts
+    sample_counts: dict[str, VariantCounts] = Field(
+        default_factory=dict, description="Counts per sample"
+    )
+    @model_validator(mode="after")
+    def validate_positions(self) -> "VariantModel":
+        """Validate position consistency."""
+        if self.end_pos < self.pos:
+            raise ValueError(f"End position {self.end_pos} < start position {self.pos}")
+        return self
+    @model_validator(mode="after")
+    def validate_variant_type(self) -> "VariantModel":
+        """Validate variant type flags are consistent."""
+        type_count = sum([self.snp, self.dnp, self.insertion, self.deletion])
+        if type_count == 0:
+            # Auto-detect variant type
+            if len(self.ref) == len(self.alt) == 1:
+                self.snp = True
+            elif len(self.ref) == len(self.alt) > 1:
+                self.dnp = True
+                self.dnp_len = len(self.ref)
+            elif len(self.alt) > len(self.ref):
+                self.insertion = True
+            elif len(self.alt) < len(self.ref):
+                self.deletion = True
+        return self
+    def get_variant_key(self) -> tuple[str, int, str, str]:
+        """Get unique variant key."""
+        return (self.chrom, self.pos, self.ref, self.alt)
+    def initialize_counts(self, sample_names: list[str]) -> None:
+        """Initialize counts for all samples."""
+        for sample in sample_names:
+            if sample not in self.sample_counts:
+                self.sample_counts[sample] = VariantCounts(
+                    sample_name=sample, counts=np.zeros(len(CountType), dtype=np.float32)
+                )
+    def get_count(self, sample: str, count_type: CountType) -> float:
+        """Get count for specific sample and type."""
+        if sample not in self.sample_counts:
+            return 0.0
+        return self.sample_counts[sample].get_count(count_type)
+    model_config = {"arbitrary_types_allowed": True}

gbcms/numba_counter.py ADDED Viewed

@@ -0,0 +1,394 @@
+"""
+Numba-optimized counting functions for high performance.
+This module provides JIT-compiled counting functions that are 50-100x faster
+than the pure Python implementation in `counter.py`. It uses Numba to compile
+Python functions to machine code.
+**When to use this module:**
+- Large datasets (>10K variants)
+- Production workloads
+- When performance is critical
+- Batch processing
+**Performance:** 50-100x faster than `counter.py`
+**Trade-offs:**
+- ✅ Much faster (50-100x)
+- ✅ Parallel processing with prange
+- ✅ Cached compilation
+- ❌ First call is slow (compilation time)
+- ❌ Requires NumPy arrays (not pysam objects)
+- ❌ Harder to debug (compiled code)
+**Key Functions:**
+- count_snp_base(): Single SNP counting (JIT compiled)
+- count_snp_batch(): Batch SNP counting (parallel)
+- filter_alignments_batch(): Vectorized filtering
+- calculate_fragment_counts(): Fragment-level counting
+**Usage:**
+    from gbcms.numba_counter import count_snp_batch
+    import numpy as np
+    # Convert pysam data to NumPy arrays
+    bases = np.array([aln.query_sequence for aln in alignments])
+    quals = np.array([aln.query_qualities for aln in alignments])
+    # Fast batch counting
+    counts = count_snp_batch(bases, quals, positions, ...)
+**Note:** First call will be slow due to JIT compilation. Subsequent calls
+are very fast. Use `cache=True` to cache compiled functions.
+**Alternative:** For small datasets or development, see `counter.py` for
+a pure Python implementation that's easier to debug.
+"""
+import numpy as np
+from numba import jit, prange
+@jit(nopython=True, cache=True)
+def count_snp_base(
+    query_bases: np.ndarray,
+    query_qualities: np.ndarray,
+    reference_positions: np.ndarray,
+    is_reverse: np.ndarray,
+    variant_pos: int,
+    ref_base: str,
+    alt_base: str,
+    base_quality_threshold: int,
+) -> tuple[int, int, int, int, int, int]:
+    """
+    Count SNP bases with Numba JIT compilation.
+    Args:
+        query_bases: Array of query base characters
+        query_qualities: Array of base qualities
+        reference_positions: Array of reference positions
+        is_reverse: Array of strand orientation flags
+        variant_pos: Variant position
+        ref_base: Reference base
+        alt_base: Alternate base
+        base_quality_threshold: Quality threshold
+    Returns:
+        Tuple of (DP, RD, AD, DPP, RDP, ADP)
+    """
+    dp = 0  # Total depth
+    rd = 0  # Reference depth
+    ad = 0  # Alternate depth
+    dpp = 0  # Positive strand depth
+    rdp = 0  # Positive strand reference depth
+    adp = 0  # Positive strand alternate depth
+    n_reads = len(query_bases)
+    for i in range(n_reads):
+        if reference_positions[i] != variant_pos:
+            continue
+        if query_qualities[i] < base_quality_threshold:
+            continue
+        base = query_bases[i]
+        # Count total depth
+        dp += 1
+        if not is_reverse[i]:
+            dpp += 1
+        # Count ref/alt
+        if base == ref_base:
+            rd += 1
+            if not is_reverse[i]:
+                rdp += 1
+        elif base == alt_base:
+            ad += 1
+            if not is_reverse[i]:
+                adp += 1
+    return dp, rd, ad, dpp, rdp, adp
+@jit(nopython=True, cache=True, parallel=True)
+def count_snp_batch(
+    query_bases_list: np.ndarray,
+    query_qualities_list: np.ndarray,
+    reference_positions_list: np.ndarray,
+    is_reverse_list: np.ndarray,
+    variant_positions: np.ndarray,
+    ref_bases: np.ndarray,
+    alt_bases: np.ndarray,
+    base_quality_threshold: int,
+) -> np.ndarray:
+    """
+    Count multiple SNPs in parallel with Numba.
+    Args:
+        query_bases_list: List of query base arrays
+        query_qualities_list: List of quality arrays
+        reference_positions_list: List of position arrays
+        is_reverse_list: List of strand arrays
+        variant_positions: Array of variant positions
+        ref_bases: Array of reference bases
+        alt_bases: Array of alternate bases
+        base_quality_threshold: Quality threshold
+    Returns:
+        Array of counts (n_variants, 6) with columns (DP, RD, AD, DPP, RDP, ADP)
+    """
+    n_variants = len(variant_positions)
+    counts = np.zeros((n_variants, 6), dtype=np.int32)
+    for i in prange(n_variants):
+        dp, rd, ad, dpp, rdp, adp = count_snp_base(
+            query_bases_list[i],
+            query_qualities_list[i],
+            reference_positions_list[i],
+            is_reverse_list[i],
+            variant_positions[i],
+            ref_bases[i],
+            alt_bases[i],
+            base_quality_threshold,
+        )
+        counts[i, 0] = dp
+        counts[i, 1] = rd
+        counts[i, 2] = ad
+        counts[i, 3] = dpp
+        counts[i, 4] = rdp
+        counts[i, 5] = adp
+    return counts
+@jit(nopython=True, cache=True)
+def filter_alignment_numba(
+    is_duplicate: bool,
+    is_proper_pair: bool,
+    is_qcfail: bool,
+    is_secondary: bool,
+    is_supplementary: bool,
+    mapping_quality: int,
+    has_indel: bool,
+    filter_duplicate: bool,
+    filter_improper_pair: bool,
+    filter_qc_failed: bool,
+    filter_non_primary: bool,
+    filter_indel: bool,
+    mapping_quality_threshold: int,
+) -> bool:
+    """
+    Fast alignment filtering with Numba.
+    Returns:
+        True if alignment should be filtered (excluded)
+    """
+    if filter_duplicate and is_duplicate:
+        return True
+    if filter_improper_pair and not is_proper_pair:
+        return True
+    if filter_qc_failed and is_qcfail:
+        return True
+    if filter_non_primary and (is_secondary or is_supplementary):
+        return True
+    if mapping_quality < mapping_quality_threshold:
+        return True
+    if filter_indel and has_indel:
+        return True
+    return False
+@jit(nopython=True, cache=True, parallel=True)
+def filter_alignments_batch(
+    is_duplicate: np.ndarray,
+    is_proper_pair: np.ndarray,
+    is_qcfail: np.ndarray,
+    is_secondary: np.ndarray,
+    is_supplementary: np.ndarray,
+    mapping_quality: np.ndarray,
+    has_indel: np.ndarray,
+    filter_duplicate: bool,
+    filter_improper_pair: bool,
+    filter_qc_failed: bool,
+    filter_non_primary: bool,
+    filter_indel: bool,
+    mapping_quality_threshold: int,
+) -> np.ndarray:
+    """
+    Filter multiple alignments in parallel.
+    Returns:
+        Boolean array where True means keep the alignment
+    """
+    n = len(is_duplicate)
+    keep = np.ones(n, dtype=np.bool_)
+    for i in prange(n):
+        keep[i] = not filter_alignment_numba(
+            is_duplicate[i],
+            is_proper_pair[i],
+            is_qcfail[i],
+            is_secondary[i],
+            is_supplementary[i],
+            mapping_quality[i],
+            has_indel[i],
+            filter_duplicate,
+            filter_improper_pair,
+            filter_qc_failed,
+            filter_non_primary,
+            filter_indel,
+            mapping_quality_threshold,
+        )
+    return keep
+@jit(nopython=True, cache=True)
+def calculate_fragment_counts(
+    fragment_ids: np.ndarray,
+    end_numbers: np.ndarray,
+    has_ref: np.ndarray,
+    has_alt: np.ndarray,
+    fractional_weight: float,
+) -> tuple[int, float, float]:
+    """
+    Calculate fragment-level counts.
+    Args:
+        fragment_ids: Array of fragment identifiers
+        end_numbers: Array of read end numbers (1 or 2)
+        has_ref: Array indicating if fragment has reference
+        has_alt: Array indicating if fragment has alternate
+        fractional_weight: Weight for disagreement (0.5 or 0)
+    Returns:
+        Tuple of (DPF, RDF, ADF)
+    """
+    # Get unique fragments
+    unique_fragments = np.unique(fragment_ids)
+    dpf = len(unique_fragments)
+    rdf = 0.0
+    adf = 0.0
+    for frag_id in unique_fragments:
+        # Find all reads for this fragment
+        frag_mask = fragment_ids == frag_id
+        frag_has_ref = np.any(has_ref[frag_mask])
+        frag_has_alt = np.any(has_alt[frag_mask])
+        # Check for overlapping ends
+        frag_ends = end_numbers[frag_mask]
+        unique_ends, end_counts = np.unique(frag_ends, return_counts=True)
+        if np.any(end_counts > 1):
+            # Skip fragments with overlapping multimapped reads
+            continue
+        # Count based on ref/alt presence
+        if frag_has_ref and frag_has_alt:
+            rdf += fractional_weight
+            adf += fractional_weight
+        elif frag_has_ref:
+            rdf += 1.0
+        elif frag_has_alt:
+            adf += 1.0
+    return dpf, rdf, adf
+@jit(nopython=True, cache=True)
+def find_cigar_position(
+    cigar_ops: np.ndarray,
+    cigar_lens: np.ndarray,
+    alignment_start: int,
+    target_pos: int,
+) -> tuple[int, bool]:
+    """
+    Find read position corresponding to reference position using CIGAR.
+    Args:
+        cigar_ops: Array of CIGAR operations
+        cigar_lens: Array of CIGAR lengths
+        alignment_start: Alignment start position
+        target_pos: Target reference position
+    Returns:
+        Tuple of (read_position, is_covered)
+    """
+    ref_pos = alignment_start
+    read_pos = 0
+    for i in range(len(cigar_ops)):
+        op = cigar_ops[i]
+        length = cigar_lens[i]
+        if op == 0:  # Match/mismatch (M)
+            if ref_pos <= target_pos < ref_pos + length:
+                return read_pos + (target_pos - ref_pos), True
+            ref_pos += length
+            read_pos += length
+        elif op == 1:  # Insertion (I)
+            read_pos += length
+        elif op == 2:  # Deletion (D)
+            if ref_pos <= target_pos < ref_pos + length:
+                return -1, False  # Position is in deletion
+            ref_pos += length
+        elif op == 3:  # Skipped region (N)
+            ref_pos += length
+        elif op == 4:  # Soft clip (S)
+            read_pos += length
+        # Hard clip (H) and padding (P) don't affect positions
+    return -1, False
+@jit(nopython=True, cache=True)
+def compute_base_quality_stats(
+    qualities: np.ndarray,
+    min_quality: int,
+) -> tuple[float, float, int]:
+    """
+    Compute base quality statistics.
+    Args:
+        qualities: Array of base qualities
+        min_quality: Minimum quality threshold
+    Returns:
+        Tuple of (mean_quality, median_quality, n_passing)
+    """
+    n = len(qualities)
+    if n == 0:
+        return 0.0, 0.0, 0
+    mean_qual = np.mean(qualities)
+    median_qual = np.median(qualities)
+    n_passing = np.sum(qualities >= min_quality)
+    return float(mean_qual), float(median_qual), int(n_passing)
+@jit(nopython=True, cache=True, parallel=True)
+def vectorized_quality_filter(
+    qualities: np.ndarray,
+    threshold: int,
+) -> np.ndarray:
+    """
+    Vectorized quality filtering.
+    Args:
+        qualities: 2D array of qualities (n_reads, read_length)
+        threshold: Quality threshold
+    Returns:
+        Boolean array of passing reads
+    """
+    n_reads = qualities.shape[0]
+    passing = np.zeros(n_reads, dtype=np.bool_)
+    for i in prange(n_reads):
+        passing[i] = np.all(qualities[i] >= threshold)
+    return passing