PyPI - py-gbcms - Versions diffs - 2.2.0__cp311-cp311-macosx_10_12_x86_64.whl - Mend

py-gbcms 2.2.0__cp311-cp311-macosx_10_12_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

gbcms/__init__.py +23 -0
gbcms/_rs.cpython-311-darwin.so +0 -0
gbcms/_rs.pyi +49 -0
gbcms/cli.py +204 -0
gbcms/core/__init__.py +9 -0
gbcms/core/kernel.py +128 -0
gbcms/io/__init__.py +18 -0
gbcms/io/input.py +227 -0
gbcms/io/output.py +354 -0
gbcms/models/__init__.py +27 -0
gbcms/models/core.py +172 -0
gbcms/pipeline.py +257 -0
gbcms/py.typed +0 -0
gbcms/utils/__init__.py +14 -0
gbcms/utils/logging.py +123 -0
py_gbcms-2.2.0.dist-info/METADATA +217 -0
py_gbcms-2.2.0.dist-info/RECORD +20 -0
py_gbcms-2.2.0.dist-info/WHEEL +4 -0
py_gbcms-2.2.0.dist-info/entry_points.txt +2 -0
py_gbcms-2.2.0.dist-info/licenses/LICENSE +664 -0

gbcms/io/output.py ADDED Viewed

@@ -0,0 +1,354 @@
+"""
+Output Writers: Formatting results for VCF and MAF.
+This module provides classes to write processed variants and their counts
+to output files, handling format-specific columns and headers.
+"""
+import csv
+from pathlib import Path
+from typing import Any
+from ..models.core import Variant
+__all__ = ["OutputWriter", "MafWriter", "VcfWriter"]
+class OutputWriter:
+    """Abstract base class for output writers."""
+    def write(self, variant: Variant, counts: Any):
+        raise NotImplementedError
+    def close(self):
+        pass
+class MafWriter(OutputWriter):
+    """Writes results to a MAF-like file (Fillout format)."""
+    def __init__(self, path: Path):
+        self.path = path
+        self.file = open(path, "w")
+        self.writer: csv.DictWriter | None = None
+        self._headers_written = False
+    def _init_writer(self):
+        # Standard GDC MAF columns (plus our custom ones)
+        # Based on GDC MAF Format v1.0.0
+        self.fieldnames = [
+            "Hugo_Symbol",
+            "Entrez_Gene_Id",
+            "Center",
+            "NCBI_Build",
+            "Chromosome",
+            "Start_Position",
+            "End_Position",
+            "Strand",
+            "Variant_Classification",
+            "Variant_Type",
+            "Reference_Allele",
+            "Tumor_Seq_Allele1",
+            "Tumor_Seq_Allele2",
+            "dbSNP_RS",
+            "dbSNP_Val_Status",
+            "Tumor_Sample_Barcode",
+            "Matched_Norm_Sample_Barcode",
+            "Match_Norm_Seq_Allele1",
+            "Match_Norm_Seq_Allele2",
+            "Tumor_Validation_Allele1",
+            "Tumor_Validation_Allele2",
+            "Match_Norm_Validation_Allele1",
+            "Match_Norm_Validation_Allele2",
+            "Verification_Status",
+            "Validation_Status",
+            "Mutation_Status",
+            "Sequencing_Phase",
+            "Sequence_Source",
+            "Validation_Method",
+            "Score",
+            "BAM_File",
+            "Sequencer",
+            "Tumor_Sample_UUID",
+            "Matched_Norm_Sample_UUID",
+            "HGVSc",
+            "HGVSp",
+            "HGVSp_Short",
+            "Transcript_ID",
+            "Exon_Number",
+            "t_depth",
+            "t_ref_count",
+            "t_alt_count",
+            "n_depth",
+            "n_ref_count",
+            "n_alt_count",
+            "all_effects",
+            "Allele",
+            "Gene",
+            "Feature",
+            "Feature_type",
+            "Consequence",
+            "cDNA_position",
+            "CDS_position",
+            "Protein_position",
+            "Amino_acids",
+            "Codons",
+            "Existing_variation",
+            "DISTANCE",
+            "STRAND",
+            "FLAGS",
+            "SYMBOL",
+            "SYMBOL_SOURCE",
+            "HGNC_ID",
+            "BIOTYPE",
+            "CANONICAL",
+            "CCDS",
+            "ENSP",
+            "SWISSPROT",
+            "TREMBL",
+            "UNIPARC",
+            "RefSeq",
+            "SIFT",
+            "PolyPhen",
+            "EXON",
+            "INTRON",
+            "DOMAINS",
+            "GMAF",
+            "AFR_MAF",
+            "AMR_MAF",
+            "ASN_MAF",
+            "EUR_MAF",
+            "AA_MAF",
+            "EA_MAF",
+            "CLIN_SIG",
+            "SOMATIC",
+            "PUBMED",
+            "MOTIF_NAME",
+            "MOTIF_POS",
+            "HIGH_INF_POS",
+            "MOTIF_SCORE_CHANGE",
+            "IMPACT",
+            "PICK",
+            "VARIANT_CLASS",
+            "TSL",
+            "HGVS_OFFSET",
+            "PHENO",
+            "MINIMISED",
+            "ExAC_AF",
+            "ExAC_AF_AFR",
+            "ExAC_AF_AMR",
+            "ExAC_AF_EAS",
+            "ExAC_AF_FIN",
+            "ExAC_AF_NFE",
+            "ExAC_AF_OTH",
+            "ExAC_AF_SAS",
+            "GENE_PHENO",
+            "FILTER",
+            "flanking_bps",
+            "vcf_id",
+            "vcf_qual",
+            "gnomAD_AF",
+            "gnomAD_AFR_AF",
+            "gnomAD_AMR_AF",
+            "gnomAD_ASJ_AF",
+            "gnomAD_EAS_AF",
+            "gnomAD_FIN_AF",
+            "gnomAD_NFE_AF",
+            "gnomAD_OTH_AF",
+            "gnomAD_SAS_AF",
+            "vcf_pos",
+            "vcf_region",
+            # Custom columns
+            "t_total_count",
+            "t_vaf",
+            "t_ref_count_fragment",
+            "t_alt_count_fragment",
+            "t_total_count_fragment",
+            "t_vaf_fragment",
+            "strand_bias_p_value",
+            "strand_bias_odds_ratio",
+            "fragment_strand_bias_p_value",
+            "fragment_strand_bias_odds_ratio",
+            # Strand counts
+            "t_ref_count_forward",
+            "t_ref_count_reverse",
+            "t_alt_count_forward",
+            "t_alt_count_reverse",
+            "t_ref_count_fragment_forward",
+            "t_ref_count_fragment_reverse",
+            "t_alt_count_fragment_forward",
+            "t_alt_count_fragment_reverse",
+        ]
+        self.writer = csv.DictWriter(
+            self.file, fieldnames=self.fieldnames, delimiter="\t", extrasaction="ignore"
+        )
+        self.writer.writeheader()
+        self._headers_written = True
+    def write(self, variant: Variant, counts: Any, sample_name: str = "TUMOR"):
+        if not self._headers_written:
+            self._init_writer()
+        assert self.writer is not None
+        # Calculate VAFs
+        total = counts.rd + counts.ad
+        vaf = counts.ad / total if total > 0 else 0.0
+        total_frag = counts.rdf + counts.adf
+        vaf_frag = counts.adf / total_frag if total_frag > 0 else 0.0
+        # MAF Coordinates (1-based)
+        start_pos = variant.pos + 1
+        end_pos = start_pos
+        if variant.variant_type == "DELETION":
+            end_pos = start_pos + len(variant.ref) - 1
+        elif variant.variant_type == "INSERTION":
+            # MAF for insertion: Start and End are the same (anchor), or Start=Anchor, End=Anchor+1?
+            # GDC: Start_Position is the last base of the reference allele (anchor).
+            # End_Position is Start_Position + 1.
+            # Let's follow GDC convention if possible, or stick to VCF-like anchor.
+            # For now, let's keep it simple: Start=End=Anchor for Ins?
+            # Actually, standard MAF usually has Start=End for insertions (between bases).
+            end_pos = start_pos + 1  # To indicate range?
+        # Populate row with defaults for missing fields, starting with metadata
+        row = dict.fromkeys(self.fieldnames, "")
+        if variant.metadata:
+            row.update(variant.metadata)
+        # Fill known fields
+        row.update(
+            {
+                "Chromosome": variant.chrom,
+                "Start_Position": str(start_pos),
+                "End_Position": str(end_pos),
+                "Reference_Allele": variant.ref,
+                "Tumor_Seq_Allele2": variant.alt,
+                "Tumor_Sample_Barcode": sample_name,
+                "Variant_Type": variant.variant_type,
+                "t_ref_count": str(counts.rd),
+                "t_alt_count": str(counts.ad),
+                "t_total_count": str(counts.dp),
+                "t_vaf": f"{vaf:.4f}",
+                "t_ref_count_fragment": str(counts.rdf),
+                "t_alt_count_fragment": str(counts.adf),
+                "t_total_count_fragment": str(counts.dpf),
+                "t_vaf_fragment": f"{vaf_frag:.4f}",
+                "strand_bias_p_value": f"{counts.sb_pval:.4e}",
+                "strand_bias_odds_ratio": f"{counts.sb_or:.4f}",
+                "fragment_strand_bias_p_value": f"{counts.fsb_pval:.4e}",
+                "fragment_strand_bias_odds_ratio": f"{counts.fsb_or:.4f}",
+                "vcf_region": f"{variant.chrom}:{start_pos}-{end_pos}",  # Simple region string
+                "vcf_pos": str(start_pos),
+                # Strand counts
+                "t_ref_count_forward": str(counts.rd_fwd),
+                "t_ref_count_reverse": str(counts.rd_rev),
+                "t_alt_count_forward": str(counts.ad_fwd),
+                "t_alt_count_reverse": str(counts.ad_rev),
+                "t_ref_count_fragment_forward": str(counts.rdf_fwd),
+                "t_ref_count_fragment_reverse": str(counts.rdf_rev),
+                "t_alt_count_fragment_forward": str(counts.adf_fwd),
+                "t_alt_count_fragment_reverse": str(counts.adf_rev),
+            }
+        )
+        if variant.original_id:
+            row["vcf_id"] = variant.original_id
+        self.writer.writerow(row)
+    def close(self):
+        self.file.close()
+class VcfWriter(OutputWriter):
+    """Writes results to a VCF file."""
+    def __init__(self, path: Path, sample_name: str = "SAMPLE"):
+        self.path = path
+        self.sample_name = sample_name
+        self.file = open(path, "w")
+        self._headers_written = False
+    def _write_header(self):
+        # Minimal VCF header
+        headers = [
+            "##fileformat=VCFv4.2",
+            "##source=gbcms_v2",
+            '##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">',
+            '##INFO=<ID=SB_PVAL,Number=1,Type=Float,Description="Fisher strand bias p-value">',
+            '##INFO=<ID=SB_OR,Number=1,Type=Float,Description="Fisher strand bias odds ratio">',
+            '##INFO=<ID=FSB_PVAL,Number=1,Type=Float,Description="Fisher fragment strand bias p-value">',
+            '##INFO=<ID=FSB_OR,Number=1,Type=Float,Description="Fisher fragment strand bias odds ratio">',
+            '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
+            '##FORMAT=<ID=AD,Number=2,Type=Integer,Description="Allelic depths for the ref and alt alleles (fwd,rev)">',
+            '##FORMAT=<ID=DP,Number=2,Type=Integer,Description="Approximate read depth (ref_total,alt_total)">',
+            '##FORMAT=<ID=RD,Number=2,Type=Integer,Description="Reference read depth (fwd,rev)">',
+            '##FORMAT=<ID=RDF,Number=2,Type=Integer,Description="Ref Fragment Count (fwd,rev)">',
+            '##FORMAT=<ID=ADF,Number=2,Type=Integer,Description="Alt Fragment Count (fwd,rev)">',
+            '##FORMAT=<ID=VAF,Number=1,Type=Float,Description="Variant Allele Fraction (read level)">',
+            '##FORMAT=<ID=FAF,Number=1,Type=Float,Description="Variant Allele Fraction (fragment level)">',
+            f"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t{self.sample_name}",
+        ]
+        self.file.write("\n".join(headers) + "\n")
+        self._headers_written = True
+    def write(self, variant: Variant, counts: Any, sample_name: str = "SAMPLE"):
+        if not self._headers_written:
+            self._write_header()
+        # VCF POS is 1-based
+        pos = variant.pos + 1
+        # INFO fields
+        info = f"DP={counts.dp};SB_PVAL={counts.sb_pval:.4e};SB_OR={counts.sb_or:.4f};FSB_PVAL={counts.fsb_pval:.4e};FSB_OR={counts.fsb_or:.4f}"
+        # FORMAT fields
+        # GT: Simple 0/1 if alt > 0? Or ./1?
+        # Let's assume 0/1 if we have alt counts, else 0/0
+        gt = "0/1" if counts.ad > 0 else "0/0"
+        # DP: ref_total,alt_total
+        dp = f"{counts.rd},{counts.ad}"
+        # RD: ref_fwd,ref_rev
+        rd = f"{counts.rd_fwd},{counts.rd_rev}"
+        # AD: alt_fwd,alt_rev
+        ad = f"{counts.ad_fwd},{counts.ad_rev}"
+        # RDF: ref_frag_fwd,ref_frag_rev
+        rdf = f"{counts.rdf_fwd},{counts.rdf_rev}"
+        # ADF: alt_frag_fwd,alt_frag_rev
+        adf = f"{counts.adf_fwd},{counts.adf_rev}"
+        # VAF calculations
+        total_reads = counts.rd + counts.ad
+        vaf = counts.ad / total_reads if total_reads > 0 else 0.0
+        total_frags = counts.rdf + counts.adf
+        faf = counts.adf / total_frags if total_frags > 0 else 0.0
+        format_str = "GT:DP:RD:AD:RDF:ADF:VAF:FAF"
+        sample_data = f"{gt}:{dp}:{rd}:{ad}:{rdf}:{adf}:{vaf:.4f}:{faf:.4f}"
+        row = [
+            variant.chrom,
+            str(pos),
+            variant.original_id or ".",
+            variant.ref,
+            variant.alt,
+            ".",  # QUAL
+            ".",  # FILTER
+            info,
+            format_str,
+            sample_data,
+        ]
+        self.file.write("\t".join(row) + "\n")
+    def close(self):
+        self.file.close()

gbcms/models/__init__.py ADDED Viewed

@@ -0,0 +1,27 @@
+"""
+Data models for gbcms.
+Provides Pydantic models for variants, configuration, and core data structures.
+"""
+from .core import (
+    GbcmsConfig,
+    GenomicInterval,
+    OutputConfig,
+    OutputFormat,
+    QualityThresholds,
+    ReadFilters,
+    Variant,
+    VariantType,
+)
+__all__ = [
+    "GbcmsConfig",
+    "GenomicInterval",
+    "OutputConfig",
+    "OutputFormat",
+    "QualityThresholds",
+    "ReadFilters",
+    "Variant",
+    "VariantType",
+]

gbcms/models/core.py ADDED Viewed

@@ -0,0 +1,172 @@
+"""
+Core data models for gbcms v2.
+This module defines the data models for variants, configuration, and nested
+config groups (filters, quality thresholds, output settings).
+"""
+import sys
+from pathlib import Path
+if sys.version_info >= (3, 11):
+    from enum import StrEnum
+else:
+    from enum import Enum
+    class StrEnum(str, Enum):
+        """Backport of StrEnum for Python 3.10."""
+        pass
+from pydantic import BaseModel, Field, field_validator, model_validator
+__all__ = [
+    "VariantType",
+    "GenomicInterval",
+    "Variant",
+    "OutputFormat",
+    "ReadFilters",
+    "QualityThresholds",
+    "OutputConfig",
+    "GbcmsConfig",
+]
+class VariantType(StrEnum):
+    """Type of genomic variant."""
+    SNP = "SNP"
+    INSERTION = "INSERTION"
+    DELETION = "DELETION"
+    COMPLEX = "COMPLEX"
+class GenomicInterval(BaseModel):
+    """
+    Represents a 0-based, half-open genomic interval [start, end).
+    This is the canonical internal representation for all coordinates.
+    """
+    chrom: str
+    start: int = Field(ge=0, description="0-based start position (inclusive)")
+    end: int = Field(ge=0, description="0-based end position (exclusive)")
+    @model_validator(mode="after")
+    def validate_interval(self) -> "GenomicInterval":
+        if self.end < self.start:
+            raise ValueError(f"End position ({self.end}) must be >= start position ({self.start})")
+        return self
+class Variant(BaseModel):
+    """Normalized representation of a genomic variant."""
+    chrom: str
+    pos: int = Field(ge=0, description="0-based position of the variant")
+    ref: str
+    alt: str
+    variant_type: VariantType
+    # Original input metadata (optional)
+    original_id: str | None = None
+    metadata: dict[str, str] = Field(
+        default_factory=dict, description="Original input metadata/columns"
+    )
+    @property
+    def interval(self) -> GenomicInterval:
+        """Get the genomic interval covered by this variant."""
+        return GenomicInterval(chrom=self.chrom, start=self.pos, end=self.pos + len(self.ref))
+class OutputFormat(StrEnum):
+    """Supported output formats for gbcms."""
+    VCF = "vcf"
+    MAF = "maf"
+# =============================================================================
+# Nested Configuration Models
+# =============================================================================
+class ReadFilters(BaseModel):
+    """
+    Filters for read selection during BAM processing.
+    These flags control which reads are excluded from counting.
+    When True, reads with the corresponding flag are filtered out.
+    """
+    duplicates: bool = Field(default=True, description="Filter duplicate reads")
+    secondary: bool = Field(default=False, description="Filter secondary alignments")
+    supplementary: bool = Field(default=False, description="Filter supplementary alignments")
+    qc_failed: bool = Field(default=False, description="Filter reads failing QC")
+    improper_pair: bool = Field(default=False, description="Filter improperly paired reads")
+    indel: bool = Field(default=False, description="Filter reads containing indels")
+class QualityThresholds(BaseModel):
+    """Quality score thresholds for filtering reads and bases."""
+    min_mapping_quality: int = Field(default=20, ge=0, description="Minimum mapping quality (MAPQ)")
+    min_base_quality: int = Field(default=0, ge=0, description="Minimum base quality (BQ)")
+class OutputConfig(BaseModel):
+    """Output configuration settings."""
+    directory: Path = Field(description="Directory to write output files")
+    format: OutputFormat = Field(default=OutputFormat.VCF, description="Output format (vcf or maf)")
+    suffix: str = Field(default="", description="Suffix to append to output filename")
+    @field_validator("directory")
+    @classmethod
+    def validate_output_dir(cls, v: Path) -> Path:
+        """Ensure output path is not a file."""
+        if v.exists() and v.is_file():
+            raise ValueError(f"Output path must be a directory, not a file: {v}")
+        return v
+class GbcmsConfig(BaseModel):
+    """
+    Global configuration for gbcms execution.
+    Groups related settings into nested models for cleaner organization.
+    """
+    # Input files
+    variant_file: Path
+    bam_files: dict[str, Path]  # sample_name -> bam_path
+    reference_fasta: Path
+    # Nested configuration groups
+    output: OutputConfig
+    filters: ReadFilters = Field(default_factory=ReadFilters)
+    quality: QualityThresholds = Field(default_factory=QualityThresholds)
+    # Performance
+    threads: int = Field(default=1, ge=1, description="Number of threads")
+    # Advanced
+    fragment_counting: bool = Field(default=False, description="Enable fragment-based counting")
+    @field_validator("variant_file", "reference_fasta")
+    @classmethod
+    def validate_file_exists(cls, v: Path) -> Path:
+        """Validate that input files exist."""
+        if not v.exists():
+            raise ValueError(f"File not found: {v}")
+        return v
+    @model_validator(mode="after")
+    def validate_bams(self) -> "GbcmsConfig":
+        """Validate that all BAM files exist."""
+        for name, path in self.bam_files.items():
+            if not path.exists():
+                raise ValueError(f"BAM file for sample '{name}' not found: {path}")
+        return self