PyPI - py-gbcms - Versions diffs - 2.0.0__py3-none-any.whl → 2.1.1__py3-none-any.whl - Mend

py-gbcms 2.0.0py3-none-any.whl → 2.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

gbcms/__init__.py +1 -13
gbcms/cli.py +134 -716
gbcms/core/kernel.py +126 -0
gbcms/io/input.py +222 -0
gbcms/io/output.py +361 -0
gbcms/models/core.py +133 -0
gbcms/pipeline.py +212 -0
gbcms/py.typed +0 -0
py_gbcms-2.1.1.dist-info/METADATA +216 -0
py_gbcms-2.1.1.dist-info/RECORD +13 -0
gbcms/config.py +0 -98
gbcms/counter.py +0 -1074
gbcms/models.py +0 -295
gbcms/numba_counter.py +0 -394
gbcms/output.py +0 -573
gbcms/parallel.py +0 -129
gbcms/processor.py +0 -293
gbcms/reference.py +0 -86
gbcms/variant.py +0 -390
py_gbcms-2.0.0.dist-info/METADATA +0 -506
py_gbcms-2.0.0.dist-info/RECORD +0 -16
{py_gbcms-2.0.0.dist-info → py_gbcms-2.1.1.dist-info}/WHEEL +0 -0
{py_gbcms-2.0.0.dist-info → py_gbcms-2.1.1.dist-info}/entry_points.txt +0 -0
{py_gbcms-2.0.0.dist-info → py_gbcms-2.1.1.dist-info}/licenses/LICENSE +0 -0

gbcms/parallel.py DELETED Viewed

@@ -1,129 +0,0 @@
-"""Parallel processing with joblib backend only."""
-import logging
-import os
-from collections.abc import Callable
-from typing import Any
-from joblib import Parallel, delayed
-from rich.progress import BarColumn, Progress, TaskProgressColumn, TextColumn
-logger = logging.getLogger(__name__)
-class ParallelProcessor:
-    """Unified interface for parallel processing with joblib."""
-    def __init__(
-        self,
-        n_jobs: int = -1,
-        backend: str = "joblib",
-        verbose: int = 0,
-    ):
-        """
-        Initialize parallel processor.
-        Args:
-            n_jobs: Number of parallel jobs (-1 for all CPUs)
-            backend: Backend to use ('joblib', 'threading', 'multiprocessing')
-            verbose: Verbosity level
-        """
-        self.n_jobs = n_jobs if n_jobs > 0 else os.cpu_count()
-        self.backend = backend
-        self.verbose = verbose
-        # Map user-friendly backend names to joblib backends
-        backend_map = {
-            "joblib": "loky",  # Robust joblib backend
-            "threading": "threading",  # Pure threading
-            "multiprocessing": "multiprocessing",  # Process-based
-            "loky": "loky",  # Explicit loky
-        }
-        self.joblib_backend = backend_map.get(backend, "loky")
-        logger.debug(
-            f"Initialized parallel processor with {self.n_jobs} jobs using {backend} -> {self.joblib_backend} backend"
-        )
-    def map(
-        self,
-        func: Callable,
-        items: list,
-        description: str = "Processing",
-        show_progress: bool = True,
-    ) -> list[Any]:
-        """
-        Apply function to each item in parallel.
-        Args:
-            func: Function to apply
-            items: List of items to process
-            description: Progress description
-            show_progress: Whether to show progress bar
-        Returns:
-            List of results
-        """
-        return self._map_joblib(func, items, description, show_progress)
-    def _map_joblib(
-        self,
-        func: Callable,
-        items: list,
-        description: str = "Processing",
-        show_progress: bool = True,
-    ) -> list[Any]:
-        """Map using joblib."""
-        if show_progress and len(items) > 10:
-            # Use progress bar for larger workloads
-            progress_columns = [
-                TextColumn("[bold blue]{task.description}"),
-                BarColumn(),
-                TaskProgressColumn(),
-                TextColumn("({task.completed}/{task.total})"),
-            ]
-            with Progress(*progress_columns, refresh_per_second=10) as progress:
-                task = progress.add_task(description, total=len(items))
-                def progress_wrapper(item):
-                    result = func(item)
-                    progress.update(task, advance=1)
-                    return result
-                with Parallel(n_jobs=self.n_jobs, backend=self.joblib_backend) as parallel:
-                    return list(parallel(delayed(progress_wrapper)(item) for item in items))
-        else:
-            # Simple parallel execution
-            with Parallel(n_jobs=self.n_jobs, backend=self.joblib_backend) as parallel:
-                return list(parallel(delayed(func)(item) for item in items))
-    def starmap(
-        self,
-        func: Callable,
-        items: list,
-        description: str = "Processing",
-        show_progress: bool = True,
-    ) -> list[Any]:
-        """
-        Apply function with arguments to each item in parallel.
-        Args:
-            func: Function to apply
-            items: List of argument tuples
-            description: Progress description
-            show_progress: Whether to show progress bar
-        Returns:
-            List of results
-        """
-        def wrapper(args):
-            return func(*args)
-        return self._map_joblib(wrapper, items, description, show_progress)
-    def shutdown(self):
-        """Shutdown parallel processing resources."""
-        # joblib handles cleanup automatically
-        pass

gbcms/processor.py DELETED Viewed

@@ -1,293 +0,0 @@
-"""Main processing logic for GetBaseCounts."""
-import logging
-from concurrent.futures import ThreadPoolExecutor, as_completed
-import pysam
-from rich.progress import BarColumn, Progress, SpinnerColumn, TaskProgressColumn, TextColumn
-from .config import Config
-from .counter import BaseCounter
-from .output import OutputFormatter
-from .reference import ReferenceSequence
-from .variant import VariantEntry, VariantLoader
-logger = logging.getLogger(__name__)
-class VariantProcessor:
-    """Main processor for counting bases in variants."""
-    def __init__(self, config: Config):
-        """
-        Initialize variant processor.
-        Args:
-            config: Configuration object
-        """
-        self.config = config
-        self.reference = ReferenceSequence(config.fasta_file)
-        self.counter = BaseCounter(config)
-        self.sample_order = list(config.bam_files.keys())
-    def process(self) -> None:
-        """Main processing pipeline."""
-        # Load variants
-        loader = VariantLoader(reference_getter=self.reference.get_base)
-        variants = self._load_all_variants(loader)
-        if not variants:
-            logger.warning("No variants to process")
-            return
-        # Sort and index variants
-        variants = self._sort_and_index_variants(variants)
-        # Initialize counts for all samples
-        for variant in variants:
-            variant.initialize_counts(self.sample_order)
-        # Create variant blocks for parallel processing
-        variant_blocks = self._create_variant_blocks(variants)
-        logger.info(f"Created {len(variant_blocks)} variant blocks for processing")
-        # Process each BAM file
-        for sample_name, bam_path in self.config.bam_files.items():
-            self._process_bam_file(sample_name, bam_path, variants, variant_blocks)
-        # Write output
-        self._write_output(variants)
-        # Cleanup
-        self.reference.close()
-        logger.info("Finished processing")
-    def _load_all_variants(self, loader: VariantLoader) -> list[VariantEntry]:
-        """Load all variants from input files."""
-        all_variants = []
-        for variant_file in self.config.variant_files:
-            if self.config.input_is_maf:
-                variants = loader.load_maf(variant_file)
-            else:
-                variants = loader.load_vcf(variant_file)
-            all_variants.extend(variants)
-        logger.info(f"Total variants loaded: {len(all_variants)}")
-        return all_variants
-    def _sort_and_index_variants(self, variants: list[VariantEntry]) -> list[VariantEntry]:
-        """Sort variants and identify duplicates."""
-        logger.info("Sorting variants")
-        variants.sort()
-        logger.info("Indexing variants")
-        duplicate_map: dict[tuple, VariantEntry] = {}
-        for variant in variants:
-            key = variant.get_variant_key()
-            if key not in duplicate_map:
-                duplicate_map[key] = variant
-            else:
-                # Mark as duplicate
-                variant.duplicate_variant_ptr = duplicate_map[key]
-        return variants
-    def _create_variant_blocks(self, variants: list[VariantEntry]) -> list[tuple[int, int]]:
-        """
-        Create blocks of variants for parallel processing.
-        Returns:
-            List of (start_index, end_index) tuples
-        """
-        if not variants:
-            return []
-        blocks = []
-        start_idx = 0
-        current_count = 0
-        for i in range(len(variants)):
-            current_count += 1
-            # Check if we should create a new block
-            should_break = False
-            if current_count >= self.config.max_block_size:
-                should_break = True
-            elif i > start_idx:
-                # Check chromosome change or distance
-                if variants[i].chrom != variants[start_idx].chrom:
-                    should_break = True
-                elif variants[i].pos - variants[start_idx].pos > self.config.max_block_dist:
-                    should_break = True
-            if should_break:
-                blocks.append((start_idx, i - 1))
-                start_idx = i
-                current_count = 1
-        # Add final block
-        blocks.append((start_idx, len(variants) - 1))
-        return blocks
-    def _process_bam_file(
-        self,
-        sample_name: str,
-        bam_path: str,
-        variants: list[VariantEntry],
-        variant_blocks: list[tuple[int, int]],
-    ) -> None:
-        """
-        Process a single BAM file.
-        Args:
-            sample_name: Sample name
-            bam_path: Path to BAM file
-            variants: List of all variants
-            variant_blocks: List of variant block ranges
-        """
-        logger.info(f"Processing BAM file: {bam_path}")
-        if self.config.num_threads > 1:
-            self._process_bam_parallel(sample_name, bam_path, variants, variant_blocks)
-        else:
-            self._process_bam_sequential(sample_name, bam_path, variants, variant_blocks)
-    def _process_bam_sequential(
-        self,
-        sample_name: str,
-        bam_path: str,
-        variants: list[VariantEntry],
-        variant_blocks: list[tuple[int, int]],
-    ) -> None:
-        """Process BAM file sequentially."""
-        with pysam.AlignmentFile(bam_path, "rb") as bam:
-            with Progress(
-                SpinnerColumn(),
-                TextColumn("[progress.description]{task.description}"),
-                BarColumn(),
-                TaskProgressColumn(),
-            ) as progress:
-                task = progress.add_task(
-                    f"[cyan]Processing {sample_name}...", total=len(variant_blocks)
-                )
-                for start_idx, end_idx in variant_blocks:
-                    self._process_variant_block(bam, sample_name, variants, start_idx, end_idx)
-                    progress.update(task, advance=1)
-    def _process_bam_parallel(
-        self,
-        sample_name: str,
-        bam_path: str,
-        variants: list[VariantEntry],
-        variant_blocks: list[tuple[int, int]],
-    ) -> None:
-        """Process BAM file in parallel."""
-        with Progress(
-            SpinnerColumn(),
-            TextColumn("[progress.description]{task.description}"),
-            BarColumn(),
-            TaskProgressColumn(),
-        ) as progress:
-            task = progress.add_task(
-                f"[cyan]Processing {sample_name}...", total=len(variant_blocks)
-            )
-            with ThreadPoolExecutor(max_workers=self.config.num_threads) as executor:
-                futures = []
-                for start_idx, end_idx in variant_blocks:
-                    future = executor.submit(
-                        self._process_variant_block_thread_safe,
-                        bam_path,
-                        sample_name,
-                        variants,
-                        start_idx,
-                        end_idx,
-                    )
-                    futures.append(future)
-                for future in as_completed(futures):
-                    future.result()  # Raise any exceptions
-                    progress.update(task, advance=1)
-    def _process_variant_block_thread_safe(
-        self,
-        bam_path: str,
-        sample_name: str,
-        variants: list[VariantEntry],
-        start_idx: int,
-        end_idx: int,
-    ) -> None:
-        """Process a variant block in a thread-safe manner."""
-        # Each thread opens its own BAM file handle
-        with pysam.AlignmentFile(bam_path, "rb") as bam:
-            self._process_variant_block(bam, sample_name, variants, start_idx, end_idx)
-    def _process_variant_block(
-        self,
-        bam: pysam.AlignmentFile,
-        sample_name: str,
-        variants: list[VariantEntry],
-        start_idx: int,
-        end_idx: int,
-    ) -> None:
-        """
-        Process a block of variants.
-        Args:
-            bam: Open BAM file handle
-            sample_name: Sample name
-            variants: List of all variants
-            start_idx: Start index in variants list
-            end_idx: End index in variants list
-        """
-        start_variant = variants[start_idx]
-        end_variant = variants[end_idx]
-        # Fetch alignments for the region
-        try:
-            alignments = list(
-                bam.fetch(
-                    start_variant.chrom,
-                    start_variant.pos,
-                    end_variant.pos + 2,  # Buffer for indels
-                )
-            )
-        except Exception as e:
-            logger.error(
-                f"Error fetching alignments for region "
-                f"{start_variant.chrom}:{start_variant.pos}-{end_variant.pos}: {e}"
-            )
-            return
-        # Filter alignments
-        filtered_alignments = [aln for aln in alignments if not self.counter.filter_alignment(aln)]
-        # Process each variant in the block
-        for i in range(start_idx, end_idx + 1):
-            variant = variants[i]
-            # Skip if this is a duplicate variant
-            if variant.duplicate_variant_ptr is not None:
-                continue
-            # Count bases for this variant
-            self.counter.count_variant(variant, filtered_alignments, sample_name)
-    def _write_output(self, variants: list[VariantEntry]) -> None:
-        """Write output file."""
-        formatter = OutputFormatter(self.config, self.sample_order)
-        if self.config.input_is_maf:
-            if self.config.output_maf:
-                formatter.write_maf_output(variants)
-            else:
-                formatter.write_fillout_output(variants)
-        else:
-            formatter.write_vcf_output(variants)

gbcms/reference.py DELETED Viewed

@@ -1,86 +0,0 @@
-"""Reference sequence handling."""
-import logging
-import pysam
-logger = logging.getLogger(__name__)
-class ReferenceSequence:
-    """Handles reference sequence loading and access."""
-    def __init__(self, fasta_file: str):
-        """
-        Initialize reference sequence handler.
-        Args:
-            fasta_file: Path to reference FASTA file (must be indexed)
-        """
-        self.fasta_file = fasta_file
-        self.fasta: pysam.FastaFile | None = None
-        self._load_reference()
-    def _load_reference(self) -> None:
-        """Load reference sequence using pysam."""
-        logger.info(f"Loading reference sequence: {self.fasta_file}")
-        try:
-            self.fasta = pysam.FastaFile(self.fasta_file)
-        except Exception as e:
-            logger.error(f"Failed to open reference FASTA file: {e}")
-            raise
-    def get_base(self, chrom: str, pos: int) -> str:
-        """
-        Get base at specific position (0-indexed).
-        Args:
-            chrom: Chromosome name
-            pos: 0-indexed position
-        Returns:
-            Base at position (uppercase)
-        """
-        if self.fasta is None:
-            raise RuntimeError("Reference FASTA not loaded")
-        try:
-            return self.fasta.fetch(chrom, pos, pos + 1).upper()
-        except Exception as e:
-            logger.error(f"Failed to fetch base at {chrom}:{pos}: {e}")
-            raise
-    def get_sequence(self, chrom: str, start: int, end: int) -> str:
-        """
-        Get sequence in range (0-indexed, end exclusive).
-        Args:
-            chrom: Chromosome name
-            start: Start position (0-indexed, inclusive)
-            end: End position (0-indexed, exclusive)
-        Returns:
-            Sequence in range (uppercase)
-        """
-        if self.fasta is None:
-            raise RuntimeError("Reference FASTA not loaded")
-        try:
-            return self.fasta.fetch(chrom, start, end).upper()
-        except Exception as e:
-            logger.error(f"Failed to fetch sequence at {chrom}:{start}-{end}: {e}")
-            raise
-    def close(self) -> None:
-        """Close the FASTA file."""
-        if self.fasta:
-            self.fasta.close()
-            self.fasta = None
-    def __enter__(self) -> "ReferenceSequence":
-        """Context manager entry."""
-        return self
-    def __exit__(self, exc_type, exc_val, exc_tb) -> None:
-        """Context manager exit."""
-        self.close()

py-gbcms 2.0.0__py3-none-any.whl → 2.1.1__py3-none-any.whl

py-gbcms 2.0.0py3-none-any.whl → 2.1.1py3-none-any.whl