PyPI - strvcf-annotator - Versions diffs - 0.1.0__py3-none-any.whl - Mend

strvcf-annotator 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

strvcf_annotator/__init__.py +43 -0
strvcf_annotator/api.py +281 -0
strvcf_annotator/cli.py +191 -0
strvcf_annotator/core/__init__.py +25 -0
strvcf_annotator/core/annotation.py +244 -0
strvcf_annotator/core/repeat_utils.py +162 -0
strvcf_annotator/core/str_reference.py +110 -0
strvcf_annotator/core/vcf_processor.py +277 -0
strvcf_annotator/parsers/__init__.py +6 -0
strvcf_annotator/parsers/base.py +84 -0
strvcf_annotator/parsers/generic.py +172 -0
strvcf_annotator/utils/__init__.py +11 -0
strvcf_annotator/utils/validation.py +215 -0
strvcf_annotator/utils/vcf_utils.py +135 -0
strvcf_annotator-0.1.0.dist-info/METADATA +304 -0
strvcf_annotator-0.1.0.dist-info/RECORD +20 -0
strvcf_annotator-0.1.0.dist-info/WHEEL +5 -0
strvcf_annotator-0.1.0.dist-info/entry_points.txt +2 -0
strvcf_annotator-0.1.0.dist-info/licenses/LICENSE +21 -0
strvcf_annotator-0.1.0.dist-info/top_level.txt +1 -0

strvcf_annotator/__init__.py ADDED Viewed

@@ -0,0 +1,43 @@
+"""Top-level package for strvcf_annotator.
+STR (Short Tandem Repeat) annotation tool for VCF files.
+Provides both library and CLI interfaces for annotating variants
+that overlap with STR regions.
+"""
+__author__ = """Olesia Kondrateva"""
+__email__ = 'xkdnoa@gmail.com'
+__version__ = '0.1.0'
+# Public API exports
+from .api import STRAnnotator, annotate_vcf
+from .parsers.base import BaseVCFParser
+from .parsers.generic import GenericParser
+from .core.str_reference import load_str_reference
+from .core.repeat_utils import (
+    extract_repeat_sequence,
+    count_repeat_units,
+    apply_variant_to_repeat,
+    is_perfect_repeat
+)
+from .utils.validation import ValidationError
+__all__ = [
+    # Main API
+    'STRAnnotator',
+    'annotate_vcf',
+    # Parsers
+    'BaseVCFParser',
+    'GenericParser',
+    # Core functions
+    'load_str_reference',
+    'extract_repeat_sequence',
+    'count_repeat_units',
+    'apply_variant_to_repeat',
+    'is_perfect_repeat',
+    # Exceptions
+    'ValidationError',
+]

strvcf_annotator/api.py ADDED Viewed

@@ -0,0 +1,281 @@
+"""Library API for programmatic access to STR annotation functionality."""
+import pysam
+from typing import Optional, Iterator
+import logging
+from .parsers.base import BaseVCFParser
+from .parsers.generic import GenericParser
+from .core.str_reference import load_str_reference
+from .core.vcf_processor import (
+    generate_annotated_records,
+    annotate_vcf_to_file,
+    process_directory
+)
+from .utils.validation import (
+    validate_directory_path,
+    validate_vcf_file,
+    validate_str_bed_file
+)
+logger = logging.getLogger(__name__)
+class STRAnnotator:
+    """Main class for STR annotation functionality.
+    Provides a high-level interface for annotating VCF files with STR
+    (Short Tandem Repeat) information. Supports both single file and
+    batch directory processing.
+    Parameters
+    ----------
+    str_bed_path : str
+        Path to BED file containing STR regions
+    parser : BaseVCFParser, optional
+        Custom parser for genotype extraction. Uses GenericParser if None.
+    somatic_mode : bool, optional
+        Enable somatic filtering mode. When True, skips variants where both
+        samples (tumor/normal) have identical genotypes. Default is False.
+    Attributes
+    ----------
+    str_bed_path : str
+        Path to STR BED file
+    str_df : pd.DataFrame
+        Loaded STR reference data
+    parser : BaseVCFParser
+        Parser for genotype extraction
+    somatic_mode : bool
+        Whether somatic filtering is enabled
+    Examples
+    --------
+    >>> annotator = STRAnnotator('str_regions.bed')
+    >>> annotator.annotate_vcf_file('input.vcf', 'output.vcf')
+    >>> # Batch process directory
+    >>> annotator.process_directory('input_dir/', 'output_dir/')
+    >>> # Stream processing
+    >>> vcf_in = pysam.VariantFile('input.vcf')
+    >>> for record in annotator.annotate_vcf_stream(vcf_in):
+    ...     print(record)
+    """
+    def __init__(
+        self,
+        str_bed_path: str,
+        parser: Optional[BaseVCFParser] = None,
+        somatic_mode: bool = False
+    ):
+        """Initialize STR annotator with reference and parser.
+        Parameters
+        ----------
+        str_bed_path : str
+            Path to BED file with STR regions
+        parser : BaseVCFParser, optional
+            Custom parser for genotype extraction
+        somatic_mode : bool, optional
+            Enable somatic filtering (skip variants where tumor==normal genotypes).
+            Default is False.
+        Raises
+        ------
+        ValidationError
+            If STR BED file is invalid
+        """
+        # Validate and load STR reference
+        validate_str_bed_file(str_bed_path)
+        self.str_bed_path = str_bed_path
+        self.str_df = load_str_reference(str_bed_path)
+        # Set parser
+        self.parser = parser if parser is not None else GenericParser()
+        # Set somatic mode
+        self.somatic_mode = somatic_mode
+        logger.info(f"Loaded {len(self.str_df)} STR regions from {str_bed_path}")
+    def annotate_vcf_file(self, input_path: str, output_path: str) -> None:
+        """Annotate single VCF file.
+        Reads a VCF file, annotates variants overlapping with STR regions,
+        and writes the annotated records to an output file.
+        Parameters
+        ----------
+        input_path : str
+            Path to input VCF file
+        output_path : str
+            Path to output VCF file
+        Raises
+        ------
+        ValidationError
+            If input VCF file is invalid
+        Examples
+        --------
+        >>> annotator = STRAnnotator('str_regions.bed')
+        >>> annotator.annotate_vcf_file('input.vcf', 'output.vcf')
+        """
+        # Validate input
+        validate_vcf_file(input_path)
+        # Annotate
+        logger.info(f"Annotating {input_path}...")
+        annotate_vcf_to_file(
+            input_path,
+            self.str_df,
+            output_path,
+            self.parser,
+            somatic_mode=self.somatic_mode
+        )
+        logger.info(f"Wrote annotated VCF to {output_path}")
+    def annotate_vcf_stream(self, vcf_in: pysam.VariantFile) -> Iterator[pysam.VariantRecord]:
+        """Annotate VCF records from stream.
+        Generator that yields annotated VCF records from an open VCF file.
+        Useful for streaming processing or custom workflows.
+        Parameters
+        ----------
+        vcf_in : pysam.VariantFile
+            Open VCF file object
+        Yields
+        ------
+        pysam.VariantRecord
+            Annotated VCF records
+        Examples
+        --------
+        >>> annotator = STRAnnotator('str_regions.bed')
+        >>> vcf_in = pysam.VariantFile('input.vcf')
+        >>> for record in annotator.annotate_vcf_stream(vcf_in):
+        ...     # Process record
+        ...     print(record.info['RU'])
+        """
+        yield from generate_annotated_records(
+            vcf_in,
+            self.str_df,
+            self.parser,
+            somatic_mode=self.somatic_mode
+        )
+    def process_directory(self, input_dir: str, output_dir: str) -> None:
+        """Batch process directory of VCF files.
+        Processes all VCF files in a directory and writes annotated versions
+        to the output directory. Skips files that have already been processed.
+        Parameters
+        ----------
+        input_dir : str
+            Directory containing input VCF files
+        output_dir : str
+            Directory for output VCF files (created if doesn't exist)
+        Raises
+        ------
+        ValidationError
+            If input directory is invalid
+        Examples
+        --------
+        >>> annotator = STRAnnotator('str_regions.bed')
+        >>> annotator.process_directory('vcf_files/', 'annotated_vcfs/')
+        """
+        # Validate directories
+        validate_directory_path(input_dir, must_exist=True)
+        validate_directory_path(output_dir, must_exist=False, create=True)
+        # Process directory
+        logger.info(f"Processing VCF files in {input_dir}...")
+        process_directory(
+            input_dir,
+            self.str_bed_path,
+            output_dir,
+            self.parser,
+            somatic_mode=self.somatic_mode
+        )
+        logger.info(f"Batch processing complete. Output in {output_dir}")
+    def get_str_at_position(self, chrom: str, pos: int) -> Optional[dict]:
+        """Get STR region at specific genomic position.
+        Parameters
+        ----------
+        chrom : str
+            Chromosome name
+        pos : int
+            Genomic position (1-based)
+        Returns
+        -------
+        Optional[dict]
+            STR region data if position is within an STR, None otherwise
+        Examples
+        --------
+        >>> annotator = STRAnnotator('str_regions.bed')
+        >>> str_region = annotator.get_str_at_position('chr1', 1000000)
+        >>> if str_region:
+        ...     print(f"Repeat unit: {str_region['RU']}")
+        """
+        from .core.str_reference import get_str_at_position
+        return get_str_at_position(self.str_df, chrom, pos)
+    def get_statistics(self) -> dict:
+        """Get statistics about loaded STR regions.
+        Returns
+        -------
+        dict
+            Statistics including total regions, chromosomes, repeat units
+        Examples
+        --------
+        >>> annotator = STRAnnotator('str_regions.bed')
+        >>> stats = annotator.get_statistics()
+        >>> print(f"Total STR regions: {stats['total_regions']}")
+        """
+        stats = {
+            'total_regions': len(self.str_df),
+            'chromosomes': self.str_df['CHROM'].nunique(),
+            'unique_repeat_units': self.str_df['RU'].nunique(),
+            'period_distribution': self.str_df['PERIOD'].value_counts().to_dict(),
+            'mean_repeat_count': self.str_df['COUNT'].mean(),
+            'median_repeat_count': self.str_df['COUNT'].median()
+        }
+        return stats
+def annotate_vcf(input_vcf: str, str_bed: str, output_vcf: str,
+                parser: Optional[BaseVCFParser] = None) -> None:
+    """Convenience function for single VCF annotation.
+    Simple function interface for annotating a single VCF file.
+    Parameters
+    ----------
+    input_vcf : str
+        Path to input VCF file
+    str_bed : str
+        Path to STR BED file
+    output_vcf : str
+        Path to output VCF file
+    parser : BaseVCFParser, optional
+        Custom parser for genotype extraction
+    Examples
+    --------
+    >>> from strvcf_annotator import annotate_vcf
+    >>> annotate_vcf('input.vcf', 'str_regions.bed', 'output.vcf')
+    """
+    annotator = STRAnnotator(str_bed, parser)
+    annotator.annotate_vcf_file(input_vcf, output_vcf)

strvcf_annotator/cli.py ADDED Viewed

@@ -0,0 +1,191 @@
+"""Console script for strvcf_annotator."""
+import argparse
+import logging
+import sys
+from . import __version__
+from .api import STRAnnotator
+from .utils.validation import ValidationError
+def setup_logging(verbose: bool = False):
+    """Configure logging for CLI.
+    Parameters
+    ----------
+    verbose : bool
+        If True, set logging level to DEBUG
+    """
+    level = logging.DEBUG if verbose else logging.INFO
+    logging.basicConfig(
+        level=level,
+        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+        datefmt='%Y-%m-%d %H:%M:%S'
+    )
+def create_parser() -> argparse.ArgumentParser:
+    """Create CLI argument parser.
+    Returns
+    -------
+    argparse.ArgumentParser
+        Configured argument parser
+    """
+    parser = argparse.ArgumentParser(
+        description="Annotate STR regions in VCF files using a BED file",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Annotate single VCF file
+  strvcf-annotator --input input.vcf --str-bed repeats.bed --output output.vcf
+  # Batch process directory
+  strvcf-annotator --input-dir vcf_files/ --str-bed repeats.bed --output-dir annotated/
+  # Enable verbose logging
+  strvcf-annotator --input input.vcf --str-bed repeats.bed --output output.vcf --verbose
+  # Somatic mode (filter variants where tumor==normal genotypes)
+  strvcf-annotator --input somatic.vcf --str-bed repeats.bed --output output.vcf --somatic-mode
+        """
+    )
+    # Input options (mutually exclusive)
+    input_group = parser.add_mutually_exclusive_group(required=True)
+    input_group.add_argument(
+        '--input',
+        type=str,
+        help='Path to input VCF file'
+    )
+    input_group.add_argument(
+        '--input-dir',
+        type=str,
+        help='Directory containing input VCF files'
+    )
+    # Required arguments
+    parser.add_argument(
+        '--str-bed',
+        required=True,
+        type=str,
+        help='Path to BED file with STR regions (CHROM, START, END, PERIOD, RU)'
+    )
+    # Output options
+    output_group = parser.add_mutually_exclusive_group(required=True)
+    output_group.add_argument(
+        '--output',
+        type=str,
+        help='Path to output VCF file (for single file mode)'
+    )
+    output_group.add_argument(
+        '--output-dir',
+        type=str,
+        help='Directory for output VCF files (for batch mode)'
+    )
+    # Optional arguments
+    parser.add_argument(
+        '--verbose',
+        '-v',
+        action='store_true',
+        help='Enable verbose logging'
+    )
+    parser.add_argument(
+        '--somatic-mode',
+        action='store_true',
+        help='Enable somatic filtering: skip variants where both samples have identical genotypes'
+    )
+    parser.add_argument(
+        '--version',
+        action='version',
+        version=f'%(prog)s {__version__}'
+    )
+    return parser
+def validate_args(args: argparse.Namespace) -> None:
+    """Validate CLI arguments.
+    Parameters
+    ----------
+    args : argparse.Namespace
+        Parsed command-line arguments
+    Raises
+    ------
+    ValidationError
+        If arguments are invalid or inconsistent
+    """
+    # Validate input/output consistency
+    if args.input and not args.output:
+        raise ValidationError("--input requires --output")
+    if args.input_dir and not args.output_dir:
+        raise ValidationError("--input-dir requires --output-dir")
+    if args.output and not args.input:
+        raise ValidationError("--output requires --input")
+    if args.output_dir and not args.input_dir:
+        raise ValidationError("--output-dir requires --input-dir")
+def main():
+    """CLI entry point with argument parsing and validation."""
+    parser = create_parser()
+    args = parser.parse_args()
+    # Setup logging
+    setup_logging(args.verbose)
+    logger = logging.getLogger(__name__)
+    try:
+        # Validate arguments
+        validate_args(args)
+        # Create annotator
+        logger.info("Initializing STR annotator...")
+        somatic_mode = getattr(args, 'somatic_mode', False)
+        annotator = STRAnnotator(args.str_bed, somatic_mode=somatic_mode)
+        # Display statistics
+        stats = annotator.get_statistics()
+        logger.info(f"Loaded {stats['total_regions']} STR regions from {stats['chromosomes']} chromosomes")
+        # Process based on mode
+        if args.input:
+            # Single file mode
+            logger.info(f"Processing single file: {args.input}")
+            annotator.annotate_vcf_file(args.input, args.output)
+            logger.info(f"Successfully wrote annotated VCF to {args.output}")
+        elif args.input_dir:
+            # Batch directory mode
+            logger.info(f"Processing directory: {args.input_dir}")
+            annotator.process_directory(args.input_dir, args.output_dir)
+            logger.info(f"Successfully processed all VCF files to {args.output_dir}")
+        logger.info("Annotation complete!")
+        return 0
+    except ValidationError as e:
+        logger.error(f"Validation error: {e}")
+        return 1
+    except FileNotFoundError as e:
+        logger.error(f"File not found: {e}")
+        return 1
+    except Exception as e:
+        logger.error(f"Unexpected error: {e}", exc_info=args.verbose)
+        return 1
+if __name__ == "__main__":
+    sys.exit(main())

strvcf_annotator/core/__init__.py ADDED Viewed

@@ -0,0 +1,25 @@
+"""Core modules for STR annotation functionality."""
+from .str_reference import load_str_reference
+from .repeat_utils import extract_repeat_sequence, count_repeat_units, apply_variant_to_repeat
+from .annotation import make_modified_header, build_new_record, should_skip_genotype
+from .vcf_processor import (
+    check_vcf_sorted,
+    reset_and_sort_vcf,
+    generate_annotated_records,
+    annotate_vcf_to_file
+)
+__all__ = [
+    'load_str_reference',
+    'extract_repeat_sequence',
+    'count_repeat_units',
+    'apply_variant_to_repeat',
+    'make_modified_header',
+    'build_new_record',
+    'should_skip_genotype',
+    'check_vcf_sorted',
+    'reset_and_sort_vcf',
+    'generate_annotated_records',
+    'annotate_vcf_to_file'
+]