PyPI - supremo-lite - Versions diffs - 0.5.4__py3-none-any.whl - Mend

supremo-lite 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

supremo_lite/__init__.py +59 -0
supremo_lite/chromosome_utils.py +322 -0
supremo_lite/core.py +41 -0
supremo_lite/mock_models/__init__.py +110 -0
supremo_lite/mock_models/testmodel_1d.py +184 -0
supremo_lite/mock_models/testmodel_2d.py +203 -0
supremo_lite/mutagenesis.py +414 -0
supremo_lite/personalize.py +3098 -0
supremo_lite/prediction_alignment.py +1014 -0
supremo_lite/sequence_utils.py +137 -0
supremo_lite/variant_utils.py +1645 -0
supremo_lite-0.5.4.dist-info/METADATA +216 -0
supremo_lite-0.5.4.dist-info/RECORD +15 -0
supremo_lite-0.5.4.dist-info/WHEEL +4 -0
supremo_lite-0.5.4.dist-info/licenses/LICENSE +22 -0

supremo_lite/__init__.py ADDED Viewed

@@ -0,0 +1,59 @@
+"""
+supremo_lite: A module for generating personalized genome sequences from a reference
+fasta and a variants file, or sequences for in-silico mutagenesis.
+This package provides functionality for:
+- Sequence encoding and transformation
+- Variant reading and application
+- In-silico mutagenesis
+"""
+# Import core components
+from .core import TORCH_AVAILABLE, BRISKET_AVAILABLE, nt_to_1h, nts
+# Import sequence transformation utilities
+from .sequence_utils import encode_seq, decode_seq, rc, rc_str
+# Import variant reading utilities
+from .variant_utils import (
+    read_vcf,
+    read_vcf_chunked,
+    get_vcf_chromosomes,
+    read_vcf_chromosome,
+    classify_variant_type,
+    parse_vcf_info,
+)
+# Import chromosome matching utilities
+from .chromosome_utils import (
+    normalize_chromosome_name,
+    create_chromosome_mapping,
+    match_chromosomes_with_report,
+    ChromosomeMismatchError,
+)
+# Import personalize functions
+from .personalize import (
+    get_personal_genome,
+    get_alt_sequences,
+    get_ref_sequences,
+    get_pam_disrupting_alt_sequences,
+    get_alt_ref_sequences,
+)
+# Import mutagenesis functions
+from .mutagenesis import get_sm_sequences, get_sm_subsequences
+# Import prediction alignment functions
+from .prediction_alignment import align_predictions_by_coordinate
+# Mock models are available in a separate submodule
+# Import with: from supremo_lite.mock_models import TestModel, TestModel2D
+# This allows users who don't have PyTorch to still use the main package
+# Version
+__version__ = "0.5.4"
+# Package metadata
+__description__ = (
+    "A module for generating personalized genome sequences and in-silico mutagenesis"
+)

supremo_lite/chromosome_utils.py ADDED Viewed

@@ -0,0 +1,322 @@
+"""
+Chromosome name matching utilities for supremo_lite.
+This module provides functions for handling mismatches in chromosome naming
+between FASTA references and VCF files using intelligent heuristics.
+"""
+import re
+import warnings
+from typing import Dict, Set, Optional, List, Tuple
+class ChromosomeMismatchError(Exception):
+    """
+    Raised when chromosome names in VCF and reference do not match.
+    This error is raised by default when chromosome names don't match exactly
+    and automatic chromosome mapping is not enabled.
+    """
+    pass
+def normalize_chromosome_name(chrom_name: str) -> str:
+    """
+    Normalize chromosome name to a standard format.
+    Args:
+        chrom_name: Raw chromosome name from VCF or FASTA
+    Returns:
+        Normalized chromosome name (without 'chr' prefix, uppercase)
+    Examples:
+        'chr1' -> '1'
+        'CHR1' -> '1'
+        'chrX' -> 'X'
+        'chrMT' -> 'MT'
+        'M' -> 'MT'  # Mitochondrial normalization
+    """
+    # Convert to string and strip whitespace
+    normalized = str(chrom_name).strip()
+    # Remove 'chr' prefix (case insensitive)
+    normalized = re.sub(r"^chr", "", normalized, flags=re.IGNORECASE)
+    # Handle mitochondrial chromosome variants
+    if normalized.upper() in ["M", "MITO", "MITOCHONDRION"]:
+        normalized = "MT"
+    # Convert to uppercase for consistency
+    normalized = normalized.upper()
+    return normalized
+def create_chromosome_mapping(
+    reference_chroms: Set[str], vcf_chroms: Set[str]
+) -> Dict[str, str]:
+    """
+    Create a mapping from VCF chromosome names to reference chromosome names.
+    This function uses heuristics to match chromosome names between VCF and FASTA:
+    1. Exact match (case sensitive)
+    2. Exact match (case insensitive)
+    3. Normalized match (with/without 'chr' prefix)
+    4. Special cases for mitochondrial chromosomes
+    Args:
+        reference_chroms: Set of chromosome names from reference FASTA
+        vcf_chroms: Set of chromosome names from VCF file
+    Returns:
+        Tuple of (mapping dict, unmatched set)
+    Example:
+        reference_chroms = {'1', '2', 'X', 'Y', 'MT'}
+        vcf_chroms = {'chr1', 'chr2', 'chrX', 'chrY', 'chrM'}
+        Returns: {'chr1': '1', 'chr2': '2', 'chrX': 'X', 'chrY': 'Y', 'chrM': 'MT'}
+    """
+    mapping = {}
+    unmatched_vcf = set()
+    # Try to match each VCF chromosome
+    for vcf_chrom in vcf_chroms:
+        matched_ref = None
+        # 1. Try exact match (case sensitive)
+        if vcf_chrom in reference_chroms:
+            matched_ref = vcf_chrom
+        # 2. Try exact match (case insensitive)
+        if matched_ref is None:
+            for ref_chrom in reference_chroms:
+                if vcf_chrom.lower() == ref_chrom.lower():
+                    matched_ref = ref_chrom
+                    break
+        # 3. Try removing/adding chr prefix
+        if matched_ref is None:
+            # If VCF has 'chr' prefix, try without it
+            if vcf_chrom.lower().startswith("chr"):
+                no_chr = vcf_chrom[3:]
+                if no_chr in reference_chroms:
+                    matched_ref = no_chr
+                else:
+                    # Try case insensitive match without chr
+                    for ref_chrom in reference_chroms:
+                        if no_chr.lower() == ref_chrom.lower():
+                            matched_ref = ref_chrom
+                            break
+            # If VCF doesn't have 'chr' prefix, try with it
+            else:
+                with_chr = f"chr{vcf_chrom}"
+                if with_chr in reference_chroms:
+                    matched_ref = with_chr
+                else:
+                    # Try case insensitive match with chr
+                    for ref_chrom in reference_chroms:
+                        if with_chr.lower() == ref_chrom.lower():
+                            matched_ref = ref_chrom
+                            break
+        # 4. Try normalized matching (handles mitochondrial variants)
+        if matched_ref is None:
+            vcf_normalized = normalize_chromosome_name(vcf_chrom)
+            for ref_chrom in reference_chroms:
+                ref_normalized = normalize_chromosome_name(ref_chrom)
+                if vcf_normalized == ref_normalized:
+                    matched_ref = ref_chrom
+                    break
+        # Record result
+        if matched_ref is not None:
+            mapping[vcf_chrom] = matched_ref
+        else:
+            unmatched_vcf.add(vcf_chrom)
+    return mapping, unmatched_vcf
+def apply_chromosome_mapping(variants_df, mapping: Dict[str, str]):
+    """
+    Apply chromosome name mapping to a variants DataFrame.
+    Args:
+        variants_df: Pandas DataFrame with 'chrom' column
+        mapping: Dictionary mapping original to new chromosome names
+    Returns:
+        Modified DataFrame with updated chromosome names
+    """
+    variants_df = variants_df.copy()
+    # Apply mapping to chromosome column
+    variants_df["chrom"] = variants_df["chrom"].map(lambda x: mapping.get(x, x))
+    return variants_df
+def get_chromosome_match_report(
+    reference_chroms: Set[str],
+    vcf_chroms: Set[str],
+    mapping: Dict[str, str],
+    unmatched: Set[str],
+) -> str:
+    """
+    Generate a human-readable report of chromosome matching results.
+    Args:
+        reference_chroms: Set of reference chromosome names
+        vcf_chroms: Set of VCF chromosome names
+        mapping: Successful mappings
+        unmatched: Unmatched VCF chromosomes
+    Returns:
+        Formatted report string
+    """
+    report_lines = []
+    report_lines.append("Chromosome Matching Report")
+    report_lines.append("=" * 40)
+    report_lines.append(
+        f"Reference chromosomes ({len(reference_chroms)}): {sorted(reference_chroms)}"
+    )
+    report_lines.append(f"VCF chromosomes ({len(vcf_chroms)}): {sorted(vcf_chroms)}")
+    report_lines.append("")
+    if mapping:
+        report_lines.append(f"Successfully matched ({len(mapping)}):")
+        for vcf_chrom, ref_chrom in sorted(mapping.items()):
+            if vcf_chrom != ref_chrom:
+                report_lines.append(f"  '{vcf_chrom}' -> '{ref_chrom}'")
+            else:
+                report_lines.append(f"  '{vcf_chrom}' (exact match)")
+    if unmatched:
+        report_lines.append("")
+        report_lines.append(f"Unmatched VCF chromosomes ({len(unmatched)}):")
+        for chrom in sorted(unmatched):
+            report_lines.append(f"  '{chrom}' (no suitable reference match found)")
+    report_lines.append("")
+    coverage = len(mapping) / len(vcf_chroms) * 100 if vcf_chroms else 100
+    report_lines.append(
+        f"Matching coverage: {coverage:.1f}% ({len(mapping)}/{len(vcf_chroms)})"
+    )
+    return "\n".join(report_lines)
+def match_chromosomes_with_report(
+    reference_chroms: Set[str],
+    vcf_chroms: Set[str],
+    verbose: bool = True,
+    auto_map_chromosomes: bool = False,
+) -> Tuple[Dict[str, str], Set[str]]:
+    """
+    Match chromosomes and optionally print a detailed report.
+    Args:
+        reference_chroms: Set of reference chromosome names
+        vcf_chroms: Set of VCF chromosome names
+        verbose: Whether to print matching report
+        auto_map_chromosomes: Whether to automatically map chromosome names when they don't
+                             match exactly (default: False). When False, raises
+                             ChromosomeMismatchError if names don't match.
+    Returns:
+        Tuple of (mapping dict, unmatched set)
+    Raises:
+        ChromosomeMismatchError: If auto_map_chromosomes=False and chromosome names don't
+                                match exactly between VCF and reference
+    """
+    # Check for exact matches first
+    exact_matches = reference_chroms & vcf_chroms
+    needs_mapping = vcf_chroms - exact_matches
+    # If all chromosomes match exactly, no mapping needed
+    if not needs_mapping:
+        mapping = {chrom: chrom for chrom in vcf_chroms}
+        return mapping, set()
+    # If mapping is needed but not enabled, raise error
+    if not auto_map_chromosomes:
+        # Format chromosome lists for error message
+        vcf_list = ", ".join(sorted(vcf_chroms))
+        ref_list = ", ".join(sorted(reference_chroms))
+        error_msg = (
+            f"Chromosome names in VCF and reference do not match.\n\n"
+            f"VCF chromosomes: {vcf_list}\n"
+            f"Reference chromosomes: {ref_list}\n\n"
+            f"To enable automatic chromosome name mapping, add auto_map_chromosomes=True:\n"
+            f"  \n"
+            f"  get_personal_genome(..., auto_map_chromosomes=True)\n\n"
+            f"Alternatively, rename chromosomes in your VCF to match the reference."
+        )
+        raise ChromosomeMismatchError(error_msg)
+    # Automatic mapping is enabled - use heuristics
+    mapping, unmatched = create_chromosome_mapping(reference_chroms, vcf_chroms)
+    if verbose and (
+        len(mapping) < len(vcf_chroms) or any(k != v for k, v in mapping.items())
+    ):
+        report = get_chromosome_match_report(
+            reference_chroms, vcf_chroms, mapping, unmatched
+        )
+        print(report)
+    if unmatched:
+        chrom_list = ", ".join(sorted(unmatched))
+        warnings.warn(
+            f"Skipped {len(unmatched)} chromosome(s) not in reference: {chrom_list}"
+        )
+    return mapping, unmatched
+def validate_chromosomes_early(reference, variants_fn):
+    """
+    Efficiently validate chromosome compatibility before loading all variant data.
+    This function optimizes chromosome checking by:
+    - For VCF file paths: Reading only the chromosome column (very fast and memory efficient)
+    - For DataFrames: Using existing data without reloading
+    - Returning chromosome sets for reuse in subsequent mapping operations
+    Args:
+        reference: Reference genome (dict-like object with .keys())
+        variants_fn: VCF file path (str) or DataFrame with variants
+    Returns:
+        Tuple of (ref_chroms, vcf_chroms) as sets
+    Note:
+        This function does NOT raise errors or perform mapping - it only extracts
+        chromosome names efficiently. Use match_chromosomes_with_report() for
+        actual validation and mapping.
+    """
+    from .variant_utils import get_vcf_chromosomes
+    import pandas as pd
+    # Get reference chromosomes
+    ref_chroms = set(reference.keys())
+    # Get VCF chromosomes efficiently based on input type
+    if isinstance(variants_fn, str):
+        # VCF file path - use efficient chromosome extraction (reads only first column)
+        vcf_chroms = get_vcf_chromosomes(variants_fn)
+    elif isinstance(variants_fn, pd.DataFrame):
+        # DataFrame - extract unique chromosome names from chrom column
+        vcf_chroms = set(variants_fn["chrom"].unique())
+    else:
+        # Other formats - try to get chrom column
+        vcf_chroms = set(variants_fn["chrom"].unique())
+    return ref_chroms, vcf_chroms

supremo_lite/core.py ADDED Viewed

@@ -0,0 +1,41 @@
+"""
+Core utilities, constants and common functions for supremo_lite.
+This module provides the basic constants and utility functions used throughout
+the package.
+"""
+import numpy as np
+from collections import defaultdict
+import warnings
+# Check for PyTorch availability
+try:
+    import torch
+    TORCH_AVAILABLE = True
+except ImportError:
+    TORCH_AVAILABLE = False
+    warnings.warn("PyTorch not found. Will return numpy arrays instead of tensors.")
+# Check for brisket availability
+try:
+    import brisket
+    BRISKET_AVAILABLE = True
+except ImportError:
+    BRISKET_AVAILABLE = False
+    warnings.warn("Brisket not found. Using slower sequence encoding implementation.")
+# Nucleotide to one-hot encoding mapping
+# Using a defaultdict to handle ambiguous bases as zeros for efficiency
+nt_to_1h = defaultdict(lambda: np.array([0, 0, 0, 0]))
+nt_to_1h["A"] = np.array([1, 0, 0, 0])
+nt_to_1h["a"] = np.array([1, 0, 0, 0])
+nt_to_1h["C"] = np.array([0, 1, 0, 0])
+nt_to_1h["c"] = np.array([0, 1, 0, 0])
+nt_to_1h["G"] = np.array([0, 0, 1, 0])
+nt_to_1h["g"] = np.array([0, 0, 1, 0])
+nt_to_1h["T"] = np.array([0, 0, 0, 1])
+nt_to_1h["t"] = np.array([0, 0, 0, 1])
+nts = np.array(list("ACGT"))

supremo_lite/mock_models/__init__.py ADDED Viewed

@@ -0,0 +1,110 @@
+"""
+Mock models for testing and demonstration purposes.
+This module provides simple PyTorch models that mimic realistic genomic deep learning
+architectures without requiring actual training. These models are intended for:
+1. **Testing**: Verifying that prediction alignment functions work correctly with
+   realistic model outputs (binned predictions, edge cropping, diagonal masking)
+2. **Documentation**: Providing immediately runnable examples for users who want to
+   understand the package workflow without training their own models
+**Important**: These models return constant values and should NOT be used for actual
+genomic predictions or biological interpretation.
+Available Models
+----------------
+TestModel : nn.Module
+    Mock 1D genomic prediction model
+    - Output shape: (batch_size, n_targets, n_final_bins)
+    - Features: binning, edge cropping
+TestModel2D : nn.Module
+    Mock 2D contact map prediction model
+    - Output shape: (batch_size, n_targets, n_flattened_ut_bins)
+    - Features: binning, edge cropping, diagonal masking, flattened output
+Examples
+--------
+Using TestModel for 1D predictions:
+>>> from supremo_lite.mock_models import TestModel, TORCH_AVAILABLE
+>>> if TORCH_AVAILABLE:
+...     import torch
+...     model = TestModel(seq_length=1024, bin_length=32, crop_length=128)
+...     x = torch.randn(4, 4, 1024)
+...     predictions = model(x)
+...     print(predictions.shape)
+torch.Size([4, 1, 24])
+Using TestModel2D for contact maps:
+>>> from supremo_lite.mock_models import TestModel2D
+>>> if TORCH_AVAILABLE:
+...     import torch
+...     model = TestModel2D(seq_length=2048, bin_length=64, crop_length=256)
+...     x = torch.randn(4, 4, 2048)
+...     predictions = model(x)
+...     print(predictions.shape)
+torch.Size([4, 1, 276])
+Checking PyTorch Availability
+------------------------------
+>>> from supremo_lite.mock_models import TORCH_AVAILABLE
+>>> if not TORCH_AVAILABLE:
+...     print("Please install PyTorch to use mock models")
+Notes
+-----
+- Requires PyTorch to be installed
+- If PyTorch is not available, attempting to instantiate models will raise ImportError
+- Check TORCH_AVAILABLE before using models
+- See individual model documentation for architecture details
+"""
+try:
+    from .testmodel_1d import TestModel, TORCH_AVAILABLE as TORCH_AVAILABLE_1D
+    from .testmodel_2d import TestModel2D, TORCH_AVAILABLE as TORCH_AVAILABLE_2D
+    # Both should have the same value, but check for consistency
+    TORCH_AVAILABLE = TORCH_AVAILABLE_1D and TORCH_AVAILABLE_2D
+except ImportError as e:
+    # This should rarely happen since the modules handle their own imports
+    # But we provide a graceful fallback
+    import warnings
+    warnings.warn(
+        f"Could not import mock models: {e}\n"
+        "Mock models require PyTorch. Install with: pip install torch",
+        ImportWarning,
+    )
+    # Create placeholder classes
+    class TestModel:
+        """TestModel requires PyTorch. Please install with: pip install torch"""
+        def __init__(self, *args, **kwargs):
+            raise ImportError(
+                "TestModel requires PyTorch. Install with: pip install torch\n"
+                "See https://pytorch.org/get-started/locally/"
+            )
+    class TestModel2D:
+        """TestModel2D requires PyTorch. Please install with: pip install torch"""
+        def __init__(self, *args, **kwargs):
+            raise ImportError(
+                "TestModel2D requires PyTorch. Install with: pip install torch\n"
+                "See https://pytorch.org/get-started/locally/"
+            )
+    TORCH_AVAILABLE = False
+__all__ = [
+    "TestModel",
+    "TestModel2D",
+    "TORCH_AVAILABLE",
+]