PyPI - pylocuszoom - Versions diffs - 0.6.0__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

pylocuszoom 0.6.0py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

pylocuszoom/__init__.py +34 -7
pylocuszoom/backends/__init__.py +116 -17
pylocuszoom/backends/base.py +363 -60
pylocuszoom/backends/bokeh_backend.py +77 -15
pylocuszoom/backends/hover.py +198 -0
pylocuszoom/backends/matplotlib_backend.py +263 -3
pylocuszoom/backends/plotly_backend.py +73 -16
pylocuszoom/config.py +365 -0
pylocuszoom/ensembl.py +476 -0
pylocuszoom/eqtl.py +17 -25
pylocuszoom/exceptions.py +33 -0
pylocuszoom/finemapping.py +18 -32
pylocuszoom/forest.py +10 -11
pylocuszoom/gene_track.py +169 -142
pylocuszoom/loaders.py +3 -1
pylocuszoom/phewas.py +10 -11
pylocuszoom/plotter.py +311 -277
pylocuszoom/recombination.py +19 -3
pylocuszoom/schemas.py +1 -6
pylocuszoom/utils.py +54 -4
pylocuszoom/validation.py +223 -0
{pylocuszoom-0.6.0.dist-info → pylocuszoom-1.0.0.dist-info}/METADATA +82 -37
pylocuszoom-1.0.0.dist-info/RECORD +31 -0
pylocuszoom-0.6.0.dist-info/RECORD +0 -26
{pylocuszoom-0.6.0.dist-info → pylocuszoom-1.0.0.dist-info}/WHEEL +0 -0
{pylocuszoom-0.6.0.dist-info → pylocuszoom-1.0.0.dist-info}/licenses/LICENSE.md +0 -0

pylocuszoom/ensembl.py ADDED Viewed

@@ -0,0 +1,476 @@
+# src/pylocuszoom/ensembl.py
+"""Ensembl REST API integration for reference data fetching.
+Provides functions to fetch gene and exon annotations from the Ensembl REST API
+(https://rest.ensembl.org) for any species.
+Note: Recombination rates are NOT available from Ensembl for most species.
+Use species-specific recombination maps instead (see recombination.py).
+"""
+import hashlib
+import os
+import sys
+import time
+from pathlib import Path
+import pandas as pd
+import requests
+from .logging import logger
+from .utils import ValidationError
+# Ensembl API limits regions to 5Mb
+ENSEMBL_MAX_REGION_SIZE = 5_000_000
+# Species name aliases -> Ensembl species names
+SPECIES_ALIASES: dict[str, str] = {
+    # Canine
+    "canine": "canis_lupus_familiaris",
+    "dog": "canis_lupus_familiaris",
+    "canis_familiaris": "canis_lupus_familiaris",
+    # Feline
+    "feline": "felis_catus",
+    "cat": "felis_catus",
+    # Human
+    "human": "homo_sapiens",
+    # Mouse
+    "mouse": "mus_musculus",
+    # Rat
+    "rat": "rattus_norvegicus",
+}
+ENSEMBL_REST_URL = "https://rest.ensembl.org"
+ENSEMBL_REQUEST_TIMEOUT = 30  # seconds
+ENSEMBL_MAX_RETRIES = 3
+ENSEMBL_RETRY_DELAY = 1.0  # seconds, doubles on each retry
+def _normalize_chrom(chrom: str | int) -> str:
+    """Normalize chromosome name by removing 'chr' prefix."""
+    return str(chrom).replace("chr", "")
+def _validate_region_size(start: int, end: int, context: str) -> None:
+    """Validate region size is within Ensembl API limits.
+    Args:
+        start: Region start position.
+        end: Region end position.
+        context: Context for error message (e.g., "genes_df", "exons_df").
+    Raises:
+        ValidationError: If region exceeds 5Mb limit.
+    """
+    region_size = end - start
+    if region_size > ENSEMBL_MAX_REGION_SIZE:
+        raise ValidationError(
+            f"Region size {region_size:,} bp exceeds Ensembl API limit of 5Mb. "
+            f"Please use a smaller region or provide {context} directly."
+        )
+def get_ensembl_species_name(species: str) -> str:
+    """Convert species alias to Ensembl species name.
+    Args:
+        species: Species name or alias (e.g., "canine", "dog", "human").
+    Returns:
+        Ensembl-compatible species name (e.g., "canis_lupus_familiaris").
+    """
+    return SPECIES_ALIASES.get(species.lower(), species.lower())
+def get_ensembl_cache_dir() -> Path:
+    """Get the cache directory for Ensembl data.
+    Uses same base location as recombination maps: ~/.cache/snp-scope-plot/ensembl
+    Returns:
+        Path to cache directory (created if doesn't exist).
+    """
+    if sys.platform == "darwin":
+        base = Path.home() / ".cache"
+    elif sys.platform == "win32":
+        base = Path(os.environ.get("LOCALAPPDATA", Path.home() / "AppData" / "Local"))
+    else:
+        base = Path(os.environ.get("XDG_CACHE_HOME", Path.home() / ".cache"))
+    cache_dir = base / "snp-scope-plot" / "ensembl"
+    cache_dir.mkdir(parents=True, exist_ok=True)
+    return cache_dir
+def _cache_key(species: str, chrom: str, start: int, end: int) -> str:
+    """Generate cache key for a region."""
+    key_str = f"{species}_{chrom}_{start}_{end}"
+    return hashlib.md5(key_str.encode()).hexdigest()[:16]
+def get_cached_genes(
+    cache_dir: Path,
+    species: str,
+    chrom: str | int,
+    start: int,
+    end: int,
+) -> pd.DataFrame | None:
+    """Load cached genes if available.
+    Args:
+        cache_dir: Cache directory path.
+        species: Species name or alias.
+        chrom: Chromosome name or number.
+        start: Region start position.
+        end: Region end position.
+    Returns:
+        DataFrame if cache hit, None if cache miss.
+    """
+    ensembl_species = get_ensembl_species_name(species)
+    chrom_str = _normalize_chrom(chrom)
+    cache_key = _cache_key(ensembl_species, chrom_str, start, end)
+    species_dir = cache_dir / ensembl_species
+    cache_file = species_dir / f"genes_{cache_key}.csv"
+    if not cache_file.exists():
+        return None
+    logger.debug(f"Cache hit: {cache_file}")
+    return pd.read_csv(cache_file)
+def save_cached_genes(
+    df: pd.DataFrame,
+    cache_dir: Path,
+    species: str,
+    chrom: str | int,
+    start: int,
+    end: int,
+) -> None:
+    """Save genes to cache as CSV.
+    Args:
+        df: DataFrame with gene annotations to cache.
+        cache_dir: Cache directory path.
+        species: Species name or alias.
+        chrom: Chromosome name or number.
+        start: Region start position.
+        end: Region end position.
+    """
+    ensembl_species = get_ensembl_species_name(species)
+    chrom_str = _normalize_chrom(chrom)
+    cache_key = _cache_key(ensembl_species, chrom_str, start, end)
+    species_dir = cache_dir / ensembl_species
+    species_dir.mkdir(parents=True, exist_ok=True)
+    cache_file = species_dir / f"genes_{cache_key}.csv"
+    df.to_csv(cache_file, index=False)
+    logger.debug(f"Cached genes to: {cache_file}")
+def _make_ensembl_request(
+    url: str,
+    params: dict,
+    max_retries: int = ENSEMBL_MAX_RETRIES,
+    raise_on_error: bool = False,
+) -> list | None:
+    """Make request to Ensembl API with retry logic.
+    Args:
+        url: API endpoint URL.
+        params: Query parameters.
+        max_retries: Maximum retry attempts for retryable errors.
+        raise_on_error: If True, raise exception on error instead of returning None.
+    Returns:
+        JSON response as list, or None on non-retryable error.
+    Raises:
+        ValidationError: If raise_on_error=True and request fails.
+    """
+    delay = ENSEMBL_RETRY_DELAY
+    for attempt in range(max_retries):
+        try:
+            response = requests.get(
+                url,
+                params=params,
+                headers={"Content-Type": "application/json"},
+                timeout=ENSEMBL_REQUEST_TIMEOUT,
+            )
+        except requests.RequestException as e:
+            logger.warning(f"Ensembl API request failed (attempt {attempt + 1}): {e}")
+            if attempt < max_retries - 1:
+                time.sleep(delay)
+                delay *= 2
+                continue
+            if raise_on_error:
+                raise ValidationError(
+                    f"Ensembl API request failed after {max_retries} attempts: {e}"
+                )
+            return None
+        # Success
+        if response.ok:
+            return response.json()
+        # Retryable errors (429 rate limit, 503 service unavailable)
+        if response.status_code in (429, 503) and attempt < max_retries - 1:
+            logger.warning(
+                f"Ensembl API returned {response.status_code} "
+                f"(attempt {attempt + 1}), retrying..."
+            )
+            time.sleep(delay)
+            delay *= 2
+            continue
+        # Non-retryable error
+        error_msg = f"Ensembl API error {response.status_code}: {response.text[:200]}"
+        logger.warning(error_msg)
+        if raise_on_error:
+            raise ValidationError(error_msg)
+        return None
+    return None
+def fetch_genes_from_ensembl(
+    species: str,
+    chrom: str | int,
+    start: int,
+    end: int,
+    biotype: str = "protein_coding",
+    raise_on_error: bool = False,
+) -> pd.DataFrame:
+    """Fetch gene annotations from Ensembl REST API.
+    Args:
+        species: Species name or alias.
+        chrom: Chromosome name or number.
+        start: Region start position (1-based).
+        end: Region end position (1-based).
+        biotype: Gene biotype filter (default: protein_coding).
+        raise_on_error: If True, raise ValidationError on API errors.
+    Returns:
+        DataFrame with columns: chr, start, end, gene_name, strand, gene_id, biotype.
+        Returns empty DataFrame on API error (unless raise_on_error=True).
+    Raises:
+        ValidationError: If region > 5Mb or if raise_on_error=True and API fails.
+    """
+    _validate_region_size(start, end, "genes_df")
+    ensembl_species = get_ensembl_species_name(species)
+    chrom_str = _normalize_chrom(chrom)
+    # Build region string
+    region = f"{chrom_str}:{start}-{end}"
+    # Build API URL
+    url = f"{ENSEMBL_REST_URL}/overlap/region/{ensembl_species}/{region}"
+    params = {"feature": "gene", "biotype": biotype}
+    logger.debug(f"Fetching genes from Ensembl: {url}")
+    data = _make_ensembl_request(url, params, raise_on_error=raise_on_error)
+    if data is None:
+        return pd.DataFrame()
+    if not data:
+        logger.debug(f"No genes found in region {region}")
+        return pd.DataFrame()
+    # Convert to DataFrame
+    records = []
+    for gene in data:
+        if gene.get("feature_type") != "gene":
+            continue
+        records.append(
+            {
+                "chr": str(gene.get("seq_region_name", chrom_str)),
+                "start": gene.get("start"),
+                "end": gene.get("end"),
+                "gene_name": gene.get("external_name", gene.get("id", "")),
+                "strand": "+" if gene.get("strand", 1) == 1 else "-",
+                "gene_id": gene.get("id", ""),
+                "biotype": gene.get("biotype", ""),
+            }
+        )
+    df = pd.DataFrame(records)
+    logger.debug(f"Fetched {len(df)} genes from Ensembl")
+    return df
+def fetch_exons_from_ensembl(
+    species: str,
+    chrom: str | int,
+    start: int,
+    end: int,
+    raise_on_error: bool = False,
+) -> pd.DataFrame:
+    """Fetch exon annotations from Ensembl REST API.
+    Args:
+        species: Species name or alias.
+        chrom: Chromosome name or number.
+        start: Region start position (1-based).
+        end: Region end position (1-based).
+        raise_on_error: If True, raise ValidationError on API errors.
+    Returns:
+        DataFrame with columns: chr, start, end, gene_name, exon_id, transcript_id.
+        Returns empty DataFrame on API error (unless raise_on_error=True).
+    Raises:
+        ValidationError: If region > 5Mb or if raise_on_error=True and API fails.
+    """
+    _validate_region_size(start, end, "exons_df")
+    ensembl_species = get_ensembl_species_name(species)
+    chrom_str = _normalize_chrom(chrom)
+    region = f"{chrom_str}:{start}-{end}"
+    url = f"{ENSEMBL_REST_URL}/overlap/region/{ensembl_species}/{region}"
+    params = {"feature": "exon"}
+    logger.debug(f"Fetching exons from Ensembl: {url}")
+    data = _make_ensembl_request(url, params, raise_on_error=raise_on_error)
+    if data is None:
+        return pd.DataFrame()
+    if not data:
+        return pd.DataFrame()
+    records = []
+    for exon in data:
+        if exon.get("feature_type") != "exon":
+            continue
+        records.append(
+            {
+                "chr": str(exon.get("seq_region_name", chrom_str)),
+                "start": exon.get("start"),
+                "end": exon.get("end"),
+                "gene_name": "",  # Exon endpoint doesn't include gene name
+                "exon_id": exon.get("id", ""),
+                "transcript_id": exon.get("Parent", ""),
+            }
+        )
+    df = pd.DataFrame(records)
+    logger.debug(f"Fetched {len(df)} exons from Ensembl")
+    return df
+def get_genes_for_region(
+    species: str,
+    chrom: str | int,
+    start: int,
+    end: int,
+    cache_dir: Path | None = None,
+    use_cache: bool = True,
+    include_exons: bool = False,
+    raise_on_error: bool = False,
+) -> pd.DataFrame | tuple[pd.DataFrame, pd.DataFrame]:
+    """Get gene annotations for a genomic region.
+    Checks cache first, fetches from Ensembl API if not cached.
+    Args:
+        species: Species name or alias.
+        chrom: Chromosome name or number.
+        start: Region start position (1-based).
+        end: Region end position (1-based).
+        cache_dir: Cache directory (uses default if None).
+        use_cache: Whether to use disk cache.
+        include_exons: If True, also fetch exons and return tuple (genes_df, exons_df).
+        raise_on_error: If True, raise ValidationError on API errors.
+    Returns:
+        If include_exons=False: DataFrame with gene annotations.
+        If include_exons=True: Tuple of (genes_df, exons_df).
+    Raises:
+        ValidationError: If region > 5Mb or if raise_on_error=True and API fails.
+    Note:
+        Gene annotations are cached to disk. Exons are fetched from the API
+        on each call when include_exons=True (not cached separately).
+    """
+    if cache_dir is None:
+        cache_dir = get_ensembl_cache_dir()
+    chrom_str = _normalize_chrom(chrom)
+    # Check cache first
+    if use_cache:
+        cached = get_cached_genes(cache_dir, species, chrom_str, start, end)
+        if cached is not None:
+            if include_exons:
+                # Exons not cached separately (yet)
+                exons_df = fetch_exons_from_ensembl(
+                    species, chrom_str, start, end, raise_on_error=raise_on_error
+                )
+                return cached, exons_df
+            return cached
+    # Fetch from Ensembl API
+    genes_df = fetch_genes_from_ensembl(
+        species, chrom_str, start, end, raise_on_error=raise_on_error
+    )
+    # Cache the result (even if empty, to avoid repeated API calls for gene-sparse regions)
+    if use_cache:
+        save_cached_genes(genes_df, cache_dir, species, chrom_str, start, end)
+    if include_exons:
+        exons_df = fetch_exons_from_ensembl(
+            species, chrom_str, start, end, raise_on_error=raise_on_error
+        )
+        return genes_df, exons_df
+    return genes_df
+def clear_ensembl_cache(
+    cache_dir: Path | None = None,
+    species: str | None = None,
+) -> int:
+    """Clear cached Ensembl data.
+    Args:
+        cache_dir: Cache directory (uses default if None).
+        species: If provided, only clear cache for this species.
+    Returns:
+        Number of files deleted.
+    """
+    if cache_dir is None:
+        cache_dir = get_ensembl_cache_dir()
+    deleted = 0
+    if species:
+        # Clear only specific species
+        ensembl_species = get_ensembl_species_name(species)
+        species_dir = cache_dir / ensembl_species
+        if species_dir.exists():
+            for cache_file in species_dir.glob("*.csv"):
+                cache_file.unlink()
+                deleted += 1
+    else:
+        # Clear all species
+        for cache_file in cache_dir.glob("**/*.csv"):
+            cache_file.unlink()
+            deleted += 1
+    logger.info(f"Cleared {deleted} cached Ensembl files from {cache_dir}")
+    return deleted

pylocuszoom/eqtl.py CHANGED Viewed

@@ -9,18 +9,15 @@ from typing import List, Optional
 import numpy as np
 import pandas as pd
+from .exceptions import EQTLValidationError, ValidationError
 from .logging import logger
+from .utils import filter_by_region
+from .validation import DataFrameValidator
 REQUIRED_EQTL_COLS = ["pos", "p_value"]
 OPTIONAL_EQTL_COLS = ["gene", "effect_size", "rs", "se"]
-class EQTLValidationError(ValueError):
-    """Raised when eQTL DataFrame validation fails."""
-    pass
 def validate_eqtl_df(
     df: pd.DataFrame,
     pos_col: str = "pos",
@@ -36,17 +33,15 @@ def validate_eqtl_df(
     Raises:
         EQTLValidationError: If required columns are missing.
     """
-    missing = []
-    if pos_col not in df.columns:
-        missing.append(pos_col)
-    if p_col not in df.columns:
-        missing.append(p_col)
-    if missing:
-        raise EQTLValidationError(
-            f"eQTL DataFrame missing required columns: {missing}. "
-            f"Required: {pos_col} (position), {p_col} (p-value)"
+    try:
+        (
+            DataFrameValidator(df, "eQTL DataFrame")
+            .require_columns([pos_col, p_col])
+            .require_numeric([p_col])
+            .validate()
         )
+    except ValidationError as e:
+        raise EQTLValidationError(str(e)) from e
 def filter_eqtl_by_gene(
@@ -99,15 +94,12 @@ def filter_eqtl_by_region(
     Returns:
         Filtered DataFrame containing only eQTLs in the region.
     """
-    mask = (df[pos_col] >= start) & (df[pos_col] <= end)
-    # Filter by chromosome if column exists
-    if chrom_col and chrom_col in df.columns:
-        chrom_str = str(chrom).replace("chr", "")
-        df_chrom = df[chrom_col].astype(str).str.replace("chr", "", regex=False)
-        mask = mask & (df_chrom == chrom_str)
-    filtered = df[mask].copy()
+    filtered = filter_by_region(
+        df,
+        region=(chrom, start, end),
+        chrom_col=chrom_col or "",
+        pos_col=pos_col,
+    )
     logger.debug(
         f"Filtered eQTL data to {len(filtered)} variants in region chr{chrom}:{start}-{end}"
     )

pylocuszoom/exceptions.py ADDED Viewed

@@ -0,0 +1,33 @@
+"""Exception hierarchy for pyLocusZoom.
+All pyLocusZoom exceptions inherit from PyLocusZoomError, enabling users to
+catch all library errors with `except PyLocusZoomError`.
+"""
+class PyLocusZoomError(Exception):
+    """Base exception for all pyLocusZoom errors."""
+class ValidationError(PyLocusZoomError, ValueError):
+    """Raised when input validation fails. Inherits ValueError for backward compat."""
+class EQTLValidationError(ValidationError):
+    """Raised when eQTL DataFrame validation fails."""
+class FinemappingValidationError(ValidationError):
+    """Raised when fine-mapping DataFrame validation fails."""
+class LoaderValidationError(ValidationError):
+    """Raised when loaded data fails validation."""
+class BackendError(PyLocusZoomError):
+    """Raised when backend operations fail."""
+class DataDownloadError(PyLocusZoomError, RuntimeError):
+    """Raised when data download operations fail."""

pylocuszoom/finemapping.py CHANGED Viewed

@@ -8,19 +8,16 @@ from typing import List, Optional
 import pandas as pd
+from .exceptions import FinemappingValidationError, ValidationError
 from .logging import logger
+from .utils import filter_by_region
+from .validation import DataFrameValidator
 # Required columns for fine-mapping data
 REQUIRED_FINEMAPPING_COLS = ["pos", "pip"]
 OPTIONAL_FINEMAPPING_COLS = ["rs", "cs", "cs_id", "effect", "se"]
-class FinemappingValidationError(ValueError):
-    """Raised when fine-mapping DataFrame validation fails."""
-    pass
 def validate_finemapping_df(
     df: pd.DataFrame,
     pos_col: str = "pos",
@@ -36,24 +33,16 @@ def validate_finemapping_df(
     Raises:
         FinemappingValidationError: If required columns are missing.
     """
-    missing = []
-    if pos_col not in df.columns:
-        missing.append(pos_col)
-    if pip_col not in df.columns:
-        missing.append(pip_col)
-    if missing:
-        raise FinemappingValidationError(
-            f"Fine-mapping DataFrame missing required columns: {missing}. "
-            f"Required: {pos_col} (position), {pip_col} (posterior inclusion probability)"
-        )
-    # Validate PIP values are in [0, 1]
-    if not df[pip_col].between(0, 1).all():
-        invalid_count = (~df[pip_col].between(0, 1)).sum()
-        raise FinemappingValidationError(
-            f"PIP values must be between 0 and 1. Found {invalid_count} invalid values."
+    try:
+        (
+            DataFrameValidator(df, "Fine-mapping DataFrame")
+            .require_columns([pos_col, pip_col])
+            .require_numeric([pip_col])
+            .require_range(pip_col, min_val=0, max_val=1)
+            .validate()
         )
+    except ValidationError as e:
+        raise FinemappingValidationError(str(e)) from e
 def filter_finemapping_by_region(
@@ -77,15 +66,12 @@ def filter_finemapping_by_region(
     Returns:
         Filtered DataFrame containing only variants in the region.
     """
-    mask = (df[pos_col] >= start) & (df[pos_col] <= end)
-    # Filter by chromosome if column exists
-    if chrom_col and chrom_col in df.columns:
-        chrom_str = str(chrom).replace("chr", "")
-        df_chrom = df[chrom_col].astype(str).str.replace("chr", "", regex=False)
-        mask = mask & (df_chrom == chrom_str)
-    filtered = df[mask].copy()
+    filtered = filter_by_region(
+        df,
+        region=(chrom, start, end),
+        chrom_col=chrom_col or "",
+        pos_col=pos_col,
+    )
     logger.debug(
         f"Filtered fine-mapping data to {len(filtered)} variants in region "
         f"chr{chrom}:{start}-{end}"

pylocuszoom/forest.py CHANGED Viewed

@@ -5,7 +5,7 @@ Validates and prepares meta-analysis/forest plot data for visualization.
 import pandas as pd
-from .utils import ValidationError
+from .validation import DataFrameValidator
 def validate_forest_df(
@@ -15,7 +15,7 @@ def validate_forest_df(
     ci_lower_col: str = "ci_lower",
     ci_upper_col: str = "ci_upper",
 ) -> None:
-    """Validate forest plot DataFrame has required columns.
+    """Validate forest plot DataFrame has required columns and types.
     Args:
         df: Forest plot data DataFrame.
@@ -25,13 +25,12 @@ def validate_forest_df(
         ci_upper_col: Column name for upper confidence interval.
     Raises:
-        ValidationError: If required columns are missing.
+        ValidationError: If required columns are missing or have invalid types.
     """
-    required = [study_col, effect_col, ci_lower_col, ci_upper_col]
-    missing = [col for col in required if col not in df.columns]
-    if missing:
-        raise ValidationError(
-            f"Forest plot DataFrame missing required columns: {missing}. "
-            f"Required: {required}. Found: {list(df.columns)}"
-        )
+    (
+        DataFrameValidator(df, "Forest plot DataFrame")
+        .require_columns([study_col, effect_col, ci_lower_col, ci_upper_col])
+        .require_numeric([effect_col, ci_lower_col, ci_upper_col])
+        .require_ci_ordering(ci_lower_col, effect_col, ci_upper_col)
+        .validate()
+    )

pylocuszoom 0.6.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

pylocuszoom 0.6.0py3-none-any.whl → 1.0.0py3-none-any.whl