PyPI - pylocuszoom - Versions diffs - 0.5.0__py3-none-any.whl → 0.8.0__py3-none-any.whl - Mend

pylocuszoom 0.5.0py3-none-any.whl → 0.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

pylocuszoom/__init__.py +38 -2
pylocuszoom/backends/__init__.py +116 -17
pylocuszoom/backends/base.py +424 -35
pylocuszoom/backends/bokeh_backend.py +192 -34
pylocuszoom/backends/hover.py +198 -0
pylocuszoom/backends/matplotlib_backend.py +332 -3
pylocuszoom/backends/plotly_backend.py +187 -38
pylocuszoom/colors.py +41 -0
pylocuszoom/ensembl.py +476 -0
pylocuszoom/eqtl.py +15 -19
pylocuszoom/finemapping.py +17 -26
pylocuszoom/forest.py +35 -0
pylocuszoom/gene_track.py +161 -135
pylocuszoom/loaders.py +38 -18
pylocuszoom/phewas.py +34 -0
pylocuszoom/plotter.py +370 -190
pylocuszoom/recombination.py +64 -34
pylocuszoom/schemas.py +37 -26
pylocuszoom/utils.py +52 -0
pylocuszoom/validation.py +172 -0
{pylocuszoom-0.5.0.dist-info → pylocuszoom-0.8.0.dist-info}/METADATA +97 -28
pylocuszoom-0.8.0.dist-info/RECORD +29 -0
pylocuszoom-0.5.0.dist-info/RECORD +0 -24
{pylocuszoom-0.5.0.dist-info → pylocuszoom-0.8.0.dist-info}/WHEEL +0 -0
{pylocuszoom-0.5.0.dist-info → pylocuszoom-0.8.0.dist-info}/licenses/LICENSE.md +0 -0

pylocuszoom/colors.py CHANGED Viewed

@@ -239,3 +239,44 @@ def get_credible_set_color_palette(n_sets: int = 10) -> dict[int, str]:
     return {
         i + 1: CREDIBLE_SET_COLORS[i % len(CREDIBLE_SET_COLORS)] for i in range(n_sets)
     }
+# PheWAS category colors - distinct colors for phenotype categories
+PHEWAS_CATEGORY_COLORS: List[str] = [
+    "#E41A1C",  # red
+    "#377EB8",  # blue
+    "#4DAF4A",  # green
+    "#984EA3",  # purple
+    "#FF7F00",  # orange
+    "#FFFF33",  # yellow
+    "#A65628",  # brown
+    "#F781BF",  # pink
+    "#999999",  # grey
+    "#66C2A5",  # teal
+    "#FC8D62",  # salmon
+    "#8DA0CB",  # periwinkle
+]
+def get_phewas_category_color(category_idx: int) -> str:
+    """Get color for a PheWAS category by index.
+    Args:
+        category_idx: Zero-indexed category number.
+    Returns:
+        Hex color code string.
+    """
+    return PHEWAS_CATEGORY_COLORS[category_idx % len(PHEWAS_CATEGORY_COLORS)]
+def get_phewas_category_palette(categories: List[str]) -> dict[str, str]:
+    """Get color palette mapping category names to colors.
+    Args:
+        categories: List of unique category names.
+    Returns:
+        Dictionary mapping category names to hex colors.
+    """
+    return {cat: get_phewas_category_color(i) for i, cat in enumerate(categories)}

pylocuszoom/ensembl.py ADDED Viewed

@@ -0,0 +1,476 @@
+# src/pylocuszoom/ensembl.py
+"""Ensembl REST API integration for reference data fetching.
+Provides functions to fetch gene and exon annotations from the Ensembl REST API
+(https://rest.ensembl.org) for any species.
+Note: Recombination rates are NOT available from Ensembl for most species.
+Use species-specific recombination maps instead (see recombination.py).
+"""
+import hashlib
+import os
+import sys
+import time
+from pathlib import Path
+import pandas as pd
+import requests
+from .logging import logger
+from .utils import ValidationError
+# Ensembl API limits regions to 5Mb
+ENSEMBL_MAX_REGION_SIZE = 5_000_000
+# Species name aliases -> Ensembl species names
+SPECIES_ALIASES: dict[str, str] = {
+    # Canine
+    "canine": "canis_lupus_familiaris",
+    "dog": "canis_lupus_familiaris",
+    "canis_familiaris": "canis_lupus_familiaris",
+    # Feline
+    "feline": "felis_catus",
+    "cat": "felis_catus",
+    # Human
+    "human": "homo_sapiens",
+    # Mouse
+    "mouse": "mus_musculus",
+    # Rat
+    "rat": "rattus_norvegicus",
+}
+ENSEMBL_REST_URL = "https://rest.ensembl.org"
+ENSEMBL_REQUEST_TIMEOUT = 30  # seconds
+ENSEMBL_MAX_RETRIES = 3
+ENSEMBL_RETRY_DELAY = 1.0  # seconds, doubles on each retry
+def _normalize_chrom(chrom: str | int) -> str:
+    """Normalize chromosome name by removing 'chr' prefix."""
+    return str(chrom).replace("chr", "")
+def _validate_region_size(start: int, end: int, context: str) -> None:
+    """Validate region size is within Ensembl API limits.
+    Args:
+        start: Region start position.
+        end: Region end position.
+        context: Context for error message (e.g., "genes_df", "exons_df").
+    Raises:
+        ValidationError: If region exceeds 5Mb limit.
+    """
+    region_size = end - start
+    if region_size > ENSEMBL_MAX_REGION_SIZE:
+        raise ValidationError(
+            f"Region size {region_size:,} bp exceeds Ensembl API limit of 5Mb. "
+            f"Please use a smaller region or provide {context} directly."
+        )
+def get_ensembl_species_name(species: str) -> str:
+    """Convert species alias to Ensembl species name.
+    Args:
+        species: Species name or alias (e.g., "canine", "dog", "human").
+    Returns:
+        Ensembl-compatible species name (e.g., "canis_lupus_familiaris").
+    """
+    return SPECIES_ALIASES.get(species.lower(), species.lower())
+def get_ensembl_cache_dir() -> Path:
+    """Get the cache directory for Ensembl data.
+    Uses same base location as recombination maps: ~/.cache/snp-scope-plot/ensembl
+    Returns:
+        Path to cache directory (created if doesn't exist).
+    """
+    if sys.platform == "darwin":
+        base = Path.home() / ".cache"
+    elif sys.platform == "win32":
+        base = Path(os.environ.get("LOCALAPPDATA", Path.home() / "AppData" / "Local"))
+    else:
+        base = Path(os.environ.get("XDG_CACHE_HOME", Path.home() / ".cache"))
+    cache_dir = base / "snp-scope-plot" / "ensembl"
+    cache_dir.mkdir(parents=True, exist_ok=True)
+    return cache_dir
+def _cache_key(species: str, chrom: str, start: int, end: int) -> str:
+    """Generate cache key for a region."""
+    key_str = f"{species}_{chrom}_{start}_{end}"
+    return hashlib.md5(key_str.encode()).hexdigest()[:16]
+def get_cached_genes(
+    cache_dir: Path,
+    species: str,
+    chrom: str | int,
+    start: int,
+    end: int,
+) -> pd.DataFrame | None:
+    """Load cached genes if available.
+    Args:
+        cache_dir: Cache directory path.
+        species: Species name or alias.
+        chrom: Chromosome name or number.
+        start: Region start position.
+        end: Region end position.
+    Returns:
+        DataFrame if cache hit, None if cache miss.
+    """
+    ensembl_species = get_ensembl_species_name(species)
+    chrom_str = _normalize_chrom(chrom)
+    cache_key = _cache_key(ensembl_species, chrom_str, start, end)
+    species_dir = cache_dir / ensembl_species
+    cache_file = species_dir / f"genes_{cache_key}.csv"
+    if not cache_file.exists():
+        return None
+    logger.debug(f"Cache hit: {cache_file}")
+    return pd.read_csv(cache_file)
+def save_cached_genes(
+    df: pd.DataFrame,
+    cache_dir: Path,
+    species: str,
+    chrom: str | int,
+    start: int,
+    end: int,
+) -> None:
+    """Save genes to cache as CSV.
+    Args:
+        df: DataFrame with gene annotations to cache.
+        cache_dir: Cache directory path.
+        species: Species name or alias.
+        chrom: Chromosome name or number.
+        start: Region start position.
+        end: Region end position.
+    """
+    ensembl_species = get_ensembl_species_name(species)
+    chrom_str = _normalize_chrom(chrom)
+    cache_key = _cache_key(ensembl_species, chrom_str, start, end)
+    species_dir = cache_dir / ensembl_species
+    species_dir.mkdir(parents=True, exist_ok=True)
+    cache_file = species_dir / f"genes_{cache_key}.csv"
+    df.to_csv(cache_file, index=False)
+    logger.debug(f"Cached genes to: {cache_file}")
+def _make_ensembl_request(
+    url: str,
+    params: dict,
+    max_retries: int = ENSEMBL_MAX_RETRIES,
+    raise_on_error: bool = False,
+) -> list | None:
+    """Make request to Ensembl API with retry logic.
+    Args:
+        url: API endpoint URL.
+        params: Query parameters.
+        max_retries: Maximum retry attempts for retryable errors.
+        raise_on_error: If True, raise exception on error instead of returning None.
+    Returns:
+        JSON response as list, or None on non-retryable error.
+    Raises:
+        ValidationError: If raise_on_error=True and request fails.
+    """
+    delay = ENSEMBL_RETRY_DELAY
+    for attempt in range(max_retries):
+        try:
+            response = requests.get(
+                url,
+                params=params,
+                headers={"Content-Type": "application/json"},
+                timeout=ENSEMBL_REQUEST_TIMEOUT,
+            )
+        except requests.RequestException as e:
+            logger.warning(f"Ensembl API request failed (attempt {attempt + 1}): {e}")
+            if attempt < max_retries - 1:
+                time.sleep(delay)
+                delay *= 2
+                continue
+            if raise_on_error:
+                raise ValidationError(
+                    f"Ensembl API request failed after {max_retries} attempts: {e}"
+                )
+            return None
+        # Success
+        if response.ok:
+            return response.json()
+        # Retryable errors (429 rate limit, 503 service unavailable)
+        if response.status_code in (429, 503) and attempt < max_retries - 1:
+            logger.warning(
+                f"Ensembl API returned {response.status_code} "
+                f"(attempt {attempt + 1}), retrying..."
+            )
+            time.sleep(delay)
+            delay *= 2
+            continue
+        # Non-retryable error
+        error_msg = f"Ensembl API error {response.status_code}: {response.text[:200]}"
+        logger.warning(error_msg)
+        if raise_on_error:
+            raise ValidationError(error_msg)
+        return None
+    return None
+def fetch_genes_from_ensembl(
+    species: str,
+    chrom: str | int,
+    start: int,
+    end: int,
+    biotype: str = "protein_coding",
+    raise_on_error: bool = False,
+) -> pd.DataFrame:
+    """Fetch gene annotations from Ensembl REST API.
+    Args:
+        species: Species name or alias.
+        chrom: Chromosome name or number.
+        start: Region start position (1-based).
+        end: Region end position (1-based).
+        biotype: Gene biotype filter (default: protein_coding).
+        raise_on_error: If True, raise ValidationError on API errors.
+    Returns:
+        DataFrame with columns: chr, start, end, gene_name, strand, gene_id, biotype.
+        Returns empty DataFrame on API error (unless raise_on_error=True).
+    Raises:
+        ValidationError: If region > 5Mb or if raise_on_error=True and API fails.
+    """
+    _validate_region_size(start, end, "genes_df")
+    ensembl_species = get_ensembl_species_name(species)
+    chrom_str = _normalize_chrom(chrom)
+    # Build region string
+    region = f"{chrom_str}:{start}-{end}"
+    # Build API URL
+    url = f"{ENSEMBL_REST_URL}/overlap/region/{ensembl_species}/{region}"
+    params = {"feature": "gene", "biotype": biotype}
+    logger.debug(f"Fetching genes from Ensembl: {url}")
+    data = _make_ensembl_request(url, params, raise_on_error=raise_on_error)
+    if data is None:
+        return pd.DataFrame()
+    if not data:
+        logger.debug(f"No genes found in region {region}")
+        return pd.DataFrame()
+    # Convert to DataFrame
+    records = []
+    for gene in data:
+        if gene.get("feature_type") != "gene":
+            continue
+        records.append(
+            {
+                "chr": str(gene.get("seq_region_name", chrom_str)),
+                "start": gene.get("start"),
+                "end": gene.get("end"),
+                "gene_name": gene.get("external_name", gene.get("id", "")),
+                "strand": "+" if gene.get("strand", 1) == 1 else "-",
+                "gene_id": gene.get("id", ""),
+                "biotype": gene.get("biotype", ""),
+            }
+        )
+    df = pd.DataFrame(records)
+    logger.debug(f"Fetched {len(df)} genes from Ensembl")
+    return df
+def fetch_exons_from_ensembl(
+    species: str,
+    chrom: str | int,
+    start: int,
+    end: int,
+    raise_on_error: bool = False,
+) -> pd.DataFrame:
+    """Fetch exon annotations from Ensembl REST API.
+    Args:
+        species: Species name or alias.
+        chrom: Chromosome name or number.
+        start: Region start position (1-based).
+        end: Region end position (1-based).
+        raise_on_error: If True, raise ValidationError on API errors.
+    Returns:
+        DataFrame with columns: chr, start, end, gene_name, exon_id, transcript_id.
+        Returns empty DataFrame on API error (unless raise_on_error=True).
+    Raises:
+        ValidationError: If region > 5Mb or if raise_on_error=True and API fails.
+    """
+    _validate_region_size(start, end, "exons_df")
+    ensembl_species = get_ensembl_species_name(species)
+    chrom_str = _normalize_chrom(chrom)
+    region = f"{chrom_str}:{start}-{end}"
+    url = f"{ENSEMBL_REST_URL}/overlap/region/{ensembl_species}/{region}"
+    params = {"feature": "exon"}
+    logger.debug(f"Fetching exons from Ensembl: {url}")
+    data = _make_ensembl_request(url, params, raise_on_error=raise_on_error)
+    if data is None:
+        return pd.DataFrame()
+    if not data:
+        return pd.DataFrame()
+    records = []
+    for exon in data:
+        if exon.get("feature_type") != "exon":
+            continue
+        records.append(
+            {
+                "chr": str(exon.get("seq_region_name", chrom_str)),
+                "start": exon.get("start"),
+                "end": exon.get("end"),
+                "gene_name": "",  # Exon endpoint doesn't include gene name
+                "exon_id": exon.get("id", ""),
+                "transcript_id": exon.get("Parent", ""),
+            }
+        )
+    df = pd.DataFrame(records)
+    logger.debug(f"Fetched {len(df)} exons from Ensembl")
+    return df
+def get_genes_for_region(
+    species: str,
+    chrom: str | int,
+    start: int,
+    end: int,
+    cache_dir: Path | None = None,
+    use_cache: bool = True,
+    include_exons: bool = False,
+    raise_on_error: bool = False,
+) -> pd.DataFrame | tuple[pd.DataFrame, pd.DataFrame]:
+    """Get gene annotations for a genomic region.
+    Checks cache first, fetches from Ensembl API if not cached.
+    Args:
+        species: Species name or alias.
+        chrom: Chromosome name or number.
+        start: Region start position (1-based).
+        end: Region end position (1-based).
+        cache_dir: Cache directory (uses default if None).
+        use_cache: Whether to use disk cache.
+        include_exons: If True, also fetch exons and return tuple (genes_df, exons_df).
+        raise_on_error: If True, raise ValidationError on API errors.
+    Returns:
+        If include_exons=False: DataFrame with gene annotations.
+        If include_exons=True: Tuple of (genes_df, exons_df).
+    Raises:
+        ValidationError: If region > 5Mb or if raise_on_error=True and API fails.
+    Note:
+        Gene annotations are cached to disk. Exons are fetched from the API
+        on each call when include_exons=True (not cached separately).
+    """
+    if cache_dir is None:
+        cache_dir = get_ensembl_cache_dir()
+    chrom_str = _normalize_chrom(chrom)
+    # Check cache first
+    if use_cache:
+        cached = get_cached_genes(cache_dir, species, chrom_str, start, end)
+        if cached is not None:
+            if include_exons:
+                # Exons not cached separately (yet)
+                exons_df = fetch_exons_from_ensembl(
+                    species, chrom_str, start, end, raise_on_error=raise_on_error
+                )
+                return cached, exons_df
+            return cached
+    # Fetch from Ensembl API
+    genes_df = fetch_genes_from_ensembl(
+        species, chrom_str, start, end, raise_on_error=raise_on_error
+    )
+    # Cache the result (even if empty, to avoid repeated API calls for gene-sparse regions)
+    if use_cache:
+        save_cached_genes(genes_df, cache_dir, species, chrom_str, start, end)
+    if include_exons:
+        exons_df = fetch_exons_from_ensembl(
+            species, chrom_str, start, end, raise_on_error=raise_on_error
+        )
+        return genes_df, exons_df
+    return genes_df
+def clear_ensembl_cache(
+    cache_dir: Path | None = None,
+    species: str | None = None,
+) -> int:
+    """Clear cached Ensembl data.
+    Args:
+        cache_dir: Cache directory (uses default if None).
+        species: If provided, only clear cache for this species.
+    Returns:
+        Number of files deleted.
+    """
+    if cache_dir is None:
+        cache_dir = get_ensembl_cache_dir()
+    deleted = 0
+    if species:
+        # Clear only specific species
+        ensembl_species = get_ensembl_species_name(species)
+        species_dir = cache_dir / ensembl_species
+        if species_dir.exists():
+            for cache_file in species_dir.glob("*.csv"):
+                cache_file.unlink()
+                deleted += 1
+    else:
+        # Clear all species
+        for cache_file in cache_dir.glob("**/*.csv"):
+            cache_file.unlink()
+            deleted += 1
+    logger.info(f"Cleared {deleted} cached Ensembl files from {cache_dir}")
+    return deleted

pylocuszoom/eqtl.py CHANGED Viewed

@@ -10,6 +10,8 @@ import numpy as np
 import pandas as pd
 from .logging import logger
+from .utils import ValidationError, filter_by_region
+from .validation import DataFrameValidator
 REQUIRED_EQTL_COLS = ["pos", "p_value"]
 OPTIONAL_EQTL_COLS = ["gene", "effect_size", "rs", "se"]
@@ -36,17 +38,14 @@ def validate_eqtl_df(
     Raises:
         EQTLValidationError: If required columns are missing.
     """
-    missing = []
-    if pos_col not in df.columns:
-        missing.append(pos_col)
-    if p_col not in df.columns:
-        missing.append(p_col)
-    if missing:
-        raise EQTLValidationError(
-            f"eQTL DataFrame missing required columns: {missing}. "
-            f"Required: {pos_col} (position), {p_col} (p-value)"
+    try:
+        (
+            DataFrameValidator(df, "eQTL DataFrame")
+            .require_columns([pos_col, p_col])
+            .validate()
         )
+    except ValidationError as e:
+        raise EQTLValidationError(str(e)) from e
 def filter_eqtl_by_gene(
@@ -99,15 +98,12 @@ def filter_eqtl_by_region(
     Returns:
         Filtered DataFrame containing only eQTLs in the region.
     """
-    mask = (df[pos_col] >= start) & (df[pos_col] <= end)
-    # Filter by chromosome if column exists
-    if chrom_col and chrom_col in df.columns:
-        chrom_str = str(chrom).replace("chr", "")
-        df_chrom = df[chrom_col].astype(str).str.replace("chr", "", regex=False)
-        mask = mask & (df_chrom == chrom_str)
-    filtered = df[mask].copy()
+    filtered = filter_by_region(
+        df,
+        region=(chrom, start, end),
+        chrom_col=chrom_col or "",
+        pos_col=pos_col,
+    )
     logger.debug(
         f"Filtered eQTL data to {len(filtered)} variants in region chr{chrom}:{start}-{end}"
     )

pylocuszoom/finemapping.py CHANGED Viewed

@@ -9,6 +9,8 @@ from typing import List, Optional
 import pandas as pd
 from .logging import logger
+from .utils import ValidationError, filter_by_region
+from .validation import DataFrameValidator
 # Required columns for fine-mapping data
 REQUIRED_FINEMAPPING_COLS = ["pos", "pip"]
@@ -36,24 +38,16 @@ def validate_finemapping_df(
     Raises:
         FinemappingValidationError: If required columns are missing.
     """
-    missing = []
-    if pos_col not in df.columns:
-        missing.append(pos_col)
-    if pip_col not in df.columns:
-        missing.append(pip_col)
-    if missing:
-        raise FinemappingValidationError(
-            f"Fine-mapping DataFrame missing required columns: {missing}. "
-            f"Required: {pos_col} (position), {pip_col} (posterior inclusion probability)"
-        )
-    # Validate PIP values are in [0, 1]
-    if not df[pip_col].between(0, 1).all():
-        invalid_count = (~df[pip_col].between(0, 1)).sum()
-        raise FinemappingValidationError(
-            f"PIP values must be between 0 and 1. Found {invalid_count} invalid values."
+    try:
+        (
+            DataFrameValidator(df, "Fine-mapping DataFrame")
+            .require_columns([pos_col, pip_col])
+            .require_numeric([pip_col])
+            .require_range(pip_col, min_val=0, max_val=1)
+            .validate()
         )
+    except ValidationError as e:
+        raise FinemappingValidationError(str(e)) from e
 def filter_finemapping_by_region(
@@ -77,15 +71,12 @@ def filter_finemapping_by_region(
     Returns:
         Filtered DataFrame containing only variants in the region.
     """
-    mask = (df[pos_col] >= start) & (df[pos_col] <= end)
-    # Filter by chromosome if column exists
-    if chrom_col and chrom_col in df.columns:
-        chrom_str = str(chrom).replace("chr", "")
-        df_chrom = df[chrom_col].astype(str).str.replace("chr", "", regex=False)
-        mask = mask & (df_chrom == chrom_str)
-    filtered = df[mask].copy()
+    filtered = filter_by_region(
+        df,
+        region=(chrom, start, end),
+        chrom_col=chrom_col or "",
+        pos_col=pos_col,
+    )
     logger.debug(
         f"Filtered fine-mapping data to {len(filtered)} variants in region "
         f"chr{chrom}:{start}-{end}"

pylocuszoom/forest.py ADDED Viewed

@@ -0,0 +1,35 @@
+"""Forest plot data validation and preparation.
+Validates and prepares meta-analysis/forest plot data for visualization.
+"""
+import pandas as pd
+from .validation import DataFrameValidator
+def validate_forest_df(
+    df: pd.DataFrame,
+    study_col: str = "study",
+    effect_col: str = "effect",
+    ci_lower_col: str = "ci_lower",
+    ci_upper_col: str = "ci_upper",
+) -> None:
+    """Validate forest plot DataFrame has required columns and types.
+    Args:
+        df: Forest plot data DataFrame.
+        study_col: Column name for study/phenotype names.
+        effect_col: Column name for effect sizes (beta, OR, HR).
+        ci_lower_col: Column name for lower confidence interval.
+        ci_upper_col: Column name for upper confidence interval.
+    Raises:
+        ValidationError: If required columns are missing or have invalid types.
+    """
+    (
+        DataFrameValidator(df, "Forest plot DataFrame")
+        .require_columns([study_col, effect_col, ci_lower_col, ci_upper_col])
+        .require_numeric([effect_col, ci_lower_col, ci_upper_col])
+        .validate()
+    )

pylocuszoom 0.5.0__py3-none-any.whl → 0.8.0__py3-none-any.whl

pylocuszoom 0.5.0py3-none-any.whl → 0.8.0py3-none-any.whl