PyPI - pylocuszoom - Versions diffs - 0.5.0__py3-none-any.whl → 0.8.0__py3-none-any.whl - Mend

pylocuszoom 0.5.0py3-none-any.whl → 0.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

pylocuszoom/__init__.py +38 -2
pylocuszoom/backends/__init__.py +116 -17
pylocuszoom/backends/base.py +424 -35
pylocuszoom/backends/bokeh_backend.py +192 -34
pylocuszoom/backends/hover.py +198 -0
pylocuszoom/backends/matplotlib_backend.py +332 -3
pylocuszoom/backends/plotly_backend.py +187 -38
pylocuszoom/colors.py +41 -0
pylocuszoom/ensembl.py +476 -0
pylocuszoom/eqtl.py +15 -19
pylocuszoom/finemapping.py +17 -26
pylocuszoom/forest.py +35 -0
pylocuszoom/gene_track.py +161 -135
pylocuszoom/loaders.py +38 -18
pylocuszoom/phewas.py +34 -0
pylocuszoom/plotter.py +370 -190
pylocuszoom/recombination.py +64 -34
pylocuszoom/schemas.py +37 -26
pylocuszoom/utils.py +52 -0
pylocuszoom/validation.py +172 -0
{pylocuszoom-0.5.0.dist-info → pylocuszoom-0.8.0.dist-info}/METADATA +97 -28
pylocuszoom-0.8.0.dist-info/RECORD +29 -0
pylocuszoom-0.5.0.dist-info/RECORD +0 -24
{pylocuszoom-0.5.0.dist-info → pylocuszoom-0.8.0.dist-info}/WHEEL +0 -0
{pylocuszoom-0.5.0.dist-info → pylocuszoom-0.8.0.dist-info}/licenses/LICENSE.md +0 -0

pylocuszoom/recombination.py CHANGED Viewed

@@ -9,14 +9,16 @@ Provides:
 import os
 import tarfile
 import tempfile
-import urllib.request
 from pathlib import Path
 from typing import Optional
 import pandas as pd
+import requests
 from matplotlib.axes import Axes
+from tqdm import tqdm
 from .logging import logger
+from .utils import filter_by_region
 # Recombination overlay color
 RECOMB_COLOR = "#7FCDFF"  # Light blue
@@ -54,6 +56,38 @@ def get_chain_file_path() -> Path:
     return get_default_data_dir() / "canFam3ToCanFam4.over.chain.gz"
+def _download_with_progress(
+    url: str, dest_path: Path, desc: str = "Downloading"
+) -> None:
+    """Download a file with a progress bar.
+    Args:
+        url: URL to download from.
+        dest_path: Destination file path.
+        desc: Description for the progress bar.
+    """
+    response = requests.get(url, stream=True, timeout=60)
+    response.raise_for_status()
+    total_size = int(response.headers.get("content-length", 0))
+    with (
+        open(dest_path, "wb") as f,
+        tqdm(
+            total=total_size,
+            unit="B",
+            unit_scale=True,
+            unit_divisor=1024,
+            desc=desc,
+            disable=total_size == 0,  # Disable if size unknown
+        ) as pbar,
+    ):
+        for chunk in response.iter_content(chunk_size=8192):
+            if chunk:
+                f.write(chunk)
+                pbar.update(len(chunk))
 def download_liftover_chain(force: bool = False) -> Path:
     """Download the CanFam3 to CanFam4 liftover chain file.
@@ -73,20 +107,11 @@ def download_liftover_chain(force: bool = False) -> Path:
     logger.info("Downloading CanFam3 to CanFam4 liftover chain...")
     logger.debug(f"Source: {CANFAM3_TO_CANFAM4_CHAIN_URL}")
-    try:
-        urllib.request.urlretrieve(CANFAM3_TO_CANFAM4_CHAIN_URL, chain_path)
-    except Exception as e:
-        logger.debug(f"urllib download failed: {e}")
-        try:
-            import requests
-            response = requests.get(CANFAM3_TO_CANFAM4_CHAIN_URL, timeout=60)
-            response.raise_for_status()
-            chain_path.write_bytes(response.content)
-        except ImportError:
-            raise RuntimeError(
-                "Failed to download. Install requests: pip install requests"
-            )
+    _download_with_progress(
+        CANFAM3_TO_CANFAM4_CHAIN_URL,
+        chain_path,
+        desc="Liftover chain",
+    )
     logger.info(f"Chain file saved to: {chain_path}")
     return chain_path
@@ -217,31 +242,31 @@ def download_canine_recombination_maps(
     logger.debug(f"Source: {CANINE_RECOMB_URL}")
     with tempfile.TemporaryDirectory() as tmpdir:
-        # Download tar.gz file
+        # Download tar.gz file with progress bar
         tar_path = Path(tmpdir) / "dog_genetic_maps.tar.gz"
-        try:
-            urllib.request.urlretrieve(CANINE_RECOMB_URL, tar_path)
-        except Exception as e:
-            logger.debug(f"urllib download failed: {e}")
-            logger.debug("Trying alternative method with requests...")
-            try:
-                import requests
-                response = requests.get(CANINE_RECOMB_URL, timeout=60)
-                response.raise_for_status()
-                tar_path.write_bytes(response.content)
-            except ImportError:
-                raise RuntimeError(
-                    "Failed to download. Install requests: pip install requests"
-                )
+        _download_with_progress(
+            CANINE_RECOMB_URL,
+            tar_path,
+            desc="Recombination maps",
+        )
         logger.debug(f"Downloaded {tar_path.stat().st_size / 1024:.1f} KB")
-        # Extract tar.gz
+        # Extract tar.gz with path traversal protection
         logger.debug("Extracting genetic maps...")
         with tarfile.open(tar_path, "r:gz") as tar:
-            tar.extractall(tmpdir)
+            # Filter to prevent path traversal attacks
+            safe_members = []
+            for member in tar.getmembers():
+                # Resolve the path and ensure it stays within tmpdir
+                member_path = Path(tmpdir) / member.name
+                try:
+                    member_path.resolve().relative_to(Path(tmpdir).resolve())
+                    safe_members.append(member)
+                except ValueError:
+                    logger.warning(f"Skipping unsafe path in archive: {member.name}")
+            tar.extractall(tmpdir, members=safe_members)
         # Find and process the extracted files
         extracted_dir = Path(tmpdir)
@@ -360,7 +385,12 @@ def get_recombination_rate_for_region(
         )
     # Filter to region
-    region_df = df[(df["pos"] >= start) & (df["pos"] <= end)].copy()
+    region_df = filter_by_region(
+        df,
+        region=(chrom, start, end),
+        chrom_col="",  # Recomb maps don't have chromosome column
+        pos_col="pos",
+    )
     return region_df[["pos", "rate"]]

pylocuszoom/schemas.py CHANGED Viewed

@@ -84,30 +84,36 @@ def validate_gwas_dataframe(
             "GWAS validation failed:\n  - " + "\n  - ".join(errors)
         )
-    # Check data types
-    if not pd.api.types.is_numeric_dtype(df[pos_col]):
+    # Check data types (must be numeric for range checks)
+    pos_is_numeric = pd.api.types.is_numeric_dtype(df[pos_col])
+    p_is_numeric = pd.api.types.is_numeric_dtype(df[p_col])
+    if not pos_is_numeric:
         errors.append(f"Column '{pos_col}' must be numeric, got {df[pos_col].dtype}")
-    if not pd.api.types.is_numeric_dtype(df[p_col]):
+    if not p_is_numeric:
         errors.append(f"Column '{p_col}' must be numeric, got {df[p_col].dtype}")
-    # Check value ranges
-    if (df[pos_col] <= 0).any():
-        n_invalid = (df[pos_col] <= 0).sum()
-        errors.append(f"Column '{pos_col}' has {n_invalid} non-positive values")
+    # Only check value ranges if columns are numeric (avoid confusing errors)
+    if pos_is_numeric:
+        if (df[pos_col] <= 0).any():
+            n_invalid = (df[pos_col] <= 0).sum()
+            errors.append(f"Column '{pos_col}' has {n_invalid} non-positive values")
-    if ((df[p_col] <= 0) | (df[p_col] > 1)).any():
-        n_invalid = ((df[p_col] <= 0) | (df[p_col] > 1)).sum()
-        errors.append(f"Column '{p_col}' has {n_invalid} values outside range (0, 1]")
+        if df[pos_col].isna().any():
+            n_na = df[pos_col].isna().sum()
+            errors.append(f"Column '{pos_col}' has {n_na} missing values")
-    # Check for NaN in required columns
-    if df[pos_col].isna().any():
-        n_na = df[pos_col].isna().sum()
-        errors.append(f"Column '{pos_col}' has {n_na} missing values")
+    if p_is_numeric:
+        if ((df[p_col] <= 0) | (df[p_col] > 1)).any():
+            n_invalid = ((df[p_col] <= 0) | (df[p_col] > 1)).sum()
+            errors.append(
+                f"Column '{p_col}' has {n_invalid} values outside range (0, 1]"
+            )
-    if df[p_col].isna().any():
-        n_na = df[p_col].isna().sum()
-        errors.append(f"Column '{p_col}' has {n_na} missing values")
+        if df[p_col].isna().any():
+            n_na = df[p_col].isna().sum()
+            errors.append(f"Column '{p_col}' has {n_na} missing values")
     if errors:
         raise LoaderValidationError(
@@ -344,20 +350,25 @@ def validate_genes_dataframe(
         )
     # Check data types
-    if not pd.api.types.is_numeric_dtype(df["start"]):
+    start_is_numeric = pd.api.types.is_numeric_dtype(df["start"])
+    end_is_numeric = pd.api.types.is_numeric_dtype(df["end"])
+    if not start_is_numeric:
         errors.append(f"Column 'start' must be numeric, got {df['start'].dtype}")
-    if not pd.api.types.is_numeric_dtype(df["end"]):
+    if not end_is_numeric:
         errors.append(f"Column 'end' must be numeric, got {df['end'].dtype}")
-    # Check ranges
-    if (df["start"] < 0).any():
-        n_invalid = (df["start"] < 0).sum()
-        errors.append(f"Column 'start' has {n_invalid} negative values")
+    # Only check ranges if columns are numeric (avoid confusing errors)
+    if start_is_numeric:
+        if (df["start"] < 0).any():
+            n_invalid = (df["start"] < 0).sum()
+            errors.append(f"Column 'start' has {n_invalid} negative values")
-    if (df["end"] < df["start"]).any():
-        n_invalid = (df["end"] < df["start"]).sum()
-        errors.append(f"Found {n_invalid} genes where end < start")
+    if start_is_numeric and end_is_numeric:
+        if (df["end"] < df["start"]).any():
+            n_invalid = (df["end"] < df["start"]).sum()
+            errors.append(f"Found {n_invalid} genes where end < start")
     if errors:
         raise LoaderValidationError(

pylocuszoom/utils.py CHANGED Viewed

@@ -106,6 +106,58 @@ def normalize_chrom(chrom: Union[int, str]) -> str:
     return str(chrom).replace("chr", "")
+def filter_by_region(
+    df: pd.DataFrame,
+    region: tuple,
+    chrom_col: str = "chrom",
+    pos_col: str = "pos",
+) -> pd.DataFrame:
+    """Filter DataFrame to genomic region with inclusive bounds.
+    Filters rows where position is within [start, end] (inclusive).
+    If chrom_col exists in DataFrame, also filters by chromosome.
+    Chromosome comparison normalizes types (int/str, chr prefix).
+    Args:
+        df: DataFrame to filter.
+        region: Tuple of (chrom, start, end) defining the region.
+        chrom_col: Column name for chromosome (default: "chrom").
+            If column doesn't exist, filters by position only.
+        pos_col: Column name for position (default: "pos").
+    Returns:
+        Filtered DataFrame (copy, not view).
+    Raises:
+        KeyError: If pos_col is not found in DataFrame.
+    Example:
+        >>> filtered = filter_by_region(df, region=(1, 1000000, 2000000))
+        >>> filtered = filter_by_region(df, region=("chr1", 1e6, 2e6), pos_col="position")
+    """
+    chrom, start, end = region
+    # Validate position column exists
+    if pos_col not in df.columns:
+        raise KeyError(
+            f"Position column '{pos_col}' not found in DataFrame. "
+            f"Available columns: {list(df.columns)}"
+        )
+    # Position filtering (inclusive bounds)
+    mask = (df[pos_col] >= start) & (df[pos_col] <= end)
+    # Chromosome filtering (if column exists)
+    if chrom_col in df.columns:
+        chrom_normalized = normalize_chrom(chrom)
+        df_chrom_normalized = (
+            df[chrom_col].astype(str).str.replace("chr", "", regex=False)
+        )
+        mask = mask & (df_chrom_normalized == chrom_normalized)
+    return df[mask].copy()
 def validate_dataframe(
     df: pd.DataFrame,
     required_cols: List[str],

pylocuszoom/validation.py ADDED Viewed

@@ -0,0 +1,172 @@
+"""DataFrame validation builder for pyLocusZoom.
+Provides a fluent API for validating pandas DataFrames with composable
+validation rules. Accumulates all validation errors before raising.
+"""
+from typing import List, Optional
+import pandas as pd
+from pandas.api.types import is_numeric_dtype
+from .utils import ValidationError
+class DataFrameValidator:
+    """Builder for composable DataFrame validation.
+    Validates DataFrames with method chaining and accumulates all errors
+    before raising. This enables clear, readable validation code with
+    comprehensive error messages.
+    Example:
+        >>> validator = DataFrameValidator(df, name="gwas_df")
+        >>> validator.require_columns(["chr", "pos", "p"])
+        ...     .require_numeric(["pos", "p"])
+        ...     .require_range("p", min_val=0, max_val=1)
+        ...     .validate()
+    """
+    def __init__(self, df: pd.DataFrame, name: str = "DataFrame"):
+        """Initialize validator.
+        Args:
+            df: DataFrame to validate.
+            name: Name for error messages (e.g., "gwas_df", "genes_df").
+        """
+        self._df = df
+        self._name = name
+        self._errors: List[str] = []
+    def require_columns(self, columns: List[str]) -> "DataFrameValidator":
+        """Check that required columns exist in DataFrame.
+        Args:
+            columns: List of required column names.
+        Returns:
+            Self for method chaining.
+        """
+        if not columns:
+            return self
+        missing = [col for col in columns if col not in self._df.columns]
+        if missing:
+            available = list(self._df.columns)
+            self._errors.append(f"Missing columns: {missing}. Available: {available}")
+        return self
+    def require_numeric(self, columns: List[str]) -> "DataFrameValidator":
+        """Check that columns have numeric dtype.
+        Skips columns that don't exist (checked separately by require_columns).
+        Args:
+            columns: List of column names that should be numeric.
+        Returns:
+            Self for method chaining.
+        """
+        for col in columns:
+            # Skip missing columns - let require_columns handle that
+            if col not in self._df.columns:
+                continue
+            if not is_numeric_dtype(self._df[col]):
+                actual_dtype = self._df[col].dtype
+                self._errors.append(
+                    f"Column '{col}' must be numeric, got {actual_dtype}"
+                )
+        return self
+    def require_range(
+        self,
+        column: str,
+        min_val: Optional[float] = None,
+        max_val: Optional[float] = None,
+        exclusive_min: bool = False,
+        exclusive_max: bool = False,
+    ) -> "DataFrameValidator":
+        """Check that column values are within specified range.
+        Args:
+            column: Column name to check.
+            min_val: Minimum allowed value (inclusive by default).
+            max_val: Maximum allowed value (inclusive by default).
+            exclusive_min: If True, minimum is exclusive (values must be > min_val).
+            exclusive_max: If True, maximum is exclusive (values must be < max_val).
+        Returns:
+            Self for method chaining.
+        """
+        # Skip missing columns
+        if column not in self._df.columns:
+            return self
+        col_data = self._df[column]
+        # Check minimum bound
+        if min_val is not None:
+            if exclusive_min:
+                invalid_count = (col_data <= min_val).sum()
+                if invalid_count > 0:
+                    self._errors.append(
+                        f"Column '{column}': {invalid_count} values <= {min_val}"
+                    )
+            else:
+                invalid_count = (col_data < min_val).sum()
+                if invalid_count > 0:
+                    self._errors.append(
+                        f"Column '{column}': {invalid_count} values < {min_val}"
+                    )
+        # Check maximum bound
+        if max_val is not None:
+            if exclusive_max:
+                invalid_count = (col_data >= max_val).sum()
+                if invalid_count > 0:
+                    self._errors.append(
+                        f"Column '{column}': {invalid_count} values >= {max_val}"
+                    )
+            else:
+                invalid_count = (col_data > max_val).sum()
+                if invalid_count > 0:
+                    self._errors.append(
+                        f"Column '{column}': {invalid_count} values > {max_val}"
+                    )
+        return self
+    def require_not_null(self, columns: List[str]) -> "DataFrameValidator":
+        """Check that columns have no null (NaN or None) values.
+        Args:
+            columns: List of column names to check for nulls.
+        Returns:
+            Self for method chaining.
+        """
+        for col in columns:
+            # Skip missing columns
+            if col not in self._df.columns:
+                continue
+            null_count = self._df[col].isna().sum()
+            if null_count > 0:
+                self._errors.append(f"Column '{col}' has {null_count} null values")
+        return self
+    def validate(self) -> None:
+        """Raise ValidationError if any validation rules failed.
+        Raises:
+            ValidationError: If any validation errors were accumulated.
+                Error message includes all accumulated errors.
+        """
+        if self._errors:
+            error_msg = f"{self._name} validation failed:\n"
+            error_msg += "\n".join(f"  - {error}" for error in self._errors)
+            raise ValidationError(error_msg)

pylocuszoom 0.5.0__py3-none-any.whl → 0.8.0__py3-none-any.whl

pylocuszoom 0.5.0py3-none-any.whl → 0.8.0py3-none-any.whl