PyPI - pylocuszoom - Versions diffs - 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl - Mend

pylocuszoom 0.5.0py3-none-any.whl → 0.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

pylocuszoom/__init__.py +23 -2
pylocuszoom/backends/base.py +86 -0
pylocuszoom/backends/bokeh_backend.py +116 -20
pylocuszoom/backends/matplotlib_backend.py +69 -0
pylocuszoom/backends/plotly_backend.py +115 -23
pylocuszoom/colors.py +41 -0
pylocuszoom/forest.py +37 -0
pylocuszoom/loaders.py +35 -17
pylocuszoom/phewas.py +35 -0
pylocuszoom/plotter.py +258 -4
pylocuszoom/recombination.py +45 -31
pylocuszoom/schemas.py +37 -26
{pylocuszoom-0.5.0.dist-info → pylocuszoom-0.6.0.dist-info}/METADATA +53 -5
pylocuszoom-0.6.0.dist-info/RECORD +26 -0
pylocuszoom-0.5.0.dist-info/RECORD +0 -24
{pylocuszoom-0.5.0.dist-info → pylocuszoom-0.6.0.dist-info}/WHEEL +0 -0
{pylocuszoom-0.5.0.dist-info → pylocuszoom-0.6.0.dist-info}/licenses/LICENSE.md +0 -0

pylocuszoom/loaders.py CHANGED Viewed

@@ -260,10 +260,14 @@ def load_saige(
         "POS": pos_col,
         "MarkerID": rs_col,
         "CHR": "chr",
-        "p.value": p_col,
-        "p.value.NA": p_col,  # SPA-adjusted
     }
+    # Prefer SPA-adjusted p-value (p.value.NA) over raw p.value when both present
+    if "p.value.NA" in df.columns:
+        col_map["p.value.NA"] = p_col
+    elif "p.value" in df.columns:
+        col_map["p.value"] = p_col
     df = df.rename(columns=col_map)
     logger.debug(f"Loaded SAIGE file with {len(df)} variants")
     validate_gwas_dataframe(df, pos_col=pos_col, p_col=p_col, rs_col=rs_col)
@@ -318,7 +322,7 @@ def load_gtex_eqtl(
         gene: Optional gene to filter to (ENSG ID or gene symbol).
     Returns:
-        DataFrame with columns: pos, p_value, gene, effect.
+        DataFrame with columns: pos, p_value, gene, effect_size.
     Example:
         >>> eqtl_df = load_gtex_eqtl("GTEx_Analysis.signif_pairs.txt.gz", gene="BRCA1")
@@ -351,10 +355,10 @@ def load_gtex_eqtl(
             col_map[col] = "gene"
             break
-    # Effect size (slope)
+    # Effect size (slope) - standardize to effect_size for plotting compatibility
     for col in ["slope", "beta", "effect_size"]:
         if col in df.columns:
-            col_map[col] = "effect"
+            col_map[col] = "effect_size"
             break
     df = df.rename(columns=col_map)
@@ -385,7 +389,7 @@ def load_eqtl_catalogue(
         gene: Optional gene to filter to.
     Returns:
-        DataFrame with columns: pos, p_value, gene, effect.
+        DataFrame with columns: pos, p_value, gene, effect_size.
     """
     df = pd.read_csv(filepath, sep="\t")
@@ -393,7 +397,7 @@ def load_eqtl_catalogue(
         "position": "pos",
         "pvalue": "p_value",
         "gene_id": "gene",
-        "beta": "effect",
+        "beta": "effect_size",  # Standardize to effect_size for plotter
         "chromosome": "chr",
     }
@@ -422,7 +426,7 @@ def load_matrixeqtl(
         gene: Optional gene to filter to.
     Returns:
-        DataFrame with columns: pos, p_value, gene, effect.
+        DataFrame with columns: pos, p_value, gene, effect_size.
     Note:
         MatrixEQTL output doesn't include position by default.
@@ -435,7 +439,7 @@ def load_matrixeqtl(
         "gene": "gene",
         "p-value": "p_value",
         "pvalue": "p_value",
-        "beta": "effect",
+        "beta": "effect_size",  # Standardize to effect_size for plotter
         "t-stat": "t_stat",
     }
@@ -725,14 +729,28 @@ def load_bed(
     # Assign column names if no header
     if not has_header:
         n_cols = len(df.columns)
-        col_names = ["chr", "start", "end"]
-        if n_cols >= 4:
-            col_names.append("gene_name")
-        if n_cols >= 5:
-            col_names.append("score")
-        if n_cols >= 6:
-            col_names.append("strand")
-        df.columns = col_names[:n_cols]
+        # Standard BED column names (up to BED12)
+        bed_col_names = [
+            "chr",
+            "start",
+            "end",
+            "gene_name",
+            "score",
+            "strand",
+            "thickStart",
+            "thickEnd",
+            "itemRgb",
+            "blockCount",
+            "blockSizes",
+            "blockStarts",
+        ]
+        # Use standard names for known columns, generic for extras
+        if n_cols <= len(bed_col_names):
+            df.columns = bed_col_names[:n_cols]
+        else:
+            # More columns than BED12 - use known names + generic
+            extra_cols = [f"col{i}" for i in range(len(bed_col_names), n_cols)]
+            df.columns = bed_col_names + extra_cols
     # Standardize column names if header was present
     col_map = {

pylocuszoom/phewas.py ADDED Viewed

@@ -0,0 +1,35 @@
+"""PheWAS data validation and preparation.
+Validates and prepares phenome-wide association study data for plotting.
+"""
+import pandas as pd
+from .utils import ValidationError
+def validate_phewas_df(
+    df: pd.DataFrame,
+    phenotype_col: str = "phenotype",
+    p_col: str = "p_value",
+    category_col: str = "category",
+) -> None:
+    """Validate PheWAS DataFrame has required columns.
+    Args:
+        df: PheWAS results DataFrame.
+        phenotype_col: Column name for phenotype names.
+        p_col: Column name for p-values.
+        category_col: Column name for phenotype categories (optional).
+    Raises:
+        ValidationError: If required columns are missing.
+    """
+    required = [phenotype_col, p_col]
+    missing = [col for col in required if col not in df.columns]
+    if missing:
+        raise ValidationError(
+            f"PheWAS DataFrame missing required columns: {missing}. "
+            f"Required: {required}. Found: {list(df.columns)}"
+        )

pylocuszoom/plotter.py CHANGED Viewed

@@ -31,12 +31,14 @@ from .colors import (
     get_eqtl_color,
     get_ld_bin,
     get_ld_color_palette,
+    get_phewas_category_palette,
 )
 from .eqtl import validate_eqtl_df
 from .finemapping import (
     get_credible_sets,
     prepare_finemapping_for_plotting,
 )
+from .forest import validate_forest_df
 from .gene_track import (
     assign_gene_positions,
     plot_gene_track,
@@ -45,6 +47,7 @@ from .gene_track import (
 from .labels import add_snp_labels
 from .ld import calculate_ld, find_plink
 from .logging import enable_logging, logger
+from .phewas import validate_phewas_df
 from .recombination import (
     RECOMB_COLOR,
     add_recombination_overlay,
@@ -1030,11 +1033,17 @@ class LocusZoomPlotter:
             if eqtl_gene and "gene" in eqtl_data.columns:
                 eqtl_data = eqtl_data[eqtl_data["gene"] == eqtl_gene]
-            # Filter by region
+            # Filter by region (position and chromosome)
             if "pos" in eqtl_data.columns:
-                eqtl_data = eqtl_data[
-                    (eqtl_data["pos"] >= start) & (eqtl_data["pos"] <= end)
-                ]
+                mask = (eqtl_data["pos"] >= start) & (eqtl_data["pos"] <= end)
+                # Also filter by chromosome if column exists
+                if "chr" in eqtl_data.columns:
+                    chrom_str = str(chrom).replace("chr", "")
+                    eqtl_chrom = (
+                        eqtl_data["chr"].astype(str).str.replace("chr", "", regex=False)
+                    )
+                    mask = mask & (eqtl_chrom == chrom_str)
+                eqtl_data = eqtl_data[mask]
             if not eqtl_data.empty:
                 eqtl_data["neglog10p"] = -np.log10(
@@ -1155,3 +1164,248 @@ class LocusZoomPlotter:
         self._backend.finalize_layout(fig, hspace=0.1)
         return fig
+    def plot_phewas(
+        self,
+        phewas_df: pd.DataFrame,
+        variant_id: str,
+        phenotype_col: str = "phenotype",
+        p_col: str = "p_value",
+        category_col: str = "category",
+        effect_col: Optional[str] = None,
+        significance_threshold: float = 5e-8,
+        figsize: Tuple[float, float] = (10, 8),
+    ) -> Any:
+        """Create a PheWAS (Phenome-Wide Association Study) plot.
+        Shows associations of a single variant across multiple phenotypes,
+        with phenotypes grouped by category and colored accordingly.
+        Args:
+            phewas_df: DataFrame with phenotype associations.
+            variant_id: Variant identifier (e.g., "rs12345") for plot title.
+            phenotype_col: Column name for phenotype names.
+            p_col: Column name for p-values.
+            category_col: Column name for phenotype categories.
+            effect_col: Optional column name for effect direction (beta/OR).
+            significance_threshold: P-value threshold for significance line.
+            figsize: Figure size as (width, height).
+        Returns:
+            Figure object (type depends on backend).
+        Example:
+            >>> fig = plotter.plot_phewas(
+            ...     phewas_df,
+            ...     variant_id="rs12345",
+            ...     category_col="category",
+            ... )
+        """
+        validate_phewas_df(phewas_df, phenotype_col, p_col, category_col)
+        df = phewas_df.copy()
+        df["neglog10p"] = -np.log10(df[p_col].clip(lower=1e-300))
+        # Sort by category then by p-value for consistent ordering
+        if category_col in df.columns:
+            df = df.sort_values([category_col, p_col])
+            categories = df[category_col].unique().tolist()
+            palette = get_phewas_category_palette(categories)
+        else:
+            df = df.sort_values(p_col)
+            categories = []
+            palette = {}
+        # Create figure
+        fig, axes = self._backend.create_figure(
+            n_panels=1,
+            height_ratios=[1.0],
+            figsize=figsize,
+        )
+        ax = axes[0]
+        # Assign y-positions (one per phenotype)
+        df["y_pos"] = range(len(df))
+        # Plot points by category
+        if categories:
+            for cat in categories:
+                cat_data = df[df[category_col] == cat]
+                # Use upward triangles for positive effects, circles otherwise
+                if effect_col and effect_col in cat_data.columns:
+                    for _, row in cat_data.iterrows():
+                        marker = "^" if row[effect_col] >= 0 else "v"
+                        self._backend.scatter(
+                            ax,
+                            pd.Series([row["neglog10p"]]),
+                            pd.Series([row["y_pos"]]),
+                            colors=palette[cat],
+                            sizes=60,
+                            marker=marker,
+                            edgecolor="black",
+                            linewidth=0.5,
+                            zorder=2,
+                        )
+                else:
+                    self._backend.scatter(
+                        ax,
+                        cat_data["neglog10p"],
+                        cat_data["y_pos"],
+                        colors=palette[cat],
+                        sizes=60,
+                        marker="o",
+                        edgecolor="black",
+                        linewidth=0.5,
+                        zorder=2,
+                    )
+        else:
+            self._backend.scatter(
+                ax,
+                df["neglog10p"],
+                df["y_pos"],
+                colors="#4169E1",
+                sizes=60,
+                edgecolor="black",
+                linewidth=0.5,
+                zorder=2,
+            )
+        # Add significance threshold line
+        sig_line = -np.log10(significance_threshold)
+        self._backend.axvline(
+            ax, x=sig_line, color="red", linestyle="--", linewidth=1, alpha=0.7
+        )
+        # Set axis labels and limits
+        self._backend.set_xlabel(ax, r"$-\log_{10}$ P")
+        self._backend.set_ylabel(ax, "Phenotype")
+        self._backend.set_ylim(ax, -0.5, len(df) - 0.5)
+        # Set y-tick labels to phenotype names (matplotlib only)
+        if self.backend_name == "matplotlib":
+            ax.set_yticks(df["y_pos"])
+            ax.set_yticklabels(df[phenotype_col], fontsize=8)
+        self._backend.set_title(ax, f"PheWAS: {variant_id}")
+        self._backend.hide_spines(ax, ["top", "right"])
+        self._backend.finalize_layout(fig)
+        return fig
+    def plot_forest(
+        self,
+        forest_df: pd.DataFrame,
+        variant_id: str,
+        study_col: str = "study",
+        effect_col: str = "effect",
+        ci_lower_col: str = "ci_lower",
+        ci_upper_col: str = "ci_upper",
+        weight_col: Optional[str] = None,
+        null_value: float = 0.0,
+        effect_label: str = "Effect Size",
+        figsize: Tuple[float, float] = (8, 6),
+    ) -> Any:
+        """Create a forest plot showing effect sizes with confidence intervals.
+        Args:
+            forest_df: DataFrame with effect sizes and confidence intervals.
+            variant_id: Variant identifier for plot title.
+            study_col: Column name for study/phenotype names.
+            effect_col: Column name for effect sizes.
+            ci_lower_col: Column name for lower confidence interval.
+            ci_upper_col: Column name for upper confidence interval.
+            weight_col: Optional column for study weights (affects marker size).
+            null_value: Reference value for null effect (0 for beta, 1 for OR).
+            effect_label: X-axis label.
+            figsize: Figure size as (width, height).
+        Returns:
+            Figure object (type depends on backend).
+        Example:
+            >>> fig = plotter.plot_forest(
+            ...     forest_df,
+            ...     variant_id="rs12345",
+            ...     effect_label="Odds Ratio",
+            ...     null_value=1.0,
+            ... )
+        """
+        validate_forest_df(forest_df, study_col, effect_col, ci_lower_col, ci_upper_col)
+        df = forest_df.copy()
+        # Create figure
+        fig, axes = self._backend.create_figure(
+            n_panels=1,
+            height_ratios=[1.0],
+            figsize=figsize,
+        )
+        ax = axes[0]
+        # Assign y-positions (reverse so first study is at top)
+        df["y_pos"] = range(len(df) - 1, -1, -1)
+        # Calculate marker sizes from weights
+        if weight_col and weight_col in df.columns:
+            # Scale weights to marker sizes (min 40, max 200)
+            weights = df[weight_col]
+            min_size, max_size = 40, 200
+            weight_range = weights.max() - weights.min()
+            if weight_range > 0:
+                sizes = min_size + (weights - weights.min()) / weight_range * (
+                    max_size - min_size
+                )
+            else:
+                sizes = (min_size + max_size) / 2
+        else:
+            sizes = 80
+        # Calculate error bar extents
+        xerr_lower = df[effect_col] - df[ci_lower_col]
+        xerr_upper = df[ci_upper_col] - df[effect_col]
+        # Plot error bars (confidence intervals)
+        self._backend.errorbar_h(
+            ax,
+            x=df[effect_col],
+            y=df["y_pos"],
+            xerr_lower=xerr_lower,
+            xerr_upper=xerr_upper,
+            color="black",
+            linewidth=1.5,
+            capsize=3,
+            zorder=2,
+        )
+        # Plot effect size markers
+        self._backend.scatter(
+            ax,
+            df[effect_col],
+            df["y_pos"],
+            colors="#4169E1",
+            sizes=sizes,
+            marker="s",  # square markers typical for forest plots
+            edgecolor="black",
+            linewidth=0.5,
+            zorder=3,
+        )
+        # Add null effect line
+        self._backend.axvline(
+            ax, x=null_value, color="grey", linestyle="--", linewidth=1, alpha=0.7
+        )
+        # Set axis labels and limits
+        self._backend.set_xlabel(ax, effect_label)
+        self._backend.set_ylim(ax, -0.5, len(df) - 0.5)
+        # Set y-tick labels to study names (matplotlib only)
+        if self.backend_name == "matplotlib":
+            ax.set_yticks(df["y_pos"])
+            ax.set_yticklabels(df[study_col], fontsize=10)
+        self._backend.set_title(ax, f"Forest Plot: {variant_id}")
+        self._backend.hide_spines(ax, ["top", "right"])
+        self._backend.finalize_layout(fig)
+        return fig

pylocuszoom/recombination.py CHANGED Viewed

@@ -9,12 +9,13 @@ Provides:
 import os
 import tarfile
 import tempfile
-import urllib.request
 from pathlib import Path
 from typing import Optional
 import pandas as pd
+import requests
 from matplotlib.axes import Axes
+from tqdm import tqdm
 from .logging import logger
@@ -54,6 +55,38 @@ def get_chain_file_path() -> Path:
     return get_default_data_dir() / "canFam3ToCanFam4.over.chain.gz"
+def _download_with_progress(
+    url: str, dest_path: Path, desc: str = "Downloading"
+) -> None:
+    """Download a file with a progress bar.
+    Args:
+        url: URL to download from.
+        dest_path: Destination file path.
+        desc: Description for the progress bar.
+    """
+    response = requests.get(url, stream=True, timeout=60)
+    response.raise_for_status()
+    total_size = int(response.headers.get("content-length", 0))
+    with (
+        open(dest_path, "wb") as f,
+        tqdm(
+            total=total_size,
+            unit="B",
+            unit_scale=True,
+            unit_divisor=1024,
+            desc=desc,
+            disable=total_size == 0,  # Disable if size unknown
+        ) as pbar,
+    ):
+        for chunk in response.iter_content(chunk_size=8192):
+            if chunk:
+                f.write(chunk)
+                pbar.update(len(chunk))
 def download_liftover_chain(force: bool = False) -> Path:
     """Download the CanFam3 to CanFam4 liftover chain file.
@@ -73,20 +106,11 @@ def download_liftover_chain(force: bool = False) -> Path:
     logger.info("Downloading CanFam3 to CanFam4 liftover chain...")
     logger.debug(f"Source: {CANFAM3_TO_CANFAM4_CHAIN_URL}")
-    try:
-        urllib.request.urlretrieve(CANFAM3_TO_CANFAM4_CHAIN_URL, chain_path)
-    except Exception as e:
-        logger.debug(f"urllib download failed: {e}")
-        try:
-            import requests
-            response = requests.get(CANFAM3_TO_CANFAM4_CHAIN_URL, timeout=60)
-            response.raise_for_status()
-            chain_path.write_bytes(response.content)
-        except ImportError:
-            raise RuntimeError(
-                "Failed to download. Install requests: pip install requests"
-            )
+    _download_with_progress(
+        CANFAM3_TO_CANFAM4_CHAIN_URL,
+        chain_path,
+        desc="Liftover chain",
+    )
     logger.info(f"Chain file saved to: {chain_path}")
     return chain_path
@@ -217,24 +241,14 @@ def download_canine_recombination_maps(
     logger.debug(f"Source: {CANINE_RECOMB_URL}")
     with tempfile.TemporaryDirectory() as tmpdir:
-        # Download tar.gz file
+        # Download tar.gz file with progress bar
         tar_path = Path(tmpdir) / "dog_genetic_maps.tar.gz"
-        try:
-            urllib.request.urlretrieve(CANINE_RECOMB_URL, tar_path)
-        except Exception as e:
-            logger.debug(f"urllib download failed: {e}")
-            logger.debug("Trying alternative method with requests...")
-            try:
-                import requests
-                response = requests.get(CANINE_RECOMB_URL, timeout=60)
-                response.raise_for_status()
-                tar_path.write_bytes(response.content)
-            except ImportError:
-                raise RuntimeError(
-                    "Failed to download. Install requests: pip install requests"
-                )
+        _download_with_progress(
+            CANINE_RECOMB_URL,
+            tar_path,
+            desc="Recombination maps",
+        )
         logger.debug(f"Downloaded {tar_path.stat().st_size / 1024:.1f} KB")

pylocuszoom/schemas.py CHANGED Viewed

@@ -84,30 +84,36 @@ def validate_gwas_dataframe(
             "GWAS validation failed:\n  - " + "\n  - ".join(errors)
         )
-    # Check data types
-    if not pd.api.types.is_numeric_dtype(df[pos_col]):
+    # Check data types (must be numeric for range checks)
+    pos_is_numeric = pd.api.types.is_numeric_dtype(df[pos_col])
+    p_is_numeric = pd.api.types.is_numeric_dtype(df[p_col])
+    if not pos_is_numeric:
         errors.append(f"Column '{pos_col}' must be numeric, got {df[pos_col].dtype}")
-    if not pd.api.types.is_numeric_dtype(df[p_col]):
+    if not p_is_numeric:
         errors.append(f"Column '{p_col}' must be numeric, got {df[p_col].dtype}")
-    # Check value ranges
-    if (df[pos_col] <= 0).any():
-        n_invalid = (df[pos_col] <= 0).sum()
-        errors.append(f"Column '{pos_col}' has {n_invalid} non-positive values")
+    # Only check value ranges if columns are numeric (avoid confusing errors)
+    if pos_is_numeric:
+        if (df[pos_col] <= 0).any():
+            n_invalid = (df[pos_col] <= 0).sum()
+            errors.append(f"Column '{pos_col}' has {n_invalid} non-positive values")
-    if ((df[p_col] <= 0) | (df[p_col] > 1)).any():
-        n_invalid = ((df[p_col] <= 0) | (df[p_col] > 1)).sum()
-        errors.append(f"Column '{p_col}' has {n_invalid} values outside range (0, 1]")
+        if df[pos_col].isna().any():
+            n_na = df[pos_col].isna().sum()
+            errors.append(f"Column '{pos_col}' has {n_na} missing values")
-    # Check for NaN in required columns
-    if df[pos_col].isna().any():
-        n_na = df[pos_col].isna().sum()
-        errors.append(f"Column '{pos_col}' has {n_na} missing values")
+    if p_is_numeric:
+        if ((df[p_col] <= 0) | (df[p_col] > 1)).any():
+            n_invalid = ((df[p_col] <= 0) | (df[p_col] > 1)).sum()
+            errors.append(
+                f"Column '{p_col}' has {n_invalid} values outside range (0, 1]"
+            )
-    if df[p_col].isna().any():
-        n_na = df[p_col].isna().sum()
-        errors.append(f"Column '{p_col}' has {n_na} missing values")
+        if df[p_col].isna().any():
+            n_na = df[p_col].isna().sum()
+            errors.append(f"Column '{p_col}' has {n_na} missing values")
     if errors:
         raise LoaderValidationError(
@@ -344,20 +350,25 @@ def validate_genes_dataframe(
         )
     # Check data types
-    if not pd.api.types.is_numeric_dtype(df["start"]):
+    start_is_numeric = pd.api.types.is_numeric_dtype(df["start"])
+    end_is_numeric = pd.api.types.is_numeric_dtype(df["end"])
+    if not start_is_numeric:
         errors.append(f"Column 'start' must be numeric, got {df['start'].dtype}")
-    if not pd.api.types.is_numeric_dtype(df["end"]):
+    if not end_is_numeric:
         errors.append(f"Column 'end' must be numeric, got {df['end'].dtype}")
-    # Check ranges
-    if (df["start"] < 0).any():
-        n_invalid = (df["start"] < 0).sum()
-        errors.append(f"Column 'start' has {n_invalid} negative values")
+    # Only check ranges if columns are numeric (avoid confusing errors)
+    if start_is_numeric:
+        if (df["start"] < 0).any():
+            n_invalid = (df["start"] < 0).sum()
+            errors.append(f"Column 'start' has {n_invalid} negative values")
-    if (df["end"] < df["start"]).any():
-        n_invalid = (df["end"] < df["start"]).sum()
-        errors.append(f"Found {n_invalid} genes where end < start")
+    if start_is_numeric and end_is_numeric:
+        if (df["end"] < df["start"]).any():
+            n_invalid = (df["end"] < df["start"]).sum()
+            errors.append(f"Found {n_invalid} genes where end < start")
     if errors:
         raise LoaderValidationError(

pylocuszoom 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

pylocuszoom 0.5.0py3-none-any.whl → 0.6.0py3-none-any.whl