PyPI - pylocuszoom - Versions diffs - 0.8.0__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

pylocuszoom 0.8.0py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

pylocuszoom/__init__.py +27 -7
pylocuszoom/_plotter_utils.py +66 -0
pylocuszoom/backends/base.py +56 -0
pylocuszoom/backends/bokeh_backend.py +141 -29
pylocuszoom/backends/matplotlib_backend.py +60 -0
pylocuszoom/backends/plotly_backend.py +297 -88
pylocuszoom/config.py +365 -0
pylocuszoom/ensembl.py +6 -11
pylocuszoom/eqtl.py +3 -7
pylocuszoom/exceptions.py +33 -0
pylocuszoom/finemapping.py +2 -7
pylocuszoom/forest.py +1 -0
pylocuszoom/gene_track.py +10 -31
pylocuszoom/labels.py +6 -2
pylocuszoom/manhattan.py +246 -0
pylocuszoom/manhattan_plotter.py +760 -0
pylocuszoom/plotter.py +401 -327
pylocuszoom/qq.py +123 -0
pylocuszoom/recombination.py +7 -7
pylocuszoom/schemas.py +1 -6
pylocuszoom/stats_plotter.py +319 -0
pylocuszoom/utils.py +2 -4
pylocuszoom/validation.py +51 -0
{pylocuszoom-0.8.0.dist-info → pylocuszoom-1.1.0.dist-info}/METADATA +159 -25
pylocuszoom-1.1.0.dist-info/RECORD +36 -0
pylocuszoom-0.8.0.dist-info/RECORD +0 -29
{pylocuszoom-0.8.0.dist-info → pylocuszoom-1.1.0.dist-info}/WHEEL +0 -0
{pylocuszoom-0.8.0.dist-info → pylocuszoom-1.1.0.dist-info}/licenses/LICENSE.md +0 -0

pylocuszoom/qq.py ADDED Viewed

@@ -0,0 +1,123 @@
+"""QQ plot data preparation and statistics."""
+import numpy as np
+import pandas as pd
+from scipy import stats
+def calculate_lambda_gc(p_values: np.ndarray) -> float:
+    """Calculate genomic inflation factor (lambda GC).
+    Lambda is the ratio of the median observed chi-squared statistic
+    to the expected median under the null hypothesis.
+    Args:
+        p_values: Array of p-values.
+    Returns:
+        Genomic inflation factor (lambda). Returns NaN if no valid p-values.
+    """
+    # Remove NaN and zero/negative values
+    p_clean = p_values[~np.isnan(p_values) & (p_values > 0)]
+    if len(p_clean) == 0:
+        return np.nan
+    # Convert to chi-squared statistics (1 df)
+    chi2 = stats.chi2.ppf(1 - p_clean, df=1)
+    # Expected median for chi-squared with 1 df
+    expected_median = stats.chi2.ppf(0.5, df=1)
+    # Lambda = observed median / expected median
+    return np.median(chi2) / expected_median
+def calculate_confidence_band(
+    n_points: int, confidence: float = 0.95
+) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+    """Calculate confidence band for QQ plot.
+    Uses order statistics to compute expected distribution of p-values
+    under the null hypothesis.
+    Args:
+        n_points: Number of p-values.
+        confidence: Confidence level (default 0.95 for 95% CI).
+    Returns:
+        Tuple of (expected, lower_bound, upper_bound) arrays in -log10 scale.
+    """
+    # Expected quantiles
+    expected = -np.log10((np.arange(1, n_points + 1)) / (n_points + 1))
+    # Confidence interval using beta distribution
+    alpha = 1 - confidence
+    ranks = np.arange(1, n_points + 1)
+    n_minus_rank = n_points - ranks + 1
+    lower_p = stats.beta.ppf(alpha / 2, ranks, n_minus_rank)
+    upper_p = stats.beta.ppf(1 - alpha / 2, ranks, n_minus_rank)
+    # Convert to -log10 scale (swap because -log10 reverses order)
+    lower_bound = -np.log10(upper_p)
+    upper_bound = -np.log10(lower_p)
+    return expected, lower_bound, upper_bound
+def prepare_qq_data(
+    df: pd.DataFrame,
+    p_col: str = "p",
+) -> pd.DataFrame:
+    """Prepare DataFrame for QQ plot rendering.
+    Args:
+        df: DataFrame with p-values.
+        p_col: Column name for p-value.
+    Returns:
+        DataFrame with columns for QQ plotting:
+        - _expected: Expected -log10(p) under null
+        - _observed: Observed -log10(p)
+        - _ci_lower: Lower confidence bound
+        - _ci_upper: Upper confidence bound
+        Attributes stored in DataFrame.attrs:
+        - lambda_gc: Genomic inflation factor
+        - n_variants: Number of valid p-values
+    """
+    if p_col not in df.columns:
+        raise ValueError(f"Column '{p_col}' not found in DataFrame")
+    # Get p-values and filter invalid
+    p_values = df[p_col].values
+    valid_mask = ~np.isnan(p_values) & (p_values > 0) & (p_values <= 1)
+    p_valid = p_values[valid_mask]
+    if len(p_valid) == 0:
+        raise ValueError("No valid p-values found (must be > 0 and <= 1)")
+    # Sort p-values (smallest first -> largest -log10 last)
+    p_sorted = np.sort(p_valid)
+    # Calculate observed -log10(p)
+    observed = -np.log10(p_sorted)
+    # Calculate expected and confidence bands
+    expected, ci_lower, ci_upper = calculate_confidence_band(len(p_sorted))
+    # Create result DataFrame
+    result = pd.DataFrame(
+        {
+            "_expected": expected,
+            "_observed": observed,
+            "_ci_lower": ci_lower,
+            "_ci_upper": ci_upper,
+        }
+    )
+    # Store statistics in attrs
+    result.attrs["lambda_gc"] = calculate_lambda_gc(p_valid)
+    result.attrs["n_variants"] = len(p_valid)
+    return result

pylocuszoom/recombination.py CHANGED Viewed

@@ -432,8 +432,8 @@ def add_recombination_overlay(
         region_recomb["pos"],
         region_recomb["rate"],
         color=RECOMB_COLOR,
-        linewidth=1.5,
-        alpha=0.7,
+        linewidth=2.5,
+        alpha=0.8,
         zorder=0,  # Behind scatter points
     )
@@ -447,14 +447,14 @@ def add_recombination_overlay(
         zorder=0,
     )
-    # Format secondary axis
-    recomb_ax.set_ylabel("Recombination rate (cM/Mb)", color=RECOMB_COLOR, fontsize=9)
-    recomb_ax.tick_params(axis="y", labelcolor=RECOMB_COLOR, labelsize=8)
+    # Format secondary axis - use black for label text (more readable)
+    recomb_ax.set_ylabel("Recombination rate (cM/Mb)", color="black", fontsize=9)
+    recomb_ax.tick_params(axis="y", labelcolor="black", labelsize=8)
     recomb_ax.set_ylim(bottom=0)
-    # Don't let recomb rate overwhelm the plot
+    # Scale to fit data with headroom
     max_rate = region_recomb["rate"].max()
-    recomb_ax.set_ylim(0, max(max_rate * 1.2, 20))
+    recomb_ax.set_ylim(0, max(max_rate * 1.3, 10))
     # Remove top spine for cleaner look
     recomb_ax.spines["top"].set_visible(False)

pylocuszoom/schemas.py CHANGED Viewed

@@ -10,12 +10,7 @@ from typing import Optional, Union
 import pandas as pd
 from pydantic import BaseModel, ConfigDict, field_validator, model_validator
-class LoaderValidationError(Exception):
-    """Raised when loaded data fails validation."""
-    pass
+from .exceptions import LoaderValidationError
 # =============================================================================
 # GWAS Validation

pylocuszoom/stats_plotter.py ADDED Viewed

@@ -0,0 +1,319 @@
+"""Statistical visualization plotter for PheWAS and forest plots.
+Provides variant-centric visualizations:
+- PheWAS plots showing associations across phenotypes
+- Forest plots showing effect sizes with confidence intervals
+"""
+from typing import Any, Optional, Tuple
+import numpy as np
+import pandas as pd
+from ._plotter_utils import DEFAULT_GENOMEWIDE_THRESHOLD, transform_pvalues
+from .backends import BackendType, get_backend
+from .colors import get_phewas_category_palette
+from .forest import validate_forest_df
+from .phewas import validate_phewas_df
+class StatsPlotter:
+    """Statistical visualization plotter for PheWAS and forest plots.
+    Creates variant-centric visualizations for phenome-wide associations
+    and meta-analysis forest plots.
+    Args:
+        backend: Plotting backend ('matplotlib', 'plotly', or 'bokeh').
+        genomewide_threshold: P-value threshold for significance line.
+    Example:
+        >>> plotter = StatsPlotter()
+        >>> fig = plotter.plot_phewas(phewas_df, variant_id="rs12345")
+        >>> fig.savefig("phewas.png", dpi=150)
+    """
+    def __init__(
+        self,
+        backend: BackendType = "matplotlib",
+        genomewide_threshold: float = DEFAULT_GENOMEWIDE_THRESHOLD,
+    ):
+        """Initialize the stats plotter."""
+        self._backend = get_backend(backend)
+        self.genomewide_threshold = genomewide_threshold
+    def plot_phewas(
+        self,
+        phewas_df: pd.DataFrame,
+        variant_id: str,
+        phenotype_col: str = "phenotype",
+        p_col: str = "p_value",
+        category_col: str = "category",
+        effect_col: Optional[str] = None,
+        significance_threshold: float = DEFAULT_GENOMEWIDE_THRESHOLD,
+        figsize: Tuple[float, float] = (10, 8),
+    ) -> Any:
+        """Create a PheWAS (Phenome-Wide Association Study) plot.
+        Shows associations of a single variant across multiple phenotypes,
+        with phenotypes grouped by category and colored accordingly.
+        Args:
+            phewas_df: DataFrame with phenotype associations.
+            variant_id: Variant identifier (e.g., "rs12345") for plot title.
+            phenotype_col: Column name for phenotype names.
+            p_col: Column name for p-values.
+            category_col: Column name for phenotype categories.
+            effect_col: Optional column name for effect direction (beta/OR).
+            significance_threshold: P-value threshold for significance line.
+            figsize: Figure size as (width, height).
+        Returns:
+            Figure object (type depends on backend).
+        Example:
+            >>> fig = plotter.plot_phewas(
+            ...     phewas_df,
+            ...     variant_id="rs12345",
+            ...     category_col="category",
+            ... )
+        """
+        validate_phewas_df(phewas_df, phenotype_col, p_col, category_col)
+        df = phewas_df.copy()
+        df = transform_pvalues(df, p_col)
+        # Sort by category then by p-value for consistent ordering
+        if category_col in df.columns:
+            df = df.sort_values([category_col, p_col])
+            categories = df[category_col].unique().tolist()
+            palette = get_phewas_category_palette(categories)
+        else:
+            df = df.sort_values(p_col)
+            categories = []
+            palette = {}
+        # Create figure
+        fig, axes = self._backend.create_figure(
+            n_panels=1,
+            height_ratios=[1.0],
+            figsize=figsize,
+        )
+        ax = axes[0]
+        # Assign y-positions (one per phenotype)
+        df["y_pos"] = range(len(df))
+        # Plot points by category
+        if categories:
+            for cat in categories:
+                # Handle NaN category: NaN == NaN is False in pandas
+                if pd.isna(cat):
+                    cat_data = df[df[category_col].isna()]
+                else:
+                    cat_data = df[df[category_col] == cat]
+                # Use upward triangles for positive effects, circles otherwise
+                if effect_col and effect_col in cat_data.columns:
+                    # Vectorized: split by effect sign, 2 scatter calls per category
+                    pos_data = cat_data[cat_data[effect_col] >= 0]
+                    neg_data = cat_data[cat_data[effect_col] < 0]
+                    if not pos_data.empty:
+                        self._backend.scatter(
+                            ax,
+                            pos_data["neglog10p"],
+                            pos_data["y_pos"],
+                            colors=palette[cat],
+                            sizes=60,
+                            marker="^",
+                            edgecolor="black",
+                            linewidth=0.5,
+                            zorder=2,
+                        )
+                    if not neg_data.empty:
+                        self._backend.scatter(
+                            ax,
+                            neg_data["neglog10p"],
+                            neg_data["y_pos"],
+                            colors=palette[cat],
+                            sizes=60,
+                            marker="v",
+                            edgecolor="black",
+                            linewidth=0.5,
+                            zorder=2,
+                        )
+                else:
+                    self._backend.scatter(
+                        ax,
+                        cat_data["neglog10p"],
+                        cat_data["y_pos"],
+                        colors=palette[cat],
+                        sizes=60,
+                        marker="o",
+                        edgecolor="black",
+                        linewidth=0.5,
+                        zorder=2,
+                    )
+        else:
+            self._backend.scatter(
+                ax,
+                df["neglog10p"],
+                df["y_pos"],
+                colors="#4169E1",
+                sizes=60,
+                edgecolor="black",
+                linewidth=0.5,
+                zorder=2,
+            )
+        # Add significance threshold line
+        sig_line = -np.log10(significance_threshold)
+        self._backend.axvline(
+            ax, x=sig_line, color="red", linestyle="--", linewidth=1, alpha=0.7
+        )
+        # Set axis labels and limits
+        self._backend.set_xlabel(ax, r"$-\log_{10}$ P")
+        self._backend.set_ylabel(ax, "Phenotype")
+        self._backend.set_ylim(ax, -0.5, len(df) - 0.5)
+        # Set y-tick labels to phenotype names
+        self._backend.set_yticks(
+            ax,
+            positions=df["y_pos"].tolist(),
+            labels=df[phenotype_col].tolist(),
+            fontsize=8,
+        )
+        self._backend.set_title(ax, f"PheWAS: {variant_id}")
+        self._backend.hide_spines(ax, ["top", "right"])
+        self._backend.finalize_layout(fig)
+        return fig
+    def plot_forest(
+        self,
+        forest_df: pd.DataFrame,
+        variant_id: str,
+        study_col: str = "study",
+        effect_col: str = "effect",
+        ci_lower_col: str = "ci_lower",
+        ci_upper_col: str = "ci_upper",
+        weight_col: Optional[str] = None,
+        null_value: float = 0.0,
+        effect_label: str = "Effect Size",
+        figsize: Tuple[float, float] = (8, 6),
+    ) -> Any:
+        """Create a forest plot showing effect sizes with confidence intervals.
+        Args:
+            forest_df: DataFrame with effect sizes and confidence intervals.
+            variant_id: Variant identifier for plot title.
+            study_col: Column name for study/phenotype names.
+            effect_col: Column name for effect sizes.
+            ci_lower_col: Column name for lower confidence interval.
+            ci_upper_col: Column name for upper confidence interval.
+            weight_col: Optional column for study weights (affects marker size).
+            null_value: Reference value for null effect (0 for beta, 1 for OR).
+            effect_label: X-axis label.
+            figsize: Figure size as (width, height).
+        Returns:
+            Figure object (type depends on backend).
+        Example:
+            >>> fig = plotter.plot_forest(
+            ...     forest_df,
+            ...     variant_id="rs12345",
+            ...     effect_label="Odds Ratio",
+            ...     null_value=1.0,
+            ... )
+        """
+        validate_forest_df(forest_df, study_col, effect_col, ci_lower_col, ci_upper_col)
+        df = forest_df.copy()
+        # Create figure
+        fig, axes = self._backend.create_figure(
+            n_panels=1,
+            height_ratios=[1.0],
+            figsize=figsize,
+        )
+        ax = axes[0]
+        # Assign y-positions (reverse so first study is at top)
+        df["y_pos"] = range(len(df) - 1, -1, -1)
+        # Calculate marker sizes from weights
+        if weight_col and weight_col in df.columns:
+            # Scale weights to marker sizes (min 40, max 200)
+            weights = df[weight_col]
+            min_size, max_size = 40, 200
+            weight_range = weights.max() - weights.min()
+            if weight_range > 0:
+                sizes = min_size + (weights - weights.min()) / weight_range * (
+                    max_size - min_size
+                )
+            else:
+                sizes = (min_size + max_size) / 2
+        else:
+            sizes = 80
+        # Calculate error bar extents
+        xerr_lower = df[effect_col] - df[ci_lower_col]
+        xerr_upper = df[ci_upper_col] - df[effect_col]
+        # Plot error bars (confidence intervals)
+        self._backend.errorbar_h(
+            ax,
+            x=df[effect_col],
+            y=df["y_pos"],
+            xerr_lower=xerr_lower,
+            xerr_upper=xerr_upper,
+            color="black",
+            linewidth=1.5,
+            capsize=3,
+            zorder=2,
+        )
+        # Plot effect size markers
+        self._backend.scatter(
+            ax,
+            df[effect_col],
+            df["y_pos"],
+            colors="#4169E1",
+            sizes=sizes,
+            marker="s",  # square markers typical for forest plots
+            edgecolor="black",
+            linewidth=0.5,
+            zorder=3,
+        )
+        # Add null effect line
+        self._backend.axvline(
+            ax, x=null_value, color="grey", linestyle="--", linewidth=1, alpha=0.7
+        )
+        # Set axis labels and limits
+        self._backend.set_xlabel(ax, effect_label)
+        self._backend.set_ylim(ax, -0.5, len(df) - 0.5)
+        # Ensure x-axis includes the null value with some padding
+        x_min = min(df[ci_lower_col].min(), null_value)
+        x_max = max(df[ci_upper_col].max(), null_value)
+        x_padding = (x_max - x_min) * 0.1
+        self._backend.set_xlim(ax, x_min - x_padding, x_max + x_padding)
+        # Set y-tick labels to study names
+        self._backend.set_yticks(
+            ax,
+            positions=df["y_pos"].tolist(),
+            labels=df[study_col].tolist(),
+            fontsize=10,
+        )
+        self._backend.set_title(ax, f"Forest Plot: {variant_id}")
+        self._backend.hide_spines(ax, ["top", "right"])
+        self._backend.finalize_layout(fig)
+        return fig

pylocuszoom/utils.py CHANGED Viewed

@@ -8,6 +8,8 @@ from typing import TYPE_CHECKING, Any, List, Optional, Union
 import pandas as pd
+from .exceptions import ValidationError
 if TYPE_CHECKING:
     from pyspark.sql import DataFrame as SparkDataFrame
@@ -15,10 +17,6 @@ if TYPE_CHECKING:
 DataFrameLike = Union[pd.DataFrame, "SparkDataFrame", Any]
-class ValidationError(ValueError):
-    """Raised when input validation fails."""
 def is_spark_dataframe(df: Any) -> bool:
     """Check if object is a PySpark DataFrame.

pylocuszoom/validation.py CHANGED Viewed

@@ -159,6 +159,57 @@ class DataFrameValidator:
         return self
+    def require_ci_ordering(
+        self,
+        ci_lower_col: str,
+        effect_col: str,
+        ci_upper_col: str,
+    ) -> "DataFrameValidator":
+        """Check that confidence intervals are properly ordered.
+        Validates that ci_lower <= effect <= ci_upper for all rows.
+        Invalid ordering would produce negative error bar lengths.
+        Args:
+            ci_lower_col: Column name for lower CI bound.
+            effect_col: Column name for effect size (point estimate).
+            ci_upper_col: Column name for upper CI bound.
+        Returns:
+            Self for method chaining.
+        """
+        # Skip if any column is missing
+        for col in [ci_lower_col, effect_col, ci_upper_col]:
+            if col not in self._df.columns:
+                return self
+        lower = self._df[ci_lower_col]
+        effect = self._df[effect_col]
+        upper = self._df[ci_upper_col]
+        # Check ci_lower <= effect
+        lower_gt_effect = (lower > effect).sum()
+        if lower_gt_effect > 0:
+            self._errors.append(
+                f"{lower_gt_effect} rows have {ci_lower_col} > {effect_col}"
+            )
+        # Check effect <= ci_upper
+        effect_gt_upper = (effect > upper).sum()
+        if effect_gt_upper > 0:
+            self._errors.append(
+                f"{effect_gt_upper} rows have {effect_col} > {ci_upper_col}"
+            )
+        # Check ci_lower <= ci_upper (implicit from above, but explicit is clearer)
+        lower_gt_upper = (lower > upper).sum()
+        if lower_gt_upper > 0:
+            self._errors.append(
+                f"{lower_gt_upper} rows have {ci_lower_col} > {ci_upper_col}"
+            )
+        return self
     def validate(self) -> None:
         """Raise ValidationError if any validation rules failed.

pylocuszoom 0.8.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

pylocuszoom 0.8.0py3-none-any.whl → 1.1.0py3-none-any.whl