PyPI - pylocuszoom - Versions diffs - 0.8.0__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

pylocuszoom 0.8.0py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

pylocuszoom/__init__.py +27 -7
pylocuszoom/_plotter_utils.py +66 -0
pylocuszoom/backends/base.py +56 -0
pylocuszoom/backends/bokeh_backend.py +141 -29
pylocuszoom/backends/matplotlib_backend.py +60 -0
pylocuszoom/backends/plotly_backend.py +297 -88
pylocuszoom/config.py +365 -0
pylocuszoom/ensembl.py +6 -11
pylocuszoom/eqtl.py +3 -7
pylocuszoom/exceptions.py +33 -0
pylocuszoom/finemapping.py +2 -7
pylocuszoom/forest.py +1 -0
pylocuszoom/gene_track.py +10 -31
pylocuszoom/labels.py +6 -2
pylocuszoom/manhattan.py +246 -0
pylocuszoom/manhattan_plotter.py +760 -0
pylocuszoom/plotter.py +401 -327
pylocuszoom/qq.py +123 -0
pylocuszoom/recombination.py +7 -7
pylocuszoom/schemas.py +1 -6
pylocuszoom/stats_plotter.py +319 -0
pylocuszoom/utils.py +2 -4
pylocuszoom/validation.py +51 -0
{pylocuszoom-0.8.0.dist-info → pylocuszoom-1.1.0.dist-info}/METADATA +159 -25
pylocuszoom-1.1.0.dist-info/RECORD +36 -0
pylocuszoom-0.8.0.dist-info/RECORD +0 -29
{pylocuszoom-0.8.0.dist-info → pylocuszoom-1.1.0.dist-info}/WHEEL +0 -0
{pylocuszoom-0.8.0.dist-info → pylocuszoom-1.1.0.dist-info}/licenses/LICENSE.md +0 -0

pylocuszoom/config.py ADDED Viewed

@@ -0,0 +1,365 @@
+"""Pydantic configuration classes for pyLocusZoom plot methods.
+This module provides typed, validated configuration objects that replace
+the parameter explosion in plot methods. Each config is immutable (frozen)
+to prevent accidental modification.
+Example:
+    >>> from pylocuszoom.config import RegionConfig, DisplayConfig, PlotConfig
+    >>> region = RegionConfig(chrom=1, start=1000000, end=2000000)
+    >>> display = DisplayConfig(snp_labels=False, label_top_n=3)
+    >>>
+    >>> # Using composite PlotConfig with factory method
+    >>> config = PlotConfig.from_kwargs(chrom=1, start=1000000, end=2000000)
+"""
+from typing import List, Optional, Tuple
+from pydantic import BaseModel, ConfigDict, Field, model_validator
+class RegionConfig(BaseModel):
+    """Genomic region specification.
+    Attributes:
+        chrom: Chromosome number (must be >= 1).
+        start: Start position in base pairs (must be >= 0).
+        end: End position in base pairs (must be > start).
+    """
+    model_config = ConfigDict(frozen=True)
+    chrom: int = Field(..., ge=1, description="Chromosome number")
+    start: int = Field(..., ge=0, description="Start position (bp)")
+    end: int = Field(..., gt=0, description="End position (bp)")
+    @model_validator(mode="after")
+    def validate_region(self) -> "RegionConfig":
+        """Validate that start < end."""
+        if self.start >= self.end:
+            raise ValueError(f"start ({self.start}) must be < end ({self.end})")
+        return self
+class ColumnConfig(BaseModel):
+    """DataFrame column name mappings for GWAS data.
+    Attributes:
+        pos_col: Column name for genomic position.
+        p_col: Column name for p-value.
+        rs_col: Column name for SNP identifier.
+    """
+    model_config = ConfigDict(frozen=True)
+    pos_col: str = Field(default="ps", description="Position column name")
+    p_col: str = Field(default="p_wald", description="P-value column name")
+    rs_col: str = Field(default="rs", description="SNP ID column name")
+class DisplayConfig(BaseModel):
+    """Display and visual options for plots.
+    Attributes:
+        snp_labels: Whether to show SNP labels on plot.
+        label_top_n: Number of top SNPs to label.
+        show_recombination: Whether to show recombination rate overlay.
+        figsize: Figure size as (width, height) in inches.
+    """
+    model_config = ConfigDict(frozen=True)
+    snp_labels: bool = Field(default=True, description="Show SNP labels")
+    label_top_n: int = Field(default=5, ge=0, description="Number of top SNPs to label")
+    show_recombination: bool = Field(
+        default=True, description="Show recombination overlay"
+    )
+    figsize: Tuple[float, float] = Field(
+        default=(12.0, 8.0), description="Figure size (width, height)"
+    )
+class LDConfig(BaseModel):
+    """Linkage disequilibrium configuration.
+    Supports three modes:
+    1. No LD coloring: All fields None (default)
+    2. Pre-computed LD: Provide ld_col for column with R^2 values
+    3. Calculate LD: Provide lead_pos and ld_reference_file
+    Attributes:
+        lead_pos: Position of lead/index SNP to highlight.
+        ld_reference_file: Path to PLINK binary fileset for LD calculation.
+        ld_col: Column name for pre-computed LD (R^2) values.
+    """
+    model_config = ConfigDict(frozen=True)
+    lead_pos: Optional[int] = Field(default=None, ge=1, description="Lead SNP position")
+    ld_reference_file: Optional[str] = Field(
+        default=None, description="PLINK binary fileset path"
+    )
+    ld_col: Optional[str] = Field(
+        default=None, description="Pre-computed LD column name"
+    )
+    @model_validator(mode="after")
+    def validate_ld_config(self) -> "LDConfig":
+        """Validate LD configuration consistency.
+        When ld_reference_file is provided, lead_pos is required to identify
+        the index SNP for LD calculation.
+        Note: For StackedPlotConfig, ld_reference_file may be provided without
+        lead_pos when lead_positions list is used (broadcast mode). This is
+        validated at the StackedPlotConfig level, not here.
+        """
+        # Validation moved to StackedPlotConfig.validate_broadcast_ld
+        # to allow broadcast mode where lead_positions list is used instead
+        return self
+class PlotConfig(BaseModel):
+    """Composite configuration for plot() method.
+    Composes all sub-configs into a single validated configuration object.
+    Use either direct construction with nested configs, or the from_kwargs()
+    factory method for backward compatibility with existing code.
+    Attributes:
+        region: Genomic region specification (required).
+        columns: DataFrame column name mappings.
+        display: Display and visual options.
+        ld: Linkage disequilibrium configuration.
+    Example:
+        >>> # Direct construction
+        >>> config = PlotConfig(
+        ...     region=RegionConfig(chrom=1, start=1000000, end=2000000),
+        ...     display=DisplayConfig(snp_labels=False),
+        ... )
+        >>>
+        >>> # Factory method (backward compatible with plot() signature)
+        >>> config = PlotConfig.from_kwargs(
+        ...     chrom=1, start=1000000, end=2000000,
+        ...     snp_labels=False, lead_pos=1500000,
+        ... )
+    """
+    model_config = ConfigDict(frozen=True)
+    region: RegionConfig
+    columns: ColumnConfig = Field(default_factory=ColumnConfig)
+    display: DisplayConfig = Field(default_factory=DisplayConfig)
+    ld: LDConfig = Field(default_factory=LDConfig)
+    @model_validator(mode="after")
+    def validate_ld_requires_lead_pos(self) -> "PlotConfig":
+        """Validate that LD reference file has lead_pos for single plots."""
+        if self.ld.ld_reference_file is not None and self.ld.lead_pos is None:
+            raise ValueError("lead_pos is required when ld_reference_file is provided")
+        return self
+    @classmethod
+    def from_kwargs(
+        cls,
+        *,
+        # Region params (required)
+        chrom: int,
+        start: int,
+        end: int,
+        # Column params
+        pos_col: str = "ps",
+        p_col: str = "p_wald",
+        rs_col: str = "rs",
+        # Display params
+        snp_labels: bool = True,
+        label_top_n: int = 5,
+        show_recombination: bool = True,
+        figsize: Tuple[float, float] = (12.0, 8.0),
+        # LD params
+        lead_pos: Optional[int] = None,
+        ld_reference_file: Optional[str] = None,
+        ld_col: Optional[str] = None,
+    ) -> "PlotConfig":
+        """Create PlotConfig from flat keyword arguments.
+        Factory method that accepts parameters matching the plot() method
+        signature, enabling backward compatibility with existing code.
+        Args:
+            chrom: Chromosome number.
+            start: Start position (bp).
+            end: End position (bp).
+            pos_col: Column name for position.
+            p_col: Column name for p-value.
+            rs_col: Column name for SNP ID.
+            snp_labels: Whether to show SNP labels.
+            label_top_n: Number of top SNPs to label.
+            show_recombination: Whether to show recombination overlay.
+            figsize: Figure size (width, height).
+            lead_pos: Position of lead SNP.
+            ld_reference_file: PLINK binary fileset path.
+            ld_col: Pre-computed LD column name.
+        Returns:
+            PlotConfig with nested config objects.
+        Raises:
+            ValidationError: If parameters are invalid.
+        """
+        return cls(
+            region=RegionConfig(chrom=chrom, start=start, end=end),
+            columns=ColumnConfig(pos_col=pos_col, p_col=p_col, rs_col=rs_col),
+            display=DisplayConfig(
+                snp_labels=snp_labels,
+                label_top_n=label_top_n,
+                show_recombination=show_recombination,
+                figsize=figsize,
+            ),
+            ld=LDConfig(
+                lead_pos=lead_pos,
+                ld_reference_file=ld_reference_file,
+                ld_col=ld_col,
+            ),
+        )
+class StackedPlotConfig(BaseModel):
+    """Composite configuration for plot_stacked() method.
+    Extends PlotConfig pattern with list-based parameters for stacked plots.
+    Supports multiple lead positions, panel labels, and LD reference files.
+    Attributes:
+        region: Genomic region specification (required).
+        columns: DataFrame column name mappings.
+        display: Display and visual options.
+        ld: Linkage disequilibrium configuration (single file for broadcast).
+        lead_positions: List of lead SNP positions (one per panel).
+        panel_labels: List of panel labels (one per panel).
+        ld_reference_files: List of PLINK filesets (one per panel).
+    Example:
+        >>> config = StackedPlotConfig.from_kwargs(
+        ...     chrom=1, start=1000000, end=2000000,
+        ...     lead_positions=[1500000, 1600000],
+        ...     panel_labels=["Study A", "Study B"],
+        ... )
+    """
+    model_config = ConfigDict(frozen=True)
+    region: RegionConfig
+    columns: ColumnConfig = Field(default_factory=ColumnConfig)
+    display: DisplayConfig = Field(default_factory=DisplayConfig)
+    ld: LDConfig = Field(default_factory=LDConfig)
+    # Stacked-specific list parameters
+    lead_positions: Optional[List[int]] = Field(
+        default=None, description="Lead SNP positions (one per panel)"
+    )
+    panel_labels: Optional[List[str]] = Field(
+        default=None, description="Panel labels (one per panel)"
+    )
+    ld_reference_files: Optional[List[str]] = Field(
+        default=None, description="PLINK filesets (one per panel)"
+    )
+    @model_validator(mode="after")
+    def validate_broadcast_ld(self) -> "StackedPlotConfig":
+        """Validate broadcast LD configuration for stacked plots.
+        When ld_reference_file is provided for broadcast, lead_positions must
+        be provided to specify the reference SNP for each panel.
+        """
+        if self.ld.ld_reference_file is not None and self.ld.lead_pos is None:
+            # Broadcast mode: ld_reference_file without lead_pos in LDConfig
+            # Requires lead_positions list instead
+            if self.lead_positions is None:
+                raise ValueError(
+                    "lead_positions is required when ld_reference_file is provided "
+                    "for broadcast (one lead position per panel)"
+                )
+        return self
+    @classmethod
+    def from_kwargs(
+        cls,
+        *,
+        # Region params (required)
+        chrom: int,
+        start: int,
+        end: int,
+        # Column params
+        pos_col: str = "ps",
+        p_col: str = "p_wald",
+        rs_col: str = "rs",
+        # Display params
+        snp_labels: bool = True,
+        label_top_n: int = 3,  # Default for stacked is 3 (less crowded)
+        show_recombination: bool = True,
+        figsize: Tuple[float, float] = (12.0, 8.0),
+        # LD params (single for broadcast)
+        ld_reference_file: Optional[str] = None,
+        ld_col: Optional[str] = None,
+        # Stacked-specific list params
+        lead_positions: Optional[List[int]] = None,
+        panel_labels: Optional[List[str]] = None,
+        ld_reference_files: Optional[List[str]] = None,
+    ) -> "StackedPlotConfig":
+        """Create StackedPlotConfig from flat keyword arguments.
+        Factory method that accepts parameters matching the plot_stacked()
+        method signature, enabling backward compatibility.
+        Args:
+            chrom: Chromosome number.
+            start: Start position (bp).
+            end: End position (bp).
+            pos_col: Column name for position.
+            p_col: Column name for p-value.
+            rs_col: Column name for SNP ID.
+            snp_labels: Whether to show SNP labels.
+            label_top_n: Number of top SNPs to label (default 3 for stacked).
+            show_recombination: Whether to show recombination overlay.
+            figsize: Figure size (width, height).
+            ld_reference_file: Single PLINK fileset (broadcast to all panels).
+            ld_col: Pre-computed LD column name.
+            lead_positions: List of lead SNP positions.
+            panel_labels: List of panel labels.
+            ld_reference_files: List of PLINK filesets.
+        Returns:
+            StackedPlotConfig with nested config objects.
+        Raises:
+            ValidationError: If parameters are invalid.
+        """
+        return cls(
+            region=RegionConfig(chrom=chrom, start=start, end=end),
+            columns=ColumnConfig(pos_col=pos_col, p_col=p_col, rs_col=rs_col),
+            display=DisplayConfig(
+                snp_labels=snp_labels,
+                label_top_n=label_top_n,
+                show_recombination=show_recombination,
+                figsize=figsize,
+            ),
+            ld=LDConfig(
+                ld_reference_file=ld_reference_file,
+                ld_col=ld_col,
+            ),
+            lead_positions=lead_positions,
+            panel_labels=panel_labels,
+            ld_reference_files=ld_reference_files,
+        )
+__all__ = [
+    "RegionConfig",
+    "ColumnConfig",
+    "DisplayConfig",
+    "LDConfig",
+    "PlotConfig",
+    "StackedPlotConfig",
+]

pylocuszoom/ensembl.py CHANGED Viewed

@@ -18,7 +18,7 @@ import pandas as pd
 import requests
 from .logging import logger
-from .utils import ValidationError
+from .utils import ValidationError, normalize_chrom
 # Ensembl API limits regions to 5Mb
 ENSEMBL_MAX_REGION_SIZE = 5_000_000
@@ -47,11 +47,6 @@ ENSEMBL_MAX_RETRIES = 3
 ENSEMBL_RETRY_DELAY = 1.0  # seconds, doubles on each retry
-def _normalize_chrom(chrom: str | int) -> str:
-    """Normalize chromosome name by removing 'chr' prefix."""
-    return str(chrom).replace("chr", "")
 def _validate_region_size(start: int, end: int, context: str) -> None:
     """Validate region size is within Ensembl API limits.
@@ -129,7 +124,7 @@ def get_cached_genes(
         DataFrame if cache hit, None if cache miss.
     """
     ensembl_species = get_ensembl_species_name(species)
-    chrom_str = _normalize_chrom(chrom)
+    chrom_str = normalize_chrom(chrom)
     cache_key = _cache_key(ensembl_species, chrom_str, start, end)
     species_dir = cache_dir / ensembl_species
@@ -161,7 +156,7 @@ def save_cached_genes(
         end: Region end position.
     """
     ensembl_species = get_ensembl_species_name(species)
-    chrom_str = _normalize_chrom(chrom)
+    chrom_str = normalize_chrom(chrom)
     cache_key = _cache_key(ensembl_species, chrom_str, start, end)
     species_dir = cache_dir / ensembl_species
@@ -266,7 +261,7 @@ def fetch_genes_from_ensembl(
     _validate_region_size(start, end, "genes_df")
     ensembl_species = get_ensembl_species_name(species)
-    chrom_str = _normalize_chrom(chrom)
+    chrom_str = normalize_chrom(chrom)
     # Build region string
     region = f"{chrom_str}:{start}-{end}"
@@ -334,7 +329,7 @@ def fetch_exons_from_ensembl(
     _validate_region_size(start, end, "exons_df")
     ensembl_species = get_ensembl_species_name(species)
-    chrom_str = _normalize_chrom(chrom)
+    chrom_str = normalize_chrom(chrom)
     region = f"{chrom_str}:{start}-{end}"
     url = f"{ENSEMBL_REST_URL}/overlap/region/{ensembl_species}/{region}"
@@ -408,7 +403,7 @@ def get_genes_for_region(
     if cache_dir is None:
         cache_dir = get_ensembl_cache_dir()
-    chrom_str = _normalize_chrom(chrom)
+    chrom_str = normalize_chrom(chrom)
     # Check cache first
     if use_cache:

pylocuszoom/eqtl.py CHANGED Viewed

@@ -9,20 +9,15 @@ from typing import List, Optional
 import numpy as np
 import pandas as pd
+from .exceptions import EQTLValidationError, ValidationError
 from .logging import logger
-from .utils import ValidationError, filter_by_region
+from .utils import filter_by_region
 from .validation import DataFrameValidator
 REQUIRED_EQTL_COLS = ["pos", "p_value"]
 OPTIONAL_EQTL_COLS = ["gene", "effect_size", "rs", "se"]
-class EQTLValidationError(ValueError):
-    """Raised when eQTL DataFrame validation fails."""
-    pass
 def validate_eqtl_df(
     df: pd.DataFrame,
     pos_col: str = "pos",
@@ -42,6 +37,7 @@ def validate_eqtl_df(
         (
             DataFrameValidator(df, "eQTL DataFrame")
             .require_columns([pos_col, p_col])
+            .require_numeric([p_col])
             .validate()
         )
     except ValidationError as e:

pylocuszoom/exceptions.py ADDED Viewed

@@ -0,0 +1,33 @@
+"""Exception hierarchy for pyLocusZoom.
+All pyLocusZoom exceptions inherit from PyLocusZoomError, enabling users to
+catch all library errors with `except PyLocusZoomError`.
+"""
+class PyLocusZoomError(Exception):
+    """Base exception for all pyLocusZoom errors."""
+class ValidationError(PyLocusZoomError, ValueError):
+    """Raised when input validation fails. Inherits ValueError for backward compat."""
+class EQTLValidationError(ValidationError):
+    """Raised when eQTL DataFrame validation fails."""
+class FinemappingValidationError(ValidationError):
+    """Raised when fine-mapping DataFrame validation fails."""
+class LoaderValidationError(ValidationError):
+    """Raised when loaded data fails validation."""
+class BackendError(PyLocusZoomError):
+    """Raised when backend operations fail."""
+class DataDownloadError(PyLocusZoomError, RuntimeError):
+    """Raised when data download operations fail."""

pylocuszoom/finemapping.py CHANGED Viewed

@@ -8,8 +8,9 @@ from typing import List, Optional
 import pandas as pd
+from .exceptions import FinemappingValidationError, ValidationError
 from .logging import logger
-from .utils import ValidationError, filter_by_region
+from .utils import filter_by_region
 from .validation import DataFrameValidator
 # Required columns for fine-mapping data
@@ -17,12 +18,6 @@ REQUIRED_FINEMAPPING_COLS = ["pos", "pip"]
 OPTIONAL_FINEMAPPING_COLS = ["rs", "cs", "cs_id", "effect", "se"]
-class FinemappingValidationError(ValueError):
-    """Raised when fine-mapping DataFrame validation fails."""
-    pass
 def validate_finemapping_df(
     df: pd.DataFrame,
     pos_col: str = "pos",

pylocuszoom/forest.py CHANGED Viewed

@@ -31,5 +31,6 @@ def validate_forest_df(
         DataFrameValidator(df, "Forest plot DataFrame")
         .require_columns([study_col, effect_col, ci_lower_col, ci_upper_col])
         .require_numeric([effect_col, ci_lower_col, ci_upper_col])
+        .require_ci_ordering(ci_lower_col, effect_col, ci_upper_col)
         .validate()
     )

pylocuszoom/gene_track.py CHANGED Viewed

@@ -48,22 +48,23 @@ def assign_gene_positions(genes_df: pd.DataFrame, start: int, end: int) -> List[
         List of integer row indices (0, 1, 2, ...) for each gene.
     """
     positions = []
-    occupied = []  # List of (end_pos, row)
+    # Track the rightmost end position for each row (including label buffer)
+    row_ends: dict[int, int] = {}  # row -> rightmost end position
     region_width = end - start
+    label_buffer = region_width * 0.08  # Extra space for labels
     for _, gene in genes_df.iterrows():
         gene_start = max(gene["start"], start)
         gene_end = min(gene["end"], end)
-        # Find first available row with buffer for label spacing
+        # Find first available row where gene doesn't overlap
         row = 0
-        label_buffer = region_width * 0.08  # Extra space for labels
-        for occ_end, occ_row in occupied:
-            if occ_row == row and occ_end > gene_start - label_buffer:
-                row = occ_row + 1
+        while row in row_ends and row_ends[row] > gene_start - label_buffer:
+            row += 1
         positions.append(row)
-        occupied.append((gene_end, row))
+        # Update the row's end position (including buffer for next gene check)
+        row_ends[row] = gene_end
     return positions
@@ -174,17 +175,6 @@ def _draw_strand_arrows_matplotlib(
         gene_start, gene_end, region_width, strand
     )
-    # Draw connecting line between arrow centers
-    if len(arrow_tip_positions) > 1:
-        ax.plot(
-            [arrow_tip_positions[0], arrow_tip_positions[-1]],
-            [y_gene, y_gene],
-            color=arrow_color,
-            linewidth=1.0,
-            zorder=4,
-            solid_capstyle="butt",
-        )
     for tip_x in arrow_tip_positions:
         if strand == "+":
             base_x = tip_x - tri_width
@@ -223,17 +213,6 @@ def _draw_strand_arrows_generic(
         gene_start, gene_end, region_width, strand
     )
-    # Draw connecting line between arrow centers
-    if len(arrow_tip_positions) > 1:
-        backend.line(
-            ax,
-            x=pd.Series([arrow_tip_positions[0], arrow_tip_positions[-1]]),
-            y=pd.Series([y_gene, y_gene]),
-            color=arrow_color,
-            linewidth=1.0,
-            zorder=4,
-        )
     for tip_x in arrow_tip_positions:
         if strand == "+":
             base_x = tip_x - tri_width
@@ -405,7 +384,7 @@ def plot_gene_track(
                 gene_name,
                 ha="center",
                 va="bottom",
-                fontsize=7,
+                fontsize=9,
                 color="#000000",
                 fontweight="medium",
                 style="italic",
@@ -552,7 +531,7 @@ def plot_gene_track_generic(
                 label_pos,
                 y_label,
                 gene_name,
-                fontsize=7,
+                fontsize=9,
                 ha="center",
                 va="bottom",
                 color="#000000",

pylocuszoom/labels.py CHANGED Viewed

@@ -11,6 +11,8 @@ import pandas as pd
 from matplotlib.axes import Axes
 from matplotlib.text import Annotation
+from pylocuszoom.logging import logger
 def add_snp_labels(
     ax: Axes,
@@ -111,7 +113,9 @@ def add_snp_labels(
                 expand_points=(1.5, 1.5),
             )
         except ImportError:
-            # adjustText not installed, labels may overlap
-            pass
+            logger.warning(
+                "adjustText not installed - SNP labels may overlap. "
+                "Install with: pip install adjustText"
+            )
     return texts

pylocuszoom 0.8.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

pylocuszoom 0.8.0py3-none-any.whl → 1.1.0py3-none-any.whl