PyPI - pycmplot - Versions diffs - 0.1.0__py3-none-any.whl - Mend

pycmplot 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

pycmplot/__init__.py +43 -0
pycmplot/_core.py +419 -0
pycmplot/annotation.py +368 -0
pycmplot/cli.py +229 -0
pycmplot/constants.py +66 -0
pycmplot/data/Homo_sapiens.GRCh37.geneinfo.tsv.gz +0 -0
pycmplot/data/Homo_sapiens.GRCh38.geneinfo.tsv.gz +0 -0
pycmplot/data/hg19ToHg38.over.chain +56506 -0
pycmplot/io.py +342 -0
pycmplot/liftover.py +111 -0
pycmplot/plotting/circular.py +261 -0
pycmplot/plotting/linear.py +375 -0
pycmplot/resources.py +116 -0
pycmplot/stats.py +106 -0
pycmplot-0.1.0.dist-info/METADATA +182 -0
pycmplot-0.1.0.dist-info/RECORD +20 -0
pycmplot-0.1.0.dist-info/WHEEL +5 -0
pycmplot-0.1.0.dist-info/entry_points.txt +2 -0
pycmplot-0.1.0.dist-info/licenses/LICENSE +21 -0
pycmplot-0.1.0.dist-info/top_level.txt +1 -0

pycmplot/plotting/linear.py ADDED Viewed

@@ -0,0 +1,375 @@
+"""
+pycmplot.plotting.linear
+========================
+Multi-track linear Manhattan plot.
+"""
+from __future__ import annotations
+import logging
+from typing import Optional
+import matplotlib.colors as mcolors
+import matplotlib.pyplot as plt
+import numpy as np
+from matplotlib.patches import FancyArrowPatch
+from natsort import natsort_keygen
+from pycmplot.constants import CHROM_ORDER
+from pycmplot.stats import get_highlight_snps
+logger = logging.getLogger(__name__)
+# ---------------------------------------------------------------------------
+# Annotation helpers (cluster-aware label spreading)
+# ---------------------------------------------------------------------------
+def _cluster_annotations_by_chr(
+    annot_df,
+    chr_col: str = "CHR",
+    x_col: str = "x",
+    window_size: float = 50e6,
+) -> list[list]:
+    """Cluster annotations within each chromosome by genomic proximity."""
+    clusters: list[list] = []
+    for _chr_name, df_chr in annot_df.groupby(chr_col):
+        df_chr = df_chr.sort_values(x_col)
+        current_cluster = [df_chr.index[0]]
+        last_x = df_chr.iloc[0][x_col]
+        for idx, row in df_chr.iloc[1:].iterrows():
+            x = row[x_col]
+            if x - last_x <= window_size:
+                current_cluster.append(idx)
+            else:
+                clusters.append(current_cluster)
+                current_cluster = [idx]
+            last_x = x
+        clusters.append(current_cluster)
+    return clusters
+def _draw_annotation_arrows(
+    ax,
+    annot_df,
+    chr_col: str,
+    label_col: str,
+    offsets: dict,
+    chr_max: dict,
+    spread_width: float = 60e6,
+    y_tip: float = 0.0,
+    y_text: float = 0.55,
+) -> None:
+    """Draw angled FancyArrowPatch arrows from text labels to signal positions."""
+    annot_df = annot_df.sort_values(by=[chr_col, "x"], key=natsort_keygen())
+    last_xtext = 0 - spread_width
+    for chr_name, df_chr in annot_df.groupby(chr_col, sort=False):
+        df_chr = df_chr.sort_values("x")
+        chr_start = offsets[chr_name]
+        chr_end = offsets[chr_name] + chr_max[chr_name]
+        x_signals = df_chr["x"].values
+        labels = df_chr[label_col].values
+        n = len(df_chr)
+        # Adaptive spread
+        chr_range = chr_end - chr_start
+        sw = spread_width
+        pad = sw / int(str(sw)[:2]) / 2
+        while sw > chr_range:
+            sw -= pad
+        sig_start = df_chr["x"].iloc[0]
+        xmin = sig_start - sw
+        xmax = xmin + n * sw
+        x_texts = np.arange(xmin, xmax, sw)
+        first_xtext = x_texts[0]
+        while first_xtext <= last_xtext:
+            x_texts = [xv + sw for xv in x_texts]
+            first_xtext = x_texts[0]
+        for x_sig, x_txt, label in zip(x_signals, x_texts, labels):
+            dx = x_txt - x_sig
+            rad = 0.15 * np.sign(dx)
+            arrow = FancyArrowPatch(
+                (x_txt, y_text),
+                (x_sig, y_tip - 0.05),
+                arrowstyle="-|>",
+                mutation_scale=12,
+                lw=0.6,
+                color="grey",
+                alpha=0.5,
+                connectionstyle=f"arc3,rad={rad}",
+            )
+            ax.add_patch(arrow)
+            ax.text(
+                x_txt,
+                y_text + 0.02,
+                str(label),
+                rotation=45,
+                ha="left",
+                va="bottom",
+                fontsize=10,
+                clip_on=False,
+                color="black",
+                fontstyle="italic",
+                fontweight="regular",
+            )
+        last_xtext = x_texts[-1]
+# ---------------------------------------------------------------------------
+# Public function
+# ---------------------------------------------------------------------------
+def plot_linear(
+    tracks: list,
+    track_labels: Optional[list[str]] = None,
+    annot_df=None,
+    highlight: bool = False,
+    highlight_thresh: float = 1e-7,
+    chr_col: str = "CHR",
+    pos_col: str = "BP",
+    p_col: str = "P",
+    trim_pval: Optional[float] = None,
+    logp: bool = True,
+    label_col: str = "label",
+    chr_order: Optional[list[str]] = None,
+    chr_spacing: float = 9e6,
+    track_heights: Optional[list[float]] = None,
+    track_spacing: float = 0.10,
+    point_size: float = 5,
+    colors: Optional[list[str]] = None,
+    sig_lines: Optional[list[dict]] = None,
+    plot_title: Optional[str] = None,
+    fig_format: Optional[str] = None,
+    dpi: int = 300,
+    figsize: tuple = (15, 9),
+):
+    """Generate a multi-track linear Manhattan plot.
+    Parameters
+    ----------
+    tracks:
+        List of DataFrames, one per GWAS trait.  Each must have columns
+        *chr_col*, *pos_col*, and *p_col*.
+    track_labels:
+        Y-axis labels for each track.
+    annot_df:
+        Optional DataFrame of lead SNPs to annotate (must contain *chr_col*,
+        *pos_col*, *label_col*).
+    label_col:
+        Column to use in the annot_df e.g. column containing gene names.
+    highlight:
+        Highlight loci within ``500 kb`` of a lead SNP.
+    chr_spacing:
+        Gap (bp) inserted between chromosomes on the x-axis.
+    sig_lines:
+        List of ``{"genome": float, "suggestive": float}`` dicts, one per track.
+    plot_title:
+        Output file path (extension determines format when *fig_format* is ``None``).
+    fig_format:
+        Override output format (e.g. ``'png'``, ``'pdf'``).
+    Returns
+    -------
+    (fig, axes)
+    """
+    if chr_order is None:
+        chr_order = CHROM_ORDER
+    chr_to_idx = {c: i for i, c in enumerate(chr_order)}
+    # ------------------------------------------------------------------
+    # Prep DataFrames
+    # ------------------------------------------------------------------
+    def _prep(df):
+        df = df.copy()
+        if trim_pval:
+            df = df[df[p_col] < trim_pval]
+        if logp:
+            df["logP"] = -np.log10(df[p_col])
+        df[chr_col] = (
+            df[chr_col]
+            .astype(str)
+            .str.replace("chr", "", regex=False)
+            .str.upper()
+            .replace({"23": "X", "24": "Y", "M": "MT", "MTDNA": "MT"})
+        )
+        if highlight:
+            df, _ = get_highlight_snps(
+                df=df,
+                window=500_000,
+                highlight_thresh=highlight_thresh,
+                logp=logp,
+            )
+        df = df[df[chr_col].isin(chr_order)]
+        df["chr_idx"] = df[chr_col].map(chr_to_idx)
+        return df.sort_values(["chr_idx", pos_col])
+    tracks = [_prep(df) for df in tracks]
+    if annot_df is not None:
+        annot_df = _prep(annot_df)
+    # ------------------------------------------------------------------
+    # Cumulative x-axis positions
+    # ------------------------------------------------------------------
+    chr_max: dict[str, float] = {}
+    offsets: dict[str, float] = {}
+    offset = 0.0
+    for c in chr_order:
+        max_pos = max(
+            [df[df[chr_col] == c][pos_col].max() for df in tracks if c in df[chr_col].values]
+            + [0]
+        )
+        chr_max[c] = max_pos
+        offsets[c] = offset
+        offset += max_pos + chr_spacing
+    def _add_cum(df):
+        df = df.copy()
+        df["x"] = df.apply(lambda r: r[pos_col] + offsets[r[chr_col]], axis=1)
+        return df
+    tracks = [_add_cum(df) for df in tracks]
+    if annot_df is not None:
+        annot_df = _add_cum(annot_df)
+    # ------------------------------------------------------------------
+    # Figure layout
+    # ------------------------------------------------------------------
+    n_tracks = len(tracks)
+    if track_heights is None:
+        track_heights = [1] + [3] * n_tracks
+    fig = plt.figure(figsize=figsize)
+    gs = fig.add_gridspec(
+        n_tracks + 1, 1,
+        height_ratios=track_heights,
+        hspace=track_spacing,
+    )
+    ax_annot = fig.add_subplot(gs[0, 0])
+    axes = [ax_annot]
+    for i in range(n_tracks):
+        axes.append(fig.add_subplot(gs[i + 1, 0], sharex=ax_annot))
+    if colors is None:
+        colors = ["gray", "steelblue"]
+    # Per-track highlight colours from tab20 colormap
+    cmap = plt.get_cmap("tab20")
+    hex_colors = [mcolors.to_hex(cmap(i / n_tracks)) for i in range(n_tracks)]
+    # ------------------------------------------------------------------
+    # Plot data tracks
+    # ------------------------------------------------------------------
+    t_labels = track_labels or [f"Track {i+1}" for i in range(n_tracks)]
+    for i, (ax, df, t_label, h_color) in enumerate(
+        zip(axes[1:], tracks, t_labels, hex_colors)
+    ):
+        color_cycle = [colors[j % len(colors)] for j in df["chr_idx"]]
+        df = df[df[p_col] >= 0]
+        y_vals = df["logP"] if logp else df[p_col]
+        ax.scatter(df["x"], y_vals, c=color_cycle, s=point_size)
+        if highlight:
+            sig = df[df["in_locus"]]
+            if not sig.empty:
+                sig_y = sig["logP"] if logp else sig[p_col]
+                ax.scatter(sig["x"].to_numpy(), sig_y.to_numpy(), s=point_size,
+                           marker="o", color="brown")
+        ax.set_ylabel(t_label, color="black")
+        if sig_lines is not None and i < len(sig_lines):
+            sl = sig_lines[i]
+            if "genome" in sl:
+                ax.axhline(y=sl["genome"], color="red", linestyle="--", linewidth=0.6)
+            if "suggestive" in sl:
+                ax.axhline(y=sl["suggestive"], color="grey", linestyle="--", linewidth=0.5)
+        ax.spines["top"].set_visible(False)
+        ax.spines["right"].set_visible(False)
+        left_pad = chr_spacing * 0.2
+        xmax = max(offsets[c] + chr_max[c] for c in chr_order)
+        ax.set_xlim(-left_pad, xmax)
+    # ------------------------------------------------------------------
+    # Annotation track
+    # ------------------------------------------------------------------
+    if annot_df is not None:
+        # Vertical lines across all data tracks
+        for x in annot_df["x"].values:
+            for ax in axes[1:]:
+                ax.axvline(x, color="grey", alpha=0.45, linewidth=0.7,
+                           linestyle="--", zorder=0)
+        _draw_annotation_arrows(
+            ax_annot,
+            annot_df,
+            chr_col=chr_col,
+            label_col=label_col,
+            offsets=offsets,
+            chr_max=chr_max,
+            spread_width=60e6,
+        )
+    ax_annot.set_ylim(0, 1)
+    ax_annot.axis("off")
+    # ------------------------------------------------------------------
+    # Chromosome labels on x-axis
+    # ------------------------------------------------------------------
+    xticks, xlabels = [], []
+    for c in chr_order:
+        if chr_max[c] == 0:
+            continue
+        start = offsets[c]
+        end = offsets[c] + chr_max[c]
+        mid = (start + end) / 2
+        xticks.append(mid)
+        xlabels.append(c)
+        for ax in axes:
+            ax.axvline(end, color="lightgray", linewidth=0.1, alpha=0.05)
+    axes[-1].set_xticks(xticks)
+    axes[-1].set_xticklabels(xlabels)
+    axes[-1].set_xlabel("Chromosome", fontsize=12)
+    for ax in axes[:-1]:
+        ax.tick_params(axis="x", which="both", bottom=False, labelbottom=False)
+        ax.spines["bottom"].set_visible(False)
+    plt.subplots_adjust(hspace=track_spacing, left=0.08)
+    plt.tight_layout()
+    fig.text(
+        0.03, 0.5,
+        "-log\u2081\u2080(P)" if logp else p_col,
+        va="center",
+        rotation="vertical",
+        fontsize=12,
+    )
+    if plot_title:
+        fmt = fig_format or Path(plot_title).suffix.lstrip(".") or "png"
+        plt.savefig(plot_title, format=fmt, dpi=dpi)
+        logger.info("Saved linear Manhattan plot: %s", plot_title)
+    return fig, axes

pycmplot/resources.py ADDED Viewed

@@ -0,0 +1,116 @@
+"""
+pycmplot.resources
+==================
+Centralised configuration for external resource files that cannot be bundled
+with the package (large reference files, chain files, etc.).
+Users can supply paths in three ways, in order of priority:
+1. Pass a :class:`ResourceConfig` instance directly to functions that need it.
+2. Set environment variables before running:
+   .. code-block:: bash
+       export PYCMPLOT_CHAIN_HG19_HG38=/path/to/hg19ToHg38.over.chain
+       export PYCMPLOT_GENEINFO_HG38=/path/to/Homo_sapiens.GRCh38.geneinfo.tsv.gz
+       export PYCMPLOT_GENEINFO_HG19=/path/to/Homo_sapiens.GRCh37.geneinfo.tsv.gz
+       export PYCMPLOT_FEATURESINFO=/path/to/Homo_sapiens.GRCh38.features.tsv.gz
+3. Edit the defaults in this module for a site-wide installation.
+"""
+from __future__ import annotations
+import os
+from dataclasses import dataclass, field
+from pathlib import Path
+from importlib.resources import files
+# define _env
+def _env(var: str, default: str | None = None) -> str | None:
+    return os.environ.get(var, default)
+# define packaged data helper
+def _pkg_data(filename: str) -> str:
+    return str(files("pycmplot.data") / filename)
+@dataclass
+class ResourceConfig:
+    """Paths to external reference files used by pycmplot.
+    Attributes
+    ----------
+    chain_hg19_hg38 :
+        LiftOver chain file for hg19 → hg38 conversion.
+    geneinfo_hg38 :
+        Tab-delimited gene info file for GRCh38 (used for nearest-gene annotation).
+    geneinfo_hg19 :
+        Tab-delimited gene info file for GRCh37 (fallback when data is hg19).
+    featuresinfo :
+        Extended features info file (all biotypes) for GRCh38.
+    """
+    chain_hg19_hg38: str | None = field(
+        default_factory=lambda: _env(
+            "PYCMPLOT_CHAIN_HG19_HG38",
+            _pkg_data("hg19ToHg38.over.chain"),
+        )
+    )
+    geneinfo_hg38: str | None = field(
+        default_factory=lambda: _env(
+            "PYCMPLOT_GENEINFO_HG38",
+            _pkg_data("Homo_sapiens.GRCh38.geneinfo.tsv.gz"),
+        )
+    )
+    geneinfo_hg19: str | None = field(
+        default_factory=lambda: _env(
+            "PYCMPLOT_GENEINFO_HG19",
+            _pkg_data("Homo_sapiens.GRCh37.geneinfo.tsv.gz"),
+        )
+    )
+    #featuresinfo: str | None = field(
+    #    default_factory=lambda: _env(
+    #        "PYCMPLOT_FEATURESINFO",
+    #        _pkg_data("featuresinfo.tsv.gz"),
+    #    )
+    #)
+    def require(self, attr: str) -> str:
+        """Return the path for *attr*, raising a clear error if it is unset."""
+        val = getattr(self, attr)
+        if val is None:
+            env_var = {
+                "chain_hg19_hg38": "PYCMPLOT_CHAIN_HG19_HG38",
+                "geneinfo_hg38":   "PYCMPLOT_GENEINFO_HG38",
+                "geneinfo_hg19":   "PYCMPLOT_GENEINFO_HG19",
+                #"featuresinfo":    "PYCMPLOT_FEATURESINFO",
+            }.get(attr, attr.upper())
+            raise FileNotFoundError(
+                f"Resource '{attr}' is not configured.\n"
+                f"Set the environment variable {env_var} or pass a "
+                f"ResourceConfig('{attr}'='/path/to/file') to the function."
+            )
+        path = Path(val)
+        if path.exists():
+            return str(path)
+        # fallback: try importlib resource resolution
+        try:
+            resource = files("pycmplot.data") / Path(val).name
+            with as_file(resource) as real_path:
+                return str(real_path)
+        except Exception:
+            pass
+        raise FileNotFoundError(
+                f"Resource file not found: {val}\n"
+                f"Check the path set for '{attr}'."
+            )
+        return str(path)
+# Module-level default instance — picks up environment variables automatically.
+default_resources = ResourceConfig()

pycmplot/stats.py ADDED Viewed

@@ -0,0 +1,106 @@
+"""
+pycmplot.stats
+==============
+Statistical helper functions for identifying lead SNPs and loci to highlight.
+"""
+from __future__ import annotations
+import numpy as np
+import pandas as pd
+def get_lead_snps(
+    df: pd.DataFrame,
+    highlight_thresh: float = 5e-8,
+    logp: bool = False,
+    window: int = 500_000,
+) -> pd.DataFrame:
+    """Identify independent lead SNPs by greedy distance clumping.
+    Starting from the most significant SNP, each subsequent SNP is kept only
+    if it is > *window* bp away from all previously kept leads on the same
+    chromosome.
+    Parameters
+    ----------
+    df:
+        Summary statistics DataFrame containing columns ``CHR``, ``POS``,
+        ``P`` (and ``logP`` when *logp* is ``True``).
+    highlight_thresh:
+        P-value (or −log₁₀(p) when *logp* is ``True``) significance cutoff.
+    logp:
+        If ``True``, filter and rank by the ``logP`` column instead of ``P``.
+    window:
+        Clumping window in base-pairs (default 500 kb).
+    Returns
+    -------
+    pd.DataFrame
+        Subset of *df* containing only the lead SNPs.
+    """
+    if logp:
+        thresh = -np.log10(float(highlight_thresh))
+        sig = df[df["logP"] >= thresh].copy()
+        p_col = "logP"
+        ascending = False
+    else:
+        sig = df[df["P"] <= highlight_thresh].copy()
+        p_col = "P"
+        ascending = True
+    sig = sig.sort_values(p_col, ascending=ascending)
+    leads: list[pd.Series] = []
+    while not sig.empty:
+        top = sig.iloc[0]
+        leads.append(top)
+        sig = sig[
+            ~(
+                (sig["CHR"] == top["CHR"])
+                & (abs(sig["POS"] - top["POS"]) <= window)
+            )
+        ]
+    return pd.DataFrame(leads)
+def get_highlight_snps(
+    df: pd.DataFrame,
+    highlight_thresh: float = 5e-8,
+    logp: bool = False,
+    window: int = 500_000,
+) -> tuple[pd.DataFrame, pd.DataFrame]:
+    """Mark all SNPs within *window* bp of a lead SNP.
+    Adds an ``in_locus`` boolean column to *df* and returns the annotated
+    DataFrame together with the lead SNP DataFrame.
+    Parameters
+    ----------
+    df, highlight_thresh, logp, window:
+        See :func:`get_lead_snps`.
+    Returns
+    -------
+    (df_annotated, leads_df)
+    """
+    df = df.copy()
+    df["in_locus"] = False
+    leads_df = get_lead_snps(
+        df=df,
+        highlight_thresh=highlight_thresh,
+        logp=False,
+        window=window,
+    )
+    for _, row in leads_df.iterrows():
+        min_pos = row["POS"] - window
+        max_pos = row["POS"] + window
+        chrom = row["CHR"]
+        mask = (df["CHR"] == chrom) & (df["POS"] >= min_pos) & (df["POS"] <= max_pos)
+        df.loc[mask, "in_locus"] = True
+    return df, leads_df