PyPI - pycmplot - Versions diffs - 0.1.0__py3-none-any.whl - Mend

pycmplot 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

pycmplot/__init__.py +43 -0
pycmplot/_core.py +419 -0
pycmplot/annotation.py +368 -0
pycmplot/cli.py +229 -0
pycmplot/constants.py +66 -0
pycmplot/data/Homo_sapiens.GRCh37.geneinfo.tsv.gz +0 -0
pycmplot/data/Homo_sapiens.GRCh38.geneinfo.tsv.gz +0 -0
pycmplot/data/hg19ToHg38.over.chain +56506 -0
pycmplot/io.py +342 -0
pycmplot/liftover.py +111 -0
pycmplot/plotting/circular.py +261 -0
pycmplot/plotting/linear.py +375 -0
pycmplot/resources.py +116 -0
pycmplot/stats.py +106 -0
pycmplot-0.1.0.dist-info/METADATA +182 -0
pycmplot-0.1.0.dist-info/RECORD +20 -0
pycmplot-0.1.0.dist-info/WHEEL +5 -0
pycmplot-0.1.0.dist-info/entry_points.txt +2 -0
pycmplot-0.1.0.dist-info/licenses/LICENSE +21 -0
pycmplot-0.1.0.dist-info/top_level.txt +1 -0

pycmplot/io.py ADDED Viewed

@@ -0,0 +1,342 @@
+"""
+pycmplot.io
+===========
+Summary statistics loading, delimiter detection, and sector-size computation.
+"""
+from __future__ import annotations
+import csv
+import gzip
+import logging
+from collections import defaultdict
+from pathlib import Path
+from typing import Optional
+import natsort
+import numpy as np
+import pandas as pd
+from pycmplot.stats import get_lead_snps, get_highlight_snps
+from pycmplot.annotation import get_hits_summary_table
+from pycmplot.resources import ResourceConfig, default_resources
+logger = logging.getLogger(__name__)
+# ---------------------------------------------------------------------------
+# File utilities
+# ---------------------------------------------------------------------------
+def smart_open(file_path: str):
+    """Open a regular or gzip-compressed file transparently."""
+    path = Path(file_path)
+    if path.suffix == ".gz":
+        return gzip.open(file_path, "rt")
+    return open(file_path, "r")
+def resolve_delimiter(delim: str) -> str:
+    """Map a human-readable delimiter name to the actual separator character."""
+    if not isinstance(delim, str):
+        raise TypeError("Delimiter must be a string.")
+    mapping = {
+        "space":      " ",
+        "tab":        "\t",
+        "comma":      ",",
+        "colon":      ":",
+        "semi-colon": ";",
+        "semicolon":  ";",
+    }
+    key = delim.strip().lower()
+    if key in mapping:
+        return mapping[key]
+    if len(key) == 1:
+        return key  # allow bare characters like '\t'
+    raise ValueError(
+        f"Invalid delimiter '{delim}'. "
+        "Choose from: space, tab, comma, colon, semi-colon."
+    )
+def detect_delimiter(file_path: str, sample_size: int = 5_000):
+    """Automatically detect the delimiter using :mod:`csv.Sniffer`.
+    Returns
+    -------
+    (delimiter_str, dialect_or_None)
+    """
+    with smart_open(file_path) as f:
+        sample = f.read(sample_size)
+    try:
+        dialect = csv.Sniffer().sniff(sample)
+        return dialect.delimiter, dialect
+    except csv.Error:
+        return _fallback_delimiter(sample), None
+def _fallback_delimiter(sample: str) -> str:
+    candidates = [",", "\t", " ", ";", "|"]
+    counts = {d: sample.count(d) for d in candidates}
+    best = max(counts, key=counts.get)
+    if counts[best] == 0:
+        raise ValueError("Unable to detect delimiter automatically.")
+    return best
+def get_file_header(
+    file_path: str,
+    delim: Optional[str] = None,
+    dialect=None,
+) -> list[str]:
+    """Return the column names from the first line of *file_path*."""
+    with smart_open(file_path) as f:
+        try:
+            if delim:
+                reader = csv.DictReader(f)
+                hdr = f"{delim}".join(reader.fieldnames or []).split(delim)
+            elif dialect:
+                reader = csv.DictReader(f, dialect=dialect)
+                hdr = reader.fieldnames or []
+            else:
+                reader = csv.DictReader(f)
+                hdr = reader.fieldnames or []
+        except csv.Error:
+            logger.warning("Header could not be determined for %s", file_path)
+            hdr = []
+    return list(hdr)
+# ---------------------------------------------------------------------------
+# Sector-size helpers
+# ---------------------------------------------------------------------------
+def _merge_min_max_lists(dicts: list[dict]) -> dict:
+    """Merge per-chromosome [min, max] lists across multiple sumstats."""
+    temp: dict = defaultdict(list)
+    for d in dicts:
+        for key, values in d.items():
+            temp[key].extend(values)
+    return {k: [min(v), max(v)] for k, v in temp.items()}
+# ---------------------------------------------------------------------------
+# Main loader
+# ---------------------------------------------------------------------------
+def get_sumstats_and_merged_sector_list(
+    sum_stats: list[str],
+    labels: list[str],
+    logp: bool = False,
+    trim_pval: Optional[float] = None,
+    file_info: Optional[dict] = None,
+    sort_tracks: Optional[str] = "chrom_len",
+    table_out: Optional[str] = None,
+    highlight: bool = False,
+    highlight_thresh: float = 5e-8,
+    signif_threshold: Optional[float] = None,
+    signif_line: Optional[float] = None,
+    suggest_threshold: Optional[float] = None,
+    resources: Optional[ResourceConfig] = None,
+):
+    """Load summary statistics and compute merged Circos sector sizes.
+    Parameters
+    ----------
+    sum_stats:
+        List of file paths to GWAS summary statistics (possibly gzip-compressed).
+    labels:
+        Track labels in the same order as *sum_stats*.
+    file_info:
+        Dict keyed by label; each value is a list
+        ``[col_names, col_dtypes, rename_map, sep]``.
+    sort_tracks:
+        ``'label'`` — sort tracks alphabetically by label.
+        ``'chrom_len'`` — sort by number of chromosomes (default).
+        ``None`` — preserve input order.
+    highlight:
+        Whether to flag loci for highlighting.
+    resources:
+        :class:`~pycmplot.resources.ResourceConfig` instance.
+    Returns
+    -------
+    (merged_sector_sizes, sumstats_loaded, hits_table, signif_lines)
+    """
+    if resources is None:
+        resources = default_resources
+    from pycmplot.liftover import liftover_position
+    # Build a label → file path mapping
+    sumstats: dict[str, list] = {
+        name: [path] for name, path in zip(labels, sum_stats)
+    }
+    sumstats_loaded: dict[str, list] = {}
+    all_lead_snps: list[pd.DataFrame] = []
+    for label in sumstats.keys() & (file_info or {}).keys():
+        sumstat_cols   = file_info[label][0]
+        sumstat_dtypes = file_info[label][1]
+        sumstat_newcols= file_info[label][2]
+        sep            = file_info[label][3]
+        logger.info("Loading %s from %s …", label, sumstats[label][0])
+        df = pd.read_csv(
+            sumstats[label][0],
+            sep=sep,
+            header=0,
+            usecols=sumstat_cols,
+            dtype=sumstat_dtypes,
+        ).rename(columns=sumstat_newcols)
+        # Trim insignificant variants for faster plotting
+        if trim_pval:
+            logger.info("Excluding variants with p-value less than %s ...", trim_pval)
+            df = df[df["P"].astype(float) <= float(trim_pval)]
+        else:
+            df = df[df["P"].astype(float) <= 1]
+        if logp:
+            logger.info("Adding a 'logP' column ...")
+            df["logP"] = -np.log10(df["P"])
+        df["LABEL"] = label
+        # Normalise chromosome names
+        logger.info('Normalizing chromosome names {"23": "X", "24": "Y", "M": "MT", "MTDNA": "MT"} ...')
+        df["CHR"] = (
+            df["CHR"]
+            .str.replace("chr", "", regex=False)
+            .dropna()
+            .str.upper()
+            .replace({"23": "X", "24": "Y", "M": "MT", "MTDNA": "MT"})
+        )
+        # Number of distinct chromosomes (for track sorting)
+        n_chroms = len(df["CHR"].unique()) - 1
+        sumstats_loaded[label] = [df, n_chroms]
+        # Liftover hg19 data if needed
+        if "BUILD" in df.columns and "hg19" in df["BUILD"].unique():
+            logger.info("Converting hg19 coordinates to hg38 ...")
+            sumstats_loaded[label][0] = liftover_position(df, resources=resources)
+        # Lead SNPs / highlight SNPs
+        if highlight:
+            logger.info("Extracting variants to highlight ...")
+            sumstats_loaded[label][0], leads = get_highlight_snps(
+                df=sumstats_loaded[label][0],
+                window=2_000_000,
+                highlight_thresh=highlight_thresh,
+                logp=True,
+            )
+        else:
+            leads = get_lead_snps(
+                df=sumstats_loaded[label][0],
+                highlight_thresh=signif_threshold or 5e-8,
+                logp=True,
+            )
+        all_lead_snps.append(leads)
+    # Combine lead SNPs and filter to significance threshold
+    all_lead_snps_df = (
+        pd.concat(all_lead_snps, ignore_index=True).drop_duplicates()
+        if all_lead_snps
+        else pd.DataFrame()
+    )
+    if not all_lead_snps_df.empty and signif_threshold:
+        all_lead_snps_df = all_lead_snps_df[
+            all_lead_snps_df["P"] <= signif_threshold
+        ]
+    hits_table = (
+        get_hits_summary_table(
+            leads_df=all_lead_snps_df,
+            table_out=table_out,
+            window_kb=2_000,
+            resources=resources,
+        )
+        if not all_lead_snps_df.empty
+        else pd.DataFrame()
+    )
+    # Derive significance/suggestive thresholds
+    if not signif_threshold:
+        if trim_pval:
+            signif_threshold = 5e-8
+        elif sumstats_loaded:
+            last_label = list(sumstats_loaded)[-1]
+            n = len(sumstats_loaded[last_label][0]["P"])
+            signif_threshold = max(0.05 / n, 5e-8)
+        else:
+            signif_threshold = 5e-8
+    if not suggest_threshold:
+        suggest_threshold = 1e-5
+    suggest_line = suggest_threshold
+    if logp:
+        suggest_line = -np.log10(suggest_threshold)
+    if signif_line is None:
+        signif_line = signif_threshold
+        if logp:
+            signif_line = -np.log10(signif_threshold)
+    else:
+        if logp and signif_line < 1:
+            signif_line = -np.log10(signif_line)
+    signif_lines = [
+        {"genome": signif_line, "suggestive": suggest_line}
+        for _ in sumstats
+    ]
+    # Optionally sort tracks
+    if sort_tracks is not None:
+        if sort_tracks.lower() == "label":
+            sumstats_loaded = dict(sorted(sumstats_loaded.items()))
+        else:  # chrom_len
+            sumstats_loaded = dict(
+                sorted(
+                    sumstats_loaded.items(),
+                    key=lambda item: (item[0], natsort.natsort_keygen()(item[1][1])),
+                )
+            )
+    # Compute per-sumstat sector sizes (chrom → [min_pos, max_pos])
+    assoc_sector_sizes_list: list[dict] = []
+    min_dic_val = None
+    for df, _n in sumstats_loaded.values():
+        assoc = df[~(df["CHR"].str.len() > 2)].copy()
+        assoc["POS"] = assoc["POS"].fillna(0).astype(int)
+        assoc_dic: dict[str, list] = {}
+        for chrom in assoc["CHR"].unique():
+            sub = assoc[assoc["CHR"] == chrom]
+            lo_val = max(sub["POS"].min() - 1_000_000, 0)
+            hi_val = sub["POS"].max() + 1_000_000
+            assoc_dic[str(chrom)] = [lo_val, hi_val]
+        min_dic_val = min(assoc_dic.values())
+        assoc_sector_sizes_list.append(assoc_dic)
+    merged = _merge_min_max_lists(assoc_sector_sizes_list)
+    merged = dict(natsort.natsorted(merged.items(), key=lambda item: item[0]))
+    if "23" in merged:
+        merged["X"] = merged.pop("23")
+    # Add spacer sector for y-axis labelling
+    if min_dic_val is not None:
+        if len(labels) <= 5:
+            merged["Spacer1"] = [x + x / 2 for x in min_dic_val]
+        else:
+            merged["Spacer1"] = [x * 2 for x in min_dic_val]
+    return merged, sumstats_loaded, hits_table, signif_lines

pycmplot/liftover.py ADDED Viewed

@@ -0,0 +1,111 @@
+"""
+pycmplot.liftover
+=================
+Genome coordinate liftover utilities (hg19 → hg38).
+The :class:`pyliftover.LiftOver` object is initialised **lazily** — it is
+created only when ``liftover_position`` is first called, so importing this
+module never raises a :class:`FileNotFoundError` even if the chain file has
+not been configured yet.
+"""
+from __future__ import annotations
+import logging
+from typing import Optional
+import numpy as np
+import pandas as pd
+from pycmplot.resources import ResourceConfig, default_resources
+logger = logging.getLogger(__name__)
+# ---------------------------------------------------------------------------
+# Lazy singleton — one LiftOver object per chain file path
+# ---------------------------------------------------------------------------
+_lo_cache: dict[str, object] = {}
+def _get_liftover(chain_path: str):
+    """Return a cached :class:`~pyliftover.LiftOver` for *chain_path*."""
+    if chain_path not in _lo_cache:
+        from pyliftover import LiftOver  # deferred import
+        logger.info("Loading LiftOver chain file: %s", chain_path)
+        _lo_cache[chain_path] = LiftOver(chain_path)
+    return _lo_cache[chain_path]
+# ---------------------------------------------------------------------------
+# Public helpers
+# ---------------------------------------------------------------------------
+def liftover_hg19_to_hg38(
+    chrom: str,
+    pos: int,
+    resources: Optional[ResourceConfig] = None,
+) -> Optional[int]:
+    """Convert a single hg19 coordinate to hg38.
+    Parameters
+    ----------
+    chrom:
+        Chromosome name **without** the ``chr`` prefix (e.g. ``"1"``, ``"X"``).
+    pos:
+        0-based position (as expected by pyliftover).
+    resources:
+        :class:`~pycmplot.resources.ResourceConfig` instance.  Falls back to
+        the module-level :data:`~pycmplot.resources.default_resources`.
+    Returns
+    -------
+    int or None
+        New hg38 position, or ``None`` if liftover failed for that coordinate.
+    """
+    if resources is None:
+        resources = default_resources
+    chain_path = resources.require("chain_hg19_hg38")
+    lo = _get_liftover(chain_path)
+    results = lo.convert_coordinate(f"chr{chrom}", pos)
+    if not results:
+        return None
+    # pyliftover returns sorted by chain score; take the best hit
+    _new_chrom, new_pos, _strand, _score = results[0]
+    return new_pos
+def liftover_position(
+    df: pd.DataFrame,
+    resources: Optional[ResourceConfig] = None,
+) -> pd.DataFrame:
+    """Liftover all hg19 rows in *df* to hg38, in place.
+    Expects columns ``CHR``, ``POS``, and ``BUILD``.  Rows whose ``BUILD``
+    is ``'hg19'`` are lifted; others are left unchanged.  Rows that fail
+    liftover (new position == 0 or ``None``) are dropped.
+    Returns the modified DataFrame with two additional columns:
+    ``OLD_POS`` and ``OLD_BUILD``.
+    """
+    if resources is None:
+        resources = default_resources
+    df = df.copy()
+    df["POS"] = df["POS"].astype(int)
+    new_positions: list[Optional[int]] = []
+    for chrom, pos, build in zip(df["CHR"], df["POS"], df["BUILD"]):
+        if build == "hg19":
+            new_positions.append(liftover_hg19_to_hg38(chrom, pos, resources))
+        else:
+            new_positions.append(pos)
+    df["OLD_POS"] = df["POS"]
+    df["OLD_BUILD"] = df["BUILD"]
+    df["BUILD"] = "hg38"
+    df["POS"] = new_positions
+    df["POS"] = df["POS"].fillna(0).astype(int)
+    return df[df["POS"] != 0]

pycmplot/plotting/circular.py ADDED Viewed

@@ -0,0 +1,261 @@
+"""
+pycmplot.plotting.circular
+==========================
+Per-chromosome circular (Circos-style) Manhattan track plotter and
+track-radius calculator.
+"""
+from __future__ import annotations
+import logging
+import math
+from typing import Optional
+import numpy as np
+import pandas as pd
+logger = logging.getLogger(__name__)
+# ---------------------------------------------------------------------------
+# Track radius calculator
+# ---------------------------------------------------------------------------
+def compute_track_radii_dict(
+    n_tracks: int,
+    r_min: float = 0,
+    r_max: float = 100,
+    pad: float = 1,
+    annotate: bool = False,
+) -> dict[str, tuple[float, float]]:
+    """Compute (r_start, r_end) tuples for *n_tracks* evenly-spaced tracks.
+    Parameters
+    ----------
+    n_tracks:
+        Number of data tracks.
+    r_min, r_max:
+        Inner and outer radius of the full plotting area.
+    pad:
+        Spacing between consecutive tracks.
+    annotate:
+        If ``True``, add one extra track slot for an annotation ring.
+    Returns
+    -------
+    dict
+        ``{"track_1": (start, end), "track_2": (start, end), …}``
+    """
+    if annotate:
+        n_tracks += 1
+    total_space = r_max - r_min
+    usable_space = total_space - pad * (n_tracks - 1)
+    if usable_space <= 0:
+        raise ValueError(
+            f"Padding ({pad}) is too large for {n_tracks} tracks in "
+            f"radius range [{r_min}, {r_max}]."
+        )
+    track_height = usable_space / n_tracks
+    radii: dict[str, tuple[float, float]] = {}
+    current = float(r_min)
+    for i in range(n_tracks):
+        radii[f"track_{i + 1}"] = (current, current + track_height)
+        current += track_height + pad
+    return radii
+# ---------------------------------------------------------------------------
+# Per-chromosome circular Manhattan track
+# ---------------------------------------------------------------------------
+def plot_circular(
+    sector=None,
+    sector_radius=None,
+    annotation_r=None,
+    assoc: Optional[pd.DataFrame] = None,
+    sector_sizes: Optional[dict] = None,
+    chrom_label_loc=None,
+    chrom_label_size: float = 6,
+    track_label_size: float = 6,
+    track_label_orientation: str = "vertical",
+    track_index: int = 0,
+    assoc_label: Optional[str] = None,
+    logp: bool = True,
+    signif_line: Optional[float] = None,
+    signif_threshold: Optional[float] = None,
+    suggest_line: Optional[float] = None,
+    suggest_threshold: Optional[float] = None,
+    highlight: bool = False,
+    highlight_thresh: Optional[float] = None,
+    colors: Optional[list[str]] = None,
+) -> None:
+    """Plot a single chromosome's data onto a pycirclize sector track.
+    This function is called once per (sector, sumstat) pair in the main
+    circular Manhattan loop.  It mutates *sector* in-place and returns
+    ``None``.  Lead-SNP collection is handled in the calling code.
+    Parameters
+    ----------
+    sector:
+        A :class:`pycirclize.Sector` object.
+    sector_radius:
+        ``(r_start, r_end)`` tuple for this track on the sector.
+    assoc:
+        Summary statistics DataFrame for **all** chromosomes (filtered to the
+        current sector chromosome inside the function).  Must have columns
+        ``CHR``, ``POS``, ``P`` (and ``logP`` if *logp* is ``True``).
+    sector_sizes:
+        Ordered dict of ``{chrom: [min_pos, max_pos]}`` for all sectors,
+        used to place labels on the first/last sector.
+    track_index:
+        0-based index of the current sumstat track (used for chromosome labels).
+    colors:
+        Two alternating colours for even/odd chromosomes.
+    """
+    if colors is None:
+        colors = ["steelblue", "silver"]
+    logger.info("Processing sector: %s", sector.name)
+    assoc = assoc.copy()
+    assoc["POS"] = assoc["POS"].fillna(0).astype(int)
+    genome_wide_sig = signif_threshold
+    suggestive = suggest_threshold
+    assoc_uniq_chroms = list(assoc["CHR"].unique())
+    v_min = float(math.floor(min(assoc["logP"]))) if logp else float(math.floor(min(assoc["P"])))
+    v_max = float(math.ceil(max(assoc["logP"]))) if logp else float(math.ceil(max(assoc["P"])))
+    if logp:
+        v_max += 2
+    if pd.isna(v_max):
+        v_max = 0.0
+    sector_keys = list(sector_sizes.keys())
+    # ------------------------------------------------------------------
+    # Track label on the last (spacer) sector
+    # ------------------------------------------------------------------
+    if sector.name == sector_keys[-1]:
+        lbl_track = sector.add_track(sector_radius)
+        lbl_track.axis(fc="white", alpha=0)
+        lbl_track.text(
+            assoc_label,
+            x=(sector.end - sector.start) / 6,
+            adjust_rotation=True,
+            orientation=track_label_orientation,
+            size=float(track_label_size),
+            color="black",
+            fontstyle="normal",
+            fontweight="regular",
+            multialignment="left",
+        )
+    if sector.name not in assoc_uniq_chroms:
+        return
+    # ------------------------------------------------------------------
+    # Chromosome label (first track only, or chrX)
+    # ------------------------------------------------------------------
+    if track_index == 0 or sector.name == "X":
+        sector.text(
+            sector.name.replace("23", "X"),
+            r=chrom_label_loc,
+            size=chrom_label_size,
+        )
+    sector.axis(fc="none", lw=0, ec="none", alpha=0.5)
+    # ------------------------------------------------------------------
+    # Y-axis ticks on the first chromosome
+    # ------------------------------------------------------------------
+    if sector.name == sector_keys[0]:
+        yax_track = sector.add_track(sector_radius)
+        yax_track.axis(fc="white", alpha=0.08)
+        if logp:
+            tick_step = 1
+            yticks = []
+            while len(yticks) < 2 or len(yticks) > 5:
+                yticks = np.arange(v_min, v_max, tick_step)
+                tick_step += 1
+        else:
+            yticks = np.arange(v_min, v_max)
+        yax_track.yticks(
+            yticks,
+            labels=[str(int(t)) for t in yticks],
+            side="left",
+            vmin=v_min,
+            vmax=v_max,
+            label_size=5,
+        )
+    # ------------------------------------------------------------------
+    # Data track
+    # ------------------------------------------------------------------
+    assoc_chr = assoc.loc[assoc["CHR"] == sector.name]
+    track = sector.add_track(sector_radius, r_pad_ratio=0.05)
+    track.axis(fc="lightgrey", alpha=0.08)
+    chrom_num = sector.name.replace("X", "23").replace("Y", "24")
+    color = colors[0] if int(chrom_num) % 2 == 0 else colors[1]
+    y_col = "logP" if logp else "P"
+    if highlight:
+        sig = assoc_chr[assoc_chr["in_locus"]]
+        bg = assoc_chr[~assoc_chr["in_locus"]]
+        track.scatter(
+            data=bg,
+            x=list(bg["POS"].astype(float)),
+            y=list(bg[y_col].astype(float)),
+            vmin=v_min, vmax=v_max,
+            marker="o", s=6, color=color, alpha=1,
+        )
+        if not sig.empty:
+            track.scatter(
+                sig["POS"].to_numpy(),
+                sig[y_col].to_numpy(),
+                vmin=v_min, vmax=v_max,
+                s=6, marker="o", color="brown",
+            )
+    else:
+        track.scatter(
+            data=assoc_chr,
+            x=list(assoc_chr["POS"].astype(float)),
+            y=list(assoc_chr[y_col].astype(float)),
+            vmin=v_min, vmax=v_max,
+            marker="o", s=6, color=color, alpha=1,
+        )
+    # ------------------------------------------------------------------
+    # Significance lines
+    # ------------------------------------------------------------------
+    if signif_line:
+        track.line(
+            x=[sector.start, sector.end],
+            y=[genome_wide_sig, genome_wide_sig],
+            vmin=v_min, vmax=v_max,
+            color="orangered", linestyle="--",
+        )
+    if suggest_line:
+        track.line(
+            x=[sector.start, sector.end],
+            y=[suggestive, suggestive],
+            vmin=v_min, vmax=v_max,
+            color="lightblue", linestyle="--",
+        )