PyPI - pycmplot - Versions diffs - 0.1.0__py3-none-any.whl - Mend

pycmplot 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

pycmplot/__init__.py +43 -0
pycmplot/_core.py +419 -0
pycmplot/annotation.py +368 -0
pycmplot/cli.py +229 -0
pycmplot/constants.py +66 -0
pycmplot/data/Homo_sapiens.GRCh37.geneinfo.tsv.gz +0 -0
pycmplot/data/Homo_sapiens.GRCh38.geneinfo.tsv.gz +0 -0
pycmplot/data/hg19ToHg38.over.chain +56506 -0
pycmplot/io.py +342 -0
pycmplot/liftover.py +111 -0
pycmplot/plotting/circular.py +261 -0
pycmplot/plotting/linear.py +375 -0
pycmplot/resources.py +116 -0
pycmplot/stats.py +106 -0
pycmplot-0.1.0.dist-info/METADATA +182 -0
pycmplot-0.1.0.dist-info/RECORD +20 -0
pycmplot-0.1.0.dist-info/WHEEL +5 -0
pycmplot-0.1.0.dist-info/entry_points.txt +2 -0
pycmplot-0.1.0.dist-info/licenses/LICENSE +21 -0
pycmplot-0.1.0.dist-info/top_level.txt +1 -0

pycmplot/annotation.py ADDED Viewed

@@ -0,0 +1,368 @@
+"""
+pycmplot.annotation
+===================
+Nearest-gene annotation and locus summary table generation.
+"""
+from __future__ import annotations
+import bisect
+import logging
+from typing import Optional
+import natsort
+import numpy as np
+import pandas as pd
+from pycmplot.constants import BIOTYPE_WEIGHTS
+from pycmplot.resources import ResourceConfig, default_resources
+logger = logging.getLogger(__name__)
+# ---------------------------------------------------------------------------
+# Internal: gene dictionary builder
+# ---------------------------------------------------------------------------
+def _build_genes_dict(genes_df: pd.DataFrame) -> dict:
+    """Build a chromosome-keyed interval dict with sorted start positions.
+    Parameters
+    ----------
+    genes_df:
+        DataFrame with columns ``CHR``, ``START``, ``END``, ``STRAND``, ``GENE``.
+    Returns
+    -------
+    dict keyed by chromosome string; each value is
+    ``{"intervals": [...], "starts": [...]}``.
+    """
+    genes_df = genes_df.sort_values(["CHR", "START"])
+    genes_dict: dict = {}
+    for chrom, group in genes_df.groupby("CHR"):
+        intervals = list(
+            zip(
+                group["START"].astype(int),
+                group["END"].astype(int),
+                group["STRAND"],
+                group["GENE"],
+            )
+        )
+        starts = [g[0] for g in intervals]
+        genes_dict[str(chrom)] = {"intervals": intervals, "starts": starts}
+    return genes_dict
+# ---------------------------------------------------------------------------
+# Internal: strand-aware variant annotation
+# ---------------------------------------------------------------------------
+def _annotate_variant(
+    chrom: str,
+    pos: int,
+    genes_dict: dict,
+    window: int = 500_000,
+    promoter_window: int = 2_000,
+) -> dict:
+    """Return strand-aware nearest-gene annotation for a single variant.
+    Returns a dict with keys:
+    ``genic``, ``nearest_upstream_gene``, ``upstream_distance``,
+    ``nearest_downstream_gene``, ``downstream_distance``,
+    ``promoter_upstream_flag``, ``gene_density``.
+    """
+    _empty = {
+        "genic": False,
+        "nearest_upstream_gene": None,
+        "upstream_distance": None,
+        "nearest_downstream_gene": None,
+        "downstream_distance": None,
+        "promoter_upstream_flag": False,
+        "bidirectional_promoter_flag": False,
+        "gene_density": 0,
+    }
+    if chrom not in genes_dict:
+        return _empty
+    chrom_data = genes_dict[chrom]
+    genes = chrom_data["intervals"]
+    starts = chrom_data["starts"]
+    left_bound = pos - window
+    right_bound = pos + window
+    i = bisect.bisect_left(starts, left_bound)
+    gene_density = 0
+    nearest_upstream: Optional[str] = None
+    nearest_downstream: Optional[str] = None
+    min_up_dist = float("inf")
+    min_down_dist = float("inf")
+    promoter_upstream_flag = False
+    while i < len(genes):
+        start, end, strand, gene = genes[i]
+        if start > right_bound:
+            break
+        if end >= left_bound:
+            gene_density += 1
+            if start <= pos <= end:
+                return {
+                    "genic": True,
+                    "nearest_upstream_gene": gene,
+                    "upstream_distance": 0,
+                    "nearest_downstream_gene": None,
+                    "downstream_distance": None,
+                    "promoter_upstream_flag": False,
+                    "gene_density": gene_density,
+                }
+            tss = start if strand == "+" else end
+            distance = abs(pos - tss)
+            if distance <= window:
+                if strand == "+":
+                    is_upstream = pos < tss
+                    in_promoter = (tss - promoter_window) <= pos < tss
+                else:
+                    is_upstream = pos > tss
+                    in_promoter = tss < pos <= (tss + promoter_window)
+                if is_upstream:
+                    if distance < min_up_dist:
+                        min_up_dist = distance
+                        nearest_upstream = gene
+                    if in_promoter:
+                        promoter_upstream_flag = True
+                else:
+                    if distance < min_down_dist:
+                        min_down_dist = distance
+                        nearest_downstream = gene
+        i += 1
+    return {
+        "genic": False,
+        "nearest_upstream_gene": nearest_upstream,
+        "upstream_distance": min_up_dist if nearest_upstream else None,
+        "nearest_downstream_gene": nearest_downstream,
+        "downstream_distance": min_down_dist if nearest_downstream else None,
+        "promoter_upstream_flag": promoter_upstream_flag,
+        "gene_density": gene_density,
+    }
+# ---------------------------------------------------------------------------
+# Internal: prioritisation scorer
+# ---------------------------------------------------------------------------
+def _annotate_and_prioritize_variant(
+    chrom: str,
+    pos: int,
+    genes_df: pd.DataFrame,
+    lead_snps_df: pd.DataFrame,
+    window: int = 500_000,
+    promoter_window: int = 2_000,
+    biotype_weights: Optional[dict] = None,
+) -> Optional[dict]:
+    if biotype_weights is None:
+        biotype_weights = BIOTYPE_WEIGHTS
+    genes_df = genes_df.copy()
+    genes_df["TSS"] = np.where(
+        genes_df["STRAND"] == "+",
+        genes_df["START"],
+        genes_df["END"],
+    )
+    chr_genes = genes_df[genes_df["CHR"] == chrom]
+    if chr_genes.empty:
+        return None
+    candidates = chr_genes[
+        (chr_genes["START"] <= pos + window) & (chr_genes["END"] >= pos - window)
+    ].copy()
+    if candidates.empty:
+        return None
+    gene_density = len(candidates)
+    candidates["distance"] = np.where(
+        (pos >= candidates["START"]) & (pos <= candidates["END"]),
+        0,
+        np.minimum(
+            abs(pos - candidates["START"]),
+            abs(pos - candidates["END"]),
+        ),
+    )
+    candidates["genic"] = (pos >= candidates["START"]) & (pos <= candidates["END"])
+    candidates["promoter_flag"] = (
+        (candidates["STRAND"] == "+")
+        & (pos >= candidates["TSS"] - promoter_window)
+        & (pos <= candidates["TSS"])
+    ) | (
+        (candidates["STRAND"] == "-")
+        & (pos <= candidates["TSS"] + promoter_window)
+        & (pos >= candidates["TSS"])
+    )
+    candidates["distance_score"] = 1 / np.log10(candidates["distance"] + 10)
+    candidates["biotype_weight"] = candidates["BIOTYPE"].map(
+        lambda x: biotype_weights.get(x, 0)
+    )
+    candidates["promoter_bonus"] = candidates["promoter_flag"].astype(int) * 0.5
+    candidates["priority_score"] = (
+        candidates["genic"].astype(int) * 2
+        + candidates["promoter_flag"].astype(int) * 1
+        + candidates["biotype_weight"] * 2 * candidates["distance_score"]
+    )
+    candidates = candidates.sort_values("priority_score", ascending=False)
+    if candidates.empty:
+        return {
+            "top_gene": None, "biotype": None, "priority_score": None,
+            "distance": None, "promoter_flag": None, "distance_score": None,
+            "biotype_weight": None, "promoter_bonus": None, "gene_density": None,
+        }
+    if candidates["genic"].any():
+        top = candidates.iloc[0]
+        return {
+            "top_gene": top["GENE"],
+            "biotype": top["BIOTYPE"],
+            "priority_score": top["priority_score"],
+            "distance": top["distance"],
+            "promoter_flag": top["promoter_flag"],
+            "distance_score": top["distance_score"],
+            "biotype_weight": top["biotype_weight"],
+            "promoter_bonus": top["promoter_bonus"],
+            "gene_density": gene_density,
+        }
+    else:
+        top2 = candidates.head(2)
+        return {
+            "top_gene": "-".join(top2["GENE"]),
+            "biotype": "intergenic",
+            "priority_score": None,
+            "distance": "-".join(map(str, top2["distance"])),
+            "promoter_flag": None,
+            "distance_score": None,
+            "biotype_weight": None,
+            "promoter_bonus": None,
+            "gene_density": None,
+        }
+# ---------------------------------------------------------------------------
+# Internal: clumping
+# ---------------------------------------------------------------------------
+def _clump_by_distance(df: pd.DataFrame, window_kb: int = 500) -> pd.DataFrame:
+    window = window_kb * 1000
+    clumped: list[pd.Series] = []
+    for _chrom, group in df.groupby("CHR"):
+        if "logP" in df.columns:
+            group = group.sort_values("logP", ascending=False)
+        else:
+            group = group.sort_values("P", ascending=True)
+        kept_positions: list[int] = []
+        for _, row in group.iterrows():
+            if all(abs(row["POS"] - p) > window for p in kept_positions):
+                clumped.append(row)
+                kept_positions.append(row["POS"])
+    return pd.DataFrame(clumped).sort_values(
+        ["CHR", "POS"], key=natsort.natsort_keygen()
+    )
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+def get_hits_summary_table(
+    leads_df: pd.DataFrame,
+    window_kb: int = 500,
+    table_out: Optional[str] = None,
+    resources: Optional[ResourceConfig] = None,
+) -> pd.DataFrame:
+    """Annotate lead SNPs with nearest genes and write a summary table.
+    Parameters
+    ----------
+    leads_df:
+        DataFrame of lead SNPs (output of :func:`~pycmplot.stats.get_lead_snps`).
+        Must contain columns ``CHR``, ``POS``, ``P``, ``BUILD``.
+    window_kb:
+        Window in kb around each lead SNP to search for genes (default 500 kb).
+    table_out:
+        If provided, write the clumped table to this TSV file path.
+    resources:
+        :class:`~pycmplot.resources.ResourceConfig` instance.
+    Returns
+    -------
+    pd.DataFrame
+        Clumped locus summary table with gene annotations.
+    """
+    if resources is None:
+        resources = default_resources
+    # Choose gene info file based on build
+    if "OLD_POS" not in leads_df.columns and list(set(leads_df["BUILD"])) == ["hg19"]:
+        geneinfo_path = resources.require("geneinfo_hg19")
+    else:
+        geneinfo_path = resources.require("geneinfo_hg38")
+    logger.info("Loading gene info from: %s", geneinfo_path)
+    geneinfo = pd.read_csv(geneinfo_path, header=0, sep="\t")
+    genes_dict = _build_genes_dict(geneinfo)
+    window = window_kb * 1_000
+    records: list[dict] = []
+    logger.info("Annotating lead variants and generating hits summary table ...")
+    for _, row in leads_df.iterrows():
+        annotation = _annotate_variant(
+            chrom=row["CHR"],
+            pos=row["POS"],
+            genes_dict=genes_dict,
+            window=window,
+        )
+        prioritized = _annotate_and_prioritize_variant(
+            chrom=row["CHR"],
+            pos=row["POS"],
+            genes_df=geneinfo,
+            lead_snps_df=leads_df,
+            window=window,
+        )
+        record = {
+            **(row.to_dict()),
+            **(annotation if annotation is not None else {}),
+            **(prioritized if prioritized is not None else {}),
+        }
+        records.append(record)
+    locus_table = pd.DataFrame(records).sort_values(
+        ["CHR", "POS"], key=natsort.natsort_keygen()
+    )
+    if table_out is not None:
+        locus_table.to_csv(table_out, index=False, sep="\t", na_rep="None")
+        logger.info("Locus summary written to: %s", table_out)
+    return _clump_by_distance(locus_table, window_kb=window_kb)

pycmplot/cli.py ADDED Viewed

@@ -0,0 +1,229 @@
+"""
+pycmplot.cli
+============
+Command-line argument definitions.
+"""
+from __future__ import annotations
+import argparse
+from pathlib import Path
+DESCMSG = """
+        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
+        |  PACKAGE FOR CIRCULAR AND LINEAR MANHATTAN PLOTTING  |
+        |                    Kevin Esoh, 2026                  |
+        |                    kesohku1@jh.edu                   |
+        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
+"""
+def get_arguments(descmsg: str = DESCMSG) -> argparse.Namespace:
+    """Parse and return command-line arguments."""
+    parser = argparse.ArgumentParser(
+        prog="pycmplot",
+        description=descmsg,
+        formatter_class=argparse.RawTextHelpFormatter,
+        add_help=False,
+    )
+    req = parser.add_argument_group("Required")
+    opt = parser.add_argument_group("Optional")
+    cio = parser.add_argument_group("Circular Only")
+    lio = parser.add_argument_group("Linear Only")
+    # ------------------------------------------------------------------
+    # Required
+    # ------------------------------------------------------------------
+    req.add_argument(
+        "-s", "--sum_stats",
+        help="Comma-separated list of GWAS summary stats files (e.g. file1.txt.gz,file2.tsv).",
+        required=True, type=str, metavar="str",
+    )
+    req.add_argument(
+        "-l", "--labels",
+        help=(
+            "Comma-separated track labels, same order as --sum_stats.\n"
+            "E.g. HbF,MCV,MCH"
+        ),
+        required=True, type=str, metavar="str",
+    )
+    req.add_argument(
+        "-b",   "--build_column",  required=True, type=str, metavar="str",
+        help="Genome build column name (containing hg18/hg19/hg38)."
+    )
+    # ------------------------------------------------------------------
+    # Optional
+    # ------------------------------------------------------------------
+    opt.add_argument(
+        "-m", "--mode",
+        help="Plot mode: lm (linear Manhattan) or cm (circular Manhattan). Default: lm.",
+        choices=["lm", "cm"], default="lm", type=str,
+    )
+    opt.add_argument(
+        "-chr", "--chrom_column",  type=str, metavar="str",
+        help="Chromosome column name in sumstats (e.g. CHR)."
+    )
+    opt.add_argument(
+        "-pos", "--pos_column",    type=str, metavar="str",
+        help="Position column name (e.g. BP)."
+    )
+    opt.add_argument(
+        "-snp", "--snp_column",    type=str, metavar="str",
+        help="SNP ID column name (e.g. ID)."
+    )
+    opt.add_argument(
+        "-p",   "--pval_column",   type=str, metavar="str",
+        help="P-value column name (e.g. P)."
+    )
+    opt.add_argument(
+        "-d",   "--delim",
+        choices=["space", "tab", "comma", "colon", "semi-colon"],
+        type=str, metavar="str",
+        help="File delimiter (autodetected if omitted)."
+    )
+    opt.add_argument(
+        "--logp", action="store_true",
+        help="Plot −log₁₀(p) instead of raw p-values."
+    )
+    opt.add_argument(
+        "-qq", "--qq_plot", action="store_true",
+        help="Also generate a QQ-plot."
+    )
+    opt.add_argument(
+        "-tp", "--trim_pval", type=float, metavar="float",
+        help="Trim variants with p > this value before plotting."
+    )
+    opt.add_argument(
+        "-sig", "--signif_threshold",
+        default=None, const=5e-8, nargs="?", type=float, metavar="float",
+        help="Genome-wide significance threshold (default: 5e-8)."
+    )
+    opt.add_argument(
+        "-sigl", "--signif_line",
+        default=None, const=5e-8, nargs="?", type=float, metavar="float",
+        help="Value for genome-wide significance line if different from `-sig` (default: 5e-8)."
+    )
+    opt.add_argument(
+        "-sug", "--suggest_threshold",
+        default=None, const=1e-5, nargs="?", type=float, metavar="float",
+        help="Suggestive significance threshold (default: 1e-5)."
+    )
+    opt.add_argument(
+        "-a", "--annotate",
+        choices=["SNP", "GENE"], nargs="?",
+        default="SNP", const="SNP", type=str, #metavar="str",
+        help="Annotate significant loci by SNP ID or nearest gene."
+    )
+    opt.add_argument(
+        "-p_size", "--point_size", default=6, type=float, metavar="float",
+        help="Size of each point of scatter plot (default: 6)."
+    )
+    opt.add_argument(
+        "-a_size", "--annotation_size", default=6, type=float, metavar="float",
+        help="Annotation label font size (default: 6)."
+    )
+    opt.add_argument(
+        "-hl",  "--highlight", action="store_true",
+        help="Highlight significant loci."
+    )
+    opt.add_argument(
+        "-ht", "--highlight_thresh", default=5e-8, type=float, metavar="float",
+        help="P-value threshold for highlighting (default: 5e-8)."
+    )
+    opt.add_argument(
+        "-hl_line", "--highlight_line", action="store_true",
+        help="Draw vertical lines through highlighted positions."
+    )
+    opt.add_argument(
+        "--colors", default="steelblue,silver", type=str, metavar="str",
+        help="Two comma-separated alternating chromosome colours (default: steelblue,silver)."
+    )
+    opt.add_argument(
+        "-st", "--sort_track",
+        choices=["chrom_len", "label"], nargs="?",
+        const="chrom_len", default=None, type=str, #metavar="str",
+        help="Sort tracks by chromosome count or label."
+    )
+    opt.add_argument(
+        "-plt", "--plot_title", default="MyCMplot", type=str, metavar="str",
+        help="Plot plot_title / output file stem."
+    )
+    opt.add_argument(
+        "-pts", "--plot_title_size", default=8, type=float, metavar="float",
+        help="Plot plot_title font size (default: 8)."
+    )
+    opt.add_argument(
+        "-od", "--output_dir", default=".", type=Path, metavar="path",
+        help="Output directory (default: current directory)."
+    )
+    opt.add_argument(
+        "-of", "--output_format",
+        choices=["png", "pdf", "svg", "jpg"],
+        default="png", type=str, metavar="str",
+        help="Output image format (default: png)."
+    )
+    opt.add_argument(
+        "--dpi", default=300, type=int, metavar="int",
+        help="Output resolution in DPI (default: 300)."
+    )
+    opt.add_argument(
+        "-f", "--force", action="store_true",
+        help="Overwrite existing output files."
+    )
+    # circular only
+    cio.add_argument(
+        "--pad", default=1, type=int, metavar="int",
+        help="Space between circular tracks (default: 1)."
+    )
+    cio.add_argument(
+        "-cl_size", "--chrom_label_size",  default=6, type=float, metavar="float",
+        help="Chromosome label font size (default: 6)."
+    )
+    cio.add_argument(
+        "-cl_side", "--chrom_label_side", choices=["inside", "outside"],
+        nargs="?", default="inside", const="inside", type=str,
+        help="Chromosome label placement (default: inside)."
+    )
+    cio.add_argument(
+        "-tl_size", "--track_label_size", default=6, type=float, metavar="float",
+        help="Track label font size (default: 6)."
+    )
+    cio.add_argument(
+        "-tl_orient", "--track_label_orientation",
+        choices=["vertical", "horizontal"], nargs="?",
+        default="vertical", const="vertical", type=str,
+        help="Track label orientation (default: vertical)."
+    )
+    cio.add_argument(
+        "--r_min", default=20, type=int, metavar="int",
+        help="Inner radius proportion (circular mode, default: 20)."
+    )
+    cio.add_argument(
+        "--r_max", default=100, type=int, metavar="int",
+        help="Outer radius (circular mode, default: 100)."
+    )
+    # linear only
+    lio.add_argument(
+        "-th", "--track_heights", type=str, metavar="str",
+        help="Comma-separated relative track heights (e.g. 2,2,1.5)."
+    )
+    lio.add_argument(
+        "-cs","--chr_spacing", default=9e6, type=float, metavar="float",
+        help="Spacing between chromosomes. Useful to reduce chromosome overlap (default: 9e6 or 9000000)."
+    )
+    lio.add_argument(
+        "-t_space", "--track_spacing", default=0.10, type=float, metavar="float",
+        help="Space between linear tracks (default: 0.10)."
+    )
+    opt.add_argument(
+        "-h", "--help", action="help",
+        help="Show this help message and exit."
+    )
+    return parser.parse_args()

pycmplot/constants.py ADDED Viewed

@@ -0,0 +1,66 @@
+"""
+pycmplot.constants
+==================
+Genome-level constants shared across modules.
+"""
+# ---------------------------------------------------------------------------
+# hg38 chromosome lengths (GRCh38)
+# ---------------------------------------------------------------------------
+hg38_chr_lengths: dict[str, int] = {
+    "chr1":  249698942,
+    "chr2":  242508799,
+    "chr3":  198450956,
+    "chr4":  190424264,
+    "chr5":  181630948,
+    "chr6":  170805979,
+    "chr7":  159345973,
+    "chr8":  145138636,
+    "chr9":  138688728,
+    "chr10": 133797422,
+    "chr11": 135186938,
+    "chr12": 133275309,
+    "chr13": 114364328,
+    "chr14": 108136338,
+    "chr15": 102439437,
+    "chr16":  92211104,
+    "chr17":  83836422,
+    "chr18":  80373285,
+    "chr19":  58617616,
+    "chr20":  64444167,
+    "chr21":  46709983,
+    "chr22":  51857516,
+    "chrX":  156040895,
+    "chrY":   57264655,
+}
+# ---------------------------------------------------------------------------
+# Gene biotype weights used for nearest-gene prioritisation
+# ---------------------------------------------------------------------------
+BIOTYPE_WEIGHTS: dict[str, float] = {
+    "gene":                                   1.00,
+    "protein_coding":                         1.00,
+    "miRNA":                                  0.75,
+    "lncRNA":                                 0.70,
+    "ncRNA":                                  0.70,
+    "lincRNA":                                0.70,
+    "ribozyme":                               0.70,
+    "snRNA":                                  0.65,
+    "snoRNA":                                 0.65,
+    "scaRNA":                                 0.65,
+    "vault_RNA":                              0.60,
+    "antisense":                              0.30,
+    "rRNA":                                   0.55,
+    "processed_transcript":                   0.50,
+    "transcribed_processed_pseudogene":       0.45,
+    "transcribed_unitary_pseudogene":         0.40,
+    "transcribed_unprocessed_pseudogene":     0.35,
+    "processed_pseudogene":                   0.30,
+    "pseudogene":                             0.20,
+    "unprocessed_pseudogene":                 0.20,
+}
+# ---------------------------------------------------------------------------
+# Standard chromosome order (autosomes + sex + MT)
+# ---------------------------------------------------------------------------
+CHROM_ORDER: list[str] = [str(i) for i in range(1, 23)] + ["X", "Y", "MT"]

pycmplot/data/Homo_sapiens.GRCh37.geneinfo.tsv.gz ADDED Viewed

Binary file

pycmplot/data/Homo_sapiens.GRCh38.geneinfo.tsv.gz ADDED Viewed

Binary file