PyPI - pathview-plus - Versions diffs - 2.0.0__py3-none-any.whl - Mend

pathview-plus 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

pathview/__init__.py +124 -0
pathview/color_mapping.py +153 -0
pathview/constants.py +27 -0
pathview/databases.py +309 -0
pathview/examples.py +342 -0
pathview/highlighting.py +375 -0
pathview/id_mapping.py +170 -0
pathview/kegg_api.py +143 -0
pathview/kgml_parser.py +189 -0
pathview/mol_data.py +168 -0
pathview/node_mapping.py +99 -0
pathview/pathview.py +316 -0
pathview/rendering.py +409 -0
pathview/sbgn_parser.py +353 -0
pathview/splines.py +304 -0
pathview/svg_rendering.py +305 -0
pathview/test_all_features.py +343 -0
pathview/utils.py +80 -0
pathview_plus-2.0.0.data/scripts/pathview-cli.py +252 -0
pathview_plus-2.0.0.dist-info/METADATA +661 -0
pathview_plus-2.0.0.dist-info/RECORD +23 -0
pathview_plus-2.0.0.dist-info/WHEEL +5 -0
pathview_plus-2.0.0.dist-info/top_level.txt +1 -0

pathview/kgml_parser.py ADDED Viewed

@@ -0,0 +1,189 @@
+"""
+kgml_parser.py
+Parse KEGG KGML (XML) pathway files into Python dataclasses and a tidy
+Polars DataFrame suitable for downstream rendering.
+Public API
+----------
+  parse_kgml  : Path → KGMLPathway
+  node_info   : KGMLPathway → pl.DataFrame
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Optional
+from xml.etree import ElementTree as ET
+import polars as pl
+# ---------------------------------------------------------------------------
+# Dataclasses
+# ---------------------------------------------------------------------------
+@dataclass
+class KGMLNode:
+    """One <entry> element from a KGML file."""
+    entry_id:  str
+    name:      str
+    node_type: str
+    link:      str
+    reaction:  str
+    x:         Optional[float] = None
+    y:         Optional[float] = None
+    width:     Optional[float] = None
+    height:    Optional[float] = None
+    bgcolor:   str = "#FFFFFF"
+    label:     str = ""
+    shape:     str = "rectangle"
+    component: list[str] = field(default_factory=list)
+@dataclass
+class KGMLEdge:
+    """One <relation> element from a KGML file."""
+    entry1:    str
+    entry2:    str
+    edge_type: str
+    subtypes:  list[tuple[str, str]] = field(default_factory=list)
+@dataclass
+class KGMLReaction:
+    """One <reaction> element from a KGML file."""
+    name:       str
+    rxn_type:   str                     # "reversible" | "irreversible"
+    substrates: list[str] = field(default_factory=list)
+    products:   list[str] = field(default_factory=list)
+@dataclass
+class KGMLPathway:
+    """Container for all parsed elements of a KGML pathway file."""
+    pathway_id:   str
+    pathway_name: str
+    nodes:        dict[str, KGMLNode]  = field(default_factory=dict)
+    edges:        list[KGMLEdge]       = field(default_factory=list)
+    reactions:    list[KGMLReaction]   = field(default_factory=list)
+# ---------------------------------------------------------------------------
+# Element parsers  (private helpers)
+# ---------------------------------------------------------------------------
+def _parse_graphics(elem: ET.Element) -> dict:
+    """Extract display attributes from a <graphics> child element."""
+    a = elem.attrib
+    return {
+        "x":      float(a.get("x", 0)),
+        "y":      float(a.get("y", 0)),
+        "width":  float(a.get("width", 46)),
+        "height": float(a.get("height", 17)),
+        "bgcolor": a.get("bgcolor", "#FFFFFF"),
+        "shape":   a.get("type", "rectangle"),
+        "label":   a.get("name", ""),
+    }
+def _parse_entry(elem: ET.Element) -> KGMLNode:
+    """Parse a single <entry> element."""
+    gfx_elem = elem.find("graphics")
+    gfx = _parse_graphics(gfx_elem) if gfx_elem is not None else {}
+    return KGMLNode(
+        entry_id  = elem.attrib["id"],
+        name      = elem.attrib.get("name", ""),
+        node_type = elem.attrib.get("type", "gene"),
+        link      = elem.attrib.get("link", ""),
+        reaction  = elem.attrib.get("reaction", ""),
+        x       = gfx.get("x"),
+        y       = gfx.get("y"),
+        width   = gfx.get("width"),
+        height  = gfx.get("height"),
+        bgcolor = gfx.get("bgcolor", "#FFFFFF"),
+        label   = gfx.get("label", elem.attrib.get("name", "")),
+        shape   = gfx.get("shape", "rectangle"),
+        component = [c.attrib["id"] for c in elem.findall("component")],
+    )
+def _parse_relation(elem: ET.Element) -> KGMLEdge:
+    """Parse a single <relation> element."""
+    return KGMLEdge(
+        entry1    = elem.attrib["entry1"],
+        entry2    = elem.attrib["entry2"],
+        edge_type = elem.attrib.get("type", ""),
+        subtypes  = [
+            (s.attrib.get("name", ""), s.attrib.get("value", ""))
+            for s in elem.findall("subtype")
+        ],
+    )
+def _parse_reaction(elem: ET.Element) -> KGMLReaction:
+    """Parse a single <reaction> element."""
+    return KGMLReaction(
+        name       = elem.attrib.get("name", ""),
+        rxn_type   = elem.attrib.get("type", "irreversible"),
+        substrates = [s.attrib["id"] for s in elem.findall("substrate")],
+        products   = [p.attrib["id"] for p in elem.findall("product")],
+    )
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+def parse_kgml(filepath: str | Path) -> KGMLPathway:
+    """
+    Parse a KEGG KGML file and return a populated KGMLPathway.
+    Parameters
+    ----------
+    filepath: Path to the .xml KGML file.
+    """
+    root = ET.parse(filepath).getroot()
+    pathway = KGMLPathway(
+        pathway_id   = root.attrib.get("number", ""),
+        pathway_name = root.attrib.get("name", ""),
+    )
+    _dispatch = {
+        "entry":    lambda e: pathway.nodes.update({(n := _parse_entry(e)).entry_id: n}),
+        "relation": lambda e: pathway.edges.append(_parse_relation(e)),
+        "reaction": lambda e: pathway.reactions.append(_parse_reaction(e)),
+    }
+    for child in root:
+        if child.tag in _dispatch:
+            _dispatch[child.tag](child)
+    return pathway
+def node_info(pathway: KGMLPathway) -> pl.DataFrame:
+    """
+    Flatten KGMLPathway nodes into a tidy Polars DataFrame.
+    Columns: entry_id, name, type, x, y, width, height, bgcolor,
+             label, shape, reaction, component, size.
+    """
+    records = [
+        {
+            "entry_id":  n.entry_id,
+            "name":      n.name,
+            "type":      n.node_type,
+            "x":         n.x,
+            "y":         n.y,
+            "width":     n.width,
+            "height":    n.height,
+            "bgcolor":   n.bgcolor,
+            "label":     n.label,
+            "shape":     n.shape,
+            "reaction":  n.reaction,
+            "component": ";".join(n.component),
+            "size":      max(1, len(n.component)),
+        }
+        for n in pathway.nodes.values()
+    ]
+    return pl.DataFrame(records)

pathview/mol_data.py ADDED Viewed

@@ -0,0 +1,168 @@
+"""
+mol_data.py
+Molecular data handling:
+  - mol_sum      : aggregate multi-probe data to target IDs (Polars-based)
+  - sim_mol_data : generate simulated expression / abundance data for testing
+"""
+from __future__ import annotations
+import warnings
+from typing import Callable, Optional
+import numpy as np
+import polars as pl
+import requests
+from .constants import KEGG_BASE, SumMethod
+from .utils import max_abs, random_pick
+# ---------------------------------------------------------------------------
+# Aggregation dispatch
+# ---------------------------------------------------------------------------
+def _make_agg_expr(col: str, method: SumMethod):
+    """Return a Polars aggregation expression for a single column."""
+    match method:
+        case "sum":    return pl.col(col).sum()
+        case "mean":   return pl.col(col).mean()
+        case "median": return pl.col(col).median()
+        case "max":    return pl.col(col).max()
+        case "max_abs":
+            return pl.col(col).map_elements(
+                lambda s: max_abs(s.to_numpy()), return_dtype=pl.Float64
+            )
+        case "random":
+            return pl.col(col).map_elements(
+                lambda s: random_pick(s.to_numpy()), return_dtype=pl.Float64
+            )
+        case _:
+            raise ValueError(
+                f"Unknown sum_method '{method}'. "
+                "Choose from: sum, mean, median, max, max_abs, random."
+            )
+# ---------------------------------------------------------------------------
+# mol_sum
+# ---------------------------------------------------------------------------
+def mol_sum(
+    mol_data: pl.DataFrame,
+    id_map: pl.DataFrame,
+    sum_method: SumMethod = "sum",
+) -> pl.DataFrame:
+    """
+    Aggregate *mol_data* from source IDs to target IDs defined by *id_map*.
+    Parameters
+    ----------
+    mol_data:   DataFrame whose **first column** contains source IDs; all
+                remaining columns are treated as numeric expression values.
+    id_map:     Two-column DataFrame [source_id, target_id].
+    sum_method: How to combine multiple source rows mapping to one target.
+    Returns a DataFrame keyed by target IDs with the same numeric columns
+    as *mol_data*.  Raises ValueError when no IDs can be mapped.
+    """
+    id_col     = mol_data.columns[0]
+    src_col, tgt_col = id_map.columns[:2]
+    #TODO: Temporary fix, check earlier steps to prevent the need to do this
+    mol_data = mol_data.cast({id_col: pl.String})
+    # Rename id_map columns to neutral names for the join
+    mapping = id_map.rename({src_col: id_col, tgt_col: "__target"})
+    merged = mol_data.join(mapping, on=id_col, how="inner")
+    if merged.is_empty():
+        raise ValueError(
+            f"No IDs from '{id_col}' could be mapped using the provided id_map."
+        )
+    n_unmapped = mol_data.height - merged.height
+    if n_unmapped > 0:
+        print(f"Note: {n_unmapped} of {mol_data.height} input IDs unmapped.")
+    numeric_cols = [c for c in merged.columns if c not in (id_col, "__target")]
+    aggregated = (
+        merged
+        .drop(id_col)
+        .group_by("__target")
+        .agg([_make_agg_expr(c, sum_method).alias(c) for c in numeric_cols])
+        .rename({"__target": id_col})
+    )
+    return aggregated
+# ---------------------------------------------------------------------------
+# sim_mol_data
+# ---------------------------------------------------------------------------
+def sim_mol_data(
+    mol_type: str = "gene",
+    species: str = "hsa",
+    n_mol: int = 100,
+    n_exp: int = 1,
+    rand_seed: int = 100,
+    discrete: bool = False,
+) -> pl.DataFrame:
+    """
+    Generate simulated molecular abundance data for testing and demos.
+    Parameters
+    ----------
+    mol_type:  "gene" (fetches real KEGG gene IDs) or "cpd" (fake KEGG IDs).
+    species:   KEGG species code used when *mol_type* is "gene".
+    n_mol:     Number of molecules to sample.
+    n_exp:     Number of simulated experiment columns.
+    rand_seed: NumPy RNG seed for reproducibility.
+    discrete:  When True, return only the sampled IDs (no numeric values).
+    Returns a DataFrame with an 'id' column and *n_exp* numeric columns named
+    'exp1', 'exp2', … (or just 'id' when *discrete* is True).
+    """
+    rng = np.random.default_rng(rand_seed)
+    if mol_type == "gene":
+        ids = _fetch_kegg_gene_ids(species)
+    elif mol_type == "cpd":
+        ids = [f"C{i:05d}" for i in range(1, 5001)]
+    else:
+        raise ValueError(f"mol_type must be 'gene' or 'cpd', got '{mol_type}'.")
+    n_available = len(ids)
+    if n_mol > n_available:
+        warnings.warn(
+            f"Requested {n_mol} molecules but only {n_available} available; "
+            "using all available IDs."
+        )
+        n_mol = n_available
+    sampled = list(rng.choice(ids, size=n_mol, replace=False))
+    if discrete:
+        return pl.DataFrame({"id": sampled})
+    data: dict[str, list] = {"id": sampled}
+    for i in range(1, n_exp + 1):
+        data[f"exp{i}"] = rng.standard_normal(n_mol).tolist()
+    return pl.DataFrame(data)
+def _fetch_kegg_gene_ids(species: str) -> list[str]:
+    """Fetch all gene IDs for *species* from KEGG; fall back to dummy IDs."""
+    url = f"{KEGG_BASE}/list/{species}"
+    try:
+        resp = requests.get(url, timeout=30)
+        resp.raise_for_status()
+        return [
+            line.split("\t")[0].split(":")[1]
+            for line in resp.text.strip().splitlines()
+            if "\t" in line
+        ]
+    except Exception as exc:
+        warnings.warn(f"Failed to fetch KEGG gene list for '{species}': {exc}. Using dummy IDs.")
+        return [f"gene{i}" for i in range(1, 1001)]

pathview/node_mapping.py ADDED Viewed

@@ -0,0 +1,99 @@
+"""
+node_mapping.py
+Map molecular expression / abundance data onto KEGG pathway nodes:
+  - node_map : join mol_data to node_data via KEGG gene/compound IDs
+"""
+from __future__ import annotations
+from typing import Optional
+import polars as pl
+from .constants import SumMethod
+from .mol_data import mol_sum
+from .utils import wordwrap
+# ---------------------------------------------------------------------------
+# Node mapping
+# ---------------------------------------------------------------------------
+def node_map(
+    mol_data: Optional[pl.DataFrame],
+    node_data: pl.DataFrame,
+    node_types: str | list[str] = "gene",
+    node_sum: SumMethod = "sum",
+    entrez_gnodes: bool = True,
+) -> Optional[pl.DataFrame]:
+    """
+    Map *mol_data* onto pathway nodes of the specified *node_types*.
+    Parameters
+    ----------
+    mol_data:      DataFrame whose first column contains molecule IDs and
+                   remaining columns contain numeric values.  Pass None to
+                   produce a position-only result with NaN values.
+    node_data:     Tidy node DataFrame produced by kgml_parser.node_info().
+    node_types:    Node type string(s) to include (e.g. "gene", "compound").
+    node_sum:      Aggregation method when multiple probes map to one node.
+    entrez_gnodes: True when gene nodes use Entrez IDs (vs KEGG gene IDs).
+    Returns a merged DataFrame of node positions and molecular values, or
+    None when no nodes of the requested type exist.
+    """
+    if isinstance(node_types, str):
+        node_types = [node_types]
+    target_nodes = node_data.filter(pl.col("type").is_in(node_types))
+    if target_nodes.is_empty():
+        return None
+    # Expand the space-separated "name" field into individual KEGG IDs
+    exploded = (
+        target_nodes
+        .with_columns(pl.col("name").str.split(" ").alias("kegg_names"))
+        .explode("kegg_names")
+        .with_columns(
+            # Strip species prefix (e.g. "hsa:1234" → "1234")
+            pl.col("kegg_names").str.replace(r"^[a-z]+:", "", literal=False)
+        )
+    )
+    if mol_data is None:
+        # Return node layout only, with a placeholder NaN value column
+        return (
+            exploded
+            .group_by("entry_id")
+            .agg([
+                pl.col("kegg_names").first(),
+                pl.col("x").first(),
+                pl.col("y").first(),
+                pl.col("width").first(),
+                pl.col("height").first(),
+                pl.col("label").first(),
+                pl.col("type").first(),
+                pl.col("size").first(),
+            ])
+            .with_columns(pl.lit(float("nan")).alias("mol_val"))
+        )
+    id_col = mol_data.columns[0]
+    id_map = (
+        exploded
+        .select(["kegg_names", "entry_id"])
+        .rename({"kegg_names": id_col, "entry_id": "__target"})
+    )
+    try:
+        summed = mol_sum(mol_data, id_map.rename({"__target": "target_id"}).rename({"target_id": "__target"}), sum_method=node_sum)
+    except ValueError:
+        return None
+    # Re-join aggregated values back to the full node layout
+    plot_data = target_nodes.join(
+        summed.rename({id_col: "entry_id"}),
+        on="entry_id",
+        how="left",
+    )
+    return plot_data