pathview-plus 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,189 @@
1
+ """
2
+ kgml_parser.py
3
+ Parse KEGG KGML (XML) pathway files into Python dataclasses and a tidy
4
+ Polars DataFrame suitable for downstream rendering.
5
+
6
+ Public API
7
+ ----------
8
+ parse_kgml : Path → KGMLPathway
9
+ node_info : KGMLPathway → pl.DataFrame
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ from dataclasses import dataclass, field
15
+ from pathlib import Path
16
+ from typing import Optional
17
+ from xml.etree import ElementTree as ET
18
+
19
+ import polars as pl
20
+
21
+
22
+ # ---------------------------------------------------------------------------
23
+ # Dataclasses
24
+ # ---------------------------------------------------------------------------
25
+
26
+ @dataclass
27
+ class KGMLNode:
28
+ """One <entry> element from a KGML file."""
29
+ entry_id: str
30
+ name: str
31
+ node_type: str
32
+ link: str
33
+ reaction: str
34
+ x: Optional[float] = None
35
+ y: Optional[float] = None
36
+ width: Optional[float] = None
37
+ height: Optional[float] = None
38
+ bgcolor: str = "#FFFFFF"
39
+ label: str = ""
40
+ shape: str = "rectangle"
41
+ component: list[str] = field(default_factory=list)
42
+
43
+
44
+ @dataclass
45
+ class KGMLEdge:
46
+ """One <relation> element from a KGML file."""
47
+ entry1: str
48
+ entry2: str
49
+ edge_type: str
50
+ subtypes: list[tuple[str, str]] = field(default_factory=list)
51
+
52
+
53
+ @dataclass
54
+ class KGMLReaction:
55
+ """One <reaction> element from a KGML file."""
56
+ name: str
57
+ rxn_type: str # "reversible" | "irreversible"
58
+ substrates: list[str] = field(default_factory=list)
59
+ products: list[str] = field(default_factory=list)
60
+
61
+
62
+ @dataclass
63
+ class KGMLPathway:
64
+ """Container for all parsed elements of a KGML pathway file."""
65
+ pathway_id: str
66
+ pathway_name: str
67
+ nodes: dict[str, KGMLNode] = field(default_factory=dict)
68
+ edges: list[KGMLEdge] = field(default_factory=list)
69
+ reactions: list[KGMLReaction] = field(default_factory=list)
70
+
71
+
72
+ # ---------------------------------------------------------------------------
73
+ # Element parsers (private helpers)
74
+ # ---------------------------------------------------------------------------
75
+
76
+ def _parse_graphics(elem: ET.Element) -> dict:
77
+ """Extract display attributes from a <graphics> child element."""
78
+ a = elem.attrib
79
+ return {
80
+ "x": float(a.get("x", 0)),
81
+ "y": float(a.get("y", 0)),
82
+ "width": float(a.get("width", 46)),
83
+ "height": float(a.get("height", 17)),
84
+ "bgcolor": a.get("bgcolor", "#FFFFFF"),
85
+ "shape": a.get("type", "rectangle"),
86
+ "label": a.get("name", ""),
87
+ }
88
+
89
+
90
+ def _parse_entry(elem: ET.Element) -> KGMLNode:
91
+ """Parse a single <entry> element."""
92
+ gfx_elem = elem.find("graphics")
93
+ gfx = _parse_graphics(gfx_elem) if gfx_elem is not None else {}
94
+
95
+ return KGMLNode(
96
+ entry_id = elem.attrib["id"],
97
+ name = elem.attrib.get("name", ""),
98
+ node_type = elem.attrib.get("type", "gene"),
99
+ link = elem.attrib.get("link", ""),
100
+ reaction = elem.attrib.get("reaction", ""),
101
+ x = gfx.get("x"),
102
+ y = gfx.get("y"),
103
+ width = gfx.get("width"),
104
+ height = gfx.get("height"),
105
+ bgcolor = gfx.get("bgcolor", "#FFFFFF"),
106
+ label = gfx.get("label", elem.attrib.get("name", "")),
107
+ shape = gfx.get("shape", "rectangle"),
108
+ component = [c.attrib["id"] for c in elem.findall("component")],
109
+ )
110
+
111
+
112
+ def _parse_relation(elem: ET.Element) -> KGMLEdge:
113
+ """Parse a single <relation> element."""
114
+ return KGMLEdge(
115
+ entry1 = elem.attrib["entry1"],
116
+ entry2 = elem.attrib["entry2"],
117
+ edge_type = elem.attrib.get("type", ""),
118
+ subtypes = [
119
+ (s.attrib.get("name", ""), s.attrib.get("value", ""))
120
+ for s in elem.findall("subtype")
121
+ ],
122
+ )
123
+
124
+
125
+ def _parse_reaction(elem: ET.Element) -> KGMLReaction:
126
+ """Parse a single <reaction> element."""
127
+ return KGMLReaction(
128
+ name = elem.attrib.get("name", ""),
129
+ rxn_type = elem.attrib.get("type", "irreversible"),
130
+ substrates = [s.attrib["id"] for s in elem.findall("substrate")],
131
+ products = [p.attrib["id"] for p in elem.findall("product")],
132
+ )
133
+
134
+
135
+ # ---------------------------------------------------------------------------
136
+ # Public API
137
+ # ---------------------------------------------------------------------------
138
+
139
+ def parse_kgml(filepath: str | Path) -> KGMLPathway:
140
+ """
141
+ Parse a KEGG KGML file and return a populated KGMLPathway.
142
+
143
+ Parameters
144
+ ----------
145
+ filepath: Path to the .xml KGML file.
146
+ """
147
+ root = ET.parse(filepath).getroot()
148
+ pathway = KGMLPathway(
149
+ pathway_id = root.attrib.get("number", ""),
150
+ pathway_name = root.attrib.get("name", ""),
151
+ )
152
+ _dispatch = {
153
+ "entry": lambda e: pathway.nodes.update({(n := _parse_entry(e)).entry_id: n}),
154
+ "relation": lambda e: pathway.edges.append(_parse_relation(e)),
155
+ "reaction": lambda e: pathway.reactions.append(_parse_reaction(e)),
156
+ }
157
+ for child in root:
158
+ if child.tag in _dispatch:
159
+ _dispatch[child.tag](child)
160
+
161
+ return pathway
162
+
163
+
164
+ def node_info(pathway: KGMLPathway) -> pl.DataFrame:
165
+ """
166
+ Flatten KGMLPathway nodes into a tidy Polars DataFrame.
167
+
168
+ Columns: entry_id, name, type, x, y, width, height, bgcolor,
169
+ label, shape, reaction, component, size.
170
+ """
171
+ records = [
172
+ {
173
+ "entry_id": n.entry_id,
174
+ "name": n.name,
175
+ "type": n.node_type,
176
+ "x": n.x,
177
+ "y": n.y,
178
+ "width": n.width,
179
+ "height": n.height,
180
+ "bgcolor": n.bgcolor,
181
+ "label": n.label,
182
+ "shape": n.shape,
183
+ "reaction": n.reaction,
184
+ "component": ";".join(n.component),
185
+ "size": max(1, len(n.component)),
186
+ }
187
+ for n in pathway.nodes.values()
188
+ ]
189
+ return pl.DataFrame(records)
pathview/mol_data.py ADDED
@@ -0,0 +1,168 @@
1
+ """
2
+ mol_data.py
3
+ Molecular data handling:
4
+ - mol_sum : aggregate multi-probe data to target IDs (Polars-based)
5
+ - sim_mol_data : generate simulated expression / abundance data for testing
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import warnings
11
+ from typing import Callable, Optional
12
+
13
+ import numpy as np
14
+ import polars as pl
15
+ import requests
16
+
17
+ from .constants import KEGG_BASE, SumMethod
18
+ from .utils import max_abs, random_pick
19
+
20
+
21
+ # ---------------------------------------------------------------------------
22
+ # Aggregation dispatch
23
+ # ---------------------------------------------------------------------------
24
+
25
+ def _make_agg_expr(col: str, method: SumMethod):
26
+ """Return a Polars aggregation expression for a single column."""
27
+ match method:
28
+ case "sum": return pl.col(col).sum()
29
+ case "mean": return pl.col(col).mean()
30
+ case "median": return pl.col(col).median()
31
+ case "max": return pl.col(col).max()
32
+ case "max_abs":
33
+ return pl.col(col).map_elements(
34
+ lambda s: max_abs(s.to_numpy()), return_dtype=pl.Float64
35
+ )
36
+ case "random":
37
+ return pl.col(col).map_elements(
38
+ lambda s: random_pick(s.to_numpy()), return_dtype=pl.Float64
39
+ )
40
+ case _:
41
+ raise ValueError(
42
+ f"Unknown sum_method '{method}'. "
43
+ "Choose from: sum, mean, median, max, max_abs, random."
44
+ )
45
+
46
+
47
+ # ---------------------------------------------------------------------------
48
+ # mol_sum
49
+ # ---------------------------------------------------------------------------
50
+
51
+ def mol_sum(
52
+ mol_data: pl.DataFrame,
53
+ id_map: pl.DataFrame,
54
+ sum_method: SumMethod = "sum",
55
+ ) -> pl.DataFrame:
56
+ """
57
+ Aggregate *mol_data* from source IDs to target IDs defined by *id_map*.
58
+
59
+ Parameters
60
+ ----------
61
+ mol_data: DataFrame whose **first column** contains source IDs; all
62
+ remaining columns are treated as numeric expression values.
63
+ id_map: Two-column DataFrame [source_id, target_id].
64
+ sum_method: How to combine multiple source rows mapping to one target.
65
+
66
+ Returns a DataFrame keyed by target IDs with the same numeric columns
67
+ as *mol_data*. Raises ValueError when no IDs can be mapped.
68
+ """
69
+ id_col = mol_data.columns[0]
70
+ src_col, tgt_col = id_map.columns[:2]
71
+
72
+ #TODO: Temporary fix, check earlier steps to prevent the need to do this
73
+ mol_data = mol_data.cast({id_col: pl.String})
74
+
75
+ # Rename id_map columns to neutral names for the join
76
+ mapping = id_map.rename({src_col: id_col, tgt_col: "__target"})
77
+
78
+ merged = mol_data.join(mapping, on=id_col, how="inner")
79
+ if merged.is_empty():
80
+ raise ValueError(
81
+ f"No IDs from '{id_col}' could be mapped using the provided id_map."
82
+ )
83
+
84
+ n_unmapped = mol_data.height - merged.height
85
+ if n_unmapped > 0:
86
+ print(f"Note: {n_unmapped} of {mol_data.height} input IDs unmapped.")
87
+
88
+ numeric_cols = [c for c in merged.columns if c not in (id_col, "__target")]
89
+ aggregated = (
90
+ merged
91
+ .drop(id_col)
92
+ .group_by("__target")
93
+ .agg([_make_agg_expr(c, sum_method).alias(c) for c in numeric_cols])
94
+ .rename({"__target": id_col})
95
+ )
96
+ return aggregated
97
+
98
+
99
+ # ---------------------------------------------------------------------------
100
+ # sim_mol_data
101
+ # ---------------------------------------------------------------------------
102
+
103
+ def sim_mol_data(
104
+ mol_type: str = "gene",
105
+ species: str = "hsa",
106
+ n_mol: int = 100,
107
+ n_exp: int = 1,
108
+ rand_seed: int = 100,
109
+ discrete: bool = False,
110
+ ) -> pl.DataFrame:
111
+ """
112
+ Generate simulated molecular abundance data for testing and demos.
113
+
114
+ Parameters
115
+ ----------
116
+ mol_type: "gene" (fetches real KEGG gene IDs) or "cpd" (fake KEGG IDs).
117
+ species: KEGG species code used when *mol_type* is "gene".
118
+ n_mol: Number of molecules to sample.
119
+ n_exp: Number of simulated experiment columns.
120
+ rand_seed: NumPy RNG seed for reproducibility.
121
+ discrete: When True, return only the sampled IDs (no numeric values).
122
+
123
+ Returns a DataFrame with an 'id' column and *n_exp* numeric columns named
124
+ 'exp1', 'exp2', … (or just 'id' when *discrete* is True).
125
+ """
126
+ rng = np.random.default_rng(rand_seed)
127
+
128
+ if mol_type == "gene":
129
+ ids = _fetch_kegg_gene_ids(species)
130
+ elif mol_type == "cpd":
131
+ ids = [f"C{i:05d}" for i in range(1, 5001)]
132
+ else:
133
+ raise ValueError(f"mol_type must be 'gene' or 'cpd', got '{mol_type}'.")
134
+
135
+ n_available = len(ids)
136
+ if n_mol > n_available:
137
+ warnings.warn(
138
+ f"Requested {n_mol} molecules but only {n_available} available; "
139
+ "using all available IDs."
140
+ )
141
+ n_mol = n_available
142
+
143
+ sampled = list(rng.choice(ids, size=n_mol, replace=False))
144
+
145
+ if discrete:
146
+ return pl.DataFrame({"id": sampled})
147
+
148
+ data: dict[str, list] = {"id": sampled}
149
+ for i in range(1, n_exp + 1):
150
+ data[f"exp{i}"] = rng.standard_normal(n_mol).tolist()
151
+
152
+ return pl.DataFrame(data)
153
+
154
+
155
+ def _fetch_kegg_gene_ids(species: str) -> list[str]:
156
+ """Fetch all gene IDs for *species* from KEGG; fall back to dummy IDs."""
157
+ url = f"{KEGG_BASE}/list/{species}"
158
+ try:
159
+ resp = requests.get(url, timeout=30)
160
+ resp.raise_for_status()
161
+ return [
162
+ line.split("\t")[0].split(":")[1]
163
+ for line in resp.text.strip().splitlines()
164
+ if "\t" in line
165
+ ]
166
+ except Exception as exc:
167
+ warnings.warn(f"Failed to fetch KEGG gene list for '{species}': {exc}. Using dummy IDs.")
168
+ return [f"gene{i}" for i in range(1, 1001)]
@@ -0,0 +1,99 @@
1
+ """
2
+ node_mapping.py
3
+ Map molecular expression / abundance data onto KEGG pathway nodes:
4
+ - node_map : join mol_data to node_data via KEGG gene/compound IDs
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from typing import Optional
10
+
11
+ import polars as pl
12
+
13
+ from .constants import SumMethod
14
+ from .mol_data import mol_sum
15
+ from .utils import wordwrap
16
+
17
+
18
+ # ---------------------------------------------------------------------------
19
+ # Node mapping
20
+ # ---------------------------------------------------------------------------
21
+
22
+ def node_map(
23
+ mol_data: Optional[pl.DataFrame],
24
+ node_data: pl.DataFrame,
25
+ node_types: str | list[str] = "gene",
26
+ node_sum: SumMethod = "sum",
27
+ entrez_gnodes: bool = True,
28
+ ) -> Optional[pl.DataFrame]:
29
+ """
30
+ Map *mol_data* onto pathway nodes of the specified *node_types*.
31
+
32
+ Parameters
33
+ ----------
34
+ mol_data: DataFrame whose first column contains molecule IDs and
35
+ remaining columns contain numeric values. Pass None to
36
+ produce a position-only result with NaN values.
37
+ node_data: Tidy node DataFrame produced by kgml_parser.node_info().
38
+ node_types: Node type string(s) to include (e.g. "gene", "compound").
39
+ node_sum: Aggregation method when multiple probes map to one node.
40
+ entrez_gnodes: True when gene nodes use Entrez IDs (vs KEGG gene IDs).
41
+
42
+ Returns a merged DataFrame of node positions and molecular values, or
43
+ None when no nodes of the requested type exist.
44
+ """
45
+ if isinstance(node_types, str):
46
+ node_types = [node_types]
47
+
48
+ target_nodes = node_data.filter(pl.col("type").is_in(node_types))
49
+ if target_nodes.is_empty():
50
+ return None
51
+
52
+ # Expand the space-separated "name" field into individual KEGG IDs
53
+ exploded = (
54
+ target_nodes
55
+ .with_columns(pl.col("name").str.split(" ").alias("kegg_names"))
56
+ .explode("kegg_names")
57
+ .with_columns(
58
+ # Strip species prefix (e.g. "hsa:1234" → "1234")
59
+ pl.col("kegg_names").str.replace(r"^[a-z]+:", "", literal=False)
60
+ )
61
+ )
62
+
63
+ if mol_data is None:
64
+ # Return node layout only, with a placeholder NaN value column
65
+ return (
66
+ exploded
67
+ .group_by("entry_id")
68
+ .agg([
69
+ pl.col("kegg_names").first(),
70
+ pl.col("x").first(),
71
+ pl.col("y").first(),
72
+ pl.col("width").first(),
73
+ pl.col("height").first(),
74
+ pl.col("label").first(),
75
+ pl.col("type").first(),
76
+ pl.col("size").first(),
77
+ ])
78
+ .with_columns(pl.lit(float("nan")).alias("mol_val"))
79
+ )
80
+
81
+ id_col = mol_data.columns[0]
82
+ id_map = (
83
+ exploded
84
+ .select(["kegg_names", "entry_id"])
85
+ .rename({"kegg_names": id_col, "entry_id": "__target"})
86
+ )
87
+
88
+ try:
89
+ summed = mol_sum(mol_data, id_map.rename({"__target": "target_id"}).rename({"target_id": "__target"}), sum_method=node_sum)
90
+ except ValueError:
91
+ return None
92
+
93
+ # Re-join aggregated values back to the full node layout
94
+ plot_data = target_nodes.join(
95
+ summed.rename({id_col: "entry_id"}),
96
+ on="entry_id",
97
+ how="left",
98
+ )
99
+ return plot_data