pathview-plus 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pathview/__init__.py +124 -0
- pathview/color_mapping.py +153 -0
- pathview/constants.py +27 -0
- pathview/databases.py +309 -0
- pathview/examples.py +342 -0
- pathview/highlighting.py +375 -0
- pathview/id_mapping.py +170 -0
- pathview/kegg_api.py +143 -0
- pathview/kgml_parser.py +189 -0
- pathview/mol_data.py +168 -0
- pathview/node_mapping.py +99 -0
- pathview/pathview.py +316 -0
- pathview/rendering.py +409 -0
- pathview/sbgn_parser.py +353 -0
- pathview/splines.py +304 -0
- pathview/svg_rendering.py +305 -0
- pathview/test_all_features.py +343 -0
- pathview/utils.py +80 -0
- pathview_plus-2.0.0.data/scripts/pathview-cli.py +252 -0
- pathview_plus-2.0.0.dist-info/METADATA +661 -0
- pathview_plus-2.0.0.dist-info/RECORD +23 -0
- pathview_plus-2.0.0.dist-info/WHEEL +5 -0
- pathview_plus-2.0.0.dist-info/top_level.txt +1 -0
pathview/pathview.py
ADDED
|
@@ -0,0 +1,316 @@
|
|
|
1
|
+
"""
|
|
2
|
+
orchestrator.py (module inside the pathview package)
|
|
3
|
+
Core orchestrator: resolves IDs, downloads KEGG files, maps data to nodes,
|
|
4
|
+
and dispatches to the appropriate renderer.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import warnings
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Optional
|
|
12
|
+
|
|
13
|
+
import polars as pl
|
|
14
|
+
|
|
15
|
+
from .color_mapping import node_color
|
|
16
|
+
from .constants import SumMethod, VALID_NODE_TYPES, NODE_META_COLS
|
|
17
|
+
from .id_mapping import cpd_id_map, eg2id, id2eg
|
|
18
|
+
from .kegg_api import download_kegg, kegg_species_code
|
|
19
|
+
from .kgml_parser import node_info, parse_kgml
|
|
20
|
+
from .mol_data import mol_sum
|
|
21
|
+
from .node_mapping import node_map
|
|
22
|
+
from .rendering import keggview_graph, keggview_native
|
|
23
|
+
from .svg_rendering import keggview_svg
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# ---------------------------------------------------------------------------
|
|
27
|
+
# Defaults factory (avoids mutable default arguments)
|
|
28
|
+
# ---------------------------------------------------------------------------
|
|
29
|
+
|
|
30
|
+
def _defaults() -> dict:
|
|
31
|
+
return dict(
|
|
32
|
+
limit = {"gene": 1.0, "cpd": 1.0},
|
|
33
|
+
bins = {"gene": 10, "cpd": 10},
|
|
34
|
+
both_dirs = {"gene": True, "cpd": True},
|
|
35
|
+
discrete = {"gene": False, "cpd": False},
|
|
36
|
+
low = {"gene": "green","cpd": "blue"},
|
|
37
|
+
mid = {"gene": "gray", "cpd": "gray"},
|
|
38
|
+
high = {"gene": "red", "cpd": "yellow"},
|
|
39
|
+
trans_fun = {"gene": None, "cpd": None},
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
# ---------------------------------------------------------------------------
|
|
44
|
+
# pathview
|
|
45
|
+
# ---------------------------------------------------------------------------
|
|
46
|
+
|
|
47
|
+
def pathview(
|
|
48
|
+
pathway_id: str,
|
|
49
|
+
gene_data: Optional[pl.DataFrame] = None,
|
|
50
|
+
cpd_data: Optional[pl.DataFrame] = None,
|
|
51
|
+
species: str = "hsa",
|
|
52
|
+
kegg_dir: str | Path = ".",
|
|
53
|
+
kegg_native: bool = True,
|
|
54
|
+
output_format: str = "png", # NEW: png, pdf, or svg
|
|
55
|
+
gene_idtype: str = "ENTREZ",
|
|
56
|
+
cpd_idtype: str = "KEGG",
|
|
57
|
+
out_suffix: str = "pathview",
|
|
58
|
+
node_sum: SumMethod = "sum",
|
|
59
|
+
map_symbol: bool = True,
|
|
60
|
+
map_null: bool = True,
|
|
61
|
+
min_nnodes: int = 3,
|
|
62
|
+
new_signature: bool = True,
|
|
63
|
+
plot_col_key: bool = True,
|
|
64
|
+
# Colour-scale parameters (all accept {"gene": …, "cpd": …} dicts)
|
|
65
|
+
limit: dict | None = None,
|
|
66
|
+
bins: dict | None = None,
|
|
67
|
+
both_dirs: dict | None = None,
|
|
68
|
+
discrete: dict | None = None,
|
|
69
|
+
low: dict | None = None,
|
|
70
|
+
mid: dict | None = None,
|
|
71
|
+
high: dict | None = None,
|
|
72
|
+
na_col: str = "transparent",
|
|
73
|
+
trans_fun: dict | None = None,
|
|
74
|
+
**kwargs,
|
|
75
|
+
) -> dict:
|
|
76
|
+
"""
|
|
77
|
+
Overlay molecular data onto a KEGG pathway diagram.
|
|
78
|
+
|
|
79
|
+
Parameters
|
|
80
|
+
----------
|
|
81
|
+
pathway_id: KEGG pathway number, e.g. ``"04110"`` or ``"hsa04110"``.
|
|
82
|
+
gene_data: DataFrame — first column = gene IDs, rest = numeric values.
|
|
83
|
+
cpd_data: DataFrame — first column = compound IDs, rest = numeric.
|
|
84
|
+
species: KEGG species code (default ``"hsa"``).
|
|
85
|
+
kegg_dir: Working directory for downloaded and output files.
|
|
86
|
+
kegg_native: True → overlay on KEGG PNG; False → NetworkX graph layout.
|
|
87
|
+
gene_idtype: Input gene ID type (``"ENTREZ"``, ``"SYMBOL"``, ``"KEGG"``…).
|
|
88
|
+
cpd_idtype: Input compound ID type (``"KEGG"``, ``"PUBCHEM"``…).
|
|
89
|
+
out_suffix: Suffix for output filenames.
|
|
90
|
+
node_sum: Aggregation method for multi-probe nodes.
|
|
91
|
+
map_symbol: Replace Entrez IDs with gene symbols in node labels.
|
|
92
|
+
map_null: Render nodes even when no data is provided.
|
|
93
|
+
min_nnodes: Skip pathway if fewer than this many mappable nodes exist.
|
|
94
|
+
new_signature: Add a "Rendered by pathview.py" watermark.
|
|
95
|
+
plot_col_key: Draw the colour-scale legend.
|
|
96
|
+
limit/bins/both_dirs/discrete/low/mid/high/trans_fun:
|
|
97
|
+
Colour-scale parameters, each a dict with "gene" and "cpd"
|
|
98
|
+
keys.
|
|
99
|
+
na_col: Colour for unmapped nodes (default ``"transparent"``).
|
|
100
|
+
|
|
101
|
+
Returns
|
|
102
|
+
-------
|
|
103
|
+
dict with keys ``"plot_data_gene"`` and ``"plot_data_cpd"`` (Polars
|
|
104
|
+
DataFrames), or an empty dict when the pathway could not be processed.
|
|
105
|
+
|
|
106
|
+
Examples
|
|
107
|
+
--------
|
|
108
|
+
>>> import polars as pl
|
|
109
|
+
>>> from pathview import pathview
|
|
110
|
+
>>> gene_df = pl.read_csv("gene_expr.tsv", separator="\\t")
|
|
111
|
+
>>> result = pathview("04110", gene_data=gene_df, species="hsa")
|
|
112
|
+
"""
|
|
113
|
+
if gene_data is None and cpd_data is None:
|
|
114
|
+
raise ValueError("At least one of gene_data or cpd_data must be provided.")
|
|
115
|
+
|
|
116
|
+
# Merge caller-supplied dicts over defaults
|
|
117
|
+
cfg = _defaults()
|
|
118
|
+
for key, val in dict(
|
|
119
|
+
limit=limit, bins=bins, both_dirs=both_dirs, discrete=discrete,
|
|
120
|
+
low=low, mid=mid, high=high, trans_fun=trans_fun,
|
|
121
|
+
).items():
|
|
122
|
+
if val is not None:
|
|
123
|
+
cfg[key] = val
|
|
124
|
+
|
|
125
|
+
kegg_dir = Path(kegg_dir)
|
|
126
|
+
|
|
127
|
+
# ---- Species resolution ------------------------------------------------
|
|
128
|
+
species_info = kegg_species_code(species)
|
|
129
|
+
species_code = species_info.kegg_code
|
|
130
|
+
if species_code == "ko":
|
|
131
|
+
gene_idtype = "KEGG"
|
|
132
|
+
|
|
133
|
+
# ---- Normalise pathway ID ----------------------------------------------
|
|
134
|
+
pathway_name = (
|
|
135
|
+
pathway_id if pathway_id.startswith(species_code)
|
|
136
|
+
else f"{species_code}{pathway_id}"
|
|
137
|
+
)
|
|
138
|
+
numeric_id = pathway_name.replace(species_code, "")
|
|
139
|
+
|
|
140
|
+
# ---- Gene ID conversion ------------------------------------------------
|
|
141
|
+
if gene_data is not None:
|
|
142
|
+
gene_data = _maybe_convert_gene_ids(
|
|
143
|
+
gene_data, gene_idtype, species_code, node_sum
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
# ---- Compound ID conversion --------------------------------------------
|
|
147
|
+
if cpd_data is not None and "kegg" not in cpd_idtype.lower():
|
|
148
|
+
cpd_data = _maybe_convert_cpd_ids(cpd_data, cpd_idtype, node_sum)
|
|
149
|
+
|
|
150
|
+
# ---- Download missing files --------------------------------------------
|
|
151
|
+
needed = ["xml", "png"] if kegg_native else ["xml"]
|
|
152
|
+
existing = {f.name for f in kegg_dir.iterdir()} if kegg_dir.exists() else set()
|
|
153
|
+
missing = [t for t in needed if f"{pathway_name}.{t}" not in existing]
|
|
154
|
+
|
|
155
|
+
if missing:
|
|
156
|
+
status = download_kegg(numeric_id, species=species_code,
|
|
157
|
+
kegg_dir=kegg_dir, file_type=missing)
|
|
158
|
+
if status.get(pathway_name) == "failed":
|
|
159
|
+
warnings.warn(f"Failed to download files for {pathway_name}; skipping.")
|
|
160
|
+
return {}
|
|
161
|
+
|
|
162
|
+
# ---- Parse KGML --------------------------------------------------------
|
|
163
|
+
pathway = parse_kgml(kegg_dir / f"{pathway_name}.xml")
|
|
164
|
+
node_data = (
|
|
165
|
+
node_info(pathway)
|
|
166
|
+
.filter(
|
|
167
|
+
pl.col("type").is_in(VALID_NODE_TYPES)
|
|
168
|
+
& pl.col("x").is_not_null()
|
|
169
|
+
& pl.col("y").is_not_null()
|
|
170
|
+
)
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
if node_data.height < min_nnodes:
|
|
174
|
+
warnings.warn(
|
|
175
|
+
f"Only {node_data.height} mappable nodes for {pathway_name} "
|
|
176
|
+
f"(minimum {min_nnodes}); skipping."
|
|
177
|
+
)
|
|
178
|
+
return {}
|
|
179
|
+
|
|
180
|
+
# ---- Map gene data onto nodes ------------------------------------------
|
|
181
|
+
gene_node_type = "ortholog" if species_code == "ko" else "gene"
|
|
182
|
+
plot_data_gene, cols_gene = _map_and_color(
|
|
183
|
+
mol_data=gene_data,
|
|
184
|
+
node_data=node_data,
|
|
185
|
+
node_types=gene_node_type,
|
|
186
|
+
node_sum=node_sum,
|
|
187
|
+
map_null=map_null,
|
|
188
|
+
color_cfg={k: cfg[k]["gene"] for k in ("limit","bins","both_dirs","discrete","low","mid","high","trans_fun")},
|
|
189
|
+
na_col=na_col,
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
# Optionally replace Entrez IDs with gene symbols in labels
|
|
193
|
+
if plot_data_gene is not None and map_symbol and gene_data is not None:
|
|
194
|
+
plot_data_gene = _add_symbol_labels(plot_data_gene, species_code)
|
|
195
|
+
|
|
196
|
+
# ---- Map compound data onto nodes --------------------------------------
|
|
197
|
+
plot_data_cpd, cols_cpd = _map_and_color(
|
|
198
|
+
mol_data=cpd_data,
|
|
199
|
+
node_data=node_data,
|
|
200
|
+
node_types="compound",
|
|
201
|
+
node_sum=node_sum,
|
|
202
|
+
map_null=map_null,
|
|
203
|
+
color_cfg={k: cfg[k]["cpd"] for k in ("limit","bins","both_dirs","discrete","low","mid","high","trans_fun")},
|
|
204
|
+
na_col=na_col,
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
# ---- Render ------------------------------------------------------------
|
|
208
|
+
render_kwargs = dict(
|
|
209
|
+
plot_data_gene=plot_data_gene, cols_gene=cols_gene,
|
|
210
|
+
plot_data_cpd=plot_data_cpd, cols_cpd=cols_cpd,
|
|
211
|
+
node_data=node_data,
|
|
212
|
+
pathway_name=pathway_name,
|
|
213
|
+
kegg_dir=kegg_dir,
|
|
214
|
+
out_suffix=out_suffix,
|
|
215
|
+
new_signature=new_signature,
|
|
216
|
+
plot_col_key=plot_col_key,
|
|
217
|
+
**{k: cfg[k] for k in ("limit","bins","both_dirs","discrete","low","mid","high")},
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
if output_format == "svg":
|
|
221
|
+
# SVG vector output (works for both KEGG and SBGN)
|
|
222
|
+
keggview_svg(**{k: v for k, v in render_kwargs.items()
|
|
223
|
+
if k not in ("discrete", "plot_col_key")})
|
|
224
|
+
elif kegg_native and output_format == "png":
|
|
225
|
+
# PNG with KEGG background (only for KEGG pathways)
|
|
226
|
+
keggview_native(**render_kwargs)
|
|
227
|
+
else:
|
|
228
|
+
# PDF graph layout (works for both KEGG and SBGN)
|
|
229
|
+
keggview_graph(**{k: v for k, v in render_kwargs.items()
|
|
230
|
+
if k not in ("discrete",)}, **kwargs)
|
|
231
|
+
|
|
232
|
+
return {"plot_data_gene": plot_data_gene, "plot_data_cpd": plot_data_cpd}
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
# ---------------------------------------------------------------------------
|
|
236
|
+
# Private helpers
|
|
237
|
+
# ---------------------------------------------------------------------------
|
|
238
|
+
|
|
239
|
+
def _maybe_convert_gene_ids(
|
|
240
|
+
gene_data: pl.DataFrame,
|
|
241
|
+
gene_idtype: str,
|
|
242
|
+
species_code: str,
|
|
243
|
+
node_sum: SumMethod,
|
|
244
|
+
) -> pl.DataFrame:
|
|
245
|
+
"""Convert non-Entrez gene IDs to Entrez before pathway mapping."""
|
|
246
|
+
if gene_idtype.upper() in ("ENTREZ", "ENTREZID", "KEGG"):
|
|
247
|
+
return gene_data
|
|
248
|
+
id_col = gene_data.columns[0]
|
|
249
|
+
id_map = id2eg(gene_data[id_col].to_list(), category=gene_idtype, org=species_code)
|
|
250
|
+
return mol_sum(gene_data, id_map, sum_method=node_sum)
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def _maybe_convert_cpd_ids(
|
|
254
|
+
cpd_data: pl.DataFrame,
|
|
255
|
+
cpd_idtype: str,
|
|
256
|
+
node_sum: SumMethod,
|
|
257
|
+
) -> pl.DataFrame:
|
|
258
|
+
"""Convert non-KEGG compound IDs to KEGG before pathway mapping."""
|
|
259
|
+
id_col = cpd_data.columns[0]
|
|
260
|
+
id_map = cpd_id_map(cpd_data[id_col].to_list(), in_type=cpd_idtype, out_type="KEGG")
|
|
261
|
+
return mol_sum(cpd_data, id_map, sum_method=node_sum)
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def _map_and_color(
|
|
265
|
+
mol_data: Optional[pl.DataFrame],
|
|
266
|
+
node_data: pl.DataFrame,
|
|
267
|
+
node_types: str,
|
|
268
|
+
node_sum: SumMethod,
|
|
269
|
+
map_null: bool,
|
|
270
|
+
color_cfg: dict,
|
|
271
|
+
na_col: str,
|
|
272
|
+
) -> tuple[Optional[pl.DataFrame], Optional[pl.DataFrame]]:
|
|
273
|
+
"""
|
|
274
|
+
Map molecule data to nodes then compute per-node colours.
|
|
275
|
+
|
|
276
|
+
Returns (plot_data, cols) where cols is a DataFrame of hex colour strings,
|
|
277
|
+
or (None, None) when no nodes of the requested type exist.
|
|
278
|
+
"""
|
|
279
|
+
if mol_data is None and not map_null:
|
|
280
|
+
return None, None
|
|
281
|
+
|
|
282
|
+
plot_data = node_map(mol_data, node_data, node_types=node_types, node_sum=node_sum)
|
|
283
|
+
if plot_data is None:
|
|
284
|
+
return None, None
|
|
285
|
+
|
|
286
|
+
val_cols = [c for c in plot_data.columns if c not in NODE_META_COLS]
|
|
287
|
+
if not val_cols:
|
|
288
|
+
return plot_data, None
|
|
289
|
+
|
|
290
|
+
cols = node_color(
|
|
291
|
+
plot_data.select(["entry_id"] + val_cols).rename({"entry_id": "id"}),
|
|
292
|
+
limit = color_cfg["limit"],
|
|
293
|
+
bins = color_cfg["bins"],
|
|
294
|
+
both_dirs= color_cfg["both_dirs"],
|
|
295
|
+
discrete = color_cfg["discrete"],
|
|
296
|
+
low = color_cfg["low"],
|
|
297
|
+
mid = color_cfg["mid"],
|
|
298
|
+
high = color_cfg["high"],
|
|
299
|
+
na_col = na_col,
|
|
300
|
+
trans_fun= color_cfg["trans_fun"],
|
|
301
|
+
)
|
|
302
|
+
return plot_data, cols
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
def _add_symbol_labels(
|
|
306
|
+
plot_data: pl.DataFrame,
|
|
307
|
+
species_code: str,
|
|
308
|
+
) -> pl.DataFrame:
|
|
309
|
+
"""Attempt to replace Entrez-based labels with gene symbols."""
|
|
310
|
+
try:
|
|
311
|
+
gene_ids = plot_data["kegg_names"].drop_nulls().to_list()
|
|
312
|
+
sym_map = eg2id(gene_ids, category="SYMBOL", org=species_code)
|
|
313
|
+
return plot_data.join(sym_map, left_on="kegg_names", right_on="ENTREZID", how="left")
|
|
314
|
+
except Exception as exc:
|
|
315
|
+
warnings.warn(f"Symbol label mapping failed: {exc}")
|
|
316
|
+
return plot_data
|