pathview-plus 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pathview/pathview.py ADDED
@@ -0,0 +1,316 @@
1
+ """
2
+ orchestrator.py (module inside the pathview package)
3
+ Core orchestrator: resolves IDs, downloads KEGG files, maps data to nodes,
4
+ and dispatches to the appropriate renderer.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import warnings
10
+ from pathlib import Path
11
+ from typing import Optional
12
+
13
+ import polars as pl
14
+
15
+ from .color_mapping import node_color
16
+ from .constants import SumMethod, VALID_NODE_TYPES, NODE_META_COLS
17
+ from .id_mapping import cpd_id_map, eg2id, id2eg
18
+ from .kegg_api import download_kegg, kegg_species_code
19
+ from .kgml_parser import node_info, parse_kgml
20
+ from .mol_data import mol_sum
21
+ from .node_mapping import node_map
22
+ from .rendering import keggview_graph, keggview_native
23
+ from .svg_rendering import keggview_svg
24
+
25
+
26
+ # ---------------------------------------------------------------------------
27
+ # Defaults factory (avoids mutable default arguments)
28
+ # ---------------------------------------------------------------------------
29
+
30
+ def _defaults() -> dict:
31
+ return dict(
32
+ limit = {"gene": 1.0, "cpd": 1.0},
33
+ bins = {"gene": 10, "cpd": 10},
34
+ both_dirs = {"gene": True, "cpd": True},
35
+ discrete = {"gene": False, "cpd": False},
36
+ low = {"gene": "green","cpd": "blue"},
37
+ mid = {"gene": "gray", "cpd": "gray"},
38
+ high = {"gene": "red", "cpd": "yellow"},
39
+ trans_fun = {"gene": None, "cpd": None},
40
+ )
41
+
42
+
43
+ # ---------------------------------------------------------------------------
44
+ # pathview
45
+ # ---------------------------------------------------------------------------
46
+
47
+ def pathview(
48
+ pathway_id: str,
49
+ gene_data: Optional[pl.DataFrame] = None,
50
+ cpd_data: Optional[pl.DataFrame] = None,
51
+ species: str = "hsa",
52
+ kegg_dir: str | Path = ".",
53
+ kegg_native: bool = True,
54
+ output_format: str = "png", # NEW: png, pdf, or svg
55
+ gene_idtype: str = "ENTREZ",
56
+ cpd_idtype: str = "KEGG",
57
+ out_suffix: str = "pathview",
58
+ node_sum: SumMethod = "sum",
59
+ map_symbol: bool = True,
60
+ map_null: bool = True,
61
+ min_nnodes: int = 3,
62
+ new_signature: bool = True,
63
+ plot_col_key: bool = True,
64
+ # Colour-scale parameters (all accept {"gene": …, "cpd": …} dicts)
65
+ limit: dict | None = None,
66
+ bins: dict | None = None,
67
+ both_dirs: dict | None = None,
68
+ discrete: dict | None = None,
69
+ low: dict | None = None,
70
+ mid: dict | None = None,
71
+ high: dict | None = None,
72
+ na_col: str = "transparent",
73
+ trans_fun: dict | None = None,
74
+ **kwargs,
75
+ ) -> dict:
76
+ """
77
+ Overlay molecular data onto a KEGG pathway diagram.
78
+
79
+ Parameters
80
+ ----------
81
+ pathway_id: KEGG pathway number, e.g. ``"04110"`` or ``"hsa04110"``.
82
+ gene_data: DataFrame — first column = gene IDs, rest = numeric values.
83
+ cpd_data: DataFrame — first column = compound IDs, rest = numeric.
84
+ species: KEGG species code (default ``"hsa"``).
85
+ kegg_dir: Working directory for downloaded and output files.
86
+ kegg_native: True → overlay on KEGG PNG; False → NetworkX graph layout.
87
+ gene_idtype: Input gene ID type (``"ENTREZ"``, ``"SYMBOL"``, ``"KEGG"``…).
88
+ cpd_idtype: Input compound ID type (``"KEGG"``, ``"PUBCHEM"``…).
89
+ out_suffix: Suffix for output filenames.
90
+ node_sum: Aggregation method for multi-probe nodes.
91
+ map_symbol: Replace Entrez IDs with gene symbols in node labels.
92
+ map_null: Render nodes even when no data is provided.
93
+ min_nnodes: Skip pathway if fewer than this many mappable nodes exist.
94
+ new_signature: Add a "Rendered by pathview.py" watermark.
95
+ plot_col_key: Draw the colour-scale legend.
96
+ limit/bins/both_dirs/discrete/low/mid/high/trans_fun:
97
+ Colour-scale parameters, each a dict with "gene" and "cpd"
98
+ keys.
99
+ na_col: Colour for unmapped nodes (default ``"transparent"``).
100
+
101
+ Returns
102
+ -------
103
+ dict with keys ``"plot_data_gene"`` and ``"plot_data_cpd"`` (Polars
104
+ DataFrames), or an empty dict when the pathway could not be processed.
105
+
106
+ Examples
107
+ --------
108
+ >>> import polars as pl
109
+ >>> from pathview import pathview
110
+ >>> gene_df = pl.read_csv("gene_expr.tsv", separator="\\t")
111
+ >>> result = pathview("04110", gene_data=gene_df, species="hsa")
112
+ """
113
+ if gene_data is None and cpd_data is None:
114
+ raise ValueError("At least one of gene_data or cpd_data must be provided.")
115
+
116
+ # Merge caller-supplied dicts over defaults
117
+ cfg = _defaults()
118
+ for key, val in dict(
119
+ limit=limit, bins=bins, both_dirs=both_dirs, discrete=discrete,
120
+ low=low, mid=mid, high=high, trans_fun=trans_fun,
121
+ ).items():
122
+ if val is not None:
123
+ cfg[key] = val
124
+
125
+ kegg_dir = Path(kegg_dir)
126
+
127
+ # ---- Species resolution ------------------------------------------------
128
+ species_info = kegg_species_code(species)
129
+ species_code = species_info.kegg_code
130
+ if species_code == "ko":
131
+ gene_idtype = "KEGG"
132
+
133
+ # ---- Normalise pathway ID ----------------------------------------------
134
+ pathway_name = (
135
+ pathway_id if pathway_id.startswith(species_code)
136
+ else f"{species_code}{pathway_id}"
137
+ )
138
+ numeric_id = pathway_name.replace(species_code, "")
139
+
140
+ # ---- Gene ID conversion ------------------------------------------------
141
+ if gene_data is not None:
142
+ gene_data = _maybe_convert_gene_ids(
143
+ gene_data, gene_idtype, species_code, node_sum
144
+ )
145
+
146
+ # ---- Compound ID conversion --------------------------------------------
147
+ if cpd_data is not None and "kegg" not in cpd_idtype.lower():
148
+ cpd_data = _maybe_convert_cpd_ids(cpd_data, cpd_idtype, node_sum)
149
+
150
+ # ---- Download missing files --------------------------------------------
151
+ needed = ["xml", "png"] if kegg_native else ["xml"]
152
+ existing = {f.name for f in kegg_dir.iterdir()} if kegg_dir.exists() else set()
153
+ missing = [t for t in needed if f"{pathway_name}.{t}" not in existing]
154
+
155
+ if missing:
156
+ status = download_kegg(numeric_id, species=species_code,
157
+ kegg_dir=kegg_dir, file_type=missing)
158
+ if status.get(pathway_name) == "failed":
159
+ warnings.warn(f"Failed to download files for {pathway_name}; skipping.")
160
+ return {}
161
+
162
+ # ---- Parse KGML --------------------------------------------------------
163
+ pathway = parse_kgml(kegg_dir / f"{pathway_name}.xml")
164
+ node_data = (
165
+ node_info(pathway)
166
+ .filter(
167
+ pl.col("type").is_in(VALID_NODE_TYPES)
168
+ & pl.col("x").is_not_null()
169
+ & pl.col("y").is_not_null()
170
+ )
171
+ )
172
+
173
+ if node_data.height < min_nnodes:
174
+ warnings.warn(
175
+ f"Only {node_data.height} mappable nodes for {pathway_name} "
176
+ f"(minimum {min_nnodes}); skipping."
177
+ )
178
+ return {}
179
+
180
+ # ---- Map gene data onto nodes ------------------------------------------
181
+ gene_node_type = "ortholog" if species_code == "ko" else "gene"
182
+ plot_data_gene, cols_gene = _map_and_color(
183
+ mol_data=gene_data,
184
+ node_data=node_data,
185
+ node_types=gene_node_type,
186
+ node_sum=node_sum,
187
+ map_null=map_null,
188
+ color_cfg={k: cfg[k]["gene"] for k in ("limit","bins","both_dirs","discrete","low","mid","high","trans_fun")},
189
+ na_col=na_col,
190
+ )
191
+
192
+ # Optionally replace Entrez IDs with gene symbols in labels
193
+ if plot_data_gene is not None and map_symbol and gene_data is not None:
194
+ plot_data_gene = _add_symbol_labels(plot_data_gene, species_code)
195
+
196
+ # ---- Map compound data onto nodes --------------------------------------
197
+ plot_data_cpd, cols_cpd = _map_and_color(
198
+ mol_data=cpd_data,
199
+ node_data=node_data,
200
+ node_types="compound",
201
+ node_sum=node_sum,
202
+ map_null=map_null,
203
+ color_cfg={k: cfg[k]["cpd"] for k in ("limit","bins","both_dirs","discrete","low","mid","high","trans_fun")},
204
+ na_col=na_col,
205
+ )
206
+
207
+ # ---- Render ------------------------------------------------------------
208
+ render_kwargs = dict(
209
+ plot_data_gene=plot_data_gene, cols_gene=cols_gene,
210
+ plot_data_cpd=plot_data_cpd, cols_cpd=cols_cpd,
211
+ node_data=node_data,
212
+ pathway_name=pathway_name,
213
+ kegg_dir=kegg_dir,
214
+ out_suffix=out_suffix,
215
+ new_signature=new_signature,
216
+ plot_col_key=plot_col_key,
217
+ **{k: cfg[k] for k in ("limit","bins","both_dirs","discrete","low","mid","high")},
218
+ )
219
+
220
+ if output_format == "svg":
221
+ # SVG vector output (works for both KEGG and SBGN)
222
+ keggview_svg(**{k: v for k, v in render_kwargs.items()
223
+ if k not in ("discrete", "plot_col_key")})
224
+ elif kegg_native and output_format == "png":
225
+ # PNG with KEGG background (only for KEGG pathways)
226
+ keggview_native(**render_kwargs)
227
+ else:
228
+ # PDF graph layout (works for both KEGG and SBGN)
229
+ keggview_graph(**{k: v for k, v in render_kwargs.items()
230
+ if k not in ("discrete",)}, **kwargs)
231
+
232
+ return {"plot_data_gene": plot_data_gene, "plot_data_cpd": plot_data_cpd}
233
+
234
+
235
+ # ---------------------------------------------------------------------------
236
+ # Private helpers
237
+ # ---------------------------------------------------------------------------
238
+
239
+ def _maybe_convert_gene_ids(
240
+ gene_data: pl.DataFrame,
241
+ gene_idtype: str,
242
+ species_code: str,
243
+ node_sum: SumMethod,
244
+ ) -> pl.DataFrame:
245
+ """Convert non-Entrez gene IDs to Entrez before pathway mapping."""
246
+ if gene_idtype.upper() in ("ENTREZ", "ENTREZID", "KEGG"):
247
+ return gene_data
248
+ id_col = gene_data.columns[0]
249
+ id_map = id2eg(gene_data[id_col].to_list(), category=gene_idtype, org=species_code)
250
+ return mol_sum(gene_data, id_map, sum_method=node_sum)
251
+
252
+
253
+ def _maybe_convert_cpd_ids(
254
+ cpd_data: pl.DataFrame,
255
+ cpd_idtype: str,
256
+ node_sum: SumMethod,
257
+ ) -> pl.DataFrame:
258
+ """Convert non-KEGG compound IDs to KEGG before pathway mapping."""
259
+ id_col = cpd_data.columns[0]
260
+ id_map = cpd_id_map(cpd_data[id_col].to_list(), in_type=cpd_idtype, out_type="KEGG")
261
+ return mol_sum(cpd_data, id_map, sum_method=node_sum)
262
+
263
+
264
+ def _map_and_color(
265
+ mol_data: Optional[pl.DataFrame],
266
+ node_data: pl.DataFrame,
267
+ node_types: str,
268
+ node_sum: SumMethod,
269
+ map_null: bool,
270
+ color_cfg: dict,
271
+ na_col: str,
272
+ ) -> tuple[Optional[pl.DataFrame], Optional[pl.DataFrame]]:
273
+ """
274
+ Map molecule data to nodes then compute per-node colours.
275
+
276
+ Returns (plot_data, cols) where cols is a DataFrame of hex colour strings,
277
+ or (None, None) when no nodes of the requested type exist.
278
+ """
279
+ if mol_data is None and not map_null:
280
+ return None, None
281
+
282
+ plot_data = node_map(mol_data, node_data, node_types=node_types, node_sum=node_sum)
283
+ if plot_data is None:
284
+ return None, None
285
+
286
+ val_cols = [c for c in plot_data.columns if c not in NODE_META_COLS]
287
+ if not val_cols:
288
+ return plot_data, None
289
+
290
+ cols = node_color(
291
+ plot_data.select(["entry_id"] + val_cols).rename({"entry_id": "id"}),
292
+ limit = color_cfg["limit"],
293
+ bins = color_cfg["bins"],
294
+ both_dirs= color_cfg["both_dirs"],
295
+ discrete = color_cfg["discrete"],
296
+ low = color_cfg["low"],
297
+ mid = color_cfg["mid"],
298
+ high = color_cfg["high"],
299
+ na_col = na_col,
300
+ trans_fun= color_cfg["trans_fun"],
301
+ )
302
+ return plot_data, cols
303
+
304
+
305
+ def _add_symbol_labels(
306
+ plot_data: pl.DataFrame,
307
+ species_code: str,
308
+ ) -> pl.DataFrame:
309
+ """Attempt to replace Entrez-based labels with gene symbols."""
310
+ try:
311
+ gene_ids = plot_data["kegg_names"].drop_nulls().to_list()
312
+ sym_map = eg2id(gene_ids, category="SYMBOL", org=species_code)
313
+ return plot_data.join(sym_map, left_on="kegg_names", right_on="ENTREZID", how="left")
314
+ except Exception as exc:
315
+ warnings.warn(f"Symbol label mapping failed: {exc}")
316
+ return plot_data