keggtangled 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- keggtangled-0.1.0/LICENSE +21 -0
- keggtangled-0.1.0/PKG-INFO +17 -0
- keggtangled-0.1.0/README.md +2 -0
- keggtangled-0.1.0/pyproject.toml +28 -0
- keggtangled-0.1.0/src/keggtangled/__init__.py +522 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Elisa Márquez Zavala
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: keggtangled
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A KEGG tool for managing compounds, reactions, and pathways
|
|
5
|
+
Author-email: Elisa Márquez Zavala <emarquez@lcg.unam.mx>
|
|
6
|
+
Maintainer-email: Elisa Márquez Zavala <emarquez@lcg.unam.mx>
|
|
7
|
+
Requires-Python: >=3.8
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Requires-Dist: biopython>=1.79
|
|
11
|
+
Project-URL: Homepage, https://github.com/emarquezz/KEGGtangled
|
|
12
|
+
Project-URL: Issues, https://github.com/emarquezz/KEGGtangled/issues
|
|
13
|
+
Project-URL: Repository, https://github.com/emarquezz/KEGGtangled
|
|
14
|
+
|
|
15
|
+
# KEGGtangled
|
|
16
|
+
Tie all your KEGG data together. Untangle it your way.
|
|
17
|
+
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["flit_core>=3.12"]
|
|
3
|
+
build-backend = "flit_core.buildapi"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "keggtangled"
|
|
7
|
+
dynamic = ["version"]
|
|
8
|
+
description = "A KEGG tool for managing compounds, reactions, and pathways"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
authors = [
|
|
11
|
+
{ name = "Elisa Márquez Zavala", email = "emarquez@lcg.unam.mx" },
|
|
12
|
+
]
|
|
13
|
+
maintainers = [
|
|
14
|
+
{ name = "Elisa Márquez Zavala", email = "emarquez@lcg.unam.mx" },
|
|
15
|
+
]
|
|
16
|
+
license = { text = "MIT" }
|
|
17
|
+
requires-python = ">=3.8"
|
|
18
|
+
dependencies = [
|
|
19
|
+
"biopython>=1.79",
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
[project.urls]
|
|
23
|
+
Homepage = "https://github.com/emarquezz/KEGGtangled"
|
|
24
|
+
Repository = "https://github.com/emarquezz/KEGGtangled"
|
|
25
|
+
Issues = "https://github.com/emarquezz/KEGGtangled/issues"
|
|
26
|
+
|
|
27
|
+
[tool.flit.module]
|
|
28
|
+
name = "keggtangled"
|
|
@@ -0,0 +1,522 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# coding: utf-8
|
|
3
|
+
__version__ = "0.1.0"
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
import os
|
|
7
|
+
import json
|
|
8
|
+
import io
|
|
9
|
+
import hashlib
|
|
10
|
+
from Bio.KEGG.REST import kegg_link, kegg_get
|
|
11
|
+
from Bio.KEGG.KGML.KGML_parser import read as kgml_read
|
|
12
|
+
|
|
13
|
+
# ----------------------------------------------------------------------
|
|
14
|
+
# Pre‑compiled regular expressions
|
|
15
|
+
# ----------------------------------------------------------------------
|
|
16
|
+
KO_RE = re.compile(r'\[KO:(K\d+)\]')
|
|
17
|
+
|
|
18
|
+
# ----------------------------------------------------------------------
|
|
19
|
+
# Compound class
|
|
20
|
+
# ----------------------------------------------------------------------
|
|
21
|
+
class Compound:
|
|
22
|
+
"""
|
|
23
|
+
Represents a KEGG compound (e.g., C00022).
|
|
24
|
+
"""
|
|
25
|
+
def __init__(self, compound_id, organism):
|
|
26
|
+
self.id = compound_id # e.g., 'C00022'
|
|
27
|
+
self.organism = organism # Organism instance (needed for KOs / genes)
|
|
28
|
+
self.name = None # human‑readable name
|
|
29
|
+
self.formula = None # molecular formula
|
|
30
|
+
self.mass = None # molecular weight (string from KEGG)
|
|
31
|
+
self.reactions = set() # reaction IDs this compound participates in
|
|
32
|
+
|
|
33
|
+
def __repr__(self):
|
|
34
|
+
return f"Compound({self.id}, {self.name})"
|
|
35
|
+
|
|
36
|
+
def get_kos(self):
|
|
37
|
+
"""Return the set of KOs linked to this compound via any reaction."""
|
|
38
|
+
kos = set()
|
|
39
|
+
for rxn_id in self.reactions:
|
|
40
|
+
kos.update(self.organism._reaction_to_kos.get(rxn_id, set()))
|
|
41
|
+
return kos
|
|
42
|
+
|
|
43
|
+
def get_genes(self):
|
|
44
|
+
"""Return the set of genes linked to this compound (through KOs and reactions)."""
|
|
45
|
+
genes = set()
|
|
46
|
+
for ko in self.get_kos():
|
|
47
|
+
genes.update(self.organism.get_genes_for_ko(ko))
|
|
48
|
+
return genes
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
# ----------------------------------------------------------------------
|
|
52
|
+
# Reaction class
|
|
53
|
+
# ----------------------------------------------------------------------
|
|
54
|
+
class Reaction:
|
|
55
|
+
def __init__(self, reaction_id, organism):
|
|
56
|
+
self.reaction_id = reaction_id
|
|
57
|
+
self.organism = organism
|
|
58
|
+
self.ko_to_genes = {}
|
|
59
|
+
|
|
60
|
+
# pathway_id -> {type, substrates, products, substrates_read, products_read,
|
|
61
|
+
# formula_kegg, formula_read}
|
|
62
|
+
self.formula_per_pathway = {}
|
|
63
|
+
|
|
64
|
+
kos = organism._reaction_to_kos.get(reaction_id, set())
|
|
65
|
+
for ko in kos:
|
|
66
|
+
genes = organism.get_genes_for_ko(ko)
|
|
67
|
+
if genes:
|
|
68
|
+
self.ko_to_genes[ko] = genes
|
|
69
|
+
|
|
70
|
+
def __repr__(self):
|
|
71
|
+
return (f"Reaction({self.reaction_id}, {self.organism.org_code}) – "
|
|
72
|
+
f"{len(self.ko_to_genes)} KOs mapped to genes, "
|
|
73
|
+
f"{len(self.formula_per_pathway)} pathway formulas")
|
|
74
|
+
|
|
75
|
+
def get_genes(self):
|
|
76
|
+
all_genes = set()
|
|
77
|
+
for genes in self.ko_to_genes.values():
|
|
78
|
+
all_genes.update(genes)
|
|
79
|
+
return all_genes
|
|
80
|
+
|
|
81
|
+
def get_kos(self):
|
|
82
|
+
return list(self.ko_to_genes.keys())
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
# ----------------------------------------------------------------------
|
|
86
|
+
# Pathway class
|
|
87
|
+
# ----------------------------------------------------------------------
|
|
88
|
+
class Pathway:
|
|
89
|
+
def __init__(self, pathway_id, gene_kos=None):
|
|
90
|
+
self.id = pathway_id
|
|
91
|
+
if gene_kos is None:
|
|
92
|
+
gene_kos = {}
|
|
93
|
+
self.gene_ids = set(gene_kos.keys())
|
|
94
|
+
self.reaction_ids = set()
|
|
95
|
+
|
|
96
|
+
def add_reactions(self, reaction_ids):
|
|
97
|
+
self.reaction_ids.update(reaction_ids)
|
|
98
|
+
|
|
99
|
+
def __repr__(self):
|
|
100
|
+
return f"Pathway({self.id}, {len(self.gene_ids)} genes, {len(self.reaction_ids)} reactions)"
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
# ----------------------------------------------------------------------
|
|
104
|
+
# KGML fetcher (cached)
|
|
105
|
+
# ----------------------------------------------------------------------
|
|
106
|
+
def get_pathway_kgml(pathway_id, cache_dir="kegg_cache"):
|
|
107
|
+
"""
|
|
108
|
+
Return a parsed KGML pathway object (Bio.KEGG.KGML.Pathway).
|
|
109
|
+
The raw XML is cached on disk.
|
|
110
|
+
"""
|
|
111
|
+
os.makedirs(cache_dir, exist_ok=True)
|
|
112
|
+
full_id = f"path:{pathway_id}" if not pathway_id.startswith("path:") else pathway_id
|
|
113
|
+
cache_file = os.path.join(cache_dir, f"{pathway_id}.kgml")
|
|
114
|
+
|
|
115
|
+
if not os.path.exists(cache_file):
|
|
116
|
+
raw_kgml = kegg_get(full_id, "kgml").read()
|
|
117
|
+
with open(cache_file, 'w', encoding='utf-8') as f:
|
|
118
|
+
f.write(raw_kgml)
|
|
119
|
+
else:
|
|
120
|
+
with open(cache_file, 'r', encoding='utf-8') as f:
|
|
121
|
+
raw_kgml = f.read()
|
|
122
|
+
|
|
123
|
+
return kgml_read(io.StringIO(raw_kgml))
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
# ----------------------------------------------------------------------
|
|
127
|
+
# Organism class (fully integrated with Compound, Reaction, Pathway)
|
|
128
|
+
# ----------------------------------------------------------------------
|
|
129
|
+
class Organism:
|
|
130
|
+
def __init__(self, org_code, batch_size=10, cache_dir="kegg_cache"):
|
|
131
|
+
self.org_code = org_code
|
|
132
|
+
self.batch_size = batch_size
|
|
133
|
+
self.cache_dir = cache_dir
|
|
134
|
+
os.makedirs(self.cache_dir, exist_ok=True)
|
|
135
|
+
|
|
136
|
+
self.pathways = {}
|
|
137
|
+
self.reactions = {}
|
|
138
|
+
self._pathway_reaction_map = {}
|
|
139
|
+
self._gene_pathway_map = {}
|
|
140
|
+
self._ko_to_reactions = {}
|
|
141
|
+
self._ko_to_genes = {}
|
|
142
|
+
self._reaction_to_kos = {}
|
|
143
|
+
self._gene_to_kos = {} # reverse mapping: locus_tag -> set of KOs
|
|
144
|
+
|
|
145
|
+
# Compound objects dictionary
|
|
146
|
+
self._compounds = {} # "C00022" -> Compound instance
|
|
147
|
+
self._compound_cache_file = os.path.join(cache_dir, f"{org_code}_compounds.json")
|
|
148
|
+
|
|
149
|
+
# Bulk pre‑fetching
|
|
150
|
+
self._load_all_ko_genes() # builds _ko_to_genes and _gene_to_kos
|
|
151
|
+
self._prefetch_all_ko_reactions() # builds _ko_to_reactions and _reaction_to_kos
|
|
152
|
+
self._load_compounds()
|
|
153
|
+
|
|
154
|
+
# ------------------------------------------------------------------
|
|
155
|
+
# Caching helper (unchanged)
|
|
156
|
+
# ------------------------------------------------------------------
|
|
157
|
+
def _cache_get(self, key, subdir="", fetcher_func=None, *args, **kwargs):
|
|
158
|
+
cache_subdir = os.path.join(self.cache_dir, subdir)
|
|
159
|
+
os.makedirs(cache_subdir, exist_ok=True)
|
|
160
|
+
key_str = str(key)
|
|
161
|
+
key_hash = hashlib.md5(key_str.encode()).hexdigest()
|
|
162
|
+
cache_file = os.path.join(cache_subdir, f"{key_hash}.txt")
|
|
163
|
+
if os.path.exists(cache_file):
|
|
164
|
+
with open(cache_file, 'r', encoding='utf-8') as f:
|
|
165
|
+
return f.read()
|
|
166
|
+
raw_data = fetcher_func(*args, **kwargs)
|
|
167
|
+
with open(cache_file, 'w', encoding='utf-8') as f:
|
|
168
|
+
f.write(raw_data)
|
|
169
|
+
return raw_data
|
|
170
|
+
|
|
171
|
+
# ------------------------------------------------------------------
|
|
172
|
+
# KO–gene mapping (JSON) + reverse gene→KO mapping
|
|
173
|
+
# ------------------------------------------------------------------
|
|
174
|
+
def _load_all_ko_genes(self):
|
|
175
|
+
json_file = os.path.join(self.cache_dir, f"{self.org_code}_ko_genes.json")
|
|
176
|
+
if os.path.exists(json_file):
|
|
177
|
+
with open(json_file, 'r', encoding='utf-8') as f:
|
|
178
|
+
data = json.load(f)
|
|
179
|
+
self._ko_to_genes = {ko: set(genes) for ko, genes in data.items()}
|
|
180
|
+
else:
|
|
181
|
+
raw = kegg_link("ko", self.org_code).read().strip()
|
|
182
|
+
if raw:
|
|
183
|
+
for line in raw.splitlines():
|
|
184
|
+
if not line:
|
|
185
|
+
continue
|
|
186
|
+
parts = line.split("\t")
|
|
187
|
+
if len(parts) < 2:
|
|
188
|
+
continue
|
|
189
|
+
gene_part = parts[0]
|
|
190
|
+
ko_part = parts[1]
|
|
191
|
+
locus = gene_part.split(":", 1)[-1]
|
|
192
|
+
self._ko_to_genes.setdefault(ko_part, set()).add(locus)
|
|
193
|
+
|
|
194
|
+
json_data = {ko: list(genes) for ko, genes in self._ko_to_genes.items()}
|
|
195
|
+
with open(json_file, 'w', encoding='utf-8') as f:
|
|
196
|
+
json.dump(json_data, f, indent=2)
|
|
197
|
+
|
|
198
|
+
# Build reverse mapping (gene -> KOs)
|
|
199
|
+
self._gene_to_kos = {}
|
|
200
|
+
for ko, genes in self._ko_to_genes.items():
|
|
201
|
+
for gene in genes:
|
|
202
|
+
self._gene_to_kos.setdefault(gene, set()).add(ko)
|
|
203
|
+
|
|
204
|
+
# ------------------------------------------------------------------
|
|
205
|
+
# KO–reaction mapping (JSON)
|
|
206
|
+
# ------------------------------------------------------------------
|
|
207
|
+
def _prefetch_all_ko_reactions(self):
|
|
208
|
+
all_kos = list(self._ko_to_genes.keys())
|
|
209
|
+
if not all_kos:
|
|
210
|
+
return
|
|
211
|
+
|
|
212
|
+
ko_reactions_file = os.path.join(self.cache_dir, f"{self.org_code}_ko_reactions.json")
|
|
213
|
+
reaction_kos_file = os.path.join(self.cache_dir, f"{self.org_code}_reaction_kos.json")
|
|
214
|
+
|
|
215
|
+
if os.path.exists(ko_reactions_file) and os.path.exists(reaction_kos_file):
|
|
216
|
+
with open(ko_reactions_file, 'r') as f:
|
|
217
|
+
data = json.load(f)
|
|
218
|
+
self._ko_to_reactions = {ko: set(rxn_list) for ko, rxn_list in data.items()}
|
|
219
|
+
with open(reaction_kos_file, 'r') as f:
|
|
220
|
+
data = json.load(f)
|
|
221
|
+
self._reaction_to_kos = {rn: set(ko_list) for rn, ko_list in data.items()}
|
|
222
|
+
return
|
|
223
|
+
|
|
224
|
+
for i in range(0, len(all_kos), self.batch_size):
|
|
225
|
+
batch = all_kos[i:i + self.batch_size]
|
|
226
|
+
ko_query = "+".join(batch)
|
|
227
|
+
url_key = f"link/rn/{ko_query}"
|
|
228
|
+
raw = self._cache_get(
|
|
229
|
+
url_key,
|
|
230
|
+
subdir="ko_reactions_batches",
|
|
231
|
+
fetcher_func=lambda: kegg_link("rn", ko_query).read().strip()
|
|
232
|
+
)
|
|
233
|
+
if not raw:
|
|
234
|
+
for ko in batch:
|
|
235
|
+
self._get_reactions_for_ko_fallback(ko)
|
|
236
|
+
continue
|
|
237
|
+
|
|
238
|
+
for line in raw.splitlines():
|
|
239
|
+
if not line:
|
|
240
|
+
continue
|
|
241
|
+
parts = line.split("\t")
|
|
242
|
+
if len(parts) < 2:
|
|
243
|
+
continue
|
|
244
|
+
ko_id = parts[0]
|
|
245
|
+
rn_id = parts[1].split(":")[1]
|
|
246
|
+
self._ko_to_reactions.setdefault(ko_id, set()).add(rn_id)
|
|
247
|
+
|
|
248
|
+
self._reaction_to_kos = {}
|
|
249
|
+
for ko, rxn_set in self._ko_to_reactions.items():
|
|
250
|
+
for rn in rxn_set:
|
|
251
|
+
self._reaction_to_kos.setdefault(rn, set()).add(ko)
|
|
252
|
+
|
|
253
|
+
with open(ko_reactions_file, 'w') as f:
|
|
254
|
+
json.dump({ko: list(rxn_set) for ko, rxn_set in self._ko_to_reactions.items()}, f, indent=2)
|
|
255
|
+
with open(reaction_kos_file, 'w') as f:
|
|
256
|
+
json.dump({rn: list(ko_set) for rn, ko_set in self._reaction_to_kos.items()}, f, indent=2)
|
|
257
|
+
|
|
258
|
+
def _get_reactions_for_ko_fallback(self, ko):
|
|
259
|
+
raw = kegg_link("rn", ko).read().strip()
|
|
260
|
+
rxn_set = set()
|
|
261
|
+
for line in raw.splitlines():
|
|
262
|
+
if line:
|
|
263
|
+
parts = line.split("\t")
|
|
264
|
+
if len(parts) >= 2:
|
|
265
|
+
rn_id = parts[1].split(":")[1]
|
|
266
|
+
rxn_set.add(rn_id)
|
|
267
|
+
self._ko_to_reactions[ko] = rxn_set
|
|
268
|
+
for rn in rxn_set:
|
|
269
|
+
self._reaction_to_kos.setdefault(rn, set()).add(ko)
|
|
270
|
+
|
|
271
|
+
# ------------------------------------------------------------------
|
|
272
|
+
# Compound management
|
|
273
|
+
# ------------------------------------------------------------------
|
|
274
|
+
def _load_compounds(self):
|
|
275
|
+
"""Load cached compound info (name, formula, mass) from JSON."""
|
|
276
|
+
if os.path.exists(self._compound_cache_file):
|
|
277
|
+
with open(self._compound_cache_file, 'r', encoding='utf-8') as f:
|
|
278
|
+
data = json.load(f)
|
|
279
|
+
for cid, info in data.items():
|
|
280
|
+
comp = Compound(cid, self)
|
|
281
|
+
comp.name = info.get("name")
|
|
282
|
+
comp.formula = info.get("formula")
|
|
283
|
+
comp.mass = info.get("mass")
|
|
284
|
+
self._compounds[cid] = comp
|
|
285
|
+
# (reaction sets will be populated when pathways are loaded)
|
|
286
|
+
|
|
287
|
+
def _save_compounds(self):
|
|
288
|
+
"""Save compound names, formulas, masses to JSON (reaction sets are transient)."""
|
|
289
|
+
data = {}
|
|
290
|
+
for cid, comp in self._compounds.items():
|
|
291
|
+
if comp.name: # only save if at least a name is known
|
|
292
|
+
data[cid] = {"name": comp.name, "formula": comp.formula, "mass": comp.mass}
|
|
293
|
+
with open(self._compound_cache_file, 'w', encoding='utf-8') as f:
|
|
294
|
+
json.dump(data, f, indent=2)
|
|
295
|
+
|
|
296
|
+
def get_compound(self, compound_id, fetch_if_missing=True):
|
|
297
|
+
"""
|
|
298
|
+
Return a Compound object. If not yet loaded, create it.
|
|
299
|
+
Optionally fetch full details from KEGG (name, formula, mass) via flat file.
|
|
300
|
+
"""
|
|
301
|
+
if compound_id not in self._compounds:
|
|
302
|
+
self._compounds[compound_id] = Compound(compound_id, self)
|
|
303
|
+
if fetch_if_missing:
|
|
304
|
+
self._fetch_compound_details(compound_id)
|
|
305
|
+
return self._compounds[compound_id]
|
|
306
|
+
|
|
307
|
+
def _fetch_compound_details(self, compound_id):
|
|
308
|
+
"""Fetch compound flat file from KEGG and populate name, formula, mass."""
|
|
309
|
+
comp = self._compounds.get(compound_id)
|
|
310
|
+
if not comp:
|
|
311
|
+
return
|
|
312
|
+
try:
|
|
313
|
+
full_id = f"cpd:{compound_id}" if not compound_id.startswith("cpd:") else compound_id
|
|
314
|
+
raw = kegg_get(full_id).read()
|
|
315
|
+
for line in raw.splitlines():
|
|
316
|
+
if line.startswith("NAME") and comp.name is None:
|
|
317
|
+
parts = line.split(maxsplit=1)
|
|
318
|
+
if len(parts) > 1:
|
|
319
|
+
comp.name = parts[1].strip()
|
|
320
|
+
elif line.startswith("FORMULA"):
|
|
321
|
+
parts = line.split(maxsplit=1)
|
|
322
|
+
if len(parts) > 1:
|
|
323
|
+
comp.formula = parts[1].strip()
|
|
324
|
+
elif line.startswith("MASS"):
|
|
325
|
+
parts = line.split()
|
|
326
|
+
if len(parts) >= 2:
|
|
327
|
+
comp.mass = parts[1]
|
|
328
|
+
self._save_compounds()
|
|
329
|
+
except Exception as e:
|
|
330
|
+
print(f"Warning: could not fetch details for compound {compound_id}: {e}")
|
|
331
|
+
|
|
332
|
+
def _parse_compound_names_from_text(self, flat_text):
|
|
333
|
+
"""
|
|
334
|
+
Extract compound IDs and names from the COMPOUND section of a pathway flat file.
|
|
335
|
+
Updates Compound objects in self._compounds and saves the cache.
|
|
336
|
+
"""
|
|
337
|
+
in_compound = False
|
|
338
|
+
for line in flat_text.splitlines():
|
|
339
|
+
if line.startswith("COMPOUND"):
|
|
340
|
+
in_compound = True
|
|
341
|
+
parts = line.split()
|
|
342
|
+
if len(parts) >= 3:
|
|
343
|
+
cid = parts[1]
|
|
344
|
+
name = ' '.join(parts[2:])
|
|
345
|
+
comp = self.get_compound(cid, fetch_if_missing=False)
|
|
346
|
+
comp.name = name
|
|
347
|
+
continue
|
|
348
|
+
if in_compound:
|
|
349
|
+
if line.startswith(" "): # continuation line
|
|
350
|
+
parts = line.split()
|
|
351
|
+
if len(parts) >= 2:
|
|
352
|
+
cid = parts[0]
|
|
353
|
+
name = ' '.join(parts[1:])
|
|
354
|
+
comp = self.get_compound(cid, fetch_if_missing=False)
|
|
355
|
+
comp.name = name
|
|
356
|
+
else:
|
|
357
|
+
in_compound = False
|
|
358
|
+
self._save_compounds()
|
|
359
|
+
|
|
360
|
+
# ------------------------------------------------------------------
|
|
361
|
+
# Public lookups
|
|
362
|
+
# ------------------------------------------------------------------
|
|
363
|
+
def get_genes_for_ko(self, ko):
|
|
364
|
+
return self._ko_to_genes.get(ko, set())
|
|
365
|
+
|
|
366
|
+
def get_reactions_for_ko(self, ko):
|
|
367
|
+
return self._ko_to_reactions.get(ko, set())
|
|
368
|
+
|
|
369
|
+
def get_kos_for_gene(self, locus_tag):
|
|
370
|
+
"""Return the set of KOs associated with a gene locus tag."""
|
|
371
|
+
return self._gene_to_kos.get(locus_tag, set())
|
|
372
|
+
|
|
373
|
+
# ------------------------------------------------------------------
|
|
374
|
+
# Pathway loading (with compound integration)
|
|
375
|
+
# ------------------------------------------------------------------
|
|
376
|
+
def _parse_gene_kos_from_text(self, flat_text):
|
|
377
|
+
gene_kos = {}
|
|
378
|
+
in_gene_section = False
|
|
379
|
+
for line in flat_text.splitlines():
|
|
380
|
+
if line.startswith("GENE"):
|
|
381
|
+
in_gene_section = True
|
|
382
|
+
parts = line.split()
|
|
383
|
+
if len(parts) > 1:
|
|
384
|
+
gene_id = parts[1]
|
|
385
|
+
kos = KO_RE.findall(line)
|
|
386
|
+
if kos:
|
|
387
|
+
gene_kos[gene_id] = {f"ko:{ko}" for ko in kos}
|
|
388
|
+
continue
|
|
389
|
+
if in_gene_section:
|
|
390
|
+
if line.startswith(" "):
|
|
391
|
+
parts = line.split()
|
|
392
|
+
if parts:
|
|
393
|
+
gene_id = parts[0]
|
|
394
|
+
kos = KO_RE.findall(line)
|
|
395
|
+
if kos:
|
|
396
|
+
gene_kos[gene_id] = {f"ko:{ko}" for ko in kos}
|
|
397
|
+
else:
|
|
398
|
+
in_gene_section = False
|
|
399
|
+
return gene_kos
|
|
400
|
+
|
|
401
|
+
def load_pathway(self, pathway_id):
|
|
402
|
+
"""Load pathway, extract genes, reactions, and compounds; attach formulas."""
|
|
403
|
+
if pathway_id in self.pathways:
|
|
404
|
+
return self.pathways[pathway_id]
|
|
405
|
+
|
|
406
|
+
# 1. Load / fetch the flat file
|
|
407
|
+
cache_file = os.path.join(self.cache_dir, "pathways", f"{pathway_id}.txt")
|
|
408
|
+
os.makedirs(os.path.dirname(cache_file), exist_ok=True)
|
|
409
|
+
|
|
410
|
+
if os.path.exists(cache_file):
|
|
411
|
+
with open(cache_file, 'r', encoding='utf-8') as f:
|
|
412
|
+
flat_text = f.read()
|
|
413
|
+
else:
|
|
414
|
+
flat_text = kegg_get(pathway_id).read()
|
|
415
|
+
with open(cache_file, 'w', encoding='utf-8') as f:
|
|
416
|
+
f.write(flat_text)
|
|
417
|
+
|
|
418
|
+
gene_kos = self._parse_gene_kos_from_text(flat_text)
|
|
419
|
+
pw = Pathway(pathway_id, gene_kos)
|
|
420
|
+
self.pathways[pathway_id] = pw
|
|
421
|
+
|
|
422
|
+
# 2. Update compound names from the flat file and save
|
|
423
|
+
self._parse_compound_names_from_text(flat_text)
|
|
424
|
+
|
|
425
|
+
# 3. Add reactions from KO annotations (from flat file)
|
|
426
|
+
all_kos = set().union(*gene_kos.values()) if gene_kos else set()
|
|
427
|
+
for ko in all_kos:
|
|
428
|
+
rxn_ids = self._ko_to_reactions.get(ko, set())
|
|
429
|
+
for rn_id in rxn_ids:
|
|
430
|
+
pw.reaction_ids.add(rn_id)
|
|
431
|
+
if rn_id not in self.reactions:
|
|
432
|
+
self.reactions[rn_id] = Reaction(rn_id, self)
|
|
433
|
+
|
|
434
|
+
# 4. Parse KGML and attach per‑pathway formulas, link compounds to reactions
|
|
435
|
+
try:
|
|
436
|
+
kgml = get_pathway_kgml(pathway_id, self.cache_dir)
|
|
437
|
+
except Exception as e:
|
|
438
|
+
print(f"Warning: could not fetch/parse KGML for {pathway_id}: {e}")
|
|
439
|
+
kgml = None
|
|
440
|
+
|
|
441
|
+
if kgml is not None:
|
|
442
|
+
for kgml_rxn in kgml.reactions:
|
|
443
|
+
rxn_id = kgml_rxn.name.split(':')[-1]
|
|
444
|
+
|
|
445
|
+
if rxn_id not in self.reactions:
|
|
446
|
+
self.reactions[rxn_id] = Reaction(rxn_id, self)
|
|
447
|
+
rxn_obj = self.reactions[rxn_id]
|
|
448
|
+
|
|
449
|
+
# Substrates / products as 'cpd:C00022'
|
|
450
|
+
substrates_kegg = [s.name for s in kgml_rxn.substrates]
|
|
451
|
+
products_kegg = [p.name for p in kgml_rxn.products]
|
|
452
|
+
|
|
453
|
+
# Short IDs for formulas
|
|
454
|
+
subs_short = [s.split(':')[-1] for s in substrates_kegg]
|
|
455
|
+
prod_short = [p.split(':')[-1] for p in products_kegg]
|
|
456
|
+
arrow = ' <=> ' if kgml_rxn.type == 'reversible' else ' --> '
|
|
457
|
+
|
|
458
|
+
# KEGG‑ID formula
|
|
459
|
+
formula_kegg = ' + '.join(subs_short) + arrow + ' + '.join(prod_short) if (subs_short or prod_short) else ''
|
|
460
|
+
|
|
461
|
+
# Readable names (using Compound objects)
|
|
462
|
+
substrates_read = []
|
|
463
|
+
for s in substrates_kegg:
|
|
464
|
+
cid = s.split(':')[-1]
|
|
465
|
+
comp = self.get_compound(cid, fetch_if_missing=False)
|
|
466
|
+
substrates_read.append(comp.name if comp.name else cid)
|
|
467
|
+
# Link compound to this reaction
|
|
468
|
+
comp.reactions.add(rxn_id)
|
|
469
|
+
|
|
470
|
+
products_read = []
|
|
471
|
+
for p in products_kegg:
|
|
472
|
+
cid = p.split(':')[-1]
|
|
473
|
+
comp = self.get_compound(cid, fetch_if_missing=False)
|
|
474
|
+
products_read.append(comp.name if comp.name else cid)
|
|
475
|
+
comp.reactions.add(rxn_id)
|
|
476
|
+
|
|
477
|
+
formula_read = ' + '.join(substrates_read) + arrow + ' + '.join(products_read)
|
|
478
|
+
|
|
479
|
+
rxn_obj.formula_per_pathway[pathway_id] = {
|
|
480
|
+
'type': kgml_rxn.type,
|
|
481
|
+
'substrates': substrates_kegg,
|
|
482
|
+
'products': products_kegg,
|
|
483
|
+
'substrates_read': substrates_read,
|
|
484
|
+
'products_read': products_read,
|
|
485
|
+
'formula_kegg': formula_kegg,
|
|
486
|
+
'formula_read': formula_read
|
|
487
|
+
}
|
|
488
|
+
|
|
489
|
+
pw.reaction_ids.add(rxn_id)
|
|
490
|
+
|
|
491
|
+
# 5. Cross‑reference maps
|
|
492
|
+
for rn_id in pw.reaction_ids:
|
|
493
|
+
self._pathway_reaction_map.setdefault(rn_id, set()).add(pathway_id)
|
|
494
|
+
for locus in pw.gene_ids:
|
|
495
|
+
self._gene_pathway_map.setdefault(locus, set()).add(pathway_id)
|
|
496
|
+
|
|
497
|
+
return pw
|
|
498
|
+
|
|
499
|
+
# ------------------------------------------------------------------
|
|
500
|
+
# Convenience methods
|
|
501
|
+
# ------------------------------------------------------------------
|
|
502
|
+
def get_pathway(self, pathway_id):
|
|
503
|
+
return self.load_pathway(pathway_id)
|
|
504
|
+
|
|
505
|
+
def get_reaction(self, reaction_id):
|
|
506
|
+
if reaction_id not in self.reactions:
|
|
507
|
+
self.reactions[reaction_id] = Reaction(reaction_id, self)
|
|
508
|
+
return self.reactions[reaction_id]
|
|
509
|
+
|
|
510
|
+
def get_pathways_for_reaction(self, reaction_id):
|
|
511
|
+
return self._pathway_reaction_map.get(reaction_id, set())
|
|
512
|
+
|
|
513
|
+
def get_pathways_for_gene(self, locus_tag):
|
|
514
|
+
return self._gene_pathway_map.get(locus_tag, set())
|
|
515
|
+
|
|
516
|
+
def get_genes_for_pathway(self, pathway_id):
|
|
517
|
+
pw = self.pathways.get(pathway_id)
|
|
518
|
+
return pw.gene_ids if pw else set()
|
|
519
|
+
|
|
520
|
+
def get_reactions_for_pathway(self, pathway_id):
|
|
521
|
+
pw = self.pathways.get(pathway_id)
|
|
522
|
+
return pw.reaction_ids if pw else set()
|