masster 0.3.18__py3-none-any.whl → 0.3.20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/__init__.py +2 -0
- masster/_version.py +1 -1
- masster/data/libs/README.md +17 -0
- masster/data/libs/ccm.py +533 -0
- masster/data/libs/central_carbon_README.md +17 -0
- masster/data/libs/central_carbon_metabolites.csv +120 -0
- masster/data/libs/urine.py +333 -0
- masster/data/libs/urine_metabolites.csv +51 -0
- masster/sample/h5.py +1 -1
- masster/sample/helpers.py +3 -7
- masster/sample/lib.py +32 -25
- masster/sample/load.py +9 -3
- masster/sample/plot.py +113 -27
- masster/study/export.py +27 -10
- masster/study/h5.py +58 -40
- masster/study/helpers.py +450 -196
- masster/study/helpers_optimized.py +5 -5
- masster/study/load.py +144 -118
- masster/study/plot.py +691 -277
- masster/study/processing.py +9 -5
- masster/study/study.py +6 -6
- {masster-0.3.18.dist-info → masster-0.3.20.dist-info}/METADATA +1 -1
- {masster-0.3.18.dist-info → masster-0.3.20.dist-info}/RECORD +31 -25
- /masster/data/{examples → wiff}/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.mzML +0 -0
- /masster/data/{examples → wiff}/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.timeseries.data +0 -0
- /masster/data/{examples → wiff}/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff +0 -0
- /masster/data/{examples → wiff}/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff.scan +0 -0
- /masster/data/{examples → wiff}/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff2 +0 -0
- {masster-0.3.18.dist-info → masster-0.3.20.dist-info}/WHEEL +0 -0
- {masster-0.3.18.dist-info → masster-0.3.20.dist-info}/entry_points.txt +0 -0
- {masster-0.3.18.dist-info → masster-0.3.20.dist-info}/licenses/LICENSE +0 -0
masster/__init__.py
CHANGED
|
@@ -12,6 +12,7 @@ from masster._version import __version__
|
|
|
12
12
|
|
|
13
13
|
# from masster._version import get_version
|
|
14
14
|
from masster.chromatogram import Chromatogram
|
|
15
|
+
from masster.lib import Lib
|
|
15
16
|
from masster.sample.sample import Sample
|
|
16
17
|
from masster.spectrum import Spectrum
|
|
17
18
|
from masster.study.study import Study
|
|
@@ -19,6 +20,7 @@ from masster.study.study import Study
|
|
|
19
20
|
|
|
20
21
|
__all__ = [
|
|
21
22
|
"Chromatogram",
|
|
23
|
+
"Lib",
|
|
22
24
|
"Sample",
|
|
23
25
|
"Spectrum",
|
|
24
26
|
"Study",
|
masster/_version.py
CHANGED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
metabolite lib CSVs
|
|
2
|
+
|
|
3
|
+
This folder contains programmatically generated metabolite lists used as example libs for the masster package.
|
|
4
|
+
|
|
5
|
+
Files (generated by scripts in the repository root):
|
|
6
|
+
- `urine_metabolites.csv` - best-effort list of metabolites reported in human urine (generated by `urine.py`).
|
|
7
|
+
- `central_carbon_metabolites.csv` - curated central carbon metabolism compound list (generated by `ccm.py`).
|
|
8
|
+
|
|
9
|
+
Generation method:
|
|
10
|
+
- Names were curated in the scripts and resolved to structural identifiers via PubChem PUG-REST.
|
|
11
|
+
- `urine.py` attempts to parse local HMDB XML (if present) before falling back to the HMDB web listing.
|
|
12
|
+
- Both scripts use retries and basic normalization to improve PubChem matching.
|
|
13
|
+
|
|
14
|
+
Notes & recommendations:
|
|
15
|
+
- For authoritative resource lists, download HMDB / ChEBI / KEGG bulk data and map IDs (preferred).
|
|
16
|
+
- Respect HMDB licensing and attribution when using HMDB data.
|
|
17
|
+
- Verify ambiguous or missing entries before use in production analyses.
|
masster/data/libs/ccm.py
ADDED
|
@@ -0,0 +1,533 @@
|
|
|
1
|
+
"""Generate a cleaned CSV of central-carbon metabolism compounds.
|
|
2
|
+
|
|
3
|
+
Workflow:
|
|
4
|
+
- Use a curated list of central-carbon/metabolism-relevant names (glycolysis, TCA, PPP,
|
|
5
|
+
amino acids, common organic acids, nucleotides, fatty acids, cofactors, sugars).
|
|
6
|
+
- Query PubChem's PUG-REST for MolecularFormula, CanonicalSMILES and InChIKey for each name
|
|
7
|
+
with retries and basic name normalization to improve matching.
|
|
8
|
+
- Save results to `masster/data/examples/central_carbon_metabolites.csv`.
|
|
9
|
+
- Test loading with `masster.lib.Lib.import_csv`.
|
|
10
|
+
|
|
11
|
+
This is a best-effort programmatic lookup; ambiguous names may not resolve (those rows will
|
|
12
|
+
have empty Formula/SMILES/InChIKey). For authoritative lists, prefer curated databases
|
|
13
|
+
(e.g., HMDB, KEGG) and bulk downloads.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import csv
|
|
19
|
+
import sys
|
|
20
|
+
import time
|
|
21
|
+
import os
|
|
22
|
+
import re
|
|
23
|
+
from urllib.parse import quote
|
|
24
|
+
|
|
25
|
+
try:
|
|
26
|
+
import requests
|
|
27
|
+
except Exception:
|
|
28
|
+
requests = None
|
|
29
|
+
|
|
30
|
+
try:
|
|
31
|
+
from rdkit import Chem
|
|
32
|
+
except Exception:
|
|
33
|
+
Chem = None
|
|
34
|
+
|
|
35
|
+
try:
|
|
36
|
+
from rdkit import Chem
|
|
37
|
+
except Exception:
|
|
38
|
+
Chem = None
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
CCM_METABOLITES = [
|
|
42
|
+
# Central carbon metabolism core (glycolysis, TCA, PPP, gluconeogenesis, pyruvate metabolism)
|
|
43
|
+
"Glucose",
|
|
44
|
+
"Glucose-6-phosphate",
|
|
45
|
+
"Fructose-6-phosphate",
|
|
46
|
+
"Fructose-1,6-bisphosphate",
|
|
47
|
+
"Glyceraldehyde-3-phosphate",
|
|
48
|
+
"Dihydroxyacetone phosphate",
|
|
49
|
+
"3-Phosphoglycerate",
|
|
50
|
+
"2-Phosphoglycerate",
|
|
51
|
+
"Phosphoenolpyruvate",
|
|
52
|
+
"Pyruvate",
|
|
53
|
+
"Lactate",
|
|
54
|
+
"Acetyl-CoA",
|
|
55
|
+
"Citric acid",
|
|
56
|
+
"Isocitrate",
|
|
57
|
+
"Alpha-ketoglutaric acid",
|
|
58
|
+
"Succinyl-CoA",
|
|
59
|
+
"Succinic acid",
|
|
60
|
+
"Fumaric acid",
|
|
61
|
+
"Malic acid",
|
|
62
|
+
"Oxaloacetic acid",
|
|
63
|
+
"Ribose-5-phosphate",
|
|
64
|
+
"Ribulose-5-phosphate",
|
|
65
|
+
"Sedoheptulose-7-phosphate",
|
|
66
|
+
"Erythrose-4-phosphate",
|
|
67
|
+
"Sedoheptulose-1,7-bisphosphate",
|
|
68
|
+
"Glycerol-3-phosphate",
|
|
69
|
+
"Glycerate",
|
|
70
|
+
"Pentose",
|
|
71
|
+
"Acetaldehyde",
|
|
72
|
+
"Acetic acid",
|
|
73
|
+
|
|
74
|
+
# Proteinogenic amino acids (20 standard)
|
|
75
|
+
"Alanine",
|
|
76
|
+
"Arginine",
|
|
77
|
+
"Asparagine",
|
|
78
|
+
"Aspartic acid",
|
|
79
|
+
"Cysteine",
|
|
80
|
+
"Glutamic acid",
|
|
81
|
+
"Glutamine",
|
|
82
|
+
"Glycine",
|
|
83
|
+
"Histidine",
|
|
84
|
+
"Isoleucine",
|
|
85
|
+
"Leucine",
|
|
86
|
+
"Lysine",
|
|
87
|
+
"Methionine",
|
|
88
|
+
"Phenylalanine",
|
|
89
|
+
"Proline",
|
|
90
|
+
"Serine",
|
|
91
|
+
"Threonine",
|
|
92
|
+
"Tryptophan",
|
|
93
|
+
"Tyrosine",
|
|
94
|
+
"Valine",
|
|
95
|
+
# Additional amino acid related metabolites
|
|
96
|
+
"Ornithine",
|
|
97
|
+
"Citrulline",
|
|
98
|
+
"Homocysteine",
|
|
99
|
+
"S-adenosylmethionine",
|
|
100
|
+
"S-adenosylhomocysteine",
|
|
101
|
+
|
|
102
|
+
# Common organic acids / intermediates & related small metabolites
|
|
103
|
+
"Formic acid",
|
|
104
|
+
"Propionic acid",
|
|
105
|
+
"Butyric acid",
|
|
106
|
+
"Malonic acid",
|
|
107
|
+
"2-Hydroxyglutarate",
|
|
108
|
+
"3-Hydroxybutyrate",
|
|
109
|
+
"Acetoacetate",
|
|
110
|
+
"Beta-hydroxybutyrate",
|
|
111
|
+
"Pyruvic acid",
|
|
112
|
+
"Lactic acid",
|
|
113
|
+
|
|
114
|
+
# Fatty acids (common)
|
|
115
|
+
"Myristic acid",
|
|
116
|
+
"Palmitic acid",
|
|
117
|
+
"Stearic acid",
|
|
118
|
+
"Palmitoleic acid",
|
|
119
|
+
"Oleic acid",
|
|
120
|
+
"Linoleic acid",
|
|
121
|
+
"Alpha-linolenic acid",
|
|
122
|
+
"Arachidonic acid",
|
|
123
|
+
|
|
124
|
+
# Nucleobases and nucleosides
|
|
125
|
+
"Adenine",
|
|
126
|
+
"Guanine",
|
|
127
|
+
"Cytosine",
|
|
128
|
+
"Thymine",
|
|
129
|
+
"Uracil",
|
|
130
|
+
"Adenosine",
|
|
131
|
+
"Guanosine",
|
|
132
|
+
"Cytidine",
|
|
133
|
+
"Uridine",
|
|
134
|
+
|
|
135
|
+
# Nucleotides (mono/di/tri)
|
|
136
|
+
"AMP",
|
|
137
|
+
"ADP",
|
|
138
|
+
"ATP",
|
|
139
|
+
"GMP",
|
|
140
|
+
"GDP",
|
|
141
|
+
"GTP",
|
|
142
|
+
"CMP",
|
|
143
|
+
"CDP",
|
|
144
|
+
"CTP",
|
|
145
|
+
"UMP",
|
|
146
|
+
"UDP",
|
|
147
|
+
"UTP",
|
|
148
|
+
|
|
149
|
+
# Cofactors / common metabolites
|
|
150
|
+
"NAD+",
|
|
151
|
+
"NADH",
|
|
152
|
+
"NADP+",
|
|
153
|
+
"NADPH",
|
|
154
|
+
"FAD",
|
|
155
|
+
"FMN",
|
|
156
|
+
"Coenzyme A",
|
|
157
|
+
"Pantothenic acid",
|
|
158
|
+
"Riboflavin",
|
|
159
|
+
"Niacin",
|
|
160
|
+
|
|
161
|
+
# Sugar and sugar derivatives
|
|
162
|
+
"Fructose",
|
|
163
|
+
"Mannose",
|
|
164
|
+
"Mannose-6-phosphate",
|
|
165
|
+
"Ribose",
|
|
166
|
+
"Glucosamine",
|
|
167
|
+
"N-acetylglucosamine",
|
|
168
|
+
|
|
169
|
+
# Other common metabolites
|
|
170
|
+
"Choline",
|
|
171
|
+
"Betaine",
|
|
172
|
+
"Carnitine",
|
|
173
|
+
"Phosphocholine",
|
|
174
|
+
"Glycerol",
|
|
175
|
+
"Sorbitol",
|
|
176
|
+
"Inositol",
|
|
177
|
+
"Cholesterol",
|
|
178
|
+
"Pantothenate",
|
|
179
|
+
]
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def fetch_from_pubchem(name: str):
|
|
183
|
+
"""Fetch formula, smiles and inchikey from PubChem by compound name.
|
|
184
|
+
|
|
185
|
+
Uses basic normalization and retries with exponential backoff. Returns
|
|
186
|
+
(formula, smiles, inchikey) or (None, None, None) on failure.
|
|
187
|
+
"""
|
|
188
|
+
props = (None, None, None)
|
|
189
|
+
|
|
190
|
+
def normalize_name(n: str) -> str:
|
|
191
|
+
if not n:
|
|
192
|
+
return n
|
|
193
|
+
s = n
|
|
194
|
+
s = re.sub(r"\(.*?\)", "", s) # remove parentheses
|
|
195
|
+
s = s.replace("+", "+")
|
|
196
|
+
s = s.replace("–", "-")
|
|
197
|
+
s = re.sub(r"\s+", " ", s).strip()
|
|
198
|
+
# common abbreviation mapping
|
|
199
|
+
mapping = {
|
|
200
|
+
"AMP": "Adenosine monophosphate",
|
|
201
|
+
"ADP": "Adenosine diphosphate",
|
|
202
|
+
"ATP": "Adenosine triphosphate",
|
|
203
|
+
"GMP": "Guanosine monophosphate",
|
|
204
|
+
"GDP": "Guanosine diphosphate",
|
|
205
|
+
"GTP": "Guanosine triphosphate",
|
|
206
|
+
"NAD+": "Nicotinamide adenine dinucleotide",
|
|
207
|
+
"NADH": "Nicotinamide adenine dinucleotide (reduced)",
|
|
208
|
+
"CoA": "Coenzyme A",
|
|
209
|
+
}
|
|
210
|
+
up = s.upper()
|
|
211
|
+
if up in mapping:
|
|
212
|
+
return mapping[up]
|
|
213
|
+
return s
|
|
214
|
+
|
|
215
|
+
query = normalize_name(name)
|
|
216
|
+
|
|
217
|
+
def try_query(q: str):
|
|
218
|
+
url = (
|
|
219
|
+
f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{quote(q)}/property/"
|
|
220
|
+
+ "MolecularFormula,CanonicalSMILES,InChI,InChIKey/JSON"
|
|
221
|
+
)
|
|
222
|
+
try:
|
|
223
|
+
r = requests.get(url, timeout=15)
|
|
224
|
+
if r.status_code == 200:
|
|
225
|
+
return r.json()
|
|
226
|
+
except Exception:
|
|
227
|
+
return None
|
|
228
|
+
return None
|
|
229
|
+
|
|
230
|
+
def try_query_inchikey(ik: str):
|
|
231
|
+
url = (
|
|
232
|
+
f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/inchikey/{quote(ik)}/property/"
|
|
233
|
+
+ "MolecularFormula,CanonicalSMILES,InChI,InChIKey/JSON"
|
|
234
|
+
)
|
|
235
|
+
try:
|
|
236
|
+
r = requests.get(url, timeout=15)
|
|
237
|
+
if r.status_code == 200:
|
|
238
|
+
return r.json()
|
|
239
|
+
except Exception:
|
|
240
|
+
return None
|
|
241
|
+
return None
|
|
242
|
+
|
|
243
|
+
def try_query_cid(cid: int):
|
|
244
|
+
url = (
|
|
245
|
+
f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cid}/property/"
|
|
246
|
+
+ "MolecularFormula,CanonicalSMILES,InChI,InChIKey/JSON"
|
|
247
|
+
)
|
|
248
|
+
try:
|
|
249
|
+
r = requests.get(url, timeout=15)
|
|
250
|
+
if r.status_code == 200:
|
|
251
|
+
return r.json()
|
|
252
|
+
except Exception:
|
|
253
|
+
return None
|
|
254
|
+
return None
|
|
255
|
+
|
|
256
|
+
def try_get_cids_from_inchikey(ik: str):
|
|
257
|
+
url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/inchikey/{quote(ik)}/cids/JSON"
|
|
258
|
+
try:
|
|
259
|
+
r = requests.get(url, timeout=15)
|
|
260
|
+
if r.status_code == 200:
|
|
261
|
+
j = r.json()
|
|
262
|
+
if 'IdentifierList' in j and 'CID' in j['IdentifierList']:
|
|
263
|
+
return j['IdentifierList']['CID']
|
|
264
|
+
except Exception:
|
|
265
|
+
return []
|
|
266
|
+
return []
|
|
267
|
+
|
|
268
|
+
if requests is None:
|
|
269
|
+
return props
|
|
270
|
+
|
|
271
|
+
# exponential backoff attempts
|
|
272
|
+
attempts = 3
|
|
273
|
+
for i in range(attempts):
|
|
274
|
+
j = try_query(query)
|
|
275
|
+
if j:
|
|
276
|
+
try:
|
|
277
|
+
if "PropertyTable" in j and "Properties" in j["PropertyTable"]:
|
|
278
|
+
p = j["PropertyTable"]["Properties"][0]
|
|
279
|
+
mf = p.get("MolecularFormula")
|
|
280
|
+
sm = p.get("CanonicalSMILES")
|
|
281
|
+
inchi = p.get("InChI")
|
|
282
|
+
ik = p.get("InChIKey")
|
|
283
|
+
# if SMILES missing, try a lookup by InChIKey (dedicated endpoint)
|
|
284
|
+
if not sm and ik:
|
|
285
|
+
j2 = try_query_inchikey(ik)
|
|
286
|
+
if j2 and "PropertyTable" in j2 and "Properties" in j2["PropertyTable"]:
|
|
287
|
+
p2 = j2["PropertyTable"]["Properties"][0]
|
|
288
|
+
sm = p2.get("CanonicalSMILES") or sm
|
|
289
|
+
inchi = inchi or p2.get("InChI")
|
|
290
|
+
|
|
291
|
+
# if still no SMILES but InChI present and RDKit available, try InChI -> SMILES conversion
|
|
292
|
+
if not sm and inchi and Chem is not None:
|
|
293
|
+
try:
|
|
294
|
+
m = Chem.MolFromInchi(inchi)
|
|
295
|
+
if m is not None:
|
|
296
|
+
sm = Chem.MolToSmiles(m, isomericSmiles=True)
|
|
297
|
+
except Exception:
|
|
298
|
+
pass
|
|
299
|
+
|
|
300
|
+
# if still no SMILES, try fetching CIDs from InChIKey and query a CID record
|
|
301
|
+
if not sm and ik:
|
|
302
|
+
cids = try_get_cids_from_inchikey(ik)
|
|
303
|
+
for cid in (cids or [])[:5]:
|
|
304
|
+
j3 = try_query_cid(cid)
|
|
305
|
+
if j3 and "PropertyTable" in j3 and "Properties" in j3["PropertyTable"]:
|
|
306
|
+
p3 = j3["PropertyTable"]["Properties"][0]
|
|
307
|
+
sm = p3.get("CanonicalSMILES") or sm
|
|
308
|
+
if sm:
|
|
309
|
+
break
|
|
310
|
+
|
|
311
|
+
return (mf, sm, ik)
|
|
312
|
+
except Exception:
|
|
313
|
+
pass
|
|
314
|
+
time.sleep(1 + 2 ** i)
|
|
315
|
+
|
|
316
|
+
# final fallback: try raw name without normalization
|
|
317
|
+
j = try_query(name)
|
|
318
|
+
if j and "PropertyTable" in j and "Properties" in j["PropertyTable"]:
|
|
319
|
+
p = j["PropertyTable"]["Properties"][0]
|
|
320
|
+
return (p.get("MolecularFormula"), p.get("CanonicalSMILES"), p.get("InChIKey"))
|
|
321
|
+
|
|
322
|
+
return props
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
def generate_csv(out_path: str = "central_carbon_metabolites.csv"):
|
|
326
|
+
rows = []
|
|
327
|
+
for name in CCM_METABOLITES:
|
|
328
|
+
formula, smiles, inchikey = (None, None, None)
|
|
329
|
+
if requests is not None:
|
|
330
|
+
formula, smiles, inchikey = fetch_from_pubchem(name)
|
|
331
|
+
|
|
332
|
+
# Neutralize charged molecular formulas (e.g., trailing +, -, 2+, 3-)
|
|
333
|
+
# by adjusting the hydrogen count accordingly and removing the explicit charge.
|
|
334
|
+
def neutralize_formula(fmt: str) -> str:
|
|
335
|
+
if not fmt:
|
|
336
|
+
return fmt
|
|
337
|
+
s = fmt.strip()
|
|
338
|
+
# normalize common unicode superscripts (²³¹⁺⁻) to ascii
|
|
339
|
+
sup_map = str.maketrans({
|
|
340
|
+
"²": "2",
|
|
341
|
+
"³": "3",
|
|
342
|
+
"¹": "1",
|
|
343
|
+
"⁺": "+",
|
|
344
|
+
"⁻": "-",
|
|
345
|
+
})
|
|
346
|
+
s = s.translate(sup_map)
|
|
347
|
+
# Remove enclosing brackets if present, e.g. [C6H5O7]2-
|
|
348
|
+
if s.startswith("[") and s.endswith("]"):
|
|
349
|
+
s = s[1:-1]
|
|
350
|
+
# strip trailing punctuation or separators (commas, periods, parentheses)
|
|
351
|
+
s = s.rstrip(" \t\n\r,.;)")
|
|
352
|
+
# detect trailing charge formats e.g. '2-', '-','3+','+','-2','+2' optionally with whitespace
|
|
353
|
+
m = re.search(r"([+-]?\d+[+-]?|[+-])\s*$", s)
|
|
354
|
+
if not m:
|
|
355
|
+
return fmt
|
|
356
|
+
|
|
357
|
+
charge_str = m.group(1)
|
|
358
|
+
base = s[: m.start(1)].strip()
|
|
359
|
+
# determine magnitude and sign for patterns like '2-' or '-2' or '+2' or '3+'
|
|
360
|
+
sign = 1
|
|
361
|
+
mag = 1
|
|
362
|
+
if charge_str[0] in '+-':
|
|
363
|
+
# formats like '-2' or '+2' or '-' or '+'
|
|
364
|
+
sign = -1 if charge_str[0] == '-' else 1
|
|
365
|
+
mag = int(charge_str[1:]) if len(charge_str) > 1 and charge_str[1:].isdigit() else 1
|
|
366
|
+
elif charge_str[-1] in '+-':
|
|
367
|
+
# formats like '2-' or '3+'
|
|
368
|
+
sign = -1 if charge_str[-1] == '-' else 1
|
|
369
|
+
mag = int(charge_str[:-1]) if charge_str[:-1].isdigit() else 1
|
|
370
|
+
|
|
371
|
+
# parse element counts from base formula
|
|
372
|
+
tokens = re.findall(r"([A-Z][a-z]?)(\d*)", base)
|
|
373
|
+
if not tokens:
|
|
374
|
+
# if parsing failed, return base without charge marker
|
|
375
|
+
return base
|
|
376
|
+
|
|
377
|
+
elems = []
|
|
378
|
+
counts: dict[str, int] = {}
|
|
379
|
+
for el, num in tokens:
|
|
380
|
+
counts[el] = counts.get(el, 0) + (int(num) if num else 1)
|
|
381
|
+
if el not in elems:
|
|
382
|
+
elems.append(el)
|
|
383
|
+
|
|
384
|
+
# adjust hydrogens: negative charge -> add H (protonation),
|
|
385
|
+
# positive charge -> remove H (deprotonation)
|
|
386
|
+
if sign == -1:
|
|
387
|
+
counts["H"] = counts.get("H", 0) + mag
|
|
388
|
+
if "H" not in elems:
|
|
389
|
+
# place H after C if present, else at beginning
|
|
390
|
+
if "C" in elems:
|
|
391
|
+
idx = elems.index("C") + 1
|
|
392
|
+
elems.insert(idx, "H")
|
|
393
|
+
else:
|
|
394
|
+
elems.insert(0, "H")
|
|
395
|
+
else:
|
|
396
|
+
if "H" in counts:
|
|
397
|
+
counts["H"] = counts.get("H", 0) - mag
|
|
398
|
+
if counts["H"] <= 0:
|
|
399
|
+
counts.pop("H", None)
|
|
400
|
+
if "H" in elems:
|
|
401
|
+
elems.remove("H")
|
|
402
|
+
else:
|
|
403
|
+
# can't remove hydrogens we don't have; leave base unchanged
|
|
404
|
+
pass
|
|
405
|
+
|
|
406
|
+
# rebuild formula preserving original element order
|
|
407
|
+
parts = []
|
|
408
|
+
for el in elems:
|
|
409
|
+
if el in counts:
|
|
410
|
+
n = counts[el]
|
|
411
|
+
parts.append(f"{el}{n if n!=1 else ''}")
|
|
412
|
+
new_formula = "".join(parts)
|
|
413
|
+
return new_formula
|
|
414
|
+
|
|
415
|
+
try:
|
|
416
|
+
formula_neutral = neutralize_formula(formula) if formula else formula
|
|
417
|
+
if formula and formula_neutral != formula:
|
|
418
|
+
# prefer the neutralized formula in the output
|
|
419
|
+
formula = formula_neutral
|
|
420
|
+
except Exception:
|
|
421
|
+
# if anything goes wrong, keep original formula
|
|
422
|
+
pass
|
|
423
|
+
|
|
424
|
+
# neutralize SMILES using RDKit when available
|
|
425
|
+
def neutralize_smiles(smiles_str: str) -> str:
|
|
426
|
+
if not smiles_str or Chem is None:
|
|
427
|
+
return smiles_str
|
|
428
|
+
try:
|
|
429
|
+
m = Chem.MolFromSmiles(smiles_str, sanitize=True)
|
|
430
|
+
if m is None:
|
|
431
|
+
return smiles_str
|
|
432
|
+
# Work on a read-write mol to adjust hydrogens and formal charges
|
|
433
|
+
rw = Chem.RWMol(Chem.AddHs(m))
|
|
434
|
+
to_remove = []
|
|
435
|
+
for a in list(rw.GetAtoms()):
|
|
436
|
+
idx = a.GetIdx()
|
|
437
|
+
q = a.GetFormalCharge()
|
|
438
|
+
if q > 0:
|
|
439
|
+
# remove up to q hydrogen neighbors (by index)
|
|
440
|
+
h_neighbors = [nbr.GetIdx() for nbr in a.GetNeighbors() if nbr.GetSymbol() == "H"]
|
|
441
|
+
remove = h_neighbors[: min(len(h_neighbors), q)]
|
|
442
|
+
to_remove.extend(remove)
|
|
443
|
+
elif q < 0:
|
|
444
|
+
# add -q hydrogens bonded to this atom
|
|
445
|
+
for _ in range(-q):
|
|
446
|
+
h = Chem.Atom("H")
|
|
447
|
+
new_idx = rw.AddAtom(h)
|
|
448
|
+
rw.AddBond(idx, new_idx, Chem.BondType.SINGLE)
|
|
449
|
+
# reset formal charge on this atom
|
|
450
|
+
rw.GetAtomWithIdx(idx).SetFormalCharge(0)
|
|
451
|
+
|
|
452
|
+
# remove hydrogen atoms collected, in reverse order so indices stay valid
|
|
453
|
+
for ridx in sorted(set(to_remove), reverse=True):
|
|
454
|
+
try:
|
|
455
|
+
rw.RemoveAtom(ridx)
|
|
456
|
+
except Exception:
|
|
457
|
+
pass
|
|
458
|
+
|
|
459
|
+
newm = rw.GetMol()
|
|
460
|
+
try:
|
|
461
|
+
Chem.SanitizeMol(newm)
|
|
462
|
+
except Exception:
|
|
463
|
+
# best effort: continue
|
|
464
|
+
pass
|
|
465
|
+
# remove explicit Hs to produce a clean canonical SMILES
|
|
466
|
+
try:
|
|
467
|
+
no_h = Chem.RemoveHs(newm)
|
|
468
|
+
except Exception:
|
|
469
|
+
no_h = newm
|
|
470
|
+
sm = Chem.MolToSmiles(no_h, isomericSmiles=True)
|
|
471
|
+
return sm
|
|
472
|
+
except Exception:
|
|
473
|
+
return smiles_str
|
|
474
|
+
|
|
475
|
+
try:
|
|
476
|
+
smiles = neutralize_smiles(smiles) if smiles else smiles
|
|
477
|
+
except Exception:
|
|
478
|
+
pass
|
|
479
|
+
rows.append({
|
|
480
|
+
"Name": name,
|
|
481
|
+
"Formula": formula or "",
|
|
482
|
+
"SMILES": smiles or "",
|
|
483
|
+
"InChIKey": inchikey or "",
|
|
484
|
+
})
|
|
485
|
+
|
|
486
|
+
# Ensure output directory exists (data/libs)
|
|
487
|
+
out_dir = os.path.join('masster', 'data', 'libs')
|
|
488
|
+
os.makedirs(out_dir, exist_ok=True)
|
|
489
|
+
out_path_full = os.path.join(out_dir, os.path.basename(out_path))
|
|
490
|
+
|
|
491
|
+
fieldnames = ["Name", "Formula", "SMILES", "InChIKey"]
|
|
492
|
+
with open(out_path_full, "w", newline="", encoding="utf-8") as f:
|
|
493
|
+
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
|
494
|
+
writer.writeheader()
|
|
495
|
+
for r in rows:
|
|
496
|
+
writer.writerow(r)
|
|
497
|
+
|
|
498
|
+
print(f"Wrote {len(rows)} entries to {out_path_full}")
|
|
499
|
+
return out_path_full
|
|
500
|
+
|
|
501
|
+
|
|
502
|
+
def test_load_with_lib(csv_path: str):
|
|
503
|
+
"""Try to load the generated CSV using masster.lib.Lib.import_csv."""
|
|
504
|
+
try:
|
|
505
|
+
from masster.lib import Lib
|
|
506
|
+
except Exception as e:
|
|
507
|
+
print(f"Cannot import masster.lib.Lib: {e}")
|
|
508
|
+
return False
|
|
509
|
+
|
|
510
|
+
try:
|
|
511
|
+
lib = Lib()
|
|
512
|
+
# import_csv expects a path and optional polarity; use polarity=None to import both
|
|
513
|
+
lib.import_csv(csv_path, polarity=None)
|
|
514
|
+
print(f"Lib loaded: {len(lib)} entries")
|
|
515
|
+
# print a few entries (polars DataFrame -> head)
|
|
516
|
+
try:
|
|
517
|
+
print(lib.lib_df.select(["name", "formula", "adduct", "mz"]).head(8))
|
|
518
|
+
except Exception:
|
|
519
|
+
# older implementations might not have the same columns; just show length
|
|
520
|
+
pass
|
|
521
|
+
return True
|
|
522
|
+
except Exception as e:
|
|
523
|
+
print(f"Failed to load CSV with Lib.import_csv: {e}")
|
|
524
|
+
return False
|
|
525
|
+
|
|
526
|
+
|
|
527
|
+
if __name__ == "__main__":
|
|
528
|
+
csv_file = generate_csv()
|
|
529
|
+
ok = test_load_with_lib(csv_file)
|
|
530
|
+
if not ok:
|
|
531
|
+
print("Test failed; please inspect messages above.")
|
|
532
|
+
sys.exit(2)
|
|
533
|
+
print("Done.")
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
central_carbon_metabolites.csv
|
|
2
|
+
|
|
3
|
+
This folder contains example compound lists used by the masster package.
|
|
4
|
+
|
|
5
|
+
Files:
|
|
6
|
+
- central_carbon_metabolites.csv: a best-effort list of central carbon metabolism related
|
|
7
|
+
compounds (glycolysis, TCA cycle, pentose phosphate pathway, amino acids, organic acids,
|
|
8
|
+
nucleotides, fatty acids, cofactors). The CSV was generated by `ccm.py` which:
|
|
9
|
+
- Uses a curated name list included in the script.
|
|
10
|
+
- Resolves structural identifiers (MolecularFormula, CanonicalSMILES, InChIKey) via PubChem PUG-REST
|
|
11
|
+
with retries and basic normalization.
|
|
12
|
+
- Writes a CSV with columns: Name, Formula, SMILES, InChIKey.
|
|
13
|
+
|
|
14
|
+
Notes:
|
|
15
|
+
- This is a programmatically generated list. For authoritative lists, use database bulk downloads
|
|
16
|
+
(HMDB, KEGG, ChEBI) and provide explicit IDs. Prefer HMDB or ChEBI for metabolomics work.
|
|
17
|
+
- Some compound names may be ambiguous; verify entries before use.
|