masster 0.3.19__tar.gz → 0.3.20__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

Files changed (84) hide show
  1. {masster-0.3.19 → masster-0.3.20}/PKG-INFO +1 -1
  2. {masster-0.3.19 → masster-0.3.20}/pyproject.toml +1 -1
  3. {masster-0.3.19 → masster-0.3.20}/src/masster/__init__.py +2 -0
  4. {masster-0.3.19 → masster-0.3.20}/src/masster/_version.py +1 -1
  5. masster-0.3.20/src/masster/data/libs/README.md +17 -0
  6. masster-0.3.20/src/masster/data/libs/ccm.py +533 -0
  7. masster-0.3.20/src/masster/data/libs/central_carbon_README.md +17 -0
  8. masster-0.3.20/src/masster/data/libs/central_carbon_metabolites.csv +120 -0
  9. masster-0.3.20/src/masster/data/libs/urine.py +333 -0
  10. masster-0.3.20/src/masster/data/libs/urine_metabolites.csv +51 -0
  11. {masster-0.3.19 → masster-0.3.20}/src/masster/sample/lib.py +32 -25
  12. {masster-0.3.19 → masster-0.3.20}/src/masster/sample/load.py +7 -1
  13. {masster-0.3.19 → masster-0.3.20}/src/masster/sample/plot.py +111 -26
  14. {masster-0.3.19 → masster-0.3.20}/src/masster/study/helpers.py +230 -6
  15. {masster-0.3.19 → masster-0.3.20}/src/masster/study/plot.py +457 -182
  16. {masster-0.3.19 → masster-0.3.20}/src/masster/study/study.py +4 -0
  17. {masster-0.3.19 → masster-0.3.20}/uv.lock +1 -1
  18. {masster-0.3.19 → masster-0.3.20}/.github/workflows/publish.yml +0 -0
  19. {masster-0.3.19 → masster-0.3.20}/.github/workflows/security.yml +0 -0
  20. {masster-0.3.19 → masster-0.3.20}/.github/workflows/test.yml +0 -0
  21. {masster-0.3.19 → masster-0.3.20}/.gitignore +0 -0
  22. {masster-0.3.19 → masster-0.3.20}/.pre-commit-config.yaml +0 -0
  23. {masster-0.3.19 → masster-0.3.20}/LICENSE +0 -0
  24. {masster-0.3.19 → masster-0.3.20}/Makefile +0 -0
  25. {masster-0.3.19 → masster-0.3.20}/README.md +0 -0
  26. {masster-0.3.19 → masster-0.3.20}/TESTING.md +0 -0
  27. {masster-0.3.19 → masster-0.3.20}/demo/example_batch_process.py +0 -0
  28. {masster-0.3.19 → masster-0.3.20}/demo/example_sample_process.py +0 -0
  29. {masster-0.3.19 → masster-0.3.20}/src/masster/chromatogram.py +0 -0
  30. {masster-0.3.19/src/masster/data/examples → masster-0.3.20/src/masster/data/wiff}/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.mzML +0 -0
  31. {masster-0.3.19/src/masster/data/examples → masster-0.3.20/src/masster/data/wiff}/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.timeseries.data +0 -0
  32. {masster-0.3.19/src/masster/data/examples → masster-0.3.20/src/masster/data/wiff}/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff +0 -0
  33. {masster-0.3.19/src/masster/data/examples → masster-0.3.20/src/masster/data/wiff}/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff.scan +0 -0
  34. {masster-0.3.19/src/masster/data/examples → masster-0.3.20/src/masster/data/wiff}/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff2 +0 -0
  35. {masster-0.3.19 → masster-0.3.20}/src/masster/logger.py +0 -0
  36. {masster-0.3.19 → masster-0.3.20}/src/masster/sample/__init__.py +0 -0
  37. {masster-0.3.19 → masster-0.3.20}/src/masster/sample/defaults/__init__.py +0 -0
  38. {masster-0.3.19 → masster-0.3.20}/src/masster/sample/defaults/find_adducts_def.py +0 -0
  39. {masster-0.3.19 → masster-0.3.20}/src/masster/sample/defaults/find_features_def.py +0 -0
  40. {masster-0.3.19 → masster-0.3.20}/src/masster/sample/defaults/find_ms2_def.py +0 -0
  41. {masster-0.3.19 → masster-0.3.20}/src/masster/sample/defaults/get_spectrum_def.py +0 -0
  42. {masster-0.3.19 → masster-0.3.20}/src/masster/sample/defaults/sample_def.py +0 -0
  43. {masster-0.3.19 → masster-0.3.20}/src/masster/sample/h5.py +0 -0
  44. {masster-0.3.19 → masster-0.3.20}/src/masster/sample/helpers.py +0 -0
  45. {masster-0.3.19 → masster-0.3.20}/src/masster/sample/parameters.py +0 -0
  46. {masster-0.3.19 → masster-0.3.20}/src/masster/sample/processing.py +0 -0
  47. {masster-0.3.19 → masster-0.3.20}/src/masster/sample/quant.py +0 -0
  48. {masster-0.3.19 → masster-0.3.20}/src/masster/sample/sample.py +0 -0
  49. {masster-0.3.19 → masster-0.3.20}/src/masster/sample/sample5_schema.json +0 -0
  50. {masster-0.3.19 → masster-0.3.20}/src/masster/sample/save.py +0 -0
  51. {masster-0.3.19 → masster-0.3.20}/src/masster/sample/sciex.py +0 -0
  52. {masster-0.3.19 → masster-0.3.20}/src/masster/spectrum.py +0 -0
  53. {masster-0.3.19 → masster-0.3.20}/src/masster/study/__init__.py +0 -0
  54. {masster-0.3.19 → masster-0.3.20}/src/masster/study/defaults/__init__.py +0 -0
  55. {masster-0.3.19 → masster-0.3.20}/src/masster/study/defaults/align_def.py +0 -0
  56. {masster-0.3.19 → masster-0.3.20}/src/masster/study/defaults/export_def.py +0 -0
  57. {masster-0.3.19 → masster-0.3.20}/src/masster/study/defaults/fill_chrom_def.py +0 -0
  58. {masster-0.3.19 → masster-0.3.20}/src/masster/study/defaults/fill_def.py +0 -0
  59. {masster-0.3.19 → masster-0.3.20}/src/masster/study/defaults/find_consensus_def.py +0 -0
  60. {masster-0.3.19 → masster-0.3.20}/src/masster/study/defaults/find_ms2_def.py +0 -0
  61. {masster-0.3.19 → masster-0.3.20}/src/masster/study/defaults/integrate_chrom_def.py +0 -0
  62. {masster-0.3.19 → masster-0.3.20}/src/masster/study/defaults/integrate_def.py +0 -0
  63. {masster-0.3.19 → masster-0.3.20}/src/masster/study/defaults/merge_def.py +0 -0
  64. {masster-0.3.19 → masster-0.3.20}/src/masster/study/defaults/study_def.py +0 -0
  65. {masster-0.3.19 → masster-0.3.20}/src/masster/study/export.py +0 -0
  66. {masster-0.3.19 → masster-0.3.20}/src/masster/study/h5.py +0 -0
  67. {masster-0.3.19 → masster-0.3.20}/src/masster/study/helpers_optimized.py +0 -0
  68. {masster-0.3.19 → masster-0.3.20}/src/masster/study/load.py +0 -0
  69. {masster-0.3.19 → masster-0.3.20}/src/masster/study/parameters.py +0 -0
  70. {masster-0.3.19 → masster-0.3.20}/src/masster/study/processing.py +0 -0
  71. {masster-0.3.19 → masster-0.3.20}/src/masster/study/save.py +0 -0
  72. {masster-0.3.19 → masster-0.3.20}/src/masster/study/study5_schema.json +0 -0
  73. {masster-0.3.19 → masster-0.3.20}/tests/conftest.py +0 -0
  74. {masster-0.3.19 → masster-0.3.20}/tests/test_chromatogram.py +0 -0
  75. {masster-0.3.19 → masster-0.3.20}/tests/test_defaults.py +0 -0
  76. {masster-0.3.19 → masster-0.3.20}/tests/test_imports.py +0 -0
  77. {masster-0.3.19 → masster-0.3.20}/tests/test_integration.py +0 -0
  78. {masster-0.3.19 → masster-0.3.20}/tests/test_logger.py +0 -0
  79. {masster-0.3.19 → masster-0.3.20}/tests/test_parameters.py +0 -0
  80. {masster-0.3.19 → masster-0.3.20}/tests/test_sample.py +0 -0
  81. {masster-0.3.19 → masster-0.3.20}/tests/test_spectrum.py +0 -0
  82. {masster-0.3.19 → masster-0.3.20}/tests/test_study.py +0 -0
  83. {masster-0.3.19 → masster-0.3.20}/tests/test_version.py +0 -0
  84. {masster-0.3.19 → masster-0.3.20}/tox.ini +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: masster
3
- Version: 0.3.19
3
+ Version: 0.3.20
4
4
  Summary: Mass spectrometry data analysis package
5
5
  Project-URL: homepage, https://github.com/zamboni-lab/masster
6
6
  Project-URL: repository, https://github.com/zamboni-lab/masster
@@ -1,7 +1,7 @@
1
1
 
2
2
  [project]
3
3
  name = "masster"
4
- version = "0.3.19"
4
+ version = "0.3.20"
5
5
  description = "Mass spectrometry data analysis package"
6
6
  authors = [
7
7
  { name = "Zamboni Lab" }
@@ -12,6 +12,7 @@ from masster._version import __version__
12
12
 
13
13
  # from masster._version import get_version
14
14
  from masster.chromatogram import Chromatogram
15
+ from masster.lib import Lib
15
16
  from masster.sample.sample import Sample
16
17
  from masster.spectrum import Spectrum
17
18
  from masster.study.study import Study
@@ -19,6 +20,7 @@ from masster.study.study import Study
19
20
 
20
21
  __all__ = [
21
22
  "Chromatogram",
23
+ "Lib",
22
24
  "Sample",
23
25
  "Spectrum",
24
26
  "Study",
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
 
4
- __version__ = "0.3.19"
4
+ __version__ = "0.3.20"
5
5
 
6
6
 
7
7
  def get_version():
@@ -0,0 +1,17 @@
1
+ metabolite lib CSVs
2
+
3
+ This folder contains programmatically generated metabolite lists used as example libs for the masster package.
4
+
5
+ Files (generated by scripts in the repository root):
6
+ - `urine_metabolites.csv` - best-effort list of metabolites reported in human urine (generated by `urine.py`).
7
+ - `central_carbon_metabolites.csv` - curated central carbon metabolism compound list (generated by `ccm.py`).
8
+
9
+ Generation method:
10
+ - Names were curated in the scripts and resolved to structural identifiers via PubChem PUG-REST.
11
+ - `urine.py` attempts to parse local HMDB XML (if present) before falling back to the HMDB web listing.
12
+ - Both scripts use retries and basic normalization to improve PubChem matching.
13
+
14
+ Notes & recommendations:
15
+ - For authoritative resource lists, download HMDB / ChEBI / KEGG bulk data and map IDs (preferred).
16
+ - Respect HMDB licensing and attribution when using HMDB data.
17
+ - Verify ambiguous or missing entries before use in production analyses.
@@ -0,0 +1,533 @@
1
+ """Generate a cleaned CSV of central-carbon metabolism compounds.
2
+
3
+ Workflow:
4
+ - Use a curated list of central-carbon/metabolism-relevant names (glycolysis, TCA, PPP,
5
+ amino acids, common organic acids, nucleotides, fatty acids, cofactors, sugars).
6
+ - Query PubChem's PUG-REST for MolecularFormula, CanonicalSMILES and InChIKey for each name
7
+ with retries and basic name normalization to improve matching.
8
+ - Save results to `masster/data/examples/central_carbon_metabolites.csv`.
9
+ - Test loading with `masster.lib.Lib.import_csv`.
10
+
11
+ This is a best-effort programmatic lookup; ambiguous names may not resolve (those rows will
12
+ have empty Formula/SMILES/InChIKey). For authoritative lists, prefer curated databases
13
+ (e.g., HMDB, KEGG) and bulk downloads.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import csv
19
+ import sys
20
+ import time
21
+ import os
22
+ import re
23
+ from urllib.parse import quote
24
+
25
+ try:
26
+ import requests
27
+ except Exception:
28
+ requests = None
29
+
30
+ try:
31
+ from rdkit import Chem
32
+ except Exception:
33
+ Chem = None
34
+
35
+ try:
36
+ from rdkit import Chem
37
+ except Exception:
38
+ Chem = None
39
+
40
+
41
+ CCM_METABOLITES = [
42
+ # Central carbon metabolism core (glycolysis, TCA, PPP, gluconeogenesis, pyruvate metabolism)
43
+ "Glucose",
44
+ "Glucose-6-phosphate",
45
+ "Fructose-6-phosphate",
46
+ "Fructose-1,6-bisphosphate",
47
+ "Glyceraldehyde-3-phosphate",
48
+ "Dihydroxyacetone phosphate",
49
+ "3-Phosphoglycerate",
50
+ "2-Phosphoglycerate",
51
+ "Phosphoenolpyruvate",
52
+ "Pyruvate",
53
+ "Lactate",
54
+ "Acetyl-CoA",
55
+ "Citric acid",
56
+ "Isocitrate",
57
+ "Alpha-ketoglutaric acid",
58
+ "Succinyl-CoA",
59
+ "Succinic acid",
60
+ "Fumaric acid",
61
+ "Malic acid",
62
+ "Oxaloacetic acid",
63
+ "Ribose-5-phosphate",
64
+ "Ribulose-5-phosphate",
65
+ "Sedoheptulose-7-phosphate",
66
+ "Erythrose-4-phosphate",
67
+ "Sedoheptulose-1,7-bisphosphate",
68
+ "Glycerol-3-phosphate",
69
+ "Glycerate",
70
+ "Pentose",
71
+ "Acetaldehyde",
72
+ "Acetic acid",
73
+
74
+ # Proteinogenic amino acids (20 standard)
75
+ "Alanine",
76
+ "Arginine",
77
+ "Asparagine",
78
+ "Aspartic acid",
79
+ "Cysteine",
80
+ "Glutamic acid",
81
+ "Glutamine",
82
+ "Glycine",
83
+ "Histidine",
84
+ "Isoleucine",
85
+ "Leucine",
86
+ "Lysine",
87
+ "Methionine",
88
+ "Phenylalanine",
89
+ "Proline",
90
+ "Serine",
91
+ "Threonine",
92
+ "Tryptophan",
93
+ "Tyrosine",
94
+ "Valine",
95
+ # Additional amino acid related metabolites
96
+ "Ornithine",
97
+ "Citrulline",
98
+ "Homocysteine",
99
+ "S-adenosylmethionine",
100
+ "S-adenosylhomocysteine",
101
+
102
+ # Common organic acids / intermediates & related small metabolites
103
+ "Formic acid",
104
+ "Propionic acid",
105
+ "Butyric acid",
106
+ "Malonic acid",
107
+ "2-Hydroxyglutarate",
108
+ "3-Hydroxybutyrate",
109
+ "Acetoacetate",
110
+ "Beta-hydroxybutyrate",
111
+ "Pyruvic acid",
112
+ "Lactic acid",
113
+
114
+ # Fatty acids (common)
115
+ "Myristic acid",
116
+ "Palmitic acid",
117
+ "Stearic acid",
118
+ "Palmitoleic acid",
119
+ "Oleic acid",
120
+ "Linoleic acid",
121
+ "Alpha-linolenic acid",
122
+ "Arachidonic acid",
123
+
124
+ # Nucleobases and nucleosides
125
+ "Adenine",
126
+ "Guanine",
127
+ "Cytosine",
128
+ "Thymine",
129
+ "Uracil",
130
+ "Adenosine",
131
+ "Guanosine",
132
+ "Cytidine",
133
+ "Uridine",
134
+
135
+ # Nucleotides (mono/di/tri)
136
+ "AMP",
137
+ "ADP",
138
+ "ATP",
139
+ "GMP",
140
+ "GDP",
141
+ "GTP",
142
+ "CMP",
143
+ "CDP",
144
+ "CTP",
145
+ "UMP",
146
+ "UDP",
147
+ "UTP",
148
+
149
+ # Cofactors / common metabolites
150
+ "NAD+",
151
+ "NADH",
152
+ "NADP+",
153
+ "NADPH",
154
+ "FAD",
155
+ "FMN",
156
+ "Coenzyme A",
157
+ "Pantothenic acid",
158
+ "Riboflavin",
159
+ "Niacin",
160
+
161
+ # Sugar and sugar derivatives
162
+ "Fructose",
163
+ "Mannose",
164
+ "Mannose-6-phosphate",
165
+ "Ribose",
166
+ "Glucosamine",
167
+ "N-acetylglucosamine",
168
+
169
+ # Other common metabolites
170
+ "Choline",
171
+ "Betaine",
172
+ "Carnitine",
173
+ "Phosphocholine",
174
+ "Glycerol",
175
+ "Sorbitol",
176
+ "Inositol",
177
+ "Cholesterol",
178
+ "Pantothenate",
179
+ ]
180
+
181
+
182
+ def fetch_from_pubchem(name: str):
183
+ """Fetch formula, smiles and inchikey from PubChem by compound name.
184
+
185
+ Uses basic normalization and retries with exponential backoff. Returns
186
+ (formula, smiles, inchikey) or (None, None, None) on failure.
187
+ """
188
+ props = (None, None, None)
189
+
190
+ def normalize_name(n: str) -> str:
191
+ if not n:
192
+ return n
193
+ s = n
194
+ s = re.sub(r"\(.*?\)", "", s) # remove parentheses
195
+ s = s.replace("+", "+")
196
+ s = s.replace("–", "-")
197
+ s = re.sub(r"\s+", " ", s).strip()
198
+ # common abbreviation mapping
199
+ mapping = {
200
+ "AMP": "Adenosine monophosphate",
201
+ "ADP": "Adenosine diphosphate",
202
+ "ATP": "Adenosine triphosphate",
203
+ "GMP": "Guanosine monophosphate",
204
+ "GDP": "Guanosine diphosphate",
205
+ "GTP": "Guanosine triphosphate",
206
+ "NAD+": "Nicotinamide adenine dinucleotide",
207
+ "NADH": "Nicotinamide adenine dinucleotide (reduced)",
208
+ "CoA": "Coenzyme A",
209
+ }
210
+ up = s.upper()
211
+ if up in mapping:
212
+ return mapping[up]
213
+ return s
214
+
215
+ query = normalize_name(name)
216
+
217
+ def try_query(q: str):
218
+ url = (
219
+ f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{quote(q)}/property/"
220
+ + "MolecularFormula,CanonicalSMILES,InChI,InChIKey/JSON"
221
+ )
222
+ try:
223
+ r = requests.get(url, timeout=15)
224
+ if r.status_code == 200:
225
+ return r.json()
226
+ except Exception:
227
+ return None
228
+ return None
229
+
230
+ def try_query_inchikey(ik: str):
231
+ url = (
232
+ f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/inchikey/{quote(ik)}/property/"
233
+ + "MolecularFormula,CanonicalSMILES,InChI,InChIKey/JSON"
234
+ )
235
+ try:
236
+ r = requests.get(url, timeout=15)
237
+ if r.status_code == 200:
238
+ return r.json()
239
+ except Exception:
240
+ return None
241
+ return None
242
+
243
+ def try_query_cid(cid: int):
244
+ url = (
245
+ f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cid}/property/"
246
+ + "MolecularFormula,CanonicalSMILES,InChI,InChIKey/JSON"
247
+ )
248
+ try:
249
+ r = requests.get(url, timeout=15)
250
+ if r.status_code == 200:
251
+ return r.json()
252
+ except Exception:
253
+ return None
254
+ return None
255
+
256
+ def try_get_cids_from_inchikey(ik: str):
257
+ url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/inchikey/{quote(ik)}/cids/JSON"
258
+ try:
259
+ r = requests.get(url, timeout=15)
260
+ if r.status_code == 200:
261
+ j = r.json()
262
+ if 'IdentifierList' in j and 'CID' in j['IdentifierList']:
263
+ return j['IdentifierList']['CID']
264
+ except Exception:
265
+ return []
266
+ return []
267
+
268
+ if requests is None:
269
+ return props
270
+
271
+ # exponential backoff attempts
272
+ attempts = 3
273
+ for i in range(attempts):
274
+ j = try_query(query)
275
+ if j:
276
+ try:
277
+ if "PropertyTable" in j and "Properties" in j["PropertyTable"]:
278
+ p = j["PropertyTable"]["Properties"][0]
279
+ mf = p.get("MolecularFormula")
280
+ sm = p.get("CanonicalSMILES")
281
+ inchi = p.get("InChI")
282
+ ik = p.get("InChIKey")
283
+ # if SMILES missing, try a lookup by InChIKey (dedicated endpoint)
284
+ if not sm and ik:
285
+ j2 = try_query_inchikey(ik)
286
+ if j2 and "PropertyTable" in j2 and "Properties" in j2["PropertyTable"]:
287
+ p2 = j2["PropertyTable"]["Properties"][0]
288
+ sm = p2.get("CanonicalSMILES") or sm
289
+ inchi = inchi or p2.get("InChI")
290
+
291
+ # if still no SMILES but InChI present and RDKit available, try InChI -> SMILES conversion
292
+ if not sm and inchi and Chem is not None:
293
+ try:
294
+ m = Chem.MolFromInchi(inchi)
295
+ if m is not None:
296
+ sm = Chem.MolToSmiles(m, isomericSmiles=True)
297
+ except Exception:
298
+ pass
299
+
300
+ # if still no SMILES, try fetching CIDs from InChIKey and query a CID record
301
+ if not sm and ik:
302
+ cids = try_get_cids_from_inchikey(ik)
303
+ for cid in (cids or [])[:5]:
304
+ j3 = try_query_cid(cid)
305
+ if j3 and "PropertyTable" in j3 and "Properties" in j3["PropertyTable"]:
306
+ p3 = j3["PropertyTable"]["Properties"][0]
307
+ sm = p3.get("CanonicalSMILES") or sm
308
+ if sm:
309
+ break
310
+
311
+ return (mf, sm, ik)
312
+ except Exception:
313
+ pass
314
+ time.sleep(1 + 2 ** i)
315
+
316
+ # final fallback: try raw name without normalization
317
+ j = try_query(name)
318
+ if j and "PropertyTable" in j and "Properties" in j["PropertyTable"]:
319
+ p = j["PropertyTable"]["Properties"][0]
320
+ return (p.get("MolecularFormula"), p.get("CanonicalSMILES"), p.get("InChIKey"))
321
+
322
+ return props
323
+
324
+
325
+ def generate_csv(out_path: str = "central_carbon_metabolites.csv"):
326
+ rows = []
327
+ for name in CCM_METABOLITES:
328
+ formula, smiles, inchikey = (None, None, None)
329
+ if requests is not None:
330
+ formula, smiles, inchikey = fetch_from_pubchem(name)
331
+
332
+ # Neutralize charged molecular formulas (e.g., trailing +, -, 2+, 3-)
333
+ # by adjusting the hydrogen count accordingly and removing the explicit charge.
334
+ def neutralize_formula(fmt: str) -> str:
335
+ if not fmt:
336
+ return fmt
337
+ s = fmt.strip()
338
+ # normalize common unicode superscripts (²³¹⁺⁻) to ascii
339
+ sup_map = str.maketrans({
340
+ "²": "2",
341
+ "³": "3",
342
+ "¹": "1",
343
+ "⁺": "+",
344
+ "⁻": "-",
345
+ })
346
+ s = s.translate(sup_map)
347
+ # Remove enclosing brackets if present, e.g. [C6H5O7]2-
348
+ if s.startswith("[") and s.endswith("]"):
349
+ s = s[1:-1]
350
+ # strip trailing punctuation or separators (commas, periods, parentheses)
351
+ s = s.rstrip(" \t\n\r,.;)")
352
+ # detect trailing charge formats e.g. '2-', '-','3+','+','-2','+2' optionally with whitespace
353
+ m = re.search(r"([+-]?\d+[+-]?|[+-])\s*$", s)
354
+ if not m:
355
+ return fmt
356
+
357
+ charge_str = m.group(1)
358
+ base = s[: m.start(1)].strip()
359
+ # determine magnitude and sign for patterns like '2-' or '-2' or '+2' or '3+'
360
+ sign = 1
361
+ mag = 1
362
+ if charge_str[0] in '+-':
363
+ # formats like '-2' or '+2' or '-' or '+'
364
+ sign = -1 if charge_str[0] == '-' else 1
365
+ mag = int(charge_str[1:]) if len(charge_str) > 1 and charge_str[1:].isdigit() else 1
366
+ elif charge_str[-1] in '+-':
367
+ # formats like '2-' or '3+'
368
+ sign = -1 if charge_str[-1] == '-' else 1
369
+ mag = int(charge_str[:-1]) if charge_str[:-1].isdigit() else 1
370
+
371
+ # parse element counts from base formula
372
+ tokens = re.findall(r"([A-Z][a-z]?)(\d*)", base)
373
+ if not tokens:
374
+ # if parsing failed, return base without charge marker
375
+ return base
376
+
377
+ elems = []
378
+ counts: dict[str, int] = {}
379
+ for el, num in tokens:
380
+ counts[el] = counts.get(el, 0) + (int(num) if num else 1)
381
+ if el not in elems:
382
+ elems.append(el)
383
+
384
+ # adjust hydrogens: negative charge -> add H (protonation),
385
+ # positive charge -> remove H (deprotonation)
386
+ if sign == -1:
387
+ counts["H"] = counts.get("H", 0) + mag
388
+ if "H" not in elems:
389
+ # place H after C if present, else at beginning
390
+ if "C" in elems:
391
+ idx = elems.index("C") + 1
392
+ elems.insert(idx, "H")
393
+ else:
394
+ elems.insert(0, "H")
395
+ else:
396
+ if "H" in counts:
397
+ counts["H"] = counts.get("H", 0) - mag
398
+ if counts["H"] <= 0:
399
+ counts.pop("H", None)
400
+ if "H" in elems:
401
+ elems.remove("H")
402
+ else:
403
+ # can't remove hydrogens we don't have; leave base unchanged
404
+ pass
405
+
406
+ # rebuild formula preserving original element order
407
+ parts = []
408
+ for el in elems:
409
+ if el in counts:
410
+ n = counts[el]
411
+ parts.append(f"{el}{n if n!=1 else ''}")
412
+ new_formula = "".join(parts)
413
+ return new_formula
414
+
415
+ try:
416
+ formula_neutral = neutralize_formula(formula) if formula else formula
417
+ if formula and formula_neutral != formula:
418
+ # prefer the neutralized formula in the output
419
+ formula = formula_neutral
420
+ except Exception:
421
+ # if anything goes wrong, keep original formula
422
+ pass
423
+
424
+ # neutralize SMILES using RDKit when available
425
+ def neutralize_smiles(smiles_str: str) -> str:
426
+ if not smiles_str or Chem is None:
427
+ return smiles_str
428
+ try:
429
+ m = Chem.MolFromSmiles(smiles_str, sanitize=True)
430
+ if m is None:
431
+ return smiles_str
432
+ # Work on a read-write mol to adjust hydrogens and formal charges
433
+ rw = Chem.RWMol(Chem.AddHs(m))
434
+ to_remove = []
435
+ for a in list(rw.GetAtoms()):
436
+ idx = a.GetIdx()
437
+ q = a.GetFormalCharge()
438
+ if q > 0:
439
+ # remove up to q hydrogen neighbors (by index)
440
+ h_neighbors = [nbr.GetIdx() for nbr in a.GetNeighbors() if nbr.GetSymbol() == "H"]
441
+ remove = h_neighbors[: min(len(h_neighbors), q)]
442
+ to_remove.extend(remove)
443
+ elif q < 0:
444
+ # add -q hydrogens bonded to this atom
445
+ for _ in range(-q):
446
+ h = Chem.Atom("H")
447
+ new_idx = rw.AddAtom(h)
448
+ rw.AddBond(idx, new_idx, Chem.BondType.SINGLE)
449
+ # reset formal charge on this atom
450
+ rw.GetAtomWithIdx(idx).SetFormalCharge(0)
451
+
452
+ # remove hydrogen atoms collected, in reverse order so indices stay valid
453
+ for ridx in sorted(set(to_remove), reverse=True):
454
+ try:
455
+ rw.RemoveAtom(ridx)
456
+ except Exception:
457
+ pass
458
+
459
+ newm = rw.GetMol()
460
+ try:
461
+ Chem.SanitizeMol(newm)
462
+ except Exception:
463
+ # best effort: continue
464
+ pass
465
+ # remove explicit Hs to produce a clean canonical SMILES
466
+ try:
467
+ no_h = Chem.RemoveHs(newm)
468
+ except Exception:
469
+ no_h = newm
470
+ sm = Chem.MolToSmiles(no_h, isomericSmiles=True)
471
+ return sm
472
+ except Exception:
473
+ return smiles_str
474
+
475
+ try:
476
+ smiles = neutralize_smiles(smiles) if smiles else smiles
477
+ except Exception:
478
+ pass
479
+ rows.append({
480
+ "Name": name,
481
+ "Formula": formula or "",
482
+ "SMILES": smiles or "",
483
+ "InChIKey": inchikey or "",
484
+ })
485
+
486
+ # Ensure output directory exists (data/libs)
487
+ out_dir = os.path.join('masster', 'data', 'libs')
488
+ os.makedirs(out_dir, exist_ok=True)
489
+ out_path_full = os.path.join(out_dir, os.path.basename(out_path))
490
+
491
+ fieldnames = ["Name", "Formula", "SMILES", "InChIKey"]
492
+ with open(out_path_full, "w", newline="", encoding="utf-8") as f:
493
+ writer = csv.DictWriter(f, fieldnames=fieldnames)
494
+ writer.writeheader()
495
+ for r in rows:
496
+ writer.writerow(r)
497
+
498
+ print(f"Wrote {len(rows)} entries to {out_path_full}")
499
+ return out_path_full
500
+
501
+
502
+ def test_load_with_lib(csv_path: str):
503
+ """Try to load the generated CSV using masster.lib.Lib.import_csv."""
504
+ try:
505
+ from masster.lib import Lib
506
+ except Exception as e:
507
+ print(f"Cannot import masster.lib.Lib: {e}")
508
+ return False
509
+
510
+ try:
511
+ lib = Lib()
512
+ # import_csv expects a path and optional polarity; use polarity=None to import both
513
+ lib.import_csv(csv_path, polarity=None)
514
+ print(f"Lib loaded: {len(lib)} entries")
515
+ # print a few entries (polars DataFrame -> head)
516
+ try:
517
+ print(lib.lib_df.select(["name", "formula", "adduct", "mz"]).head(8))
518
+ except Exception:
519
+ # older implementations might not have the same columns; just show length
520
+ pass
521
+ return True
522
+ except Exception as e:
523
+ print(f"Failed to load CSV with Lib.import_csv: {e}")
524
+ return False
525
+
526
+
527
+ if __name__ == "__main__":
528
+ csv_file = generate_csv()
529
+ ok = test_load_with_lib(csv_file)
530
+ if not ok:
531
+ print("Test failed; please inspect messages above.")
532
+ sys.exit(2)
533
+ print("Done.")
@@ -0,0 +1,17 @@
1
+ central_carbon_metabolites.csv
2
+
3
+ This folder contains example compound lists used by the masster package.
4
+
5
+ Files:
6
+ - central_carbon_metabolites.csv: a best-effort list of central carbon metabolism related
7
+ compounds (glycolysis, TCA cycle, pentose phosphate pathway, amino acids, organic acids,
8
+ nucleotides, fatty acids, cofactors). The CSV was generated by `ccm.py` which:
9
+ - Uses a curated name list included in the script.
10
+ - Resolves structural identifiers (MolecularFormula, CanonicalSMILES, InChIKey) via PubChem PUG-REST
11
+ with retries and basic normalization.
12
+ - Writes a CSV with columns: Name, Formula, SMILES, InChIKey.
13
+
14
+ Notes:
15
+ - This is a programmatically generated list. For authoritative lists, use database bulk downloads
16
+ (HMDB, KEGG, ChEBI) and provide explicit IDs. Prefer HMDB or ChEBI for metabolomics work.
17
+ - Some compound names may be ambiguous; verify entries before use.