masster 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- masster/__init__.py +8 -8
- masster/_version.py +1 -1
- masster/chromatogram.py +3 -9
- masster/data/libs/README.md +1 -1
- masster/data/libs/ccm.csv +120 -120
- masster/data/libs/ccm.py +116 -62
- masster/data/libs/central_carbon_README.md +1 -1
- masster/data/libs/urine.py +161 -65
- masster/data/libs/urine_metabolites.csv +4693 -4693
- masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.mzML +2 -2
- masster/logger.py +43 -78
- masster/sample/__init__.py +1 -1
- masster/sample/adducts.py +264 -338
- masster/sample/defaults/find_adducts_def.py +8 -21
- masster/sample/defaults/find_features_def.py +1 -6
- masster/sample/defaults/get_spectrum_def.py +1 -5
- masster/sample/defaults/sample_def.py +1 -5
- masster/sample/h5.py +282 -561
- masster/sample/helpers.py +75 -131
- masster/sample/lib.py +17 -42
- masster/sample/load.py +17 -31
- masster/sample/parameters.py +2 -6
- masster/sample/plot.py +27 -88
- masster/sample/processing.py +87 -117
- masster/sample/quant.py +51 -57
- masster/sample/sample.py +90 -103
- masster/sample/sample5_schema.json +44 -44
- masster/sample/save.py +12 -35
- masster/sample/sciex.py +19 -66
- masster/spectrum.py +20 -58
- masster/study/__init__.py +1 -1
- masster/study/defaults/align_def.py +1 -5
- masster/study/defaults/fill_chrom_def.py +1 -5
- masster/study/defaults/fill_def.py +1 -5
- masster/study/defaults/integrate_chrom_def.py +1 -5
- masster/study/defaults/integrate_def.py +1 -5
- masster/study/defaults/study_def.py +25 -58
- masster/study/export.py +207 -233
- masster/study/h5.py +136 -470
- masster/study/helpers.py +202 -495
- masster/study/helpers_optimized.py +13 -40
- masster/study/id.py +110 -213
- masster/study/load.py +143 -230
- masster/study/plot.py +257 -518
- masster/study/processing.py +257 -469
- masster/study/save.py +5 -15
- masster/study/study.py +276 -379
- masster/study/study5_schema.json +96 -96
- {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/METADATA +1 -1
- masster-0.4.1.dist-info/RECORD +67 -0
- masster-0.4.0.dist-info/RECORD +0 -67
- {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/WHEEL +0 -0
- {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/entry_points.txt +0 -0
- {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/licenses/LICENSE +0 -0
masster/data/libs/ccm.py
CHANGED
|
@@ -5,8 +5,8 @@ Workflow:
|
|
|
5
5
|
amino acids, common organic acids, nucleotides, fatty acids, cofactors, sugars).
|
|
6
6
|
- Query PubChem's PUG-REST for MolecularFormula, CanonicalSMILES and InChIKey for each name
|
|
7
7
|
with retries and basic name normalization to improve matching.
|
|
8
|
-
- Save results to `masster/data/examples/
|
|
9
|
-
- Test loading with `
|
|
8
|
+
- Save results to `masster/data/examples/ccm.csv`.
|
|
9
|
+
- Test loading with `masster.lib.Lib.import_csv`.
|
|
10
10
|
|
|
11
11
|
This is a best-effort programmatic lookup; ambiguous names may not resolve (those rows will
|
|
12
12
|
have empty Formula/SMILES/InChIKey). For authoritative lists, prefer curated databases
|
|
@@ -70,6 +70,7 @@ CCM_METABOLITES = [
|
|
|
70
70
|
"Pentose",
|
|
71
71
|
"Acetaldehyde",
|
|
72
72
|
"Acetic acid",
|
|
73
|
+
|
|
73
74
|
# Proteinogenic amino acids (20 standard)
|
|
74
75
|
"Alanine",
|
|
75
76
|
"Arginine",
|
|
@@ -97,6 +98,7 @@ CCM_METABOLITES = [
|
|
|
97
98
|
"Homocysteine",
|
|
98
99
|
"S-adenosylmethionine",
|
|
99
100
|
"S-adenosylhomocysteine",
|
|
101
|
+
|
|
100
102
|
# Common organic acids / intermediates & related small metabolites
|
|
101
103
|
"Formic acid",
|
|
102
104
|
"Propionic acid",
|
|
@@ -108,6 +110,7 @@ CCM_METABOLITES = [
|
|
|
108
110
|
"Beta-hydroxybutyrate",
|
|
109
111
|
"Pyruvic acid",
|
|
110
112
|
"Lactic acid",
|
|
113
|
+
|
|
111
114
|
# Fatty acids (common)
|
|
112
115
|
"Myristic acid",
|
|
113
116
|
"Palmitic acid",
|
|
@@ -117,6 +120,7 @@ CCM_METABOLITES = [
|
|
|
117
120
|
"Linoleic acid",
|
|
118
121
|
"Alpha-linolenic acid",
|
|
119
122
|
"Arachidonic acid",
|
|
123
|
+
|
|
120
124
|
# Nucleobases and nucleosides
|
|
121
125
|
"Adenine",
|
|
122
126
|
"Guanine",
|
|
@@ -127,6 +131,7 @@ CCM_METABOLITES = [
|
|
|
127
131
|
"Guanosine",
|
|
128
132
|
"Cytidine",
|
|
129
133
|
"Uridine",
|
|
134
|
+
|
|
130
135
|
# Nucleotides (mono/di/tri)
|
|
131
136
|
"AMP",
|
|
132
137
|
"ADP",
|
|
@@ -140,6 +145,7 @@ CCM_METABOLITES = [
|
|
|
140
145
|
"UMP",
|
|
141
146
|
"UDP",
|
|
142
147
|
"UTP",
|
|
148
|
+
|
|
143
149
|
# Cofactors / common metabolites
|
|
144
150
|
"NAD+",
|
|
145
151
|
"NADH",
|
|
@@ -151,6 +157,7 @@ CCM_METABOLITES = [
|
|
|
151
157
|
"Pantothenic acid",
|
|
152
158
|
"Riboflavin",
|
|
153
159
|
"Niacin",
|
|
160
|
+
|
|
154
161
|
# Sugar and sugar derivatives
|
|
155
162
|
"Fructose",
|
|
156
163
|
"Mannose",
|
|
@@ -158,6 +165,7 @@ CCM_METABOLITES = [
|
|
|
158
165
|
"Ribose",
|
|
159
166
|
"Glucosamine",
|
|
160
167
|
"N-acetylglucosamine",
|
|
168
|
+
|
|
161
169
|
# Other common metabolites
|
|
162
170
|
"Choline",
|
|
163
171
|
"Betaine",
|
|
@@ -171,13 +179,40 @@ CCM_METABOLITES = [
|
|
|
171
179
|
]
|
|
172
180
|
|
|
173
181
|
|
|
182
|
+
def canonicalize_smiles(smiles_str: str) -> str:
|
|
183
|
+
"""
|
|
184
|
+
Canonicalize SMILES string using RDKit.
|
|
185
|
+
|
|
186
|
+
Args:
|
|
187
|
+
smiles_str: Input SMILES string
|
|
188
|
+
|
|
189
|
+
Returns:
|
|
190
|
+
Canonical SMILES string, or original string if canonicalization fails
|
|
191
|
+
"""
|
|
192
|
+
if not smiles_str or not smiles_str.strip() or Chem is None:
|
|
193
|
+
return smiles_str
|
|
194
|
+
|
|
195
|
+
try:
|
|
196
|
+
mol = Chem.MolFromSmiles(smiles_str, sanitize=True)
|
|
197
|
+
if mol is None:
|
|
198
|
+
return smiles_str
|
|
199
|
+
|
|
200
|
+
# Generate canonical SMILES with isomeric information
|
|
201
|
+
canonical_smiles = Chem.MolToSmiles(mol, isomericSmiles=True, canonical=True)
|
|
202
|
+
return canonical_smiles
|
|
203
|
+
|
|
204
|
+
except Exception:
|
|
205
|
+
# If canonicalization fails, return the original SMILES
|
|
206
|
+
return smiles_str
|
|
207
|
+
|
|
208
|
+
|
|
174
209
|
def fetch_from_pubchem(name: str):
|
|
175
|
-
"""Fetch formula, smiles and
|
|
210
|
+
"""Fetch formula, smiles, inchikey, and CID from PubChem by compound name.
|
|
176
211
|
|
|
177
212
|
Uses basic normalization and retries with exponential backoff. Returns
|
|
178
|
-
(formula, smiles, inchikey) or (None, None, None) on failure.
|
|
213
|
+
(formula, smiles, inchikey, cid) or (None, None, None, None) on failure.
|
|
179
214
|
"""
|
|
180
|
-
props = (None, None, None)
|
|
215
|
+
props = (None, None, None, None)
|
|
181
216
|
|
|
182
217
|
def normalize_name(n: str) -> str:
|
|
183
218
|
if not n:
|
|
@@ -219,6 +254,21 @@ def fetch_from_pubchem(name: str):
|
|
|
219
254
|
return None
|
|
220
255
|
return None
|
|
221
256
|
|
|
257
|
+
def try_query_with_cid(q: str):
|
|
258
|
+
"""Query compound by name and get CID along with properties."""
|
|
259
|
+
url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{quote(q)}/cids/JSON"
|
|
260
|
+
try:
|
|
261
|
+
r = requests.get(url, timeout=15)
|
|
262
|
+
if r.status_code == 200:
|
|
263
|
+
j = r.json()
|
|
264
|
+
if 'IdentifierList' in j and 'CID' in j['IdentifierList']:
|
|
265
|
+
cids = j['IdentifierList']['CID']
|
|
266
|
+
if cids:
|
|
267
|
+
return cids[0] # Return the first CID
|
|
268
|
+
except Exception:
|
|
269
|
+
return None
|
|
270
|
+
return None
|
|
271
|
+
|
|
222
272
|
def try_query_inchikey(ik: str):
|
|
223
273
|
url = (
|
|
224
274
|
f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/inchikey/{quote(ik)}/property/"
|
|
@@ -251,8 +301,8 @@ def fetch_from_pubchem(name: str):
|
|
|
251
301
|
r = requests.get(url, timeout=15)
|
|
252
302
|
if r.status_code == 200:
|
|
253
303
|
j = r.json()
|
|
254
|
-
if
|
|
255
|
-
return j[
|
|
304
|
+
if 'IdentifierList' in j and 'CID' in j['IdentifierList']:
|
|
305
|
+
return j['IdentifierList']['CID']
|
|
256
306
|
except Exception:
|
|
257
307
|
return []
|
|
258
308
|
return []
|
|
@@ -262,7 +312,13 @@ def fetch_from_pubchem(name: str):
|
|
|
262
312
|
|
|
263
313
|
# exponential backoff attempts
|
|
264
314
|
attempts = 3
|
|
315
|
+
cid = None
|
|
316
|
+
|
|
265
317
|
for i in range(attempts):
|
|
318
|
+
# First try to get the CID
|
|
319
|
+
if not cid:
|
|
320
|
+
cid = try_query_with_cid(query)
|
|
321
|
+
|
|
266
322
|
j = try_query(query)
|
|
267
323
|
if j:
|
|
268
324
|
try:
|
|
@@ -275,11 +331,7 @@ def fetch_from_pubchem(name: str):
|
|
|
275
331
|
# if SMILES missing, try a lookup by InChIKey (dedicated endpoint)
|
|
276
332
|
if not sm and ik:
|
|
277
333
|
j2 = try_query_inchikey(ik)
|
|
278
|
-
if
|
|
279
|
-
j2
|
|
280
|
-
and "PropertyTable" in j2
|
|
281
|
-
and "Properties" in j2["PropertyTable"]
|
|
282
|
-
):
|
|
334
|
+
if j2 and "PropertyTable" in j2 and "Properties" in j2["PropertyTable"]:
|
|
283
335
|
p2 = j2["PropertyTable"]["Properties"][0]
|
|
284
336
|
sm = p2.get("CanonicalSMILES") or sm
|
|
285
337
|
inchi = inchi or p2.get("InChI")
|
|
@@ -296,38 +348,39 @@ def fetch_from_pubchem(name: str):
|
|
|
296
348
|
# if still no SMILES, try fetching CIDs from InChIKey and query a CID record
|
|
297
349
|
if not sm and ik:
|
|
298
350
|
cids = try_get_cids_from_inchikey(ik)
|
|
299
|
-
for
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
and "Properties" in j3["PropertyTable"]
|
|
305
|
-
):
|
|
351
|
+
for cid_from_ik in (cids or [])[:5]:
|
|
352
|
+
if not cid: # Only set CID if we don't have one yet
|
|
353
|
+
cid = cid_from_ik
|
|
354
|
+
j3 = try_query_cid(cid_from_ik)
|
|
355
|
+
if j3 and "PropertyTable" in j3 and "Properties" in j3["PropertyTable"]:
|
|
306
356
|
p3 = j3["PropertyTable"]["Properties"][0]
|
|
307
357
|
sm = p3.get("CanonicalSMILES") or sm
|
|
308
358
|
if sm:
|
|
309
359
|
break
|
|
310
360
|
|
|
311
|
-
return (mf, sm, ik)
|
|
361
|
+
return (mf, sm, ik, cid)
|
|
312
362
|
except Exception:
|
|
313
363
|
pass
|
|
314
|
-
time.sleep(1 + 2**i)
|
|
364
|
+
time.sleep(1 + 2 ** i)
|
|
315
365
|
|
|
316
366
|
# final fallback: try raw name without normalization
|
|
367
|
+
if not cid:
|
|
368
|
+
cid = try_query_with_cid(name)
|
|
369
|
+
|
|
317
370
|
j = try_query(name)
|
|
318
371
|
if j and "PropertyTable" in j and "Properties" in j["PropertyTable"]:
|
|
319
372
|
p = j["PropertyTable"]["Properties"][0]
|
|
320
|
-
return (p.get("MolecularFormula"), p.get("CanonicalSMILES"), p.get("InChIKey"))
|
|
373
|
+
return (p.get("MolecularFormula"), p.get("CanonicalSMILES"), p.get("InChIKey"), cid)
|
|
321
374
|
|
|
322
375
|
return props
|
|
323
376
|
|
|
324
377
|
|
|
325
|
-
def generate_csv(out_path: str = "
|
|
378
|
+
def generate_csv(out_path: str = "ccm.csv"):
|
|
326
379
|
rows = []
|
|
327
380
|
for name in CCM_METABOLITES:
|
|
328
|
-
formula, smiles, inchikey = (None, None, None)
|
|
381
|
+
formula, smiles, inchikey, cid = (None, None, None, None)
|
|
329
382
|
if requests is not None:
|
|
330
|
-
formula, smiles, inchikey = fetch_from_pubchem(name)
|
|
383
|
+
formula, smiles, inchikey, cid = fetch_from_pubchem(name)
|
|
331
384
|
|
|
332
385
|
# Neutralize charged molecular formulas (e.g., trailing +, -, 2+, 3-)
|
|
333
386
|
# by adjusting the hydrogen count accordingly and removing the explicit charge.
|
|
@@ -336,15 +389,13 @@ def generate_csv(out_path: str = "central_carbon_metabolites.csv"):
|
|
|
336
389
|
return fmt
|
|
337
390
|
s = fmt.strip()
|
|
338
391
|
# normalize common unicode superscripts (²³¹⁺⁻) to ascii
|
|
339
|
-
sup_map = str.maketrans(
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
},
|
|
347
|
-
)
|
|
392
|
+
sup_map = str.maketrans({
|
|
393
|
+
"²": "2",
|
|
394
|
+
"³": "3",
|
|
395
|
+
"¹": "1",
|
|
396
|
+
"⁺": "+",
|
|
397
|
+
"⁻": "-",
|
|
398
|
+
})
|
|
348
399
|
s = s.translate(sup_map)
|
|
349
400
|
# Remove enclosing brackets if present, e.g. [C6H5O7]2-
|
|
350
401
|
if s.startswith("[") and s.endswith("]"):
|
|
@@ -361,17 +412,13 @@ def generate_csv(out_path: str = "central_carbon_metabolites.csv"):
|
|
|
361
412
|
# determine magnitude and sign for patterns like '2-' or '-2' or '+2' or '3+'
|
|
362
413
|
sign = 1
|
|
363
414
|
mag = 1
|
|
364
|
-
if charge_str[0] in
|
|
415
|
+
if charge_str[0] in '+-':
|
|
365
416
|
# formats like '-2' or '+2' or '-' or '+'
|
|
366
|
-
sign = -1 if charge_str[0] ==
|
|
367
|
-
mag = (
|
|
368
|
-
|
|
369
|
-
if len(charge_str) > 1 and charge_str[1:].isdigit()
|
|
370
|
-
else 1
|
|
371
|
-
)
|
|
372
|
-
elif charge_str[-1] in "+-":
|
|
417
|
+
sign = -1 if charge_str[0] == '-' else 1
|
|
418
|
+
mag = int(charge_str[1:]) if len(charge_str) > 1 and charge_str[1:].isdigit() else 1
|
|
419
|
+
elif charge_str[-1] in '+-':
|
|
373
420
|
# formats like '2-' or '3+'
|
|
374
|
-
sign = -1 if charge_str[-1] ==
|
|
421
|
+
sign = -1 if charge_str[-1] == '-' else 1
|
|
375
422
|
mag = int(charge_str[:-1]) if charge_str[:-1].isdigit() else 1
|
|
376
423
|
|
|
377
424
|
# parse element counts from base formula
|
|
@@ -414,7 +461,7 @@ def generate_csv(out_path: str = "central_carbon_metabolites.csv"):
|
|
|
414
461
|
for el in elems:
|
|
415
462
|
if el in counts:
|
|
416
463
|
n = counts[el]
|
|
417
|
-
parts.append(f"{el}{n if n
|
|
464
|
+
parts.append(f"{el}{n if n!=1 else ''}")
|
|
418
465
|
new_formula = "".join(parts)
|
|
419
466
|
return new_formula
|
|
420
467
|
|
|
@@ -443,11 +490,7 @@ def generate_csv(out_path: str = "central_carbon_metabolites.csv"):
|
|
|
443
490
|
q = a.GetFormalCharge()
|
|
444
491
|
if q > 0:
|
|
445
492
|
# remove up to q hydrogen neighbors (by index)
|
|
446
|
-
h_neighbors = [
|
|
447
|
-
nbr.GetIdx()
|
|
448
|
-
for nbr in a.GetNeighbors()
|
|
449
|
-
if nbr.GetSymbol() == "H"
|
|
450
|
-
]
|
|
493
|
+
h_neighbors = [nbr.GetIdx() for nbr in a.GetNeighbors() if nbr.GetSymbol() == "H"]
|
|
451
494
|
remove = h_neighbors[: min(len(h_neighbors), q)]
|
|
452
495
|
to_remove.extend(remove)
|
|
453
496
|
elif q < 0:
|
|
@@ -486,21 +529,32 @@ def generate_csv(out_path: str = "central_carbon_metabolites.csv"):
|
|
|
486
529
|
smiles = neutralize_smiles(smiles) if smiles else smiles
|
|
487
530
|
except Exception:
|
|
488
531
|
pass
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
532
|
+
|
|
533
|
+
# Canonicalize SMILES after neutralization
|
|
534
|
+
try:
|
|
535
|
+
smiles = canonicalize_smiles(smiles) if smiles else smiles
|
|
536
|
+
except Exception:
|
|
537
|
+
pass
|
|
538
|
+
|
|
539
|
+
# Format the database ID and database name
|
|
540
|
+
db_id = f"CID:{cid}" if cid else ""
|
|
541
|
+
db = "pubchem" if cid else ""
|
|
542
|
+
|
|
543
|
+
rows.append({
|
|
544
|
+
"Name": name,
|
|
545
|
+
"Formula": formula or "",
|
|
546
|
+
"SMILES": smiles or "",
|
|
547
|
+
"InChIKey": inchikey or "",
|
|
548
|
+
"db_id": db_id,
|
|
549
|
+
"db": db,
|
|
550
|
+
})
|
|
497
551
|
|
|
498
552
|
# Ensure output directory exists (data/libs)
|
|
499
|
-
out_dir = os.path.join(
|
|
553
|
+
out_dir = os.path.join('masster', 'data', 'libs')
|
|
500
554
|
os.makedirs(out_dir, exist_ok=True)
|
|
501
555
|
out_path_full = os.path.join(out_dir, os.path.basename(out_path))
|
|
502
556
|
|
|
503
|
-
fieldnames = ["Name", "Formula", "SMILES", "InChIKey"]
|
|
557
|
+
fieldnames = ["Name", "Formula", "SMILES", "InChIKey", "db_id", "db"]
|
|
504
558
|
with open(out_path_full, "w", newline="", encoding="utf-8") as f:
|
|
505
559
|
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
|
506
560
|
writer.writeheader()
|
|
@@ -512,11 +566,11 @@ def generate_csv(out_path: str = "central_carbon_metabolites.csv"):
|
|
|
512
566
|
|
|
513
567
|
|
|
514
568
|
def test_load_with_lib(csv_path: str):
|
|
515
|
-
"""Try to load the generated CSV using
|
|
569
|
+
"""Try to load the generated CSV using masster.lib.Lib.import_csv."""
|
|
516
570
|
try:
|
|
517
|
-
from
|
|
571
|
+
from masster.lib import Lib
|
|
518
572
|
except Exception as e:
|
|
519
|
-
print(f"Cannot import
|
|
573
|
+
print(f"Cannot import masster.lib.Lib: {e}")
|
|
520
574
|
return False
|
|
521
575
|
|
|
522
576
|
try:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
central_carbon_metabolites.csv
|
|
2
2
|
|
|
3
|
-
This folder contains example compound lists used by the
|
|
3
|
+
This folder contains example compound lists used by the masster package.
|
|
4
4
|
|
|
5
5
|
Files:
|
|
6
6
|
- central_carbon_metabolites.csv: a best-effort list of central carbon metabolism related
|