masster 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. masster/__init__.py +8 -8
  2. masster/_version.py +1 -1
  3. masster/chromatogram.py +3 -9
  4. masster/data/libs/README.md +1 -1
  5. masster/data/libs/ccm.csv +120 -120
  6. masster/data/libs/ccm.py +116 -62
  7. masster/data/libs/central_carbon_README.md +1 -1
  8. masster/data/libs/urine.py +161 -65
  9. masster/data/libs/urine_metabolites.csv +4693 -4693
  10. masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.mzML +2 -2
  11. masster/logger.py +43 -78
  12. masster/sample/__init__.py +1 -1
  13. masster/sample/adducts.py +264 -338
  14. masster/sample/defaults/find_adducts_def.py +8 -21
  15. masster/sample/defaults/find_features_def.py +1 -6
  16. masster/sample/defaults/get_spectrum_def.py +1 -5
  17. masster/sample/defaults/sample_def.py +1 -5
  18. masster/sample/h5.py +282 -561
  19. masster/sample/helpers.py +75 -131
  20. masster/sample/lib.py +17 -42
  21. masster/sample/load.py +17 -31
  22. masster/sample/parameters.py +2 -6
  23. masster/sample/plot.py +27 -88
  24. masster/sample/processing.py +87 -117
  25. masster/sample/quant.py +51 -57
  26. masster/sample/sample.py +90 -103
  27. masster/sample/sample5_schema.json +44 -44
  28. masster/sample/save.py +12 -35
  29. masster/sample/sciex.py +19 -66
  30. masster/spectrum.py +20 -58
  31. masster/study/__init__.py +1 -1
  32. masster/study/defaults/align_def.py +1 -5
  33. masster/study/defaults/fill_chrom_def.py +1 -5
  34. masster/study/defaults/fill_def.py +1 -5
  35. masster/study/defaults/integrate_chrom_def.py +1 -5
  36. masster/study/defaults/integrate_def.py +1 -5
  37. masster/study/defaults/study_def.py +25 -58
  38. masster/study/export.py +207 -233
  39. masster/study/h5.py +136 -470
  40. masster/study/helpers.py +202 -495
  41. masster/study/helpers_optimized.py +13 -40
  42. masster/study/id.py +110 -213
  43. masster/study/load.py +143 -230
  44. masster/study/plot.py +257 -518
  45. masster/study/processing.py +257 -469
  46. masster/study/save.py +5 -15
  47. masster/study/study.py +276 -379
  48. masster/study/study5_schema.json +96 -96
  49. {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/METADATA +1 -1
  50. masster-0.4.1.dist-info/RECORD +67 -0
  51. masster-0.4.0.dist-info/RECORD +0 -67
  52. {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/WHEEL +0 -0
  53. {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/entry_points.txt +0 -0
  54. {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/licenses/LICENSE +0 -0
masster/data/libs/ccm.py CHANGED
@@ -5,8 +5,8 @@ Workflow:
5
5
  amino acids, common organic acids, nucleotides, fatty acids, cofactors, sugars).
6
6
  - Query PubChem's PUG-REST for MolecularFormula, CanonicalSMILES and InChIKey for each name
7
7
  with retries and basic name normalization to improve matching.
8
- - Save results to `masster/data/examples/central_carbon_metabolites.csv`.
9
- - Test loading with `master.lib.Lib.import_csv`.
8
+ - Save results to `masster/data/examples/ccm.csv`.
9
+ - Test loading with `masster.lib.Lib.import_csv`.
10
10
 
11
11
  This is a best-effort programmatic lookup; ambiguous names may not resolve (those rows will
12
12
  have empty Formula/SMILES/InChIKey). For authoritative lists, prefer curated databases
@@ -70,6 +70,7 @@ CCM_METABOLITES = [
70
70
  "Pentose",
71
71
  "Acetaldehyde",
72
72
  "Acetic acid",
73
+
73
74
  # Proteinogenic amino acids (20 standard)
74
75
  "Alanine",
75
76
  "Arginine",
@@ -97,6 +98,7 @@ CCM_METABOLITES = [
97
98
  "Homocysteine",
98
99
  "S-adenosylmethionine",
99
100
  "S-adenosylhomocysteine",
101
+
100
102
  # Common organic acids / intermediates & related small metabolites
101
103
  "Formic acid",
102
104
  "Propionic acid",
@@ -108,6 +110,7 @@ CCM_METABOLITES = [
108
110
  "Beta-hydroxybutyrate",
109
111
  "Pyruvic acid",
110
112
  "Lactic acid",
113
+
111
114
  # Fatty acids (common)
112
115
  "Myristic acid",
113
116
  "Palmitic acid",
@@ -117,6 +120,7 @@ CCM_METABOLITES = [
117
120
  "Linoleic acid",
118
121
  "Alpha-linolenic acid",
119
122
  "Arachidonic acid",
123
+
120
124
  # Nucleobases and nucleosides
121
125
  "Adenine",
122
126
  "Guanine",
@@ -127,6 +131,7 @@ CCM_METABOLITES = [
127
131
  "Guanosine",
128
132
  "Cytidine",
129
133
  "Uridine",
134
+
130
135
  # Nucleotides (mono/di/tri)
131
136
  "AMP",
132
137
  "ADP",
@@ -140,6 +145,7 @@ CCM_METABOLITES = [
140
145
  "UMP",
141
146
  "UDP",
142
147
  "UTP",
148
+
143
149
  # Cofactors / common metabolites
144
150
  "NAD+",
145
151
  "NADH",
@@ -151,6 +157,7 @@ CCM_METABOLITES = [
151
157
  "Pantothenic acid",
152
158
  "Riboflavin",
153
159
  "Niacin",
160
+
154
161
  # Sugar and sugar derivatives
155
162
  "Fructose",
156
163
  "Mannose",
@@ -158,6 +165,7 @@ CCM_METABOLITES = [
158
165
  "Ribose",
159
166
  "Glucosamine",
160
167
  "N-acetylglucosamine",
168
+
161
169
  # Other common metabolites
162
170
  "Choline",
163
171
  "Betaine",
@@ -171,13 +179,40 @@ CCM_METABOLITES = [
171
179
  ]
172
180
 
173
181
 
182
+ def canonicalize_smiles(smiles_str: str) -> str:
183
+ """
184
+ Canonicalize SMILES string using RDKit.
185
+
186
+ Args:
187
+ smiles_str: Input SMILES string
188
+
189
+ Returns:
190
+ Canonical SMILES string, or original string if canonicalization fails
191
+ """
192
+ if not smiles_str or not smiles_str.strip() or Chem is None:
193
+ return smiles_str
194
+
195
+ try:
196
+ mol = Chem.MolFromSmiles(smiles_str, sanitize=True)
197
+ if mol is None:
198
+ return smiles_str
199
+
200
+ # Generate canonical SMILES with isomeric information
201
+ canonical_smiles = Chem.MolToSmiles(mol, isomericSmiles=True, canonical=True)
202
+ return canonical_smiles
203
+
204
+ except Exception:
205
+ # If canonicalization fails, return the original SMILES
206
+ return smiles_str
207
+
208
+
174
209
  def fetch_from_pubchem(name: str):
175
- """Fetch formula, smiles and inchikey from PubChem by compound name.
210
+ """Fetch formula, smiles, inchikey, and CID from PubChem by compound name.
176
211
 
177
212
  Uses basic normalization and retries with exponential backoff. Returns
178
- (formula, smiles, inchikey) or (None, None, None) on failure.
213
+ (formula, smiles, inchikey, cid) or (None, None, None, None) on failure.
179
214
  """
180
- props = (None, None, None)
215
+ props = (None, None, None, None)
181
216
 
182
217
  def normalize_name(n: str) -> str:
183
218
  if not n:
@@ -219,6 +254,21 @@ def fetch_from_pubchem(name: str):
219
254
  return None
220
255
  return None
221
256
 
257
+ def try_query_with_cid(q: str):
258
+ """Query compound by name and get CID along with properties."""
259
+ url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{quote(q)}/cids/JSON"
260
+ try:
261
+ r = requests.get(url, timeout=15)
262
+ if r.status_code == 200:
263
+ j = r.json()
264
+ if 'IdentifierList' in j and 'CID' in j['IdentifierList']:
265
+ cids = j['IdentifierList']['CID']
266
+ if cids:
267
+ return cids[0] # Return the first CID
268
+ except Exception:
269
+ return None
270
+ return None
271
+
222
272
  def try_query_inchikey(ik: str):
223
273
  url = (
224
274
  f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/inchikey/{quote(ik)}/property/"
@@ -251,8 +301,8 @@ def fetch_from_pubchem(name: str):
251
301
  r = requests.get(url, timeout=15)
252
302
  if r.status_code == 200:
253
303
  j = r.json()
254
- if "IdentifierList" in j and "CID" in j["IdentifierList"]:
255
- return j["IdentifierList"]["CID"]
304
+ if 'IdentifierList' in j and 'CID' in j['IdentifierList']:
305
+ return j['IdentifierList']['CID']
256
306
  except Exception:
257
307
  return []
258
308
  return []
@@ -262,7 +312,13 @@ def fetch_from_pubchem(name: str):
262
312
 
263
313
  # exponential backoff attempts
264
314
  attempts = 3
315
+ cid = None
316
+
265
317
  for i in range(attempts):
318
+ # First try to get the CID
319
+ if not cid:
320
+ cid = try_query_with_cid(query)
321
+
266
322
  j = try_query(query)
267
323
  if j:
268
324
  try:
@@ -275,11 +331,7 @@ def fetch_from_pubchem(name: str):
275
331
  # if SMILES missing, try a lookup by InChIKey (dedicated endpoint)
276
332
  if not sm and ik:
277
333
  j2 = try_query_inchikey(ik)
278
- if (
279
- j2
280
- and "PropertyTable" in j2
281
- and "Properties" in j2["PropertyTable"]
282
- ):
334
+ if j2 and "PropertyTable" in j2 and "Properties" in j2["PropertyTable"]:
283
335
  p2 = j2["PropertyTable"]["Properties"][0]
284
336
  sm = p2.get("CanonicalSMILES") or sm
285
337
  inchi = inchi or p2.get("InChI")
@@ -296,38 +348,39 @@ def fetch_from_pubchem(name: str):
296
348
  # if still no SMILES, try fetching CIDs from InChIKey and query a CID record
297
349
  if not sm and ik:
298
350
  cids = try_get_cids_from_inchikey(ik)
299
- for cid in (cids or [])[:5]:
300
- j3 = try_query_cid(cid)
301
- if (
302
- j3
303
- and "PropertyTable" in j3
304
- and "Properties" in j3["PropertyTable"]
305
- ):
351
+ for cid_from_ik in (cids or [])[:5]:
352
+ if not cid: # Only set CID if we don't have one yet
353
+ cid = cid_from_ik
354
+ j3 = try_query_cid(cid_from_ik)
355
+ if j3 and "PropertyTable" in j3 and "Properties" in j3["PropertyTable"]:
306
356
  p3 = j3["PropertyTable"]["Properties"][0]
307
357
  sm = p3.get("CanonicalSMILES") or sm
308
358
  if sm:
309
359
  break
310
360
 
311
- return (mf, sm, ik)
361
+ return (mf, sm, ik, cid)
312
362
  except Exception:
313
363
  pass
314
- time.sleep(1 + 2**i)
364
+ time.sleep(1 + 2 ** i)
315
365
 
316
366
  # final fallback: try raw name without normalization
367
+ if not cid:
368
+ cid = try_query_with_cid(name)
369
+
317
370
  j = try_query(name)
318
371
  if j and "PropertyTable" in j and "Properties" in j["PropertyTable"]:
319
372
  p = j["PropertyTable"]["Properties"][0]
320
- return (p.get("MolecularFormula"), p.get("CanonicalSMILES"), p.get("InChIKey"))
373
+ return (p.get("MolecularFormula"), p.get("CanonicalSMILES"), p.get("InChIKey"), cid)
321
374
 
322
375
  return props
323
376
 
324
377
 
325
- def generate_csv(out_path: str = "central_carbon_metabolites.csv"):
378
+ def generate_csv(out_path: str = "ccm.csv"):
326
379
  rows = []
327
380
  for name in CCM_METABOLITES:
328
- formula, smiles, inchikey = (None, None, None)
381
+ formula, smiles, inchikey, cid = (None, None, None, None)
329
382
  if requests is not None:
330
- formula, smiles, inchikey = fetch_from_pubchem(name)
383
+ formula, smiles, inchikey, cid = fetch_from_pubchem(name)
331
384
 
332
385
  # Neutralize charged molecular formulas (e.g., trailing +, -, 2+, 3-)
333
386
  # by adjusting the hydrogen count accordingly and removing the explicit charge.
@@ -336,15 +389,13 @@ def generate_csv(out_path: str = "central_carbon_metabolites.csv"):
336
389
  return fmt
337
390
  s = fmt.strip()
338
391
  # normalize common unicode superscripts (²³¹⁺⁻) to ascii
339
- sup_map = str.maketrans(
340
- {
341
- "²": "2",
342
- "³": "3",
343
- "¹": "1",
344
- "": "+",
345
- "⁻": "-",
346
- },
347
- )
392
+ sup_map = str.maketrans({
393
+ "²": "2",
394
+ "³": "3",
395
+ "¹": "1",
396
+ "": "+",
397
+ "": "-",
398
+ })
348
399
  s = s.translate(sup_map)
349
400
  # Remove enclosing brackets if present, e.g. [C6H5O7]2-
350
401
  if s.startswith("[") and s.endswith("]"):
@@ -361,17 +412,13 @@ def generate_csv(out_path: str = "central_carbon_metabolites.csv"):
361
412
  # determine magnitude and sign for patterns like '2-' or '-2' or '+2' or '3+'
362
413
  sign = 1
363
414
  mag = 1
364
- if charge_str[0] in "+-":
415
+ if charge_str[0] in '+-':
365
416
  # formats like '-2' or '+2' or '-' or '+'
366
- sign = -1 if charge_str[0] == "-" else 1
367
- mag = (
368
- int(charge_str[1:])
369
- if len(charge_str) > 1 and charge_str[1:].isdigit()
370
- else 1
371
- )
372
- elif charge_str[-1] in "+-":
417
+ sign = -1 if charge_str[0] == '-' else 1
418
+ mag = int(charge_str[1:]) if len(charge_str) > 1 and charge_str[1:].isdigit() else 1
419
+ elif charge_str[-1] in '+-':
373
420
  # formats like '2-' or '3+'
374
- sign = -1 if charge_str[-1] == "-" else 1
421
+ sign = -1 if charge_str[-1] == '-' else 1
375
422
  mag = int(charge_str[:-1]) if charge_str[:-1].isdigit() else 1
376
423
 
377
424
  # parse element counts from base formula
@@ -414,7 +461,7 @@ def generate_csv(out_path: str = "central_carbon_metabolites.csv"):
414
461
  for el in elems:
415
462
  if el in counts:
416
463
  n = counts[el]
417
- parts.append(f"{el}{n if n != 1 else ''}")
464
+ parts.append(f"{el}{n if n!=1 else ''}")
418
465
  new_formula = "".join(parts)
419
466
  return new_formula
420
467
 
@@ -443,11 +490,7 @@ def generate_csv(out_path: str = "central_carbon_metabolites.csv"):
443
490
  q = a.GetFormalCharge()
444
491
  if q > 0:
445
492
  # remove up to q hydrogen neighbors (by index)
446
- h_neighbors = [
447
- nbr.GetIdx()
448
- for nbr in a.GetNeighbors()
449
- if nbr.GetSymbol() == "H"
450
- ]
493
+ h_neighbors = [nbr.GetIdx() for nbr in a.GetNeighbors() if nbr.GetSymbol() == "H"]
451
494
  remove = h_neighbors[: min(len(h_neighbors), q)]
452
495
  to_remove.extend(remove)
453
496
  elif q < 0:
@@ -486,21 +529,32 @@ def generate_csv(out_path: str = "central_carbon_metabolites.csv"):
486
529
  smiles = neutralize_smiles(smiles) if smiles else smiles
487
530
  except Exception:
488
531
  pass
489
- rows.append(
490
- {
491
- "Name": name,
492
- "Formula": formula or "",
493
- "SMILES": smiles or "",
494
- "InChIKey": inchikey or "",
495
- },
496
- )
532
+
533
+ # Canonicalize SMILES after neutralization
534
+ try:
535
+ smiles = canonicalize_smiles(smiles) if smiles else smiles
536
+ except Exception:
537
+ pass
538
+
539
+ # Format the database ID and database name
540
+ db_id = f"CID:{cid}" if cid else ""
541
+ db = "pubchem" if cid else ""
542
+
543
+ rows.append({
544
+ "Name": name,
545
+ "Formula": formula or "",
546
+ "SMILES": smiles or "",
547
+ "InChIKey": inchikey or "",
548
+ "db_id": db_id,
549
+ "db": db,
550
+ })
497
551
 
498
552
  # Ensure output directory exists (data/libs)
499
- out_dir = os.path.join("master", "data", "libs")
553
+ out_dir = os.path.join('masster', 'data', 'libs')
500
554
  os.makedirs(out_dir, exist_ok=True)
501
555
  out_path_full = os.path.join(out_dir, os.path.basename(out_path))
502
556
 
503
- fieldnames = ["Name", "Formula", "SMILES", "InChIKey"]
557
+ fieldnames = ["Name", "Formula", "SMILES", "InChIKey", "db_id", "db"]
504
558
  with open(out_path_full, "w", newline="", encoding="utf-8") as f:
505
559
  writer = csv.DictWriter(f, fieldnames=fieldnames)
506
560
  writer.writeheader()
@@ -512,11 +566,11 @@ def generate_csv(out_path: str = "central_carbon_metabolites.csv"):
512
566
 
513
567
 
514
568
  def test_load_with_lib(csv_path: str):
515
- """Try to load the generated CSV using master.lib.Lib.import_csv."""
569
+ """Try to load the generated CSV using masster.lib.Lib.import_csv."""
516
570
  try:
517
- from master.lib import Lib
571
+ from masster.lib import Lib
518
572
  except Exception as e:
519
- print(f"Cannot import master.lib.Lib: {e}")
573
+ print(f"Cannot import masster.lib.Lib: {e}")
520
574
  return False
521
575
 
522
576
  try:
@@ -1,6 +1,6 @@
1
1
  central_carbon_metabolites.csv
2
2
 
3
- This folder contains example compound lists used by the master package.
3
+ This folder contains example compound lists used by the masster package.
4
4
 
5
5
  Files:
6
6
  - central_carbon_metabolites.csv: a best-effort list of central carbon metabolism related