masster 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. masster/__init__.py +8 -8
  2. masster/_version.py +1 -1
  3. masster/chromatogram.py +3 -9
  4. masster/data/libs/README.md +1 -1
  5. masster/data/libs/ccm.csv +120 -120
  6. masster/data/libs/ccm.py +116 -62
  7. masster/data/libs/central_carbon_README.md +1 -1
  8. masster/data/libs/urine.py +161 -65
  9. masster/data/libs/urine_metabolites.csv +4693 -4693
  10. masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.mzML +2 -2
  11. masster/logger.py +43 -78
  12. masster/sample/__init__.py +1 -1
  13. masster/sample/adducts.py +264 -338
  14. masster/sample/defaults/find_adducts_def.py +8 -21
  15. masster/sample/defaults/find_features_def.py +1 -6
  16. masster/sample/defaults/get_spectrum_def.py +1 -5
  17. masster/sample/defaults/sample_def.py +1 -5
  18. masster/sample/h5.py +282 -561
  19. masster/sample/helpers.py +75 -131
  20. masster/sample/lib.py +17 -42
  21. masster/sample/load.py +17 -31
  22. masster/sample/parameters.py +2 -6
  23. masster/sample/plot.py +27 -88
  24. masster/sample/processing.py +87 -117
  25. masster/sample/quant.py +51 -57
  26. masster/sample/sample.py +90 -103
  27. masster/sample/sample5_schema.json +44 -44
  28. masster/sample/save.py +12 -35
  29. masster/sample/sciex.py +19 -66
  30. masster/spectrum.py +20 -58
  31. masster/study/__init__.py +1 -1
  32. masster/study/defaults/align_def.py +1 -5
  33. masster/study/defaults/fill_chrom_def.py +1 -5
  34. masster/study/defaults/fill_def.py +1 -5
  35. masster/study/defaults/integrate_chrom_def.py +1 -5
  36. masster/study/defaults/integrate_def.py +1 -5
  37. masster/study/defaults/study_def.py +25 -58
  38. masster/study/export.py +207 -233
  39. masster/study/h5.py +136 -470
  40. masster/study/helpers.py +202 -495
  41. masster/study/helpers_optimized.py +13 -40
  42. masster/study/id.py +110 -213
  43. masster/study/load.py +143 -230
  44. masster/study/plot.py +257 -518
  45. masster/study/processing.py +257 -469
  46. masster/study/save.py +5 -15
  47. masster/study/study.py +276 -379
  48. masster/study/study5_schema.json +96 -96
  49. {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/METADATA +1 -1
  50. masster-0.4.1.dist-info/RECORD +67 -0
  51. masster-0.4.0.dist-info/RECORD +0 -67
  52. {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/WHEEL +0 -0
  53. {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/entry_points.txt +0 -0
  54. {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/licenses/LICENSE +0 -0
@@ -3,20 +3,22 @@
3
3
  Simple XML -> CSV parser for urine metabolites.
4
4
 
5
5
  Produces `masster/data/libs/urine_metabolites.csv` with columns:
6
- name,smiles,inchikey,formula
6
+ name,smiles,inchikey,formula,db_id,db
7
7
 
8
8
  Usage:
9
9
  uv run python masster/data/libs/urine.py [path/to/urine_metabolites.xml]
10
10
 
11
11
  If no argument is given the script uses masster/data/libs/urine_metabolites.xml
12
12
  """
13
-
14
13
  from __future__ import annotations
15
14
 
16
15
  import csv
17
16
  import sys
17
+ import time
18
18
  from pathlib import Path
19
19
  import xml.etree.ElementTree as ET
20
+ import requests
21
+ from urllib.parse import quote
20
22
 
21
23
  try:
22
24
  from rdkit import Chem
@@ -26,26 +28,20 @@ except Exception:
26
28
 
27
29
  # Determine repo root (three levels up: .../masster/data/libs -> repo root)
28
30
  WORKSPACE_ROOT = Path(__file__).resolve().parents[3]
29
- DEFAULT_XML = WORKSPACE_ROOT / "master" / "data" / "libs" / "urine_metabolites.xml"
30
- OUT_CSV = WORKSPACE_ROOT / "master" / "data" / "libs" / "urine_metabolites.csv"
31
+ DEFAULT_XML = WORKSPACE_ROOT / 'masster' / 'data' / 'libs' / 'urine_metabolites.xml'
32
+ OUT_CSV = WORKSPACE_ROOT / 'masster' / 'data' / 'libs' / 'urine_metabolites.csv'
31
33
 
32
34
 
33
35
  def local(tag: str) -> str:
34
- return tag.split("}")[-1] if tag else ""
36
+ return tag.split('}')[-1] if tag else ''
35
37
 
36
38
 
37
- NAME_KEYS = {
38
- "name",
39
- "approved_name",
40
- "primary_name",
41
- "common_name",
42
- "metabolite_name",
43
- "accession",
44
- }
45
- FORMULA_KEYS = {"formula", "chemical_formula", "molecular_formula"}
46
- SMILES_KEYS = {"smiles", "canonical_smiles", "isomeric_smiles"}
47
- INCHIKEY_KEYS = {"inchikey", "inchi_key", "standard_inchikey"}
48
- INCHI_KEYS = {"inchi", "standard_inchi"}
39
+ NAME_KEYS = {'name', 'approved_name', 'primary_name', 'common_name', 'metabolite_name', 'accession'}
40
+ FORMULA_KEYS = {'formula', 'chemical_formula', 'molecular_formula'}
41
+ SMILES_KEYS = {'smiles', 'canonical_smiles', 'isomeric_smiles'}
42
+ INCHIKEY_KEYS = {'inchikey', 'inchi_key', 'standard_inchikey'}
43
+ INCHI_KEYS = {'inchi', 'standard_inchi'}
44
+ PUBCHEM_KEYS = {'pubchem_compound_id', 'pubchem_cid'}
49
45
 
50
46
 
51
47
  def find_first_text(elem, candidates):
@@ -57,7 +53,7 @@ def find_first_text(elem, candidates):
57
53
  s = t.strip()
58
54
  if s:
59
55
  return s
60
- return ""
56
+ return ''
61
57
 
62
58
 
63
59
  def find_name(elem):
@@ -65,14 +61,7 @@ def find_name(elem):
65
61
 
66
62
  Priority: name, approved_name, primary_name, common_name, metabolite_name, accession
67
63
  """
68
- priority = [
69
- "name",
70
- "approved_name",
71
- "primary_name",
72
- "common_name",
73
- "metabolite_name",
74
- "accession",
75
- ]
64
+ priority = ['name', 'approved_name', 'primary_name', 'common_name', 'metabolite_name', 'accession']
76
65
  for key in priority:
77
66
  for child in elem.iter():
78
67
  if local(child.tag).lower() == key:
@@ -81,58 +70,139 @@ def find_name(elem):
81
70
  s = t.strip()
82
71
  if s:
83
72
  return s
84
- return ""
73
+ return ''
85
74
 
86
75
 
87
76
  def inchi_to_smiles(inchi_text: str) -> str:
88
77
  if not inchi_text or Chem is None:
89
- return ""
78
+ return ''
90
79
  try:
91
80
  mol = Chem.MolFromInchi(inchi_text, sanitize=False)
92
81
  if mol is None:
93
82
  mol = Chem.MolFromInchi(inchi_text)
94
83
  if mol is None:
95
- return ""
84
+ return ''
96
85
  try:
97
86
  Chem.SanitizeMol(mol)
98
87
  except Exception:
99
88
  pass
100
89
  return Chem.MolToSmiles(mol, isomericSmiles=True)
101
90
  except Exception:
102
- return ""
91
+ return ''
92
+
93
+
94
+ def canonicalize_smiles(smiles_str: str) -> str:
95
+ """
96
+ Canonicalize SMILES string using RDKit.
97
+
98
+ Args:
99
+ smiles_str: Input SMILES string
100
+
101
+ Returns:
102
+ Canonical SMILES string, or original string if canonicalization fails
103
+ """
104
+ if not smiles_str or not smiles_str.strip() or Chem is None:
105
+ return smiles_str
106
+
107
+ try:
108
+ mol = Chem.MolFromSmiles(smiles_str, sanitize=True)
109
+ if mol is None:
110
+ return smiles_str
111
+
112
+ # Generate canonical SMILES with isomeric information
113
+ canonical_smiles = Chem.MolToSmiles(mol, isomericSmiles=True, canonical=True)
114
+ return canonical_smiles
115
+
116
+ except Exception:
117
+ # If canonicalization fails, return the original SMILES
118
+ return smiles_str
119
+
120
+
121
+ def query_pubchem_by_inchikey(inchikey: str, max_retries: int = 3, delay: float = 0.2) -> str:
122
+ """Query PubChem for compound ID using InChI key.
123
+
124
+ Args:
125
+ inchikey: The InChI key to search for
126
+ max_retries: Maximum number of retry attempts
127
+ delay: Delay between requests in seconds
128
+
129
+ Returns:
130
+ PubChem compound ID as string, or empty string if not found
131
+ """
132
+ if not inchikey or not inchikey.strip():
133
+ return ''
134
+
135
+ # Clean the InChI key
136
+ inchikey = inchikey.strip()
137
+
138
+ # PubChem REST API URL for searching by InChI key
139
+ url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/inchikey/{quote(inchikey)}/cids/JSON"
140
+
141
+ for attempt in range(max_retries):
142
+ try:
143
+ time.sleep(delay) # Rate limiting
144
+ response = requests.get(url, timeout=10)
145
+
146
+ if response.status_code == 200:
147
+ data = response.json()
148
+ if 'IdentifierList' in data and 'CID' in data['IdentifierList']:
149
+ cids = data['IdentifierList']['CID']
150
+ if cids:
151
+ return str(cids[0]) # Return the first CID
152
+ elif response.status_code == 404:
153
+ # Not found in PubChem
154
+ return ''
155
+ else:
156
+ # Other error, might retry
157
+ if attempt < max_retries - 1:
158
+ time.sleep(delay * (2 ** attempt)) # Exponential backoff
159
+ continue
160
+
161
+ except requests.RequestException:
162
+ if attempt < max_retries - 1:
163
+ time.sleep(delay * (2 ** attempt)) # Exponential backoff
164
+ continue
165
+ except Exception:
166
+ # JSON parsing error or other unexpected error
167
+ break
168
+
169
+ return ''
103
170
 
104
171
 
105
- def parse_and_write(xml_path: Path, out_csv: Path) -> tuple[int, int, int]:
172
+ def parse_and_write(xml_path: Path, out_csv: Path) -> tuple[int, int, int, int, int]:
106
173
  out_csv.parent.mkdir(parents=True, exist_ok=True)
107
174
  total = 0
108
175
  with_smiles = 0
109
176
  with_inchikey = 0
177
+ with_pubchem_id = 0
178
+ existing_pubchem_ids = 0
110
179
 
111
- with out_csv.open("w", encoding="utf-8", newline="") as outf:
112
- writer = csv.DictWriter(
113
- outf,
114
- fieldnames=["name", "smiles", "inchikey", "formula"],
115
- )
180
+ with out_csv.open('w', encoding='utf-8', newline='') as outf:
181
+ writer = csv.DictWriter(outf, fieldnames=['name', 'smiles', 'inchikey', 'formula', 'db_id', 'db'])
116
182
  writer.writeheader()
117
183
 
118
- context = ET.iterparse(str(xml_path), events=("end",))
184
+ context = ET.iterparse(str(xml_path), events=('end',))
119
185
  for event, elem in context:
120
186
  tag = local(elem.tag).lower()
121
- has_candidate = any(
122
- local(ch.tag).lower()
123
- in NAME_KEYS | FORMULA_KEYS | SMILES_KEYS | INCHIKEY_KEYS
124
- for ch in elem
125
- )
126
- if tag in {"metabolite", "compound", "entry", "record"} or has_candidate:
187
+
188
+ # Only process actual metabolite entries (root level compounds)
189
+ if tag == 'metabolite':
127
190
  name = find_name(elem)
128
191
  formula = find_first_text(elem, FORMULA_KEYS)
192
+
193
+ # Only generate rows for entries that have a formula
194
+ if not formula:
195
+ elem.clear()
196
+ continue
197
+
129
198
  smiles = find_first_text(elem, SMILES_KEYS)
130
199
  inchikey = find_first_text(elem, INCHIKEY_KEYS)
131
200
  inchi = find_first_text(elem, INCHI_KEYS)
201
+ existing_pubchem_id = find_first_text(elem, PUBCHEM_KEYS)
132
202
 
133
203
  if not smiles:
134
204
  for ch in elem.iter():
135
- if ch.text and "smiles" in local(ch.tag).lower():
205
+ if ch.text and 'smiles' in local(ch.tag).lower():
136
206
  s = ch.text.strip()
137
207
  if s:
138
208
  smiles = s
@@ -140,7 +210,7 @@ def parse_and_write(xml_path: Path, out_csv: Path) -> tuple[int, int, int]:
140
210
 
141
211
  if not inchikey:
142
212
  for ch in elem.iter():
143
- if ch.text and "inchikey" in local(ch.tag).lower():
213
+ if ch.text and 'inchikey' in local(ch.tag).lower():
144
214
  s = ch.text.strip()
145
215
  if s:
146
216
  inchikey = s
@@ -151,24 +221,50 @@ def parse_and_write(xml_path: Path, out_csv: Path) -> tuple[int, int, int]:
151
221
  if smi:
152
222
  smiles = smi
153
223
 
154
- if name or formula or smiles or inchikey:
155
- writer.writerow(
156
- {
157
- "name": name,
158
- "smiles": smiles,
159
- "inchikey": inchikey,
160
- "formula": formula,
161
- },
162
- )
163
- total += 1
164
- if smiles:
165
- with_smiles += 1
166
- if inchikey:
167
- with_inchikey += 1
224
+ # Determine database ID - check XML first, then query PubChem if needed
225
+ db_id = ''
226
+ db = ''
227
+ if existing_pubchem_id:
228
+ # Use existing PubChem ID from XML
229
+ db_id = f"CID:{existing_pubchem_id}"
230
+ db = 'pubchem'
231
+ with_pubchem_id += 1
232
+ existing_pubchem_ids += 1
233
+ print(f"Processing {total + 1}: Found existing PubChem CID for {name or 'Unknown'}: {existing_pubchem_id}")
234
+ elif inchikey:
235
+ # Query PubChem only if no existing ID
236
+ print(f"Processing {total + 1}: Querying PubChem for {name or 'Unknown'} ({inchikey[:14]}...)...")
237
+ pubchem_id = query_pubchem_by_inchikey(inchikey)
238
+ if pubchem_id:
239
+ db_id = f"CID:{pubchem_id}"
240
+ db = 'pubchem'
241
+ with_pubchem_id += 1
242
+ print(f" -> Found CID: {pubchem_id}")
243
+ else:
244
+ print(f" -> No PubChem match found")
245
+
246
+ # Canonicalize SMILES if present
247
+ if smiles:
248
+ smiles = canonicalize_smiles(smiles)
249
+
250
+ # Write the row for this metabolite
251
+ writer.writerow({
252
+ 'name': name,
253
+ 'smiles': smiles,
254
+ 'inchikey': inchikey,
255
+ 'formula': formula,
256
+ 'db_id': db_id,
257
+ 'db': db
258
+ })
259
+ total += 1
260
+ if smiles:
261
+ with_smiles += 1
262
+ if inchikey:
263
+ with_inchikey += 1
168
264
 
169
265
  elem.clear()
170
266
 
171
- return total, with_smiles, with_inchikey
267
+ return total, with_smiles, with_inchikey, with_pubchem_id, existing_pubchem_ids
172
268
 
173
269
 
174
270
  def main(args):
@@ -176,15 +272,15 @@ def main(args):
176
272
  if not xml_path.exists():
177
273
  print(f"XML not found: {xml_path}")
178
274
  return 2
179
- total, with_smiles, with_inchikey = parse_and_write(xml_path, OUT_CSV)
275
+ total, with_smiles, with_inchikey, with_pubchem_id, existing_pubchem_ids = parse_and_write(xml_path, OUT_CSV)
180
276
  print(f"Wrote {total} rows to {OUT_CSV}")
181
- print(f"with_smiles={with_smiles} with_inchikey={with_inchikey}")
277
+ print(f"with_smiles={with_smiles} with_inchikey={with_inchikey} with_pubchem_id={with_pubchem_id}")
278
+ print(f"existing_pubchem_ids={existing_pubchem_ids} queried_pubchem_ids={with_pubchem_id - existing_pubchem_ids}")
182
279
 
183
280
  # optional quick import check
184
281
  try:
185
282
  sys.path.insert(0, str(WORKSPACE_ROOT))
186
- from master.lib import Lib # type: ignore
187
-
283
+ from masster.lib import Lib # type: ignore
188
284
  try:
189
285
  lib = Lib(str(OUT_CSV))
190
286
  print(f"Successfully imported {len(lib)} library entries from {OUT_CSV}")
@@ -196,5 +292,5 @@ def main(args):
196
292
  return 0
197
293
 
198
294
 
199
- if __name__ == "__main__":
295
+ if __name__ == '__main__':
200
296
  sys.exit(main(sys.argv[1:]))