masster 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- masster/__init__.py +8 -8
- masster/_version.py +1 -1
- masster/chromatogram.py +3 -9
- masster/data/libs/README.md +1 -1
- masster/data/libs/ccm.csv +120 -120
- masster/data/libs/ccm.py +116 -62
- masster/data/libs/central_carbon_README.md +1 -1
- masster/data/libs/urine.py +161 -65
- masster/data/libs/urine_metabolites.csv +4693 -4693
- masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.mzML +2 -2
- masster/logger.py +43 -78
- masster/sample/__init__.py +1 -1
- masster/sample/adducts.py +264 -338
- masster/sample/defaults/find_adducts_def.py +8 -21
- masster/sample/defaults/find_features_def.py +1 -6
- masster/sample/defaults/get_spectrum_def.py +1 -5
- masster/sample/defaults/sample_def.py +1 -5
- masster/sample/h5.py +282 -561
- masster/sample/helpers.py +75 -131
- masster/sample/lib.py +17 -42
- masster/sample/load.py +17 -31
- masster/sample/parameters.py +2 -6
- masster/sample/plot.py +27 -88
- masster/sample/processing.py +87 -117
- masster/sample/quant.py +51 -57
- masster/sample/sample.py +90 -103
- masster/sample/sample5_schema.json +44 -44
- masster/sample/save.py +12 -35
- masster/sample/sciex.py +19 -66
- masster/spectrum.py +20 -58
- masster/study/__init__.py +1 -1
- masster/study/defaults/align_def.py +1 -5
- masster/study/defaults/fill_chrom_def.py +1 -5
- masster/study/defaults/fill_def.py +1 -5
- masster/study/defaults/integrate_chrom_def.py +1 -5
- masster/study/defaults/integrate_def.py +1 -5
- masster/study/defaults/study_def.py +25 -58
- masster/study/export.py +207 -233
- masster/study/h5.py +136 -470
- masster/study/helpers.py +202 -495
- masster/study/helpers_optimized.py +13 -40
- masster/study/id.py +110 -213
- masster/study/load.py +143 -230
- masster/study/plot.py +257 -518
- masster/study/processing.py +257 -469
- masster/study/save.py +5 -15
- masster/study/study.py +276 -379
- masster/study/study5_schema.json +96 -96
- {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/METADATA +1 -1
- masster-0.4.1.dist-info/RECORD +67 -0
- masster-0.4.0.dist-info/RECORD +0 -67
- {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/WHEEL +0 -0
- {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/entry_points.txt +0 -0
- {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/licenses/LICENSE +0 -0
masster/data/libs/urine.py
CHANGED
|
@@ -3,20 +3,22 @@
|
|
|
3
3
|
Simple XML -> CSV parser for urine metabolites.
|
|
4
4
|
|
|
5
5
|
Produces `masster/data/libs/urine_metabolites.csv` with columns:
|
|
6
|
-
name,smiles,inchikey,formula
|
|
6
|
+
name,smiles,inchikey,formula,db_id,db
|
|
7
7
|
|
|
8
8
|
Usage:
|
|
9
9
|
uv run python masster/data/libs/urine.py [path/to/urine_metabolites.xml]
|
|
10
10
|
|
|
11
11
|
If no argument is given the script uses masster/data/libs/urine_metabolites.xml
|
|
12
12
|
"""
|
|
13
|
-
|
|
14
13
|
from __future__ import annotations
|
|
15
14
|
|
|
16
15
|
import csv
|
|
17
16
|
import sys
|
|
17
|
+
import time
|
|
18
18
|
from pathlib import Path
|
|
19
19
|
import xml.etree.ElementTree as ET
|
|
20
|
+
import requests
|
|
21
|
+
from urllib.parse import quote
|
|
20
22
|
|
|
21
23
|
try:
|
|
22
24
|
from rdkit import Chem
|
|
@@ -26,26 +28,20 @@ except Exception:
|
|
|
26
28
|
|
|
27
29
|
# Determine repo root (three levels up: .../masster/data/libs -> repo root)
|
|
28
30
|
WORKSPACE_ROOT = Path(__file__).resolve().parents[3]
|
|
29
|
-
DEFAULT_XML = WORKSPACE_ROOT /
|
|
30
|
-
OUT_CSV = WORKSPACE_ROOT /
|
|
31
|
+
DEFAULT_XML = WORKSPACE_ROOT / 'masster' / 'data' / 'libs' / 'urine_metabolites.xml'
|
|
32
|
+
OUT_CSV = WORKSPACE_ROOT / 'masster' / 'data' / 'libs' / 'urine_metabolites.csv'
|
|
31
33
|
|
|
32
34
|
|
|
33
35
|
def local(tag: str) -> str:
|
|
34
|
-
return tag.split(
|
|
36
|
+
return tag.split('}')[-1] if tag else ''
|
|
35
37
|
|
|
36
38
|
|
|
37
|
-
NAME_KEYS = {
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
"accession",
|
|
44
|
-
}
|
|
45
|
-
FORMULA_KEYS = {"formula", "chemical_formula", "molecular_formula"}
|
|
46
|
-
SMILES_KEYS = {"smiles", "canonical_smiles", "isomeric_smiles"}
|
|
47
|
-
INCHIKEY_KEYS = {"inchikey", "inchi_key", "standard_inchikey"}
|
|
48
|
-
INCHI_KEYS = {"inchi", "standard_inchi"}
|
|
39
|
+
NAME_KEYS = {'name', 'approved_name', 'primary_name', 'common_name', 'metabolite_name', 'accession'}
|
|
40
|
+
FORMULA_KEYS = {'formula', 'chemical_formula', 'molecular_formula'}
|
|
41
|
+
SMILES_KEYS = {'smiles', 'canonical_smiles', 'isomeric_smiles'}
|
|
42
|
+
INCHIKEY_KEYS = {'inchikey', 'inchi_key', 'standard_inchikey'}
|
|
43
|
+
INCHI_KEYS = {'inchi', 'standard_inchi'}
|
|
44
|
+
PUBCHEM_KEYS = {'pubchem_compound_id', 'pubchem_cid'}
|
|
49
45
|
|
|
50
46
|
|
|
51
47
|
def find_first_text(elem, candidates):
|
|
@@ -57,7 +53,7 @@ def find_first_text(elem, candidates):
|
|
|
57
53
|
s = t.strip()
|
|
58
54
|
if s:
|
|
59
55
|
return s
|
|
60
|
-
return
|
|
56
|
+
return ''
|
|
61
57
|
|
|
62
58
|
|
|
63
59
|
def find_name(elem):
|
|
@@ -65,14 +61,7 @@ def find_name(elem):
|
|
|
65
61
|
|
|
66
62
|
Priority: name, approved_name, primary_name, common_name, metabolite_name, accession
|
|
67
63
|
"""
|
|
68
|
-
priority = [
|
|
69
|
-
"name",
|
|
70
|
-
"approved_name",
|
|
71
|
-
"primary_name",
|
|
72
|
-
"common_name",
|
|
73
|
-
"metabolite_name",
|
|
74
|
-
"accession",
|
|
75
|
-
]
|
|
64
|
+
priority = ['name', 'approved_name', 'primary_name', 'common_name', 'metabolite_name', 'accession']
|
|
76
65
|
for key in priority:
|
|
77
66
|
for child in elem.iter():
|
|
78
67
|
if local(child.tag).lower() == key:
|
|
@@ -81,58 +70,139 @@ def find_name(elem):
|
|
|
81
70
|
s = t.strip()
|
|
82
71
|
if s:
|
|
83
72
|
return s
|
|
84
|
-
return
|
|
73
|
+
return ''
|
|
85
74
|
|
|
86
75
|
|
|
87
76
|
def inchi_to_smiles(inchi_text: str) -> str:
|
|
88
77
|
if not inchi_text or Chem is None:
|
|
89
|
-
return
|
|
78
|
+
return ''
|
|
90
79
|
try:
|
|
91
80
|
mol = Chem.MolFromInchi(inchi_text, sanitize=False)
|
|
92
81
|
if mol is None:
|
|
93
82
|
mol = Chem.MolFromInchi(inchi_text)
|
|
94
83
|
if mol is None:
|
|
95
|
-
return
|
|
84
|
+
return ''
|
|
96
85
|
try:
|
|
97
86
|
Chem.SanitizeMol(mol)
|
|
98
87
|
except Exception:
|
|
99
88
|
pass
|
|
100
89
|
return Chem.MolToSmiles(mol, isomericSmiles=True)
|
|
101
90
|
except Exception:
|
|
102
|
-
return
|
|
91
|
+
return ''
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def canonicalize_smiles(smiles_str: str) -> str:
|
|
95
|
+
"""
|
|
96
|
+
Canonicalize SMILES string using RDKit.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
smiles_str: Input SMILES string
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
Canonical SMILES string, or original string if canonicalization fails
|
|
103
|
+
"""
|
|
104
|
+
if not smiles_str or not smiles_str.strip() or Chem is None:
|
|
105
|
+
return smiles_str
|
|
106
|
+
|
|
107
|
+
try:
|
|
108
|
+
mol = Chem.MolFromSmiles(smiles_str, sanitize=True)
|
|
109
|
+
if mol is None:
|
|
110
|
+
return smiles_str
|
|
111
|
+
|
|
112
|
+
# Generate canonical SMILES with isomeric information
|
|
113
|
+
canonical_smiles = Chem.MolToSmiles(mol, isomericSmiles=True, canonical=True)
|
|
114
|
+
return canonical_smiles
|
|
115
|
+
|
|
116
|
+
except Exception:
|
|
117
|
+
# If canonicalization fails, return the original SMILES
|
|
118
|
+
return smiles_str
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def query_pubchem_by_inchikey(inchikey: str, max_retries: int = 3, delay: float = 0.2) -> str:
|
|
122
|
+
"""Query PubChem for compound ID using InChI key.
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
inchikey: The InChI key to search for
|
|
126
|
+
max_retries: Maximum number of retry attempts
|
|
127
|
+
delay: Delay between requests in seconds
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
PubChem compound ID as string, or empty string if not found
|
|
131
|
+
"""
|
|
132
|
+
if not inchikey or not inchikey.strip():
|
|
133
|
+
return ''
|
|
134
|
+
|
|
135
|
+
# Clean the InChI key
|
|
136
|
+
inchikey = inchikey.strip()
|
|
137
|
+
|
|
138
|
+
# PubChem REST API URL for searching by InChI key
|
|
139
|
+
url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/inchikey/{quote(inchikey)}/cids/JSON"
|
|
140
|
+
|
|
141
|
+
for attempt in range(max_retries):
|
|
142
|
+
try:
|
|
143
|
+
time.sleep(delay) # Rate limiting
|
|
144
|
+
response = requests.get(url, timeout=10)
|
|
145
|
+
|
|
146
|
+
if response.status_code == 200:
|
|
147
|
+
data = response.json()
|
|
148
|
+
if 'IdentifierList' in data and 'CID' in data['IdentifierList']:
|
|
149
|
+
cids = data['IdentifierList']['CID']
|
|
150
|
+
if cids:
|
|
151
|
+
return str(cids[0]) # Return the first CID
|
|
152
|
+
elif response.status_code == 404:
|
|
153
|
+
# Not found in PubChem
|
|
154
|
+
return ''
|
|
155
|
+
else:
|
|
156
|
+
# Other error, might retry
|
|
157
|
+
if attempt < max_retries - 1:
|
|
158
|
+
time.sleep(delay * (2 ** attempt)) # Exponential backoff
|
|
159
|
+
continue
|
|
160
|
+
|
|
161
|
+
except requests.RequestException:
|
|
162
|
+
if attempt < max_retries - 1:
|
|
163
|
+
time.sleep(delay * (2 ** attempt)) # Exponential backoff
|
|
164
|
+
continue
|
|
165
|
+
except Exception:
|
|
166
|
+
# JSON parsing error or other unexpected error
|
|
167
|
+
break
|
|
168
|
+
|
|
169
|
+
return ''
|
|
103
170
|
|
|
104
171
|
|
|
105
|
-
def parse_and_write(xml_path: Path, out_csv: Path) -> tuple[int, int, int]:
|
|
172
|
+
def parse_and_write(xml_path: Path, out_csv: Path) -> tuple[int, int, int, int, int]:
|
|
106
173
|
out_csv.parent.mkdir(parents=True, exist_ok=True)
|
|
107
174
|
total = 0
|
|
108
175
|
with_smiles = 0
|
|
109
176
|
with_inchikey = 0
|
|
177
|
+
with_pubchem_id = 0
|
|
178
|
+
existing_pubchem_ids = 0
|
|
110
179
|
|
|
111
|
-
with out_csv.open(
|
|
112
|
-
writer = csv.DictWriter(
|
|
113
|
-
outf,
|
|
114
|
-
fieldnames=["name", "smiles", "inchikey", "formula"],
|
|
115
|
-
)
|
|
180
|
+
with out_csv.open('w', encoding='utf-8', newline='') as outf:
|
|
181
|
+
writer = csv.DictWriter(outf, fieldnames=['name', 'smiles', 'inchikey', 'formula', 'db_id', 'db'])
|
|
116
182
|
writer.writeheader()
|
|
117
183
|
|
|
118
|
-
context = ET.iterparse(str(xml_path), events=(
|
|
184
|
+
context = ET.iterparse(str(xml_path), events=('end',))
|
|
119
185
|
for event, elem in context:
|
|
120
186
|
tag = local(elem.tag).lower()
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
for ch in elem
|
|
125
|
-
)
|
|
126
|
-
if tag in {"metabolite", "compound", "entry", "record"} or has_candidate:
|
|
187
|
+
|
|
188
|
+
# Only process actual metabolite entries (root level compounds)
|
|
189
|
+
if tag == 'metabolite':
|
|
127
190
|
name = find_name(elem)
|
|
128
191
|
formula = find_first_text(elem, FORMULA_KEYS)
|
|
192
|
+
|
|
193
|
+
# Only generate rows for entries that have a formula
|
|
194
|
+
if not formula:
|
|
195
|
+
elem.clear()
|
|
196
|
+
continue
|
|
197
|
+
|
|
129
198
|
smiles = find_first_text(elem, SMILES_KEYS)
|
|
130
199
|
inchikey = find_first_text(elem, INCHIKEY_KEYS)
|
|
131
200
|
inchi = find_first_text(elem, INCHI_KEYS)
|
|
201
|
+
existing_pubchem_id = find_first_text(elem, PUBCHEM_KEYS)
|
|
132
202
|
|
|
133
203
|
if not smiles:
|
|
134
204
|
for ch in elem.iter():
|
|
135
|
-
if ch.text and
|
|
205
|
+
if ch.text and 'smiles' in local(ch.tag).lower():
|
|
136
206
|
s = ch.text.strip()
|
|
137
207
|
if s:
|
|
138
208
|
smiles = s
|
|
@@ -140,7 +210,7 @@ def parse_and_write(xml_path: Path, out_csv: Path) -> tuple[int, int, int]:
|
|
|
140
210
|
|
|
141
211
|
if not inchikey:
|
|
142
212
|
for ch in elem.iter():
|
|
143
|
-
if ch.text and
|
|
213
|
+
if ch.text and 'inchikey' in local(ch.tag).lower():
|
|
144
214
|
s = ch.text.strip()
|
|
145
215
|
if s:
|
|
146
216
|
inchikey = s
|
|
@@ -151,24 +221,50 @@ def parse_and_write(xml_path: Path, out_csv: Path) -> tuple[int, int, int]:
|
|
|
151
221
|
if smi:
|
|
152
222
|
smiles = smi
|
|
153
223
|
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
total
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
224
|
+
# Determine database ID - check XML first, then query PubChem if needed
|
|
225
|
+
db_id = ''
|
|
226
|
+
db = ''
|
|
227
|
+
if existing_pubchem_id:
|
|
228
|
+
# Use existing PubChem ID from XML
|
|
229
|
+
db_id = f"CID:{existing_pubchem_id}"
|
|
230
|
+
db = 'pubchem'
|
|
231
|
+
with_pubchem_id += 1
|
|
232
|
+
existing_pubchem_ids += 1
|
|
233
|
+
print(f"Processing {total + 1}: Found existing PubChem CID for {name or 'Unknown'}: {existing_pubchem_id}")
|
|
234
|
+
elif inchikey:
|
|
235
|
+
# Query PubChem only if no existing ID
|
|
236
|
+
print(f"Processing {total + 1}: Querying PubChem for {name or 'Unknown'} ({inchikey[:14]}...)...")
|
|
237
|
+
pubchem_id = query_pubchem_by_inchikey(inchikey)
|
|
238
|
+
if pubchem_id:
|
|
239
|
+
db_id = f"CID:{pubchem_id}"
|
|
240
|
+
db = 'pubchem'
|
|
241
|
+
with_pubchem_id += 1
|
|
242
|
+
print(f" -> Found CID: {pubchem_id}")
|
|
243
|
+
else:
|
|
244
|
+
print(f" -> No PubChem match found")
|
|
245
|
+
|
|
246
|
+
# Canonicalize SMILES if present
|
|
247
|
+
if smiles:
|
|
248
|
+
smiles = canonicalize_smiles(smiles)
|
|
249
|
+
|
|
250
|
+
# Write the row for this metabolite
|
|
251
|
+
writer.writerow({
|
|
252
|
+
'name': name,
|
|
253
|
+
'smiles': smiles,
|
|
254
|
+
'inchikey': inchikey,
|
|
255
|
+
'formula': formula,
|
|
256
|
+
'db_id': db_id,
|
|
257
|
+
'db': db
|
|
258
|
+
})
|
|
259
|
+
total += 1
|
|
260
|
+
if smiles:
|
|
261
|
+
with_smiles += 1
|
|
262
|
+
if inchikey:
|
|
263
|
+
with_inchikey += 1
|
|
168
264
|
|
|
169
265
|
elem.clear()
|
|
170
266
|
|
|
171
|
-
return total, with_smiles, with_inchikey
|
|
267
|
+
return total, with_smiles, with_inchikey, with_pubchem_id, existing_pubchem_ids
|
|
172
268
|
|
|
173
269
|
|
|
174
270
|
def main(args):
|
|
@@ -176,15 +272,15 @@ def main(args):
|
|
|
176
272
|
if not xml_path.exists():
|
|
177
273
|
print(f"XML not found: {xml_path}")
|
|
178
274
|
return 2
|
|
179
|
-
total, with_smiles, with_inchikey = parse_and_write(xml_path, OUT_CSV)
|
|
275
|
+
total, with_smiles, with_inchikey, with_pubchem_id, existing_pubchem_ids = parse_and_write(xml_path, OUT_CSV)
|
|
180
276
|
print(f"Wrote {total} rows to {OUT_CSV}")
|
|
181
|
-
print(f"with_smiles={with_smiles} with_inchikey={with_inchikey}")
|
|
277
|
+
print(f"with_smiles={with_smiles} with_inchikey={with_inchikey} with_pubchem_id={with_pubchem_id}")
|
|
278
|
+
print(f"existing_pubchem_ids={existing_pubchem_ids} queried_pubchem_ids={with_pubchem_id - existing_pubchem_ids}")
|
|
182
279
|
|
|
183
280
|
# optional quick import check
|
|
184
281
|
try:
|
|
185
282
|
sys.path.insert(0, str(WORKSPACE_ROOT))
|
|
186
|
-
from
|
|
187
|
-
|
|
283
|
+
from masster.lib import Lib # type: ignore
|
|
188
284
|
try:
|
|
189
285
|
lib = Lib(str(OUT_CSV))
|
|
190
286
|
print(f"Successfully imported {len(lib)} library entries from {OUT_CSV}")
|
|
@@ -196,5 +292,5 @@ def main(args):
|
|
|
196
292
|
return 0
|
|
197
293
|
|
|
198
294
|
|
|
199
|
-
if __name__ ==
|
|
295
|
+
if __name__ == '__main__':
|
|
200
296
|
sys.exit(main(sys.argv[1:]))
|