cdxml-toolkit 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cdxml_toolkit/__init__.py +18 -0
- cdxml_toolkit/_jre/__init__.py +2 -0
- cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
- cdxml_toolkit/analysis/__init__.py +35 -0
- cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
- cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
- cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
- cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
- cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
- cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
- cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
- cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
- cdxml_toolkit/analysis/extract_nmr.py +47 -0
- cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
- cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
- cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
- cdxml_toolkit/cdxml_builder.py +920 -0
- cdxml_toolkit/cdxml_utils.py +342 -0
- cdxml_toolkit/chemdraw/__init__.py +5 -0
- cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
- cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
- cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
- cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
- cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
- cdxml_toolkit/constants.py +304 -0
- cdxml_toolkit/coord_normalizer.py +438 -0
- cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
- cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
- cdxml_toolkit/image/__init__.py +15 -0
- cdxml_toolkit/image/reaction_from_image.py +2103 -0
- cdxml_toolkit/image/structure_from_image.py +1711 -0
- cdxml_toolkit/layout/__init__.py +5 -0
- cdxml_toolkit/layout/alignment.py +1642 -0
- cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
- cdxml_toolkit/layout/scheme_merger.py +2260 -0
- cdxml_toolkit/mcp_server/__init__.py +0 -0
- cdxml_toolkit/mcp_server/__main__.py +5 -0
- cdxml_toolkit/mcp_server/server.py +1567 -0
- cdxml_toolkit/naming/__init__.py +6 -0
- cdxml_toolkit/naming/aligned_namer.py +2342 -0
- cdxml_toolkit/naming/mol_builder.py +3722 -0
- cdxml_toolkit/naming/name_decomposer.py +2843 -0
- cdxml_toolkit/naming/reactions_datamol.json +2414 -0
- cdxml_toolkit/office/__init__.py +5 -0
- cdxml_toolkit/office/doc_from_template.py +722 -0
- cdxml_toolkit/office/ole_embedder.py +808 -0
- cdxml_toolkit/office/ole_extractor.py +272 -0
- cdxml_toolkit/perception/__init__.py +10 -0
- cdxml_toolkit/perception/compound_search.py +229 -0
- cdxml_toolkit/perception/eln_csv_parser.py +240 -0
- cdxml_toolkit/perception/rdf_parser.py +664 -0
- cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
- cdxml_toolkit/perception/reaction_parser.py +2150 -0
- cdxml_toolkit/perception/scheme_reader.py +2948 -0
- cdxml_toolkit/perception/scheme_refine.py +1404 -0
- cdxml_toolkit/perception/scheme_segmenter.py +619 -0
- cdxml_toolkit/perception/spatial_assignment.py +1013 -0
- cdxml_toolkit/rdkit_utils.py +605 -0
- cdxml_toolkit/render/__init__.py +17 -0
- cdxml_toolkit/render/auto_layout.py +229 -0
- cdxml_toolkit/render/compact_parser.py +632 -0
- cdxml_toolkit/render/parser.py +706 -0
- cdxml_toolkit/render/render_scheme.py +267 -0
- cdxml_toolkit/render/renderer.py +2387 -0
- cdxml_toolkit/render/schema.py +90 -0
- cdxml_toolkit/render/scheme_maker.py +1043 -0
- cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
- cdxml_toolkit/resolve/__init__.py +13 -0
- cdxml_toolkit/resolve/cas_resolver.py +430 -0
- cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
- cdxml_toolkit/resolve/condensed_formula.py +493 -0
- cdxml_toolkit/resolve/jre_manager.py +195 -0
- cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
- cdxml_toolkit/resolve/reagent_db.py +285 -0
- cdxml_toolkit/resolve/superatom_data.json +2856 -0
- cdxml_toolkit/resolve/superatom_table.py +146 -0
- cdxml_toolkit/text_formatting.py +298 -0
- cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
- cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
- cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
- cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
- cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,654 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Mass Resolver — Structure-Based Mass Determination for LCMS Identification
|
|
4
|
+
|
|
5
|
+
Extracts expected species (starting materials, products, reagents) from
|
|
6
|
+
CDX/RXN structure files via ChemScript + RDKit, computes monoisotopic
|
|
7
|
+
exact masses, and builds expected ESI adduct m/z tables.
|
|
8
|
+
|
|
9
|
+
Three tiers of mass resolution:
|
|
10
|
+
1. ChemScript + RDKit (CDX or RXN → SMILES → exact mass)
|
|
11
|
+
2. RDKit only (RXN → exact mass, with SUP abbreviation correction)
|
|
12
|
+
3. CSV MW fallback (average MW from ELN export)
|
|
13
|
+
|
|
14
|
+
Usage:
|
|
15
|
+
from mass_resolver import extract_expected_masses, ExpectedSpecies
|
|
16
|
+
|
|
17
|
+
species = extract_expected_masses(exp)
|
|
18
|
+
for sp in species:
|
|
19
|
+
print(f"{sp.name}: {sp.exact_mass:.3f} Da")
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
import os
|
|
23
|
+
import sys
|
|
24
|
+
from dataclasses import dataclass, field
|
|
25
|
+
from typing import List, Optional, Dict, Tuple
|
|
26
|
+
|
|
27
|
+
from cdxml_toolkit.constants import MW_MATCH_TOLERANCE
|
|
28
|
+
|
|
29
|
+
# --- Optional: structure-based mass determination ---
|
|
30
|
+
try:
|
|
31
|
+
from rdkit import Chem
|
|
32
|
+
from rdkit.Chem import Descriptors
|
|
33
|
+
_HAS_RDKIT = True
|
|
34
|
+
except ImportError:
|
|
35
|
+
_HAS_RDKIT = False
|
|
36
|
+
|
|
37
|
+
try:
|
|
38
|
+
from cdxml_toolkit.chemdraw.chemscript_bridge import ChemScriptBridge
|
|
39
|
+
_HAS_CHEMSCRIPT = True
|
|
40
|
+
except ImportError:
|
|
41
|
+
_HAS_CHEMSCRIPT = False
|
|
42
|
+
|
|
43
|
+
# ---------------------------------------------------------------------------
|
|
44
|
+
# Constants
|
|
45
|
+
# ---------------------------------------------------------------------------
|
|
46
|
+
|
|
47
|
+
# Standard ESI adducts: name -> (ESI mode, mass offset from neutral)
|
|
48
|
+
ADDUCTS = {
|
|
49
|
+
"[M+H]+": ("ES+", 1.008),
|
|
50
|
+
"[M-H]-": ("ES-", -1.008),
|
|
51
|
+
"[M+Na]+": ("ES+", 22.990),
|
|
52
|
+
"[M+formate]-": ("ES-", 44.998),
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
# Adduct reporting priority: prefer [M+H]+/[M-H]- (proton transfer)
|
|
56
|
+
# over [M+Na]+/[M+formate]- (adduct ions). Lower number = preferred.
|
|
57
|
+
ADDUCT_PRIORITY = {
|
|
58
|
+
"[M+H]+": 0,
|
|
59
|
+
"[M-H]-": 0,
|
|
60
|
+
"[M+Na]+": 1,
|
|
61
|
+
"[M+formate]-": 1,
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
# ESI mode preference for breaking ties: ESI+ preferred over ESI-
|
|
65
|
+
MODE_PREFERENCE = {"ES+": 0, "ES-": 1}
|
|
66
|
+
|
|
67
|
+
# Lazy-built table mapping SUP abbreviation labels to fragment exact masses.
|
|
68
|
+
# Populated on first use by _get_abbrev_mass_table() (requires RDKit).
|
|
69
|
+
_ABBREV_MASS_TABLE: Optional[Dict[str, float]] = None
|
|
70
|
+
|
|
71
|
+
# Cache of raw FlowER predictions (before deduplication), set by
|
|
72
|
+
# extract_expected_masses() when predict_byproducts=True.
|
|
73
|
+
_last_flower_predictions: List = []
|
|
74
|
+
|
|
75
|
+
# ---------------------------------------------------------------------------
|
|
76
|
+
# Data structures
|
|
77
|
+
# ---------------------------------------------------------------------------
|
|
78
|
+
|
|
79
|
+
@dataclass
|
|
80
|
+
class ExpectedSpecies:
|
|
81
|
+
"""A chemical species with predicted LCMS adduct masses."""
|
|
82
|
+
name: str # display name: "SM", "DP", formula, or IUPAC name
|
|
83
|
+
role: str # "substrate", "reactant", "product"
|
|
84
|
+
exact_mass: float # monoisotopic neutral mass
|
|
85
|
+
smiles: str
|
|
86
|
+
adducts: Dict[str, float] = field(default_factory=dict)
|
|
87
|
+
source_file: str = "" # CDX/RXN path if from structure, "" if CSV
|
|
88
|
+
|
|
89
|
+
# ---------------------------------------------------------------------------
|
|
90
|
+
# Mass computation
|
|
91
|
+
# ---------------------------------------------------------------------------
|
|
92
|
+
|
|
93
|
+
def compute_masses(smiles: str) -> Optional[Tuple[float, float]]:
|
|
94
|
+
"""
|
|
95
|
+
Compute monoisotopic masses from SMILES.
|
|
96
|
+
|
|
97
|
+
Returns (neutral_mass, full_mass) where:
|
|
98
|
+
- neutral_mass: mass of the largest fragment (free base / free acid),
|
|
99
|
+
used for LCMS adduct matching
|
|
100
|
+
- full_mass: mass of the entire molecule including any counterions,
|
|
101
|
+
used for matching against CSV MW which may record the salt form
|
|
102
|
+
|
|
103
|
+
For non-salt molecules, both values are identical.
|
|
104
|
+
"""
|
|
105
|
+
mol = Chem.MolFromSmiles(smiles)
|
|
106
|
+
if mol is None:
|
|
107
|
+
return None
|
|
108
|
+
|
|
109
|
+
full_mass = Descriptors.ExactMolWt(mol)
|
|
110
|
+
|
|
111
|
+
# Split multi-component SMILES (salts)
|
|
112
|
+
frags = Chem.GetMolFrags(mol, asMols=True)
|
|
113
|
+
if len(frags) > 1:
|
|
114
|
+
neutral_mol = max(frags, key=lambda m: m.GetNumHeavyAtoms())
|
|
115
|
+
neutral_mass = Descriptors.ExactMolWt(neutral_mol)
|
|
116
|
+
else:
|
|
117
|
+
neutral_mass = full_mass
|
|
118
|
+
|
|
119
|
+
return (neutral_mass, full_mass)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
# Backward-compatible aliases (were private, now public)
|
|
123
|
+
_compute_masses = compute_masses
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def build_adducts(exact_mass: float) -> Dict[str, float]:
|
|
127
|
+
"""Build expected adduct m/z dict from neutral exact mass."""
|
|
128
|
+
return {name: exact_mass + offset for name, (_, offset) in ADDUCTS.items()}
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
# Backward-compatible alias
|
|
132
|
+
_build_adducts = build_adducts
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
# ---------------------------------------------------------------------------
|
|
136
|
+
# CSV name matching
|
|
137
|
+
# ---------------------------------------------------------------------------
|
|
138
|
+
|
|
139
|
+
def _match_csv_name(neutral_mass: float,
|
|
140
|
+
full_mass: float,
|
|
141
|
+
reagents,
|
|
142
|
+
used_indices: set) -> Optional[str]:
|
|
143
|
+
"""
|
|
144
|
+
Match a structure's mass to a CSV reagent row by MW.
|
|
145
|
+
|
|
146
|
+
Tries both the neutral (free base) mass and the full (salt) mass
|
|
147
|
+
against each CSV MW. This handles:
|
|
148
|
+
- Free-form structure vs free-form CSV MW (neutral ≈ CSV)
|
|
149
|
+
- Salt structure vs salt CSV MW (full ≈ CSV)
|
|
150
|
+
- Salt structure vs free-form CSV MW (neutral ≈ CSV)
|
|
151
|
+
|
|
152
|
+
Returns the CSV reagent name if a match is found (within 2 Da),
|
|
153
|
+
or None. Marks matched index as used to prevent double-matching.
|
|
154
|
+
"""
|
|
155
|
+
best_name = None
|
|
156
|
+
best_delta = 2.0
|
|
157
|
+
best_idx = -1
|
|
158
|
+
|
|
159
|
+
for idx, reagent in enumerate(reagents):
|
|
160
|
+
if idx in used_indices or reagent.mw <= 0:
|
|
161
|
+
continue
|
|
162
|
+
|
|
163
|
+
# Try neutral mass (free base/acid) against CSV MW
|
|
164
|
+
delta = abs(neutral_mass - reagent.mw)
|
|
165
|
+
if delta < best_delta:
|
|
166
|
+
best_delta = delta
|
|
167
|
+
best_name = reagent.name.strip()
|
|
168
|
+
best_idx = idx
|
|
169
|
+
|
|
170
|
+
# Try full mass (including counterion) against CSV MW
|
|
171
|
+
if full_mass != neutral_mass:
|
|
172
|
+
delta = abs(full_mass - reagent.mw)
|
|
173
|
+
if delta < best_delta:
|
|
174
|
+
best_delta = delta
|
|
175
|
+
best_name = reagent.name.strip()
|
|
176
|
+
best_idx = idx
|
|
177
|
+
|
|
178
|
+
if best_name and best_idx >= 0:
|
|
179
|
+
used_indices.add(best_idx)
|
|
180
|
+
return best_name
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
# ---------------------------------------------------------------------------
|
|
184
|
+
# SUP abbreviation mass correction (RDKit RXN loading)
|
|
185
|
+
# ---------------------------------------------------------------------------
|
|
186
|
+
|
|
187
|
+
def _get_abbrev_mass_table() -> Dict[str, float]:
|
|
188
|
+
"""Build (once) a table of SUP abbreviation label → fragment exact mass.
|
|
189
|
+
|
|
190
|
+
The fragment mass is the monoisotopic mass of the group that gets attached
|
|
191
|
+
to the molecule, i.e. the abbreviation minus its '*' attachment-point atom.
|
|
192
|
+
For example COOH → C(=O)OH fragment → 44.998 Da.
|
|
193
|
+
|
|
194
|
+
Used to correct masses when RDKit reads SUP SGroup atoms as plain CH3
|
|
195
|
+
placeholders instead of the real abbreviated group.
|
|
196
|
+
"""
|
|
197
|
+
global _ABBREV_MASS_TABLE
|
|
198
|
+
if _ABBREV_MASS_TABLE is not None:
|
|
199
|
+
return _ABBREV_MASS_TABLE
|
|
200
|
+
|
|
201
|
+
table: Dict[str, float] = {}
|
|
202
|
+
|
|
203
|
+
if _HAS_RDKIT:
|
|
204
|
+
H_mass = 1.00794
|
|
205
|
+
|
|
206
|
+
def _frag_mass(smiles_with_star: str) -> Optional[float]:
|
|
207
|
+
"""Exact mass of the fragment (abbreviation minus the * atom)."""
|
|
208
|
+
full_smi = smiles_with_star.replace("*", "[H]", 1)
|
|
209
|
+
mol = Chem.MolFromSmiles(full_smi)
|
|
210
|
+
if mol is None:
|
|
211
|
+
return None
|
|
212
|
+
return Descriptors.ExactMolWt(mol) - H_mass
|
|
213
|
+
|
|
214
|
+
# RDKit built-in abbreviations (COOH, OBn, NHBoc, etc.)
|
|
215
|
+
# Note: abbrev.mol is a query mol with no implicit Hs; extract SMILES
|
|
216
|
+
# from it and re-parse via MolFromSmiles for correct mass computation.
|
|
217
|
+
try:
|
|
218
|
+
from rdkit.Chem import rdAbbreviations
|
|
219
|
+
for abbrev in rdAbbreviations.GetDefaultAbbreviations():
|
|
220
|
+
smi = Chem.MolToSmiles(abbrev.mol)
|
|
221
|
+
fm = _frag_mass(smi)
|
|
222
|
+
if fm is not None:
|
|
223
|
+
table[abbrev.label] = fm
|
|
224
|
+
except Exception:
|
|
225
|
+
pass
|
|
226
|
+
|
|
227
|
+
# Supplementary abbreviations not in RDKit's default list
|
|
228
|
+
_EXTRA_SMILES: Dict[str, str] = {
|
|
229
|
+
"COOtBu": "*C(=O)OC(C)(C)C",
|
|
230
|
+
"CO2tBu": "*C(=O)OC(C)(C)C",
|
|
231
|
+
"tBuOOC": "*C(=O)OC(C)(C)C",
|
|
232
|
+
"OTs": "*OS(=O)(=O)c1ccc(C)cc1",
|
|
233
|
+
"OTf": "*OS(=O)(=O)C(F)(F)F",
|
|
234
|
+
"OMs": "*OS(=O)(=O)C",
|
|
235
|
+
"OMe": "*OC",
|
|
236
|
+
"OEt": "*OCC",
|
|
237
|
+
"OiPr": "*OC(C)C",
|
|
238
|
+
"OBu": "*OCCCC",
|
|
239
|
+
"OtBu": "*OC(C)(C)C",
|
|
240
|
+
"OAc": "*OC(C)=O",
|
|
241
|
+
"OBn": "*OCc1ccccc1",
|
|
242
|
+
"Ph": "*c1ccccc1",
|
|
243
|
+
"Bn": "*Cc1ccccc1",
|
|
244
|
+
"Boc": "*C(=O)OC(C)(C)C",
|
|
245
|
+
"NBoc": "*NC(=O)OC(C)(C)C",
|
|
246
|
+
"NHBoc": "*NC(=O)OC(C)(C)C",
|
|
247
|
+
"Cbz": "*C(=O)OCc1ccccc1",
|
|
248
|
+
"Fmoc": "*C(=O)OCC1c2ccccc2-c2ccccc21",
|
|
249
|
+
"TMS": "*[Si](C)(C)C",
|
|
250
|
+
"TBS": "*[Si](C)(C)C(C)(C)C",
|
|
251
|
+
"TIPS": "*[Si](C(C)C)(C(C)C)C(C)C",
|
|
252
|
+
"PMB": "*Cc1ccc(OC)cc1",
|
|
253
|
+
"MOM": "*OCOC",
|
|
254
|
+
"Ac": "*C(C)=O",
|
|
255
|
+
"Piv": "*C(=O)C(C)(C)C",
|
|
256
|
+
}
|
|
257
|
+
for label, smi in _EXTRA_SMILES.items():
|
|
258
|
+
if label not in table:
|
|
259
|
+
fm = _frag_mass(smi)
|
|
260
|
+
if fm is not None:
|
|
261
|
+
table[label] = fm
|
|
262
|
+
|
|
263
|
+
_ABBREV_MASS_TABLE = table
|
|
264
|
+
return table
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def _sup_mass_correction(mol) -> float:
|
|
268
|
+
"""Compute total exact-mass correction (Da) for SUP abbreviation groups.
|
|
269
|
+
|
|
270
|
+
RDKit reads each SUP SGroup placeholder atom as a plain carbon with
|
|
271
|
+
implicit Hs (e.g., CH3 for degree-1 attachment). This function computes
|
|
272
|
+
the correction needed to get the true mass of each abbreviated group.
|
|
273
|
+
|
|
274
|
+
Returns 0.0 if RDKit is unavailable, no SGroups exist, or all labels are
|
|
275
|
+
unknown.
|
|
276
|
+
"""
|
|
277
|
+
if not _HAS_RDKIT:
|
|
278
|
+
return 0.0
|
|
279
|
+
|
|
280
|
+
try:
|
|
281
|
+
sgroups = Chem.GetMolSubstanceGroups(mol)
|
|
282
|
+
except Exception:
|
|
283
|
+
return 0.0
|
|
284
|
+
|
|
285
|
+
if not sgroups:
|
|
286
|
+
return 0.0
|
|
287
|
+
|
|
288
|
+
table = _get_abbrev_mass_table()
|
|
289
|
+
C_mass = 12.000
|
|
290
|
+
H_mass = 1.00794
|
|
291
|
+
total = 0.0
|
|
292
|
+
|
|
293
|
+
for sg in sgroups:
|
|
294
|
+
try:
|
|
295
|
+
if sg.GetProp("TYPE") != "SUP":
|
|
296
|
+
continue
|
|
297
|
+
label = sg.GetProp("LABEL")
|
|
298
|
+
except Exception:
|
|
299
|
+
continue
|
|
300
|
+
|
|
301
|
+
if label not in table:
|
|
302
|
+
print(f" Warning: Unknown SUP abbreviation '{label}' — "
|
|
303
|
+
f"mass may be incorrect", file=sys.stderr)
|
|
304
|
+
continue
|
|
305
|
+
|
|
306
|
+
atom_indices = list(sg.GetAtoms())
|
|
307
|
+
if not atom_indices:
|
|
308
|
+
continue
|
|
309
|
+
|
|
310
|
+
atom = mol.GetAtomWithIdx(atom_indices[0])
|
|
311
|
+
num_h = atom.GetTotalNumHs()
|
|
312
|
+
placeholder_mass = C_mass + num_h * H_mass
|
|
313
|
+
delta = table[label] - placeholder_mass
|
|
314
|
+
total += delta
|
|
315
|
+
print(f" SUP correction: '{label}' (C+{num_h}H placeholder) "
|
|
316
|
+
f"{delta:+.3f} Da", file=sys.stderr)
|
|
317
|
+
|
|
318
|
+
return total
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
# ---------------------------------------------------------------------------
|
|
322
|
+
# Structure-file extraction (ChemScript + RDKit)
|
|
323
|
+
# ---------------------------------------------------------------------------
|
|
324
|
+
|
|
325
|
+
def _extract_from_structure(source, exp) -> List[ExpectedSpecies]:
|
|
326
|
+
"""Load reaction from CDX/RXN and extract species with exact masses."""
|
|
327
|
+
try:
|
|
328
|
+
cs = ChemScriptBridge()
|
|
329
|
+
rxn_data = cs.load_reaction(source)
|
|
330
|
+
except Exception as e:
|
|
331
|
+
print(f" Warning: Could not load reaction from {source}: {e}",
|
|
332
|
+
file=sys.stderr)
|
|
333
|
+
return []
|
|
334
|
+
|
|
335
|
+
species = []
|
|
336
|
+
used_csv_indices: set = set() # track matched CSV rows
|
|
337
|
+
|
|
338
|
+
# Process reactants
|
|
339
|
+
for i, rct in enumerate(rxn_data.get("reactants", [])):
|
|
340
|
+
smiles = rct.get("smiles", "")
|
|
341
|
+
if not smiles:
|
|
342
|
+
continue
|
|
343
|
+
masses = _compute_masses(smiles)
|
|
344
|
+
if masses is None:
|
|
345
|
+
continue
|
|
346
|
+
neutral_mass, full_mass = masses
|
|
347
|
+
|
|
348
|
+
# Determine role: match against CSV substrate MW
|
|
349
|
+
# Try both neutral and full mass (CSV may record salt or free form)
|
|
350
|
+
role = "reactant"
|
|
351
|
+
is_substrate = False
|
|
352
|
+
if exp.sm_mass:
|
|
353
|
+
if (abs(neutral_mass - exp.sm_mass) < MW_MATCH_TOLERANCE or
|
|
354
|
+
abs(full_mass - exp.sm_mass) < MW_MATCH_TOLERANCE):
|
|
355
|
+
is_substrate = True
|
|
356
|
+
if is_substrate:
|
|
357
|
+
role = "substrate"
|
|
358
|
+
name = "SM"
|
|
359
|
+
else:
|
|
360
|
+
# Use CSV reagent name if available, else ChemScript name
|
|
361
|
+
csv_name = _match_csv_name(neutral_mass, full_mass,
|
|
362
|
+
exp.reactants, used_csv_indices)
|
|
363
|
+
name = csv_name or rct.get("name") or rct.get(
|
|
364
|
+
"formula", f"Reactant {i+1}")
|
|
365
|
+
|
|
366
|
+
sp = ExpectedSpecies(
|
|
367
|
+
name=name, role=role,
|
|
368
|
+
exact_mass=neutral_mass, smiles=smiles,
|
|
369
|
+
source_file=source,
|
|
370
|
+
)
|
|
371
|
+
sp.adducts = _build_adducts(neutral_mass)
|
|
372
|
+
species.append(sp)
|
|
373
|
+
|
|
374
|
+
# Process products
|
|
375
|
+
for i, prod in enumerate(rxn_data.get("products", [])):
|
|
376
|
+
smiles = prod.get("smiles", "")
|
|
377
|
+
if not smiles:
|
|
378
|
+
continue
|
|
379
|
+
masses = _compute_masses(smiles)
|
|
380
|
+
if masses is None:
|
|
381
|
+
continue
|
|
382
|
+
neutral_mass, full_mass = masses
|
|
383
|
+
|
|
384
|
+
# If there's only one product, label it "DP" (desired product)
|
|
385
|
+
if len(rxn_data.get("products", [])) == 1:
|
|
386
|
+
name = "DP"
|
|
387
|
+
else:
|
|
388
|
+
name = prod.get("name") or prod.get("formula", f"Product {i+1}")
|
|
389
|
+
|
|
390
|
+
sp = ExpectedSpecies(
|
|
391
|
+
name=name, role="product",
|
|
392
|
+
exact_mass=neutral_mass, smiles=smiles,
|
|
393
|
+
source_file=source,
|
|
394
|
+
)
|
|
395
|
+
sp.adducts = _build_adducts(neutral_mass)
|
|
396
|
+
species.append(sp)
|
|
397
|
+
|
|
398
|
+
return species
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
# ---------------------------------------------------------------------------
|
|
402
|
+
# RXN extraction (RDKit only — no ChemScript)
|
|
403
|
+
# ---------------------------------------------------------------------------
|
|
404
|
+
|
|
405
|
+
def _extract_from_rxn_rdkit(rxn_path: str, exp) -> List[ExpectedSpecies]:
|
|
406
|
+
"""Load RXN file directly with RDKit and extract species with exact masses.
|
|
407
|
+
|
|
408
|
+
This is the Tier 2 fallback when ChemScript is unavailable but RDKit is.
|
|
409
|
+
RDKit can read V2000 and V3000 RXN files natively.
|
|
410
|
+
"""
|
|
411
|
+
try:
|
|
412
|
+
from rdkit.Chem import AllChem
|
|
413
|
+
except ImportError:
|
|
414
|
+
return []
|
|
415
|
+
|
|
416
|
+
try:
|
|
417
|
+
rxn = AllChem.ReactionFromRxnFile(rxn_path)
|
|
418
|
+
if rxn is None:
|
|
419
|
+
print(f" Warning: RDKit could not parse {rxn_path}",
|
|
420
|
+
file=sys.stderr)
|
|
421
|
+
return []
|
|
422
|
+
except Exception as e:
|
|
423
|
+
print(f" Warning: RDKit RXN load failed for {rxn_path}: {e}",
|
|
424
|
+
file=sys.stderr)
|
|
425
|
+
return []
|
|
426
|
+
|
|
427
|
+
species = []
|
|
428
|
+
used_csv_indices: set = set()
|
|
429
|
+
|
|
430
|
+
# Process reactants
|
|
431
|
+
for i in range(rxn.GetNumReactantTemplates()):
|
|
432
|
+
mol = rxn.GetReactantTemplate(i)
|
|
433
|
+
if mol is None or mol.GetNumAtoms() == 0:
|
|
434
|
+
continue
|
|
435
|
+
try:
|
|
436
|
+
# Sanitize so we can compute MW
|
|
437
|
+
Chem.SanitizeMol(mol)
|
|
438
|
+
except Exception:
|
|
439
|
+
continue
|
|
440
|
+
|
|
441
|
+
smiles = Chem.MolToSmiles(mol)
|
|
442
|
+
masses = _compute_masses(smiles)
|
|
443
|
+
if masses is None:
|
|
444
|
+
continue
|
|
445
|
+
neutral_mass, full_mass = masses
|
|
446
|
+
|
|
447
|
+
# Correct for SUP (superatom) abbreviation groups — RDKit reads them
|
|
448
|
+
# as CH3 placeholders; apply delta to get the true exact mass.
|
|
449
|
+
correction = _sup_mass_correction(mol)
|
|
450
|
+
if correction:
|
|
451
|
+
neutral_mass += correction
|
|
452
|
+
full_mass += correction
|
|
453
|
+
|
|
454
|
+
role = "reactant"
|
|
455
|
+
is_substrate = False
|
|
456
|
+
if exp.sm_mass:
|
|
457
|
+
if (abs(neutral_mass - exp.sm_mass) < MW_MATCH_TOLERANCE or
|
|
458
|
+
abs(full_mass - exp.sm_mass) < MW_MATCH_TOLERANCE):
|
|
459
|
+
is_substrate = True
|
|
460
|
+
if is_substrate:
|
|
461
|
+
role = "substrate"
|
|
462
|
+
name = "SM"
|
|
463
|
+
else:
|
|
464
|
+
csv_name = _match_csv_name(neutral_mass, full_mass,
|
|
465
|
+
exp.reactants, used_csv_indices)
|
|
466
|
+
name = csv_name or f"Reactant {i+1}"
|
|
467
|
+
|
|
468
|
+
sp = ExpectedSpecies(
|
|
469
|
+
name=name, role=role,
|
|
470
|
+
exact_mass=neutral_mass, smiles=smiles,
|
|
471
|
+
source_file=rxn_path,
|
|
472
|
+
)
|
|
473
|
+
sp.adducts = _build_adducts(neutral_mass)
|
|
474
|
+
species.append(sp)
|
|
475
|
+
|
|
476
|
+
# Process products
|
|
477
|
+
for i in range(rxn.GetNumProductTemplates()):
|
|
478
|
+
mol = rxn.GetProductTemplate(i)
|
|
479
|
+
if mol is None or mol.GetNumAtoms() == 0:
|
|
480
|
+
continue
|
|
481
|
+
try:
|
|
482
|
+
Chem.SanitizeMol(mol)
|
|
483
|
+
except Exception:
|
|
484
|
+
continue
|
|
485
|
+
|
|
486
|
+
smiles = Chem.MolToSmiles(mol)
|
|
487
|
+
masses = _compute_masses(smiles)
|
|
488
|
+
if masses is None:
|
|
489
|
+
continue
|
|
490
|
+
neutral_mass, full_mass = masses
|
|
491
|
+
|
|
492
|
+
correction = _sup_mass_correction(mol)
|
|
493
|
+
if correction:
|
|
494
|
+
neutral_mass += correction
|
|
495
|
+
full_mass += correction
|
|
496
|
+
|
|
497
|
+
if rxn.GetNumProductTemplates() == 1:
|
|
498
|
+
name = "DP"
|
|
499
|
+
else:
|
|
500
|
+
name = f"Product {i+1}"
|
|
501
|
+
|
|
502
|
+
sp = ExpectedSpecies(
|
|
503
|
+
name=name, role="product",
|
|
504
|
+
exact_mass=neutral_mass, smiles=smiles,
|
|
505
|
+
source_file=rxn_path,
|
|
506
|
+
)
|
|
507
|
+
sp.adducts = _build_adducts(neutral_mass)
|
|
508
|
+
species.append(sp)
|
|
509
|
+
|
|
510
|
+
if species:
|
|
511
|
+
print(f" Loaded reaction via RDKit from {os.path.basename(rxn_path)} "
|
|
512
|
+
f"({len(species)} species)", file=sys.stderr)
|
|
513
|
+
return species
|
|
514
|
+
|
|
515
|
+
|
|
516
|
+
# ---------------------------------------------------------------------------
|
|
517
|
+
# CSV MW fallback
|
|
518
|
+
# ---------------------------------------------------------------------------
|
|
519
|
+
|
|
520
|
+
def _fallback_from_csv(exp) -> List[ExpectedSpecies]:
|
|
521
|
+
"""Create expected species from CSV MW values (fallback)."""
|
|
522
|
+
species = []
|
|
523
|
+
|
|
524
|
+
if exp.sm_mass:
|
|
525
|
+
sp = ExpectedSpecies(
|
|
526
|
+
name="SM", role="substrate",
|
|
527
|
+
exact_mass=exp.sm_mass, smiles="",
|
|
528
|
+
)
|
|
529
|
+
sp.adducts = _build_adducts(exp.sm_mass)
|
|
530
|
+
species.append(sp)
|
|
531
|
+
|
|
532
|
+
if exp.product_mass:
|
|
533
|
+
sp = ExpectedSpecies(
|
|
534
|
+
name="DP", role="product",
|
|
535
|
+
exact_mass=exp.product_mass, smiles="",
|
|
536
|
+
)
|
|
537
|
+
sp.adducts = _build_adducts(exp.product_mass)
|
|
538
|
+
species.append(sp)
|
|
539
|
+
|
|
540
|
+
return species
|
|
541
|
+
|
|
542
|
+
|
|
543
|
+
# ---------------------------------------------------------------------------
|
|
544
|
+
# Public API
|
|
545
|
+
# ---------------------------------------------------------------------------
|
|
546
|
+
|
|
547
|
+
def extract_expected_masses(exp, predict_byproducts=False) -> List[ExpectedSpecies]:
|
|
548
|
+
"""
|
|
549
|
+
Extract expected species masses from CDX/RXN structure files.
|
|
550
|
+
|
|
551
|
+
Uses ChemScript to load the reaction and extract SMILES for each
|
|
552
|
+
component, then RDKit to compute monoisotopic exact masses and handle
|
|
553
|
+
salt splitting. Falls back to CSV MW values if structure files or
|
|
554
|
+
required libraries are unavailable.
|
|
555
|
+
|
|
556
|
+
Args:
|
|
557
|
+
exp: Experiment object with cdx_path, rxn_path, reactants, etc.
|
|
558
|
+
predict_byproducts: If True, run FlowER beam search to predict
|
|
559
|
+
reaction byproducts and add them to the expected species list.
|
|
560
|
+
Requires the 'flower' conda environment. Results are cached.
|
|
561
|
+
"""
|
|
562
|
+
sources = [s for s in [exp.cdx_path, exp.rxn_path] if s]
|
|
563
|
+
|
|
564
|
+
# Tier 1: ChemScript + RDKit (can load CDX and RXN)
|
|
565
|
+
if sources and _HAS_CHEMSCRIPT and _HAS_RDKIT:
|
|
566
|
+
for source in sources:
|
|
567
|
+
species = _extract_from_structure(source, exp)
|
|
568
|
+
if species:
|
|
569
|
+
break
|
|
570
|
+
else:
|
|
571
|
+
species = None
|
|
572
|
+
else:
|
|
573
|
+
species = None
|
|
574
|
+
|
|
575
|
+
# Tier 2: RDKit only — load RXN directly (no ChemScript needed)
|
|
576
|
+
if species is None and _HAS_RDKIT and exp.rxn_path:
|
|
577
|
+
species = _extract_from_rxn_rdkit(exp.rxn_path, exp)
|
|
578
|
+
|
|
579
|
+
# Tier 3: CSV MW values (least accurate — average MW, not monoisotopic)
|
|
580
|
+
if species is None:
|
|
581
|
+
species = _fallback_from_csv(exp)
|
|
582
|
+
|
|
583
|
+
# Optional: FlowER byproduct prediction
|
|
584
|
+
global _last_flower_predictions
|
|
585
|
+
_last_flower_predictions = []
|
|
586
|
+
if predict_byproducts and exp.rxn_path:
|
|
587
|
+
try:
|
|
588
|
+
from experiments.byproduct_prediction.flower_predictor import (
|
|
589
|
+
predict_byproducts as _predict_bp,
|
|
590
|
+
)
|
|
591
|
+
csv_path = getattr(exp, '_csv_path', '') or ''
|
|
592
|
+
bp_species = _predict_bp(
|
|
593
|
+
rxn_path=exp.rxn_path,
|
|
594
|
+
csv_path=csv_path,
|
|
595
|
+
)
|
|
596
|
+
if bp_species:
|
|
597
|
+
print(f" FlowER predicted {len(bp_species)} byproduct(s)",
|
|
598
|
+
file=sys.stderr)
|
|
599
|
+
# Save full list before deduplication (for CDXML output)
|
|
600
|
+
_last_flower_predictions = list(bp_species)
|
|
601
|
+
# Filter out byproducts that duplicate existing species
|
|
602
|
+
# (SM, DP, or CSV reagents) by exact mass
|
|
603
|
+
existing_masses = [s.exact_mass for s in species]
|
|
604
|
+
from cdxml_toolkit.constants import MASS_TOLERANCE
|
|
605
|
+
kept = []
|
|
606
|
+
for bp in bp_species:
|
|
607
|
+
if any(abs(bp.exact_mass - em) < MASS_TOLERANCE
|
|
608
|
+
for em in existing_masses):
|
|
609
|
+
print(f" Skipping {bp.name} "
|
|
610
|
+
f"(mass {bp.exact_mass:.1f} duplicates "
|
|
611
|
+
f"an existing species)", file=sys.stderr)
|
|
612
|
+
continue
|
|
613
|
+
# Try to match against CSV reagent names by MW
|
|
614
|
+
if hasattr(exp, 'reactants') and exp.reactants and bp.smiles:
|
|
615
|
+
masses = _compute_masses(bp.smiles) if _HAS_RDKIT else None
|
|
616
|
+
if masses:
|
|
617
|
+
neutral_m, full_m = masses
|
|
618
|
+
else:
|
|
619
|
+
neutral_m = full_m = bp.exact_mass
|
|
620
|
+
csv_name = _match_csv_name(
|
|
621
|
+
neutral_m, full_m, exp.reactants, set())
|
|
622
|
+
if csv_name:
|
|
623
|
+
bp.name = f"BP-{csv_name}"
|
|
624
|
+
kept.append(bp)
|
|
625
|
+
if len(kept) < len(bp_species):
|
|
626
|
+
print(f" Kept {len(kept)} byproduct(s) after "
|
|
627
|
+
f"deduplication", file=sys.stderr)
|
|
628
|
+
species.extend(kept)
|
|
629
|
+
except ImportError:
|
|
630
|
+
print(" FlowER predictor not available — "
|
|
631
|
+
"skipping byproduct prediction", file=sys.stderr)
|
|
632
|
+
except Exception as e:
|
|
633
|
+
print(f" FlowER prediction failed: {e}", file=sys.stderr)
|
|
634
|
+
|
|
635
|
+
return species
|
|
636
|
+
|
|
637
|
+
|
|
638
|
+
def get_last_flower_predictions() -> List[ExpectedSpecies]:
|
|
639
|
+
"""Return the full FlowER prediction list from the last call to
|
|
640
|
+
``extract_expected_masses(predict_byproducts=True)``.
|
|
641
|
+
|
|
642
|
+
This is the pre-deduplication list (all predictions after basic MW
|
|
643
|
+
filtering). Used by procedure_writer to generate the reference CDXML.
|
|
644
|
+
"""
|
|
645
|
+
return list(_last_flower_predictions)
|
|
646
|
+
|
|
647
|
+
|
|
648
|
+
# ---------------------------------------------------------------------------
|
|
649
|
+
# CLI placeholder
|
|
650
|
+
# ---------------------------------------------------------------------------
|
|
651
|
+
|
|
652
|
+
if __name__ == "__main__":
|
|
653
|
+
print("mass_resolver: no standalone CLI — "
|
|
654
|
+
"import extract_expected_masses() from procedure_writer.py")
|