cdxml-toolkit 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. cdxml_toolkit/__init__.py +18 -0
  2. cdxml_toolkit/_jre/__init__.py +2 -0
  3. cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
  4. cdxml_toolkit/analysis/__init__.py +35 -0
  5. cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
  6. cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
  7. cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
  8. cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
  9. cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
  10. cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
  11. cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
  12. cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
  13. cdxml_toolkit/analysis/extract_nmr.py +47 -0
  14. cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
  15. cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
  16. cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
  17. cdxml_toolkit/cdxml_builder.py +920 -0
  18. cdxml_toolkit/cdxml_utils.py +342 -0
  19. cdxml_toolkit/chemdraw/__init__.py +5 -0
  20. cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
  21. cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
  22. cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
  23. cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
  24. cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
  25. cdxml_toolkit/constants.py +304 -0
  26. cdxml_toolkit/coord_normalizer.py +438 -0
  27. cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
  28. cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
  29. cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
  30. cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
  31. cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
  32. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
  33. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
  34. cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
  35. cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
  36. cdxml_toolkit/image/__init__.py +15 -0
  37. cdxml_toolkit/image/reaction_from_image.py +2103 -0
  38. cdxml_toolkit/image/structure_from_image.py +1711 -0
  39. cdxml_toolkit/layout/__init__.py +5 -0
  40. cdxml_toolkit/layout/alignment.py +1642 -0
  41. cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
  42. cdxml_toolkit/layout/scheme_merger.py +2260 -0
  43. cdxml_toolkit/mcp_server/__init__.py +0 -0
  44. cdxml_toolkit/mcp_server/__main__.py +5 -0
  45. cdxml_toolkit/mcp_server/server.py +1567 -0
  46. cdxml_toolkit/naming/__init__.py +6 -0
  47. cdxml_toolkit/naming/aligned_namer.py +2342 -0
  48. cdxml_toolkit/naming/mol_builder.py +3722 -0
  49. cdxml_toolkit/naming/name_decomposer.py +2843 -0
  50. cdxml_toolkit/naming/reactions_datamol.json +2414 -0
  51. cdxml_toolkit/office/__init__.py +5 -0
  52. cdxml_toolkit/office/doc_from_template.py +722 -0
  53. cdxml_toolkit/office/ole_embedder.py +808 -0
  54. cdxml_toolkit/office/ole_extractor.py +272 -0
  55. cdxml_toolkit/perception/__init__.py +10 -0
  56. cdxml_toolkit/perception/compound_search.py +229 -0
  57. cdxml_toolkit/perception/eln_csv_parser.py +240 -0
  58. cdxml_toolkit/perception/rdf_parser.py +664 -0
  59. cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
  60. cdxml_toolkit/perception/reaction_parser.py +2150 -0
  61. cdxml_toolkit/perception/scheme_reader.py +2948 -0
  62. cdxml_toolkit/perception/scheme_refine.py +1404 -0
  63. cdxml_toolkit/perception/scheme_segmenter.py +619 -0
  64. cdxml_toolkit/perception/spatial_assignment.py +1013 -0
  65. cdxml_toolkit/rdkit_utils.py +605 -0
  66. cdxml_toolkit/render/__init__.py +17 -0
  67. cdxml_toolkit/render/auto_layout.py +229 -0
  68. cdxml_toolkit/render/compact_parser.py +632 -0
  69. cdxml_toolkit/render/parser.py +706 -0
  70. cdxml_toolkit/render/render_scheme.py +267 -0
  71. cdxml_toolkit/render/renderer.py +2387 -0
  72. cdxml_toolkit/render/schema.py +90 -0
  73. cdxml_toolkit/render/scheme_maker.py +1043 -0
  74. cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
  75. cdxml_toolkit/resolve/__init__.py +13 -0
  76. cdxml_toolkit/resolve/cas_resolver.py +430 -0
  77. cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
  78. cdxml_toolkit/resolve/condensed_formula.py +493 -0
  79. cdxml_toolkit/resolve/jre_manager.py +195 -0
  80. cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
  81. cdxml_toolkit/resolve/reagent_db.py +285 -0
  82. cdxml_toolkit/resolve/superatom_data.json +2856 -0
  83. cdxml_toolkit/resolve/superatom_table.py +146 -0
  84. cdxml_toolkit/text_formatting.py +298 -0
  85. cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
  86. cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
  87. cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
  88. cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
  89. cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
  90. cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
  91. cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,654 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Mass Resolver — Structure-Based Mass Determination for LCMS Identification
4
+
5
+ Extracts expected species (starting materials, products, reagents) from
6
+ CDX/RXN structure files via ChemScript + RDKit, computes monoisotopic
7
+ exact masses, and builds expected ESI adduct m/z tables.
8
+
9
+ Three tiers of mass resolution:
10
+ 1. ChemScript + RDKit (CDX or RXN → SMILES → exact mass)
11
+ 2. RDKit only (RXN → exact mass, with SUP abbreviation correction)
12
+ 3. CSV MW fallback (average MW from ELN export)
13
+
14
+ Usage:
15
+ from mass_resolver import extract_expected_masses, ExpectedSpecies
16
+
17
+ species = extract_expected_masses(exp)
18
+ for sp in species:
19
+ print(f"{sp.name}: {sp.exact_mass:.3f} Da")
20
+ """
21
+
22
+ import os
23
+ import sys
24
+ from dataclasses import dataclass, field
25
+ from typing import List, Optional, Dict, Tuple
26
+
27
+ from cdxml_toolkit.constants import MW_MATCH_TOLERANCE
28
+
29
+ # --- Optional: structure-based mass determination ---
30
+ try:
31
+ from rdkit import Chem
32
+ from rdkit.Chem import Descriptors
33
+ _HAS_RDKIT = True
34
+ except ImportError:
35
+ _HAS_RDKIT = False
36
+
37
+ try:
38
+ from cdxml_toolkit.chemdraw.chemscript_bridge import ChemScriptBridge
39
+ _HAS_CHEMSCRIPT = True
40
+ except ImportError:
41
+ _HAS_CHEMSCRIPT = False
42
+
43
+ # ---------------------------------------------------------------------------
44
+ # Constants
45
+ # ---------------------------------------------------------------------------
46
+
47
+ # Standard ESI adducts: name -> (ESI mode, mass offset from neutral)
48
+ ADDUCTS = {
49
+ "[M+H]+": ("ES+", 1.008),
50
+ "[M-H]-": ("ES-", -1.008),
51
+ "[M+Na]+": ("ES+", 22.990),
52
+ "[M+formate]-": ("ES-", 44.998),
53
+ }
54
+
55
+ # Adduct reporting priority: prefer [M+H]+/[M-H]- (proton transfer)
56
+ # over [M+Na]+/[M+formate]- (adduct ions). Lower number = preferred.
57
+ ADDUCT_PRIORITY = {
58
+ "[M+H]+": 0,
59
+ "[M-H]-": 0,
60
+ "[M+Na]+": 1,
61
+ "[M+formate]-": 1,
62
+ }
63
+
64
+ # ESI mode preference for breaking ties: ESI+ preferred over ESI-
65
+ MODE_PREFERENCE = {"ES+": 0, "ES-": 1}
66
+
67
+ # Lazy-built table mapping SUP abbreviation labels to fragment exact masses.
68
+ # Populated on first use by _get_abbrev_mass_table() (requires RDKit).
69
+ _ABBREV_MASS_TABLE: Optional[Dict[str, float]] = None
70
+
71
+ # Cache of raw FlowER predictions (before deduplication), set by
72
+ # extract_expected_masses() when predict_byproducts=True.
73
+ _last_flower_predictions: List = []
74
+
75
+ # ---------------------------------------------------------------------------
76
+ # Data structures
77
+ # ---------------------------------------------------------------------------
78
+
79
+ @dataclass
80
+ class ExpectedSpecies:
81
+ """A chemical species with predicted LCMS adduct masses."""
82
+ name: str # display name: "SM", "DP", formula, or IUPAC name
83
+ role: str # "substrate", "reactant", "product"
84
+ exact_mass: float # monoisotopic neutral mass
85
+ smiles: str
86
+ adducts: Dict[str, float] = field(default_factory=dict)
87
+ source_file: str = "" # CDX/RXN path if from structure, "" if CSV
88
+
89
+ # ---------------------------------------------------------------------------
90
+ # Mass computation
91
+ # ---------------------------------------------------------------------------
92
+
93
+ def compute_masses(smiles: str) -> Optional[Tuple[float, float]]:
94
+ """
95
+ Compute monoisotopic masses from SMILES.
96
+
97
+ Returns (neutral_mass, full_mass) where:
98
+ - neutral_mass: mass of the largest fragment (free base / free acid),
99
+ used for LCMS adduct matching
100
+ - full_mass: mass of the entire molecule including any counterions,
101
+ used for matching against CSV MW which may record the salt form
102
+
103
+ For non-salt molecules, both values are identical.
104
+ """
105
+ mol = Chem.MolFromSmiles(smiles)
106
+ if mol is None:
107
+ return None
108
+
109
+ full_mass = Descriptors.ExactMolWt(mol)
110
+
111
+ # Split multi-component SMILES (salts)
112
+ frags = Chem.GetMolFrags(mol, asMols=True)
113
+ if len(frags) > 1:
114
+ neutral_mol = max(frags, key=lambda m: m.GetNumHeavyAtoms())
115
+ neutral_mass = Descriptors.ExactMolWt(neutral_mol)
116
+ else:
117
+ neutral_mass = full_mass
118
+
119
+ return (neutral_mass, full_mass)
120
+
121
+
122
+ # Backward-compatible aliases (were private, now public)
123
+ _compute_masses = compute_masses
124
+
125
+
126
+ def build_adducts(exact_mass: float) -> Dict[str, float]:
127
+ """Build expected adduct m/z dict from neutral exact mass."""
128
+ return {name: exact_mass + offset for name, (_, offset) in ADDUCTS.items()}
129
+
130
+
131
+ # Backward-compatible alias
132
+ _build_adducts = build_adducts
133
+
134
+
135
+ # ---------------------------------------------------------------------------
136
+ # CSV name matching
137
+ # ---------------------------------------------------------------------------
138
+
139
+ def _match_csv_name(neutral_mass: float,
140
+ full_mass: float,
141
+ reagents,
142
+ used_indices: set) -> Optional[str]:
143
+ """
144
+ Match a structure's mass to a CSV reagent row by MW.
145
+
146
+ Tries both the neutral (free base) mass and the full (salt) mass
147
+ against each CSV MW. This handles:
148
+ - Free-form structure vs free-form CSV MW (neutral ≈ CSV)
149
+ - Salt structure vs salt CSV MW (full ≈ CSV)
150
+ - Salt structure vs free-form CSV MW (neutral ≈ CSV)
151
+
152
+ Returns the CSV reagent name if a match is found (within 2 Da),
153
+ or None. Marks matched index as used to prevent double-matching.
154
+ """
155
+ best_name = None
156
+ best_delta = 2.0
157
+ best_idx = -1
158
+
159
+ for idx, reagent in enumerate(reagents):
160
+ if idx in used_indices or reagent.mw <= 0:
161
+ continue
162
+
163
+ # Try neutral mass (free base/acid) against CSV MW
164
+ delta = abs(neutral_mass - reagent.mw)
165
+ if delta < best_delta:
166
+ best_delta = delta
167
+ best_name = reagent.name.strip()
168
+ best_idx = idx
169
+
170
+ # Try full mass (including counterion) against CSV MW
171
+ if full_mass != neutral_mass:
172
+ delta = abs(full_mass - reagent.mw)
173
+ if delta < best_delta:
174
+ best_delta = delta
175
+ best_name = reagent.name.strip()
176
+ best_idx = idx
177
+
178
+ if best_name and best_idx >= 0:
179
+ used_indices.add(best_idx)
180
+ return best_name
181
+
182
+
183
+ # ---------------------------------------------------------------------------
184
+ # SUP abbreviation mass correction (RDKit RXN loading)
185
+ # ---------------------------------------------------------------------------
186
+
187
+ def _get_abbrev_mass_table() -> Dict[str, float]:
188
+ """Build (once) a table of SUP abbreviation label → fragment exact mass.
189
+
190
+ The fragment mass is the monoisotopic mass of the group that gets attached
191
+ to the molecule, i.e. the abbreviation minus its '*' attachment-point atom.
192
+ For example COOH → C(=O)OH fragment → 44.998 Da.
193
+
194
+ Used to correct masses when RDKit reads SUP SGroup atoms as plain CH3
195
+ placeholders instead of the real abbreviated group.
196
+ """
197
+ global _ABBREV_MASS_TABLE
198
+ if _ABBREV_MASS_TABLE is not None:
199
+ return _ABBREV_MASS_TABLE
200
+
201
+ table: Dict[str, float] = {}
202
+
203
+ if _HAS_RDKIT:
204
+ H_mass = 1.00794
205
+
206
+ def _frag_mass(smiles_with_star: str) -> Optional[float]:
207
+ """Exact mass of the fragment (abbreviation minus the * atom)."""
208
+ full_smi = smiles_with_star.replace("*", "[H]", 1)
209
+ mol = Chem.MolFromSmiles(full_smi)
210
+ if mol is None:
211
+ return None
212
+ return Descriptors.ExactMolWt(mol) - H_mass
213
+
214
+ # RDKit built-in abbreviations (COOH, OBn, NHBoc, etc.)
215
+ # Note: abbrev.mol is a query mol with no implicit Hs; extract SMILES
216
+ # from it and re-parse via MolFromSmiles for correct mass computation.
217
+ try:
218
+ from rdkit.Chem import rdAbbreviations
219
+ for abbrev in rdAbbreviations.GetDefaultAbbreviations():
220
+ smi = Chem.MolToSmiles(abbrev.mol)
221
+ fm = _frag_mass(smi)
222
+ if fm is not None:
223
+ table[abbrev.label] = fm
224
+ except Exception:
225
+ pass
226
+
227
+ # Supplementary abbreviations not in RDKit's default list
228
+ _EXTRA_SMILES: Dict[str, str] = {
229
+ "COOtBu": "*C(=O)OC(C)(C)C",
230
+ "CO2tBu": "*C(=O)OC(C)(C)C",
231
+ "tBuOOC": "*C(=O)OC(C)(C)C",
232
+ "OTs": "*OS(=O)(=O)c1ccc(C)cc1",
233
+ "OTf": "*OS(=O)(=O)C(F)(F)F",
234
+ "OMs": "*OS(=O)(=O)C",
235
+ "OMe": "*OC",
236
+ "OEt": "*OCC",
237
+ "OiPr": "*OC(C)C",
238
+ "OBu": "*OCCCC",
239
+ "OtBu": "*OC(C)(C)C",
240
+ "OAc": "*OC(C)=O",
241
+ "OBn": "*OCc1ccccc1",
242
+ "Ph": "*c1ccccc1",
243
+ "Bn": "*Cc1ccccc1",
244
+ "Boc": "*C(=O)OC(C)(C)C",
245
+ "NBoc": "*NC(=O)OC(C)(C)C",
246
+ "NHBoc": "*NC(=O)OC(C)(C)C",
247
+ "Cbz": "*C(=O)OCc1ccccc1",
248
+ "Fmoc": "*C(=O)OCC1c2ccccc2-c2ccccc21",
249
+ "TMS": "*[Si](C)(C)C",
250
+ "TBS": "*[Si](C)(C)C(C)(C)C",
251
+ "TIPS": "*[Si](C(C)C)(C(C)C)C(C)C",
252
+ "PMB": "*Cc1ccc(OC)cc1",
253
+ "MOM": "*OCOC",
254
+ "Ac": "*C(C)=O",
255
+ "Piv": "*C(=O)C(C)(C)C",
256
+ }
257
+ for label, smi in _EXTRA_SMILES.items():
258
+ if label not in table:
259
+ fm = _frag_mass(smi)
260
+ if fm is not None:
261
+ table[label] = fm
262
+
263
+ _ABBREV_MASS_TABLE = table
264
+ return table
265
+
266
+
267
+ def _sup_mass_correction(mol) -> float:
268
+ """Compute total exact-mass correction (Da) for SUP abbreviation groups.
269
+
270
+ RDKit reads each SUP SGroup placeholder atom as a plain carbon with
271
+ implicit Hs (e.g., CH3 for degree-1 attachment). This function computes
272
+ the correction needed to get the true mass of each abbreviated group.
273
+
274
+ Returns 0.0 if RDKit is unavailable, no SGroups exist, or all labels are
275
+ unknown.
276
+ """
277
+ if not _HAS_RDKIT:
278
+ return 0.0
279
+
280
+ try:
281
+ sgroups = Chem.GetMolSubstanceGroups(mol)
282
+ except Exception:
283
+ return 0.0
284
+
285
+ if not sgroups:
286
+ return 0.0
287
+
288
+ table = _get_abbrev_mass_table()
289
+ C_mass = 12.000
290
+ H_mass = 1.00794
291
+ total = 0.0
292
+
293
+ for sg in sgroups:
294
+ try:
295
+ if sg.GetProp("TYPE") != "SUP":
296
+ continue
297
+ label = sg.GetProp("LABEL")
298
+ except Exception:
299
+ continue
300
+
301
+ if label not in table:
302
+ print(f" Warning: Unknown SUP abbreviation '{label}' — "
303
+ f"mass may be incorrect", file=sys.stderr)
304
+ continue
305
+
306
+ atom_indices = list(sg.GetAtoms())
307
+ if not atom_indices:
308
+ continue
309
+
310
+ atom = mol.GetAtomWithIdx(atom_indices[0])
311
+ num_h = atom.GetTotalNumHs()
312
+ placeholder_mass = C_mass + num_h * H_mass
313
+ delta = table[label] - placeholder_mass
314
+ total += delta
315
+ print(f" SUP correction: '{label}' (C+{num_h}H placeholder) "
316
+ f"{delta:+.3f} Da", file=sys.stderr)
317
+
318
+ return total
319
+
320
+
321
+ # ---------------------------------------------------------------------------
322
+ # Structure-file extraction (ChemScript + RDKit)
323
+ # ---------------------------------------------------------------------------
324
+
325
+ def _extract_from_structure(source, exp) -> List[ExpectedSpecies]:
326
+ """Load reaction from CDX/RXN and extract species with exact masses."""
327
+ try:
328
+ cs = ChemScriptBridge()
329
+ rxn_data = cs.load_reaction(source)
330
+ except Exception as e:
331
+ print(f" Warning: Could not load reaction from {source}: {e}",
332
+ file=sys.stderr)
333
+ return []
334
+
335
+ species = []
336
+ used_csv_indices: set = set() # track matched CSV rows
337
+
338
+ # Process reactants
339
+ for i, rct in enumerate(rxn_data.get("reactants", [])):
340
+ smiles = rct.get("smiles", "")
341
+ if not smiles:
342
+ continue
343
+ masses = _compute_masses(smiles)
344
+ if masses is None:
345
+ continue
346
+ neutral_mass, full_mass = masses
347
+
348
+ # Determine role: match against CSV substrate MW
349
+ # Try both neutral and full mass (CSV may record salt or free form)
350
+ role = "reactant"
351
+ is_substrate = False
352
+ if exp.sm_mass:
353
+ if (abs(neutral_mass - exp.sm_mass) < MW_MATCH_TOLERANCE or
354
+ abs(full_mass - exp.sm_mass) < MW_MATCH_TOLERANCE):
355
+ is_substrate = True
356
+ if is_substrate:
357
+ role = "substrate"
358
+ name = "SM"
359
+ else:
360
+ # Use CSV reagent name if available, else ChemScript name
361
+ csv_name = _match_csv_name(neutral_mass, full_mass,
362
+ exp.reactants, used_csv_indices)
363
+ name = csv_name or rct.get("name") or rct.get(
364
+ "formula", f"Reactant {i+1}")
365
+
366
+ sp = ExpectedSpecies(
367
+ name=name, role=role,
368
+ exact_mass=neutral_mass, smiles=smiles,
369
+ source_file=source,
370
+ )
371
+ sp.adducts = _build_adducts(neutral_mass)
372
+ species.append(sp)
373
+
374
+ # Process products
375
+ for i, prod in enumerate(rxn_data.get("products", [])):
376
+ smiles = prod.get("smiles", "")
377
+ if not smiles:
378
+ continue
379
+ masses = _compute_masses(smiles)
380
+ if masses is None:
381
+ continue
382
+ neutral_mass, full_mass = masses
383
+
384
+ # If there's only one product, label it "DP" (desired product)
385
+ if len(rxn_data.get("products", [])) == 1:
386
+ name = "DP"
387
+ else:
388
+ name = prod.get("name") or prod.get("formula", f"Product {i+1}")
389
+
390
+ sp = ExpectedSpecies(
391
+ name=name, role="product",
392
+ exact_mass=neutral_mass, smiles=smiles,
393
+ source_file=source,
394
+ )
395
+ sp.adducts = _build_adducts(neutral_mass)
396
+ species.append(sp)
397
+
398
+ return species
399
+
400
+
401
+ # ---------------------------------------------------------------------------
402
+ # RXN extraction (RDKit only — no ChemScript)
403
+ # ---------------------------------------------------------------------------
404
+
405
+ def _extract_from_rxn_rdkit(rxn_path: str, exp) -> List[ExpectedSpecies]:
406
+ """Load RXN file directly with RDKit and extract species with exact masses.
407
+
408
+ This is the Tier 2 fallback when ChemScript is unavailable but RDKit is.
409
+ RDKit can read V2000 and V3000 RXN files natively.
410
+ """
411
+ try:
412
+ from rdkit.Chem import AllChem
413
+ except ImportError:
414
+ return []
415
+
416
+ try:
417
+ rxn = AllChem.ReactionFromRxnFile(rxn_path)
418
+ if rxn is None:
419
+ print(f" Warning: RDKit could not parse {rxn_path}",
420
+ file=sys.stderr)
421
+ return []
422
+ except Exception as e:
423
+ print(f" Warning: RDKit RXN load failed for {rxn_path}: {e}",
424
+ file=sys.stderr)
425
+ return []
426
+
427
+ species = []
428
+ used_csv_indices: set = set()
429
+
430
+ # Process reactants
431
+ for i in range(rxn.GetNumReactantTemplates()):
432
+ mol = rxn.GetReactantTemplate(i)
433
+ if mol is None or mol.GetNumAtoms() == 0:
434
+ continue
435
+ try:
436
+ # Sanitize so we can compute MW
437
+ Chem.SanitizeMol(mol)
438
+ except Exception:
439
+ continue
440
+
441
+ smiles = Chem.MolToSmiles(mol)
442
+ masses = _compute_masses(smiles)
443
+ if masses is None:
444
+ continue
445
+ neutral_mass, full_mass = masses
446
+
447
+ # Correct for SUP (superatom) abbreviation groups — RDKit reads them
448
+ # as CH3 placeholders; apply delta to get the true exact mass.
449
+ correction = _sup_mass_correction(mol)
450
+ if correction:
451
+ neutral_mass += correction
452
+ full_mass += correction
453
+
454
+ role = "reactant"
455
+ is_substrate = False
456
+ if exp.sm_mass:
457
+ if (abs(neutral_mass - exp.sm_mass) < MW_MATCH_TOLERANCE or
458
+ abs(full_mass - exp.sm_mass) < MW_MATCH_TOLERANCE):
459
+ is_substrate = True
460
+ if is_substrate:
461
+ role = "substrate"
462
+ name = "SM"
463
+ else:
464
+ csv_name = _match_csv_name(neutral_mass, full_mass,
465
+ exp.reactants, used_csv_indices)
466
+ name = csv_name or f"Reactant {i+1}"
467
+
468
+ sp = ExpectedSpecies(
469
+ name=name, role=role,
470
+ exact_mass=neutral_mass, smiles=smiles,
471
+ source_file=rxn_path,
472
+ )
473
+ sp.adducts = _build_adducts(neutral_mass)
474
+ species.append(sp)
475
+
476
+ # Process products
477
+ for i in range(rxn.GetNumProductTemplates()):
478
+ mol = rxn.GetProductTemplate(i)
479
+ if mol is None or mol.GetNumAtoms() == 0:
480
+ continue
481
+ try:
482
+ Chem.SanitizeMol(mol)
483
+ except Exception:
484
+ continue
485
+
486
+ smiles = Chem.MolToSmiles(mol)
487
+ masses = _compute_masses(smiles)
488
+ if masses is None:
489
+ continue
490
+ neutral_mass, full_mass = masses
491
+
492
+ correction = _sup_mass_correction(mol)
493
+ if correction:
494
+ neutral_mass += correction
495
+ full_mass += correction
496
+
497
+ if rxn.GetNumProductTemplates() == 1:
498
+ name = "DP"
499
+ else:
500
+ name = f"Product {i+1}"
501
+
502
+ sp = ExpectedSpecies(
503
+ name=name, role="product",
504
+ exact_mass=neutral_mass, smiles=smiles,
505
+ source_file=rxn_path,
506
+ )
507
+ sp.adducts = _build_adducts(neutral_mass)
508
+ species.append(sp)
509
+
510
+ if species:
511
+ print(f" Loaded reaction via RDKit from {os.path.basename(rxn_path)} "
512
+ f"({len(species)} species)", file=sys.stderr)
513
+ return species
514
+
515
+
516
+ # ---------------------------------------------------------------------------
517
+ # CSV MW fallback
518
+ # ---------------------------------------------------------------------------
519
+
520
+ def _fallback_from_csv(exp) -> List[ExpectedSpecies]:
521
+ """Create expected species from CSV MW values (fallback)."""
522
+ species = []
523
+
524
+ if exp.sm_mass:
525
+ sp = ExpectedSpecies(
526
+ name="SM", role="substrate",
527
+ exact_mass=exp.sm_mass, smiles="",
528
+ )
529
+ sp.adducts = _build_adducts(exp.sm_mass)
530
+ species.append(sp)
531
+
532
+ if exp.product_mass:
533
+ sp = ExpectedSpecies(
534
+ name="DP", role="product",
535
+ exact_mass=exp.product_mass, smiles="",
536
+ )
537
+ sp.adducts = _build_adducts(exp.product_mass)
538
+ species.append(sp)
539
+
540
+ return species
541
+
542
+
543
+ # ---------------------------------------------------------------------------
544
+ # Public API
545
+ # ---------------------------------------------------------------------------
546
+
547
+ def extract_expected_masses(exp, predict_byproducts=False) -> List[ExpectedSpecies]:
548
+ """
549
+ Extract expected species masses from CDX/RXN structure files.
550
+
551
+ Uses ChemScript to load the reaction and extract SMILES for each
552
+ component, then RDKit to compute monoisotopic exact masses and handle
553
+ salt splitting. Falls back to CSV MW values if structure files or
554
+ required libraries are unavailable.
555
+
556
+ Args:
557
+ exp: Experiment object with cdx_path, rxn_path, reactants, etc.
558
+ predict_byproducts: If True, run FlowER beam search to predict
559
+ reaction byproducts and add them to the expected species list.
560
+ Requires the 'flower' conda environment. Results are cached.
561
+ """
562
+ sources = [s for s in [exp.cdx_path, exp.rxn_path] if s]
563
+
564
+ # Tier 1: ChemScript + RDKit (can load CDX and RXN)
565
+ if sources and _HAS_CHEMSCRIPT and _HAS_RDKIT:
566
+ for source in sources:
567
+ species = _extract_from_structure(source, exp)
568
+ if species:
569
+ break
570
+ else:
571
+ species = None
572
+ else:
573
+ species = None
574
+
575
+ # Tier 2: RDKit only — load RXN directly (no ChemScript needed)
576
+ if species is None and _HAS_RDKIT and exp.rxn_path:
577
+ species = _extract_from_rxn_rdkit(exp.rxn_path, exp)
578
+
579
+ # Tier 3: CSV MW values (least accurate — average MW, not monoisotopic)
580
+ if species is None:
581
+ species = _fallback_from_csv(exp)
582
+
583
+ # Optional: FlowER byproduct prediction
584
+ global _last_flower_predictions
585
+ _last_flower_predictions = []
586
+ if predict_byproducts and exp.rxn_path:
587
+ try:
588
+ from experiments.byproduct_prediction.flower_predictor import (
589
+ predict_byproducts as _predict_bp,
590
+ )
591
+ csv_path = getattr(exp, '_csv_path', '') or ''
592
+ bp_species = _predict_bp(
593
+ rxn_path=exp.rxn_path,
594
+ csv_path=csv_path,
595
+ )
596
+ if bp_species:
597
+ print(f" FlowER predicted {len(bp_species)} byproduct(s)",
598
+ file=sys.stderr)
599
+ # Save full list before deduplication (for CDXML output)
600
+ _last_flower_predictions = list(bp_species)
601
+ # Filter out byproducts that duplicate existing species
602
+ # (SM, DP, or CSV reagents) by exact mass
603
+ existing_masses = [s.exact_mass for s in species]
604
+ from cdxml_toolkit.constants import MASS_TOLERANCE
605
+ kept = []
606
+ for bp in bp_species:
607
+ if any(abs(bp.exact_mass - em) < MASS_TOLERANCE
608
+ for em in existing_masses):
609
+ print(f" Skipping {bp.name} "
610
+ f"(mass {bp.exact_mass:.1f} duplicates "
611
+ f"an existing species)", file=sys.stderr)
612
+ continue
613
+ # Try to match against CSV reagent names by MW
614
+ if hasattr(exp, 'reactants') and exp.reactants and bp.smiles:
615
+ masses = _compute_masses(bp.smiles) if _HAS_RDKIT else None
616
+ if masses:
617
+ neutral_m, full_m = masses
618
+ else:
619
+ neutral_m = full_m = bp.exact_mass
620
+ csv_name = _match_csv_name(
621
+ neutral_m, full_m, exp.reactants, set())
622
+ if csv_name:
623
+ bp.name = f"BP-{csv_name}"
624
+ kept.append(bp)
625
+ if len(kept) < len(bp_species):
626
+ print(f" Kept {len(kept)} byproduct(s) after "
627
+ f"deduplication", file=sys.stderr)
628
+ species.extend(kept)
629
+ except ImportError:
630
+ print(" FlowER predictor not available — "
631
+ "skipping byproduct prediction", file=sys.stderr)
632
+ except Exception as e:
633
+ print(f" FlowER prediction failed: {e}", file=sys.stderr)
634
+
635
+ return species
636
+
637
+
638
+ def get_last_flower_predictions() -> List[ExpectedSpecies]:
639
+ """Return the full FlowER prediction list from the last call to
640
+ ``extract_expected_masses(predict_byproducts=True)``.
641
+
642
+ This is the pre-deduplication list (all predictions after basic MW
643
+ filtering). Used by procedure_writer to generate the reference CDXML.
644
+ """
645
+ return list(_last_flower_predictions)
646
+
647
+
648
+ # ---------------------------------------------------------------------------
649
+ # CLI placeholder
650
+ # ---------------------------------------------------------------------------
651
+
652
+ if __name__ == "__main__":
653
+ print("mass_resolver: no standalone CLI — "
654
+ "import extract_expected_masses() from procedure_writer.py")