cdxml-toolkit 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. cdxml_toolkit/__init__.py +18 -0
  2. cdxml_toolkit/_jre/__init__.py +2 -0
  3. cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
  4. cdxml_toolkit/analysis/__init__.py +35 -0
  5. cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
  6. cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
  7. cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
  8. cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
  9. cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
  10. cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
  11. cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
  12. cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
  13. cdxml_toolkit/analysis/extract_nmr.py +47 -0
  14. cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
  15. cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
  16. cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
  17. cdxml_toolkit/cdxml_builder.py +920 -0
  18. cdxml_toolkit/cdxml_utils.py +342 -0
  19. cdxml_toolkit/chemdraw/__init__.py +5 -0
  20. cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
  21. cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
  22. cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
  23. cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
  24. cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
  25. cdxml_toolkit/constants.py +304 -0
  26. cdxml_toolkit/coord_normalizer.py +438 -0
  27. cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
  28. cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
  29. cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
  30. cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
  31. cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
  32. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
  33. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
  34. cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
  35. cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
  36. cdxml_toolkit/image/__init__.py +15 -0
  37. cdxml_toolkit/image/reaction_from_image.py +2103 -0
  38. cdxml_toolkit/image/structure_from_image.py +1711 -0
  39. cdxml_toolkit/layout/__init__.py +5 -0
  40. cdxml_toolkit/layout/alignment.py +1642 -0
  41. cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
  42. cdxml_toolkit/layout/scheme_merger.py +2260 -0
  43. cdxml_toolkit/mcp_server/__init__.py +0 -0
  44. cdxml_toolkit/mcp_server/__main__.py +5 -0
  45. cdxml_toolkit/mcp_server/server.py +1567 -0
  46. cdxml_toolkit/naming/__init__.py +6 -0
  47. cdxml_toolkit/naming/aligned_namer.py +2342 -0
  48. cdxml_toolkit/naming/mol_builder.py +3722 -0
  49. cdxml_toolkit/naming/name_decomposer.py +2843 -0
  50. cdxml_toolkit/naming/reactions_datamol.json +2414 -0
  51. cdxml_toolkit/office/__init__.py +5 -0
  52. cdxml_toolkit/office/doc_from_template.py +722 -0
  53. cdxml_toolkit/office/ole_embedder.py +808 -0
  54. cdxml_toolkit/office/ole_extractor.py +272 -0
  55. cdxml_toolkit/perception/__init__.py +10 -0
  56. cdxml_toolkit/perception/compound_search.py +229 -0
  57. cdxml_toolkit/perception/eln_csv_parser.py +240 -0
  58. cdxml_toolkit/perception/rdf_parser.py +664 -0
  59. cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
  60. cdxml_toolkit/perception/reaction_parser.py +2150 -0
  61. cdxml_toolkit/perception/scheme_reader.py +2948 -0
  62. cdxml_toolkit/perception/scheme_refine.py +1404 -0
  63. cdxml_toolkit/perception/scheme_segmenter.py +619 -0
  64. cdxml_toolkit/perception/spatial_assignment.py +1013 -0
  65. cdxml_toolkit/rdkit_utils.py +605 -0
  66. cdxml_toolkit/render/__init__.py +17 -0
  67. cdxml_toolkit/render/auto_layout.py +229 -0
  68. cdxml_toolkit/render/compact_parser.py +632 -0
  69. cdxml_toolkit/render/parser.py +706 -0
  70. cdxml_toolkit/render/render_scheme.py +267 -0
  71. cdxml_toolkit/render/renderer.py +2387 -0
  72. cdxml_toolkit/render/schema.py +90 -0
  73. cdxml_toolkit/render/scheme_maker.py +1043 -0
  74. cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
  75. cdxml_toolkit/resolve/__init__.py +13 -0
  76. cdxml_toolkit/resolve/cas_resolver.py +430 -0
  77. cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
  78. cdxml_toolkit/resolve/condensed_formula.py +493 -0
  79. cdxml_toolkit/resolve/jre_manager.py +195 -0
  80. cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
  81. cdxml_toolkit/resolve/reagent_db.py +285 -0
  82. cdxml_toolkit/resolve/superatom_data.json +2856 -0
  83. cdxml_toolkit/resolve/superatom_table.py +146 -0
  84. cdxml_toolkit/text_formatting.py +298 -0
  85. cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
  86. cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
  87. cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
  88. cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
  89. cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
  90. cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
  91. cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,2150 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ reaction_parser.py — Unified reaction semantic layer.
4
+
5
+ Parses ELN export files (any combination of CDX, CDXML, RXN, CSV) into a
6
+ single persisted JSON descriptor listing every chemical species with:
7
+ - Canonical SMILES (full + neutral/salt-split)
8
+ - Role classification (atom_contributing / non_contributing / product)
9
+ - Display name (SM, DP, curated abbreviation, CSV name, or formula)
10
+ - Mass data (exact mass, MW, ESI adducts)
11
+
12
+ The JSON output serves as the single source of truth for downstream tools
13
+ (procedure_writer, scheme_merger, flower_predictor, etc.).
14
+
15
+ CLI:
16
+ python reaction_parser.py experiment.cdxml -o reaction.json
17
+ python reaction_parser.py experiment.cdxml --csv exp.csv --pretty
18
+ python reaction_parser.py --input-dir path/ --experiment KL-7001-004
19
+
20
+ Python API:
21
+ from cdxml_toolkit.perception.reaction_parser import parse_reaction, ReactionDescriptor
22
+ desc = parse_reaction(cdxml="scheme.cdxml", csv="exp.csv")
23
+ desc.to_json("reaction.json")
24
+ """
25
+
26
+ import argparse
27
+ import json
28
+ import os
29
+ import re
30
+ import sys
31
+ import datetime
32
+ from dataclasses import dataclass, field, asdict
33
+ from typing import Any, Dict, List, Optional, Tuple
34
+ from xml.etree import ElementTree as ET
35
+
36
+ from ..constants import MW_MATCH_TOLERANCE, MASS_TOLERANCE
37
+
38
+ # ---------------------------------------------------------------------------
39
+ # Logging helper
40
+ # ---------------------------------------------------------------------------
41
+ _verbose = False
42
+
43
+
44
+ def _log(msg: str) -> None:
45
+ if _verbose:
46
+ print(msg, file=sys.stderr)
47
+
48
+
49
+ # ---------------------------------------------------------------------------
50
+ # Data classes
51
+ # ---------------------------------------------------------------------------
52
+
53
+ @dataclass
54
+ class SpeciesDescriptor:
55
+ """A single chemical species in the reaction."""
56
+ id: str = "" # "sp_0", "sp_1", ...
57
+ smiles: Optional[str] = None # canonical, full (salts together)
58
+ smiles_neutral: Optional[str] = None # largest fragment (for LCMS)
59
+ name: str = "" # display name
60
+ role: str = "" # atom_contributing | non_contributing | product
61
+ role_detail: Optional[str] = None # from reagent_db: base, catalyst, ...
62
+ rxn_insight_role: Optional[str] = None # from RXN Insight
63
+ classification_method: str = "" # role_lookup, rxnmapper, mcs, csv_type, ...
64
+ is_sm: bool = False
65
+ is_dp: bool = False
66
+ exact_mass: float = 0.0 # monoisotopic, neutral
67
+ exact_mass_full: float = 0.0 # monoisotopic, full salt
68
+ mw: float = 0.0 # average MW
69
+ formula: Optional[str] = None
70
+ adducts: Dict[str, float] = field(default_factory=dict)
71
+ source: str = "" # fragment, text_label, csv_only, rxn
72
+ source_id: Optional[str] = None # CDXML element id
73
+ csv_equiv: Optional[str] = None
74
+ csv_mass: Optional[str] = None
75
+ csv_name: Optional[str] = None
76
+ csv_volume: Optional[str] = None
77
+ csv_supplier: Optional[str] = None
78
+ # v1.1 fields — ELN enrichment
79
+ is_substrate: bool = False # True = equiv 1.0 in CSV (for scheme layout)
80
+ is_solvent: bool = False # From CSV SOLVENT section or reagent_db role
81
+ display_text: Optional[str] = None # Formatted text for scheme: "Cs2CO3 (2 eq.)"
82
+ # v1.2 fields — original CDXML geometry preservation
83
+ original_geometry: Optional[Dict[str, Any]] = field(default=None)
84
+ # Structure of original_geometry:
85
+ # {
86
+ # "atoms": [
87
+ # {"id": 42, "x": 100.0, "y": 200.0, "symbol": "C"},
88
+ # {"id": 43, "x": 114.4, "y": 207.2, "symbol": "N", "num_hydrogens": 1},
89
+ # {"id": 44, "x": 128.8, "y": 200.0, "is_abbreviation": True,
90
+ # "label": "OTs", "label_smiles": "O[S](=O)(C1=CC=C(C)C=C1)=O",
91
+ # "is_generic": False},
92
+ # {"id": 45, "x": 140.0, "y": 210.0, "is_generic": True,
93
+ # "label": "R", "node_type": "GenericNickname"},
94
+ # ],
95
+ # "bonds": [
96
+ # {"begin": 42, "end": 43, "order": 1},
97
+ # {"begin": 43, "end": 44, "order": 1, "double_position": "Left"},
98
+ # ],
99
+ # "bond_length": 14.4, # average bond length in CDXML points
100
+ # }
101
+
102
+ def to_dict(self) -> dict:
103
+ d = asdict(self)
104
+ # Drop None values for cleaner JSON
105
+ return {k: v for k, v in d.items() if v is not None}
106
+
107
+
108
+ @dataclass
109
+ class ReactionDescriptor:
110
+ """Complete parsed reaction description."""
111
+ version: str = "1.3"
112
+ experiment: str = ""
113
+ input_files: Dict[str, Optional[str]] = field(default_factory=dict)
114
+ reaction_smiles: Optional[str] = None
115
+ reaction_class: Optional[str] = None
116
+ reaction_name: Optional[str] = None
117
+ classification_confidence: Optional[float] = None
118
+ species: List[SpeciesDescriptor] = field(default_factory=list)
119
+ warnings: List[str] = field(default_factory=list)
120
+ metadata: Dict[str, Any] = field(default_factory=dict)
121
+ # v1.1 fields — scheme layout and ELN enrichment
122
+ conditions: List[str] = field(default_factory=list) # ["80 °C", "24 h", "N2"]
123
+ eln_data: Optional[Dict[str, Any]] = field(default=None) # run arrow data + procedure
124
+
125
+ def to_dict(self) -> dict:
126
+ d = {
127
+ "version": self.version,
128
+ "experiment": self.experiment,
129
+ "input_files": self.input_files,
130
+ "reaction_smiles": self.reaction_smiles,
131
+ "reaction_class": self.reaction_class,
132
+ "reaction_name": self.reaction_name,
133
+ "classification_confidence": self.classification_confidence,
134
+ "species": [sp.to_dict() for sp in self.species],
135
+ "warnings": self.warnings,
136
+ "metadata": self.metadata,
137
+ "conditions": self.conditions,
138
+ "eln_data": self.eln_data,
139
+ }
140
+ return d
141
+
142
+ @classmethod
143
+ def from_dict(cls, d: dict) -> "ReactionDescriptor":
144
+ species_raw = d.get("species", [])
145
+ species = []
146
+ for sp_d in species_raw:
147
+ sp = SpeciesDescriptor(**{k: v for k, v in sp_d.items()
148
+ if k in SpeciesDescriptor.__dataclass_fields__})
149
+ species.append(sp)
150
+ return cls(
151
+ version=d.get("version", "1.0"),
152
+ experiment=d.get("experiment", ""),
153
+ input_files=d.get("input_files", {}),
154
+ reaction_smiles=d.get("reaction_smiles"),
155
+ reaction_class=d.get("reaction_class"),
156
+ reaction_name=d.get("reaction_name"),
157
+ classification_confidence=d.get("classification_confidence"),
158
+ species=species,
159
+ warnings=d.get("warnings", []),
160
+ metadata=d.get("metadata", {}),
161
+ conditions=d.get("conditions", []),
162
+ eln_data=d.get("eln_data"),
163
+ )
164
+
165
+ def to_json(self, path: str, pretty: bool = True) -> None:
166
+ with open(path, "w", encoding="utf-8") as f:
167
+ json.dump(self.to_dict(), f, indent=2 if pretty else None,
168
+ ensure_ascii=False)
169
+
170
+ @classmethod
171
+ def from_json(cls, path: str) -> "ReactionDescriptor":
172
+ with open(path, "r", encoding="utf-8") as f:
173
+ return cls.from_dict(json.load(f))
174
+
175
+ def get_sm(self) -> Optional[SpeciesDescriptor]:
176
+ """Return the starting material species, or None."""
177
+ for sp in self.species:
178
+ if sp.is_sm:
179
+ return sp
180
+ return None
181
+
182
+ def get_dp(self) -> Optional[SpeciesDescriptor]:
183
+ """Return the desired product species, or None."""
184
+ for sp in self.species:
185
+ if sp.is_dp:
186
+ return sp
187
+ return None
188
+
189
+ def get_expected_species(self) -> List[dict]:
190
+ """Return ExpectedSpecies-compatible dicts for LCMS matching."""
191
+ result = []
192
+ for sp in self.species:
193
+ if sp.exact_mass > 0 and sp.smiles:
194
+ result.append({
195
+ "name": sp.name,
196
+ "role": _lcms_role(sp),
197
+ "exact_mass": sp.exact_mass,
198
+ "smiles": sp.smiles_neutral or sp.smiles,
199
+ "adducts": dict(sp.adducts),
200
+ })
201
+ return result
202
+
203
+ # -- Default field sets for summary() ------------------------------------
204
+ DEFAULT_SPECIES_FIELDS = [
205
+ "id", "name", "role", "role_detail", "smiles",
206
+ "display_text", "formula", "mw",
207
+ ]
208
+ DEFAULT_TOP_FIELDS = [
209
+ "experiment", "conditions",
210
+ ]
211
+ DEFAULT_ELN_FIELDS = [
212
+ "product_yield", "reaction_type",
213
+ ]
214
+ ALL_SPECIES_FIELDS = [
215
+ f.name for f in SpeciesDescriptor.__dataclass_fields__.values()
216
+ ]
217
+ ALL_TOP_FIELDS = [
218
+ "version", "experiment", "input_files", "reaction_smiles",
219
+ "reaction_class", "reaction_name", "classification_confidence",
220
+ "warnings", "metadata", "conditions",
221
+ ]
222
+ ALL_ELN_FIELDS = [
223
+ "sm_mass", "product_obtained", "product_yield", "procedure_text",
224
+ "procedure_plain", "reaction_type", "start_date", "labbook_name",
225
+ "solvents", "solvent_details",
226
+ ]
227
+
228
+ def summary(
229
+ self,
230
+ species_fields: Optional[List[str]] = None,
231
+ top_fields: Optional[List[str]] = None,
232
+ eln_fields: Optional[List[str]] = None,
233
+ ) -> dict:
234
+ """Return a slim summary dict for LLM context.
235
+
236
+ Parameters
237
+ ----------
238
+ species_fields : list of str, optional
239
+ Per-species fields to include. ``None`` → DEFAULT_SPECIES_FIELDS.
240
+ Pass ``["*"]`` for all fields.
241
+ top_fields : list of str, optional
242
+ Top-level reaction fields to include. ``None`` → DEFAULT_TOP_FIELDS.
243
+ Pass ``["*"]`` for all fields.
244
+ eln_fields : list of str, optional
245
+ ``eln_data`` sub-fields to include. ``None`` → DEFAULT_ELN_FIELDS.
246
+ Pass ``["*"]`` for all eln_data fields. Pass ``[]`` to omit eln_data.
247
+
248
+ Returns
249
+ -------
250
+ dict
251
+ A filtered copy of the reaction descriptor.
252
+ """
253
+ sp_keys = (self.ALL_SPECIES_FIELDS if species_fields == ["*"]
254
+ else (species_fields or self.DEFAULT_SPECIES_FIELDS))
255
+ t_keys = (self.ALL_TOP_FIELDS if top_fields == ["*"]
256
+ else (top_fields or self.DEFAULT_TOP_FIELDS))
257
+ e_keys = (self.ALL_ELN_FIELDS if eln_fields == ["*"]
258
+ else (eln_fields if eln_fields is not None
259
+ else self.DEFAULT_ELN_FIELDS))
260
+
261
+ # Top-level fields
262
+ full = self.to_dict()
263
+ out: Dict[str, Any] = {}
264
+ for k in t_keys:
265
+ if k in full:
266
+ out[k] = full[k]
267
+
268
+ # Species
269
+ species_out = []
270
+ for sp in self.species:
271
+ sp_d = sp.to_dict()
272
+ species_out.append({k: sp_d[k] for k in sp_keys if k in sp_d})
273
+ out["species"] = species_out
274
+
275
+ # ELN data
276
+ if e_keys and self.eln_data:
277
+ eln_out = {k: self.eln_data[k] for k in e_keys
278
+ if k in self.eln_data}
279
+ if eln_out:
280
+ out["eln_data"] = eln_out
281
+
282
+ return out
283
+
284
+
285
+ def reaction_summary(
286
+ json_path: str,
287
+ species_fields: Optional[List[str]] = None,
288
+ top_fields: Optional[List[str]] = None,
289
+ eln_fields: Optional[List[str]] = None,
290
+ ) -> dict:
291
+ """Load a reaction JSON and return a slim summary for LLM context.
292
+
293
+ Convenience wrapper around ``ReactionDescriptor.from_json().summary()``.
294
+ See :meth:`ReactionDescriptor.summary` for parameter docs.
295
+
296
+ Available species fields
297
+ ------------------------
298
+ id, name, role, role_detail, smiles, smiles_neutral, classification_method,
299
+ is_sm, is_dp, is_substrate, is_solvent, exact_mass, exact_mass_full, mw,
300
+ formula, adducts, source, source_id, csv_equiv, csv_mass, csv_name,
301
+ csv_volume, csv_supplier, display_text, original_geometry
302
+
303
+ Available top-level fields
304
+ --------------------------
305
+ version, experiment, input_files, reaction_smiles, reaction_class,
306
+ reaction_name, classification_confidence, warnings, metadata, conditions
307
+
308
+ Available eln_data fields
309
+ -------------------------
310
+ sm_mass, product_obtained, product_yield, procedure_text, procedure_plain,
311
+ reaction_type, start_date, labbook_name, solvents, solvent_details
312
+ """
313
+ desc = ReactionDescriptor.from_json(json_path)
314
+ return desc.summary(
315
+ species_fields=species_fields,
316
+ top_fields=top_fields,
317
+ eln_fields=eln_fields,
318
+ )
319
+
320
+
321
+ def _lcms_role(sp: SpeciesDescriptor) -> str:
322
+ """Map SpeciesDescriptor role to ExpectedSpecies role string."""
323
+ if sp.is_sm:
324
+ return "substrate"
325
+ if sp.is_dp:
326
+ return "product"
327
+ if sp.role == "product":
328
+ return "product"
329
+ return "reactant"
330
+
331
+
332
+ # ---------------------------------------------------------------------------
333
+ # Condition text splitting
334
+ # ---------------------------------------------------------------------------
335
+
336
+ # Patterns that identify non-chemical condition tokens
337
+ _CONDITION_PATTERNS = [
338
+ re.compile(r"^-?\d+\.?\d*\s+.{0,2}C.*$"), # temperature: "80 °C", "105 C", encoding issues
339
+ re.compile(r"^-?\d+\.?\d*\s*[°\u00b0\ufffd].*$"), # temperature: "80°C", degree prefix
340
+ re.compile(r"^r\.?t\.?$", re.IGNORECASE), # room temperature
341
+ re.compile(r"^reflux$", re.IGNORECASE),
342
+ re.compile(r"^refl\.?$", re.IGNORECASE),
343
+ re.compile(r"^\d+\.?\d*\s*(h|hr|hrs|min|d|days?)$", re.IGNORECASE), # time
344
+ re.compile(r"^\d+\.?\d*\s*mol\s*%$", re.IGNORECASE), # catalyst loading
345
+ re.compile(r"^overnight$", re.IGNORECASE),
346
+ re.compile(r"^o\.?n\.?$", re.IGNORECASE),
347
+ re.compile(r"^\d+\.?\d*\s*bar$", re.IGNORECASE), # pressure
348
+ re.compile(r"^N[2\u2082]\s*(atm)?$"), # N2 atmosphere
349
+ re.compile(r"^Ar\s*(atm)?$"), # argon atmosphere
350
+ re.compile(r"^MW$", re.IGNORECASE), # microwave
351
+ re.compile(r"^sealed\s+tube$", re.IGNORECASE),
352
+ re.compile(r"^-?\d+\s+to\s+-?\d+", re.IGNORECASE), # temp range: "-78 to RT", "0 to 25"
353
+ re.compile(r"^-?\d+\s*(?:°|[\u00b0\ufffd])?\s*(?:C\s+)?to\s+(?:RT|r\.?t\.?|-?\d+)", re.IGNORECASE),
354
+ re.compile(r"^\d+\.?\d*\s*equiv?\.?$", re.IGNORECASE), # equivalents
355
+ re.compile(r"^\d+\.?\d*\s*eq\.?$", re.IGNORECASE),
356
+ re.compile(r"^\d+\s*M$"), # molarity: "2 M"
357
+ re.compile(r"^\d+\.?\d*\s*mL$", re.IGNORECASE), # volume
358
+ re.compile(r"^then$", re.IGNORECASE),
359
+ ]
360
+
361
+
362
+ def _is_condition_token(token: str) -> bool:
363
+ """Return True if token is a reaction condition, not a chemical name."""
364
+ return any(p.match(token) for p in _CONDITION_PATTERNS)
365
+
366
+
367
+ def split_condition_text(text: str) -> List[str]:
368
+ """Split a merged condition text block into individual chemical tokens.
369
+
370
+ Handles merged ``<t>`` blocks where reagent names are separated by
371
+ newlines and/or commas. Filters out non-chemical tokens (temperature,
372
+ time, "rt", "reflux", etc.).
373
+
374
+ Returns a list of chemical name strings.
375
+ """
376
+ from ..resolve.reagent_db import get_reagent_db
377
+ db = get_reagent_db()
378
+
379
+ # Split on newlines first (scheme_polisher merges with \n)
380
+ lines = text.split("\n")
381
+ tokens: List[str] = []
382
+
383
+ for line in lines:
384
+ line = line.strip()
385
+ if not line:
386
+ continue
387
+
388
+ # Strip trailing equiv annotations: "Cs2CO3 (2 eq.)" → "Cs2CO3"
389
+ line = re.sub(r"\s*\(\d+\.?\d*\s*eq\.?\)\s*$", "", line,
390
+ flags=re.IGNORECASE)
391
+
392
+ # If the entire line (before comma-split) is a known reagent, keep it
393
+ if db.entry_for_name(line.strip().lower()):
394
+ tokens.append(line.strip())
395
+ continue
396
+
397
+ # Split on comma/semicolon, but protect names like "1,4-dioxane"
398
+ # Strategy: try splitting, and if any resulting segment is a known
399
+ # chemical, use the split; otherwise keep the line intact.
400
+ parts = re.split(r"[;,]\s*", line)
401
+ if len(parts) == 1:
402
+ # No delimiter found
403
+ token = parts[0].strip()
404
+ if token and not _is_condition_token(token):
405
+ tokens.append(token)
406
+ else:
407
+ # Multiple parts — filter each
408
+ for part in parts:
409
+ part = part.strip()
410
+ if not part:
411
+ continue
412
+ if _is_condition_token(part):
413
+ continue
414
+ tokens.append(part)
415
+
416
+ return tokens
417
+
418
+
419
+ def extract_conditions_from_text(text: str) -> List[str]:
420
+ """Extract condition tokens (temperature, time, atmosphere) from text.
421
+
422
+ Inverse of ``split_condition_text`` — returns ONLY the non-chemical
423
+ tokens that represent reaction conditions.
424
+ """
425
+ conditions: List[str] = []
426
+ for line in text.split("\n"):
427
+ line = line.strip()
428
+ if not line:
429
+ continue
430
+ # Strip trailing equiv annotations before splitting
431
+ line = re.sub(r"\s*\(\d+\.?\d*\s*eq\.?\)\s*$", "", line,
432
+ flags=re.IGNORECASE)
433
+ parts = re.split(r"[;,]\s*", line)
434
+ for part in parts:
435
+ part = part.strip()
436
+ if part and _is_condition_token(part):
437
+ conditions.append(part)
438
+ return conditions
439
+
440
+
441
+ # ---------------------------------------------------------------------------
442
+ # Text label → SMILES resolution
443
+ # ---------------------------------------------------------------------------
444
+
445
+ def _resolve_text_label(text: str,
446
+ use_network: bool = True) -> Optional[str]:
447
+ """Resolve a text label to canonical SMILES.
448
+
449
+ Resolution chain (first success wins):
450
+ 1. reagent_db name → SMILES (curated dictionary)
451
+ 2. condensed formula parser (generative, offline)
452
+ 3. OPSIN (offline, IUPAC/systematic names)
453
+ 4. PubChem (online, if *use_network*)
454
+
455
+ Returns canonical SMILES or None.
456
+ """
457
+ from ..resolve.reagent_db import get_reagent_db
458
+ db = get_reagent_db()
459
+
460
+ # Normalize: strip equiv annotations, whitespace
461
+ clean = re.sub(r"\s*\(\d+\.?\d*\s*eq\.?\)\s*$", "", text,
462
+ flags=re.IGNORECASE).strip()
463
+
464
+ # 1. Reagent DB name → SMILES
465
+ entry = db.entry_for_name(clean.lower())
466
+ if entry:
467
+ smi = entry.get("smiles")
468
+ if smi:
469
+ # May be a list of SMILES variants — take the first
470
+ if isinstance(smi, list):
471
+ smi = smi[0]
472
+ # Try to canonicalize
473
+ try:
474
+ from rdkit import Chem
475
+ mol = Chem.MolFromSmiles(smi)
476
+ if mol:
477
+ return Chem.MolToSmiles(mol)
478
+ except ImportError:
479
+ pass
480
+ return smi
481
+
482
+ # 2. Condensed formula parser (generative, offline)
483
+ try:
484
+ from ..resolve.condensed_formula import resolve_condensed_formula
485
+ smi = resolve_condensed_formula(clean)
486
+ if smi:
487
+ return smi
488
+ except (ImportError, Exception):
489
+ pass
490
+
491
+ # 3. OPSIN (offline)
492
+ try:
493
+ from .reactant_heuristic import _opsin_name_to_smiles
494
+ smi = _opsin_name_to_smiles(clean)
495
+ if smi:
496
+ return smi
497
+ except (ImportError, Exception):
498
+ pass
499
+
500
+ # 4. PubChem (online)
501
+ if use_network:
502
+ try:
503
+ from ..resolve.cas_resolver import resolve_name_to_smiles
504
+ smi = resolve_name_to_smiles(clean)
505
+ if smi:
506
+ return smi
507
+ except (ImportError, Exception):
508
+ pass
509
+
510
+ return None
511
+
512
+
513
+ # ---------------------------------------------------------------------------
514
+ # Arrow detection and side assignment (does NOT use <step> attributes)
515
+ # ---------------------------------------------------------------------------
516
+
517
+ def _find_arrow(page: ET.Element) -> Optional[ET.Element]:
518
+ """Find the main reaction arrow on the page.
519
+
520
+ Looks for ``<arrow>`` elements first, then ``<graphic>`` elements with
521
+ arrow-type attributes. Returns the first found, or None.
522
+ """
523
+ # Direct <arrow> elements
524
+ for el in page:
525
+ if el.tag == "arrow":
526
+ return el
527
+
528
+ # <graphic> with arrow attributes (ChemDraw CDXML variant)
529
+ for el in page:
530
+ if el.tag == "graphic":
531
+ if el.get("GraphicType") == "Line" and el.get("ArrowType"):
532
+ return el
533
+ # SupersededBy linkage
534
+ if el.get("SupersededBy"):
535
+ continue
536
+
537
+ return None
538
+
539
+
540
+ def _arrow_endpoints(arrow: ET.Element) -> Tuple[float, float, float, float]:
541
+ """Return (tail_x, tail_y, head_x, head_y) from an arrow element."""
542
+ from ..cdxml_utils import arrow_endpoints
543
+ return arrow_endpoints(arrow)
544
+
545
+
546
+ def _fragment_centroid(frag: ET.Element) -> Tuple[float, float]:
547
+ """Compute centroid from direct-child atom positions."""
548
+ xs, ys = [], []
549
+ for n in frag.findall("n"):
550
+ p = n.get("p")
551
+ if p:
552
+ parts = p.split()
553
+ xs.append(float(parts[0]))
554
+ ys.append(float(parts[1]))
555
+ if xs:
556
+ return sum(xs) / len(xs), sum(ys) / len(ys)
557
+ return 0.0, 0.0
558
+
559
+
560
+ def _text_anchor(t_elem: ET.Element) -> Tuple[float, float]:
561
+ """Get approximate position of a text element."""
562
+ p = t_elem.get("p")
563
+ if p:
564
+ parts = p.split()
565
+ return float(parts[0]), float(parts[1])
566
+ bb = t_elem.get("BoundingBox")
567
+ if bb:
568
+ vals = [float(v) for v in bb.split()]
569
+ return (vals[0] + vals[2]) / 2, (vals[1] + vals[3]) / 2
570
+ return 0.0, 0.0
571
+
572
+
573
+ def _extract_geometry(frag_elem) -> Optional[Dict[str, Any]]:
574
+ """Extract original CDXML geometry from a <fragment> element.
575
+
576
+ Returns a dict with atoms, bonds, and average bond length that can be
577
+ stored in ``SpeciesDescriptor.original_geometry``. Abbreviation groups
578
+ (``NodeType="Fragment"``) and generic groups (``GenericNickname``, etc.)
579
+ are flagged with their label text so downstream tools can re-abbreviate.
580
+ """
581
+ _GENERIC_NODETYPES = {"GenericNickname", "Nickname", "Unspecified"}
582
+
583
+ atoms = []
584
+ id_set = set()
585
+ for n in frag_elem.findall("n"):
586
+ nid_str = n.get("id")
587
+ if nid_str is None:
588
+ continue
589
+ nid = int(nid_str)
590
+ node_type = n.get("NodeType")
591
+ if node_type == "ExternalConnectionPoint":
592
+ continue
593
+
594
+ p = n.get("p", "0 0").split()
595
+ x, y = float(p[0]), float(p[1])
596
+ elem = int(n.get("Element", "6"))
597
+ sym = _ELEM_SYMBOLS.get(elem, "C")
598
+ num_h_attr = n.get("NumHydrogens")
599
+
600
+ atom_d: Dict[str, Any] = {"id": nid, "x": x, "y": y, "symbol": sym}
601
+ if num_h_attr is not None:
602
+ atom_d["num_hydrogens"] = int(num_h_attr)
603
+
604
+ # Abbreviation groups (real superatom abbreviations)
605
+ if node_type == "Fragment":
606
+ # Get label text
607
+ label = None
608
+ for t in n.findall("t"):
609
+ parts = []
610
+ for s in t.findall("s"):
611
+ if s.text:
612
+ parts.append(s.text)
613
+ if parts:
614
+ label = "".join(parts)
615
+ break
616
+ atom_d["is_abbreviation"] = True
617
+ atom_d["is_generic"] = False
618
+ if label:
619
+ atom_d["label"] = label
620
+ # Look up the SMILES for this abbreviation
621
+ try:
622
+ from ..resolve.superatom_table import lookup_smiles
623
+ lsmi = lookup_smiles(label)
624
+ if lsmi:
625
+ atom_d["label_smiles"] = lsmi
626
+ except ImportError:
627
+ pass
628
+
629
+ # Generic variable groups (R, X, Ar, R1, etc.)
630
+ elif node_type in _GENERIC_NODETYPES:
631
+ label = None
632
+ # Try GenericNickname attribute first
633
+ label = n.get("GenericNickname")
634
+ if not label:
635
+ for t in n.findall("t"):
636
+ parts = []
637
+ for s in t.findall("s"):
638
+ if s.text:
639
+ parts.append(s.text)
640
+ if parts:
641
+ label = "".join(parts)
642
+ break
643
+ atom_d["is_abbreviation"] = False
644
+ atom_d["is_generic"] = True
645
+ atom_d["node_type"] = node_type
646
+ if label:
647
+ atom_d["label"] = label
648
+
649
+ atoms.append(atom_d)
650
+ id_set.add(nid)
651
+
652
+ bonds = []
653
+ bond_lengths = []
654
+ atom_pos = {a["id"]: (a["x"], a["y"]) for a in atoms}
655
+ for b in frag_elem.findall("b"):
656
+ bi, ei = int(b.get("B", "0")), int(b.get("E", "0"))
657
+ if bi not in id_set or ei not in id_set:
658
+ continue
659
+ order = int(b.get("Order", "1"))
660
+ bond_d: Dict[str, Any] = {"begin": bi, "end": ei, "order": order}
661
+ dp = b.get("DoublePosition")
662
+ if dp:
663
+ bond_d["double_position"] = dp
664
+ bonds.append(bond_d)
665
+ # Compute bond length for average
666
+ if bi in atom_pos and ei in atom_pos:
667
+ dx = atom_pos[bi][0] - atom_pos[ei][0]
668
+ dy = atom_pos[bi][1] - atom_pos[ei][1]
669
+ bl = (dx * dx + dy * dy) ** 0.5
670
+ if bl > 0:
671
+ bond_lengths.append(bl)
672
+
673
+ if not atoms:
674
+ return None
675
+
676
+ result: Dict[str, Any] = {"atoms": atoms, "bonds": bonds}
677
+ if bond_lengths:
678
+ result["bond_length"] = round(sum(bond_lengths) / len(bond_lengths), 2)
679
+ return result
680
+
681
+
682
+ # Element number → symbol mapping for _extract_geometry
683
+ _ELEM_SYMBOLS = {
684
+ 1: "H", 5: "B", 6: "C", 7: "N", 8: "O", 9: "F",
685
+ 14: "Si", 15: "P", 16: "S", 17: "Cl", 35: "Br", 53: "I",
686
+ 3: "Li", 11: "Na", 12: "Mg", 19: "K", 20: "Ca", 26: "Fe",
687
+ 29: "Cu", 30: "Zn", 46: "Pd", 55: "Cs", 78: "Pt",
688
+ }
689
+
690
+
691
+ def _get_text_content(t_elem: ET.Element) -> str:
692
+ """Extract plain text content from a <t> element."""
693
+ parts = []
694
+ for s in t_elem.iter("s"):
695
+ if s.text:
696
+ parts.append(s.text)
697
+ return "".join(parts).strip()
698
+
699
+
700
+ # ---------------------------------------------------------------------------
701
+ # CDXML extraction (fragments + text labels)
702
+ # ---------------------------------------------------------------------------
703
+
704
+ def _extract_from_cdxml(cdxml_path: str,
705
+ use_network: bool = True,
706
+ ) -> Tuple[List[SpeciesDescriptor], List[str], List[str]]:
707
+ """Extract species from a CDXML scheme.
708
+
709
+ Returns (species_list, warnings, conditions).
710
+ Determines product vs reactant by position relative to the arrow,
711
+ NOT from ``<step>`` attributes. Conditions are non-chemical tokens
712
+ (temperatures, times, atmospheres) from text labels near the arrow.
713
+ """
714
+ from ..cdxml_utils import parse_cdxml
715
+
716
+ tree = parse_cdxml(cdxml_path)
717
+ root = tree.getroot()
718
+ page = root.find(".//page")
719
+ if page is None:
720
+ return [], ["No <page> element found in CDXML"], []
721
+
722
+ # Find the arrow
723
+ arrow = _find_arrow(page)
724
+ if arrow is None:
725
+ return [], ["No reaction arrow found in CDXML"], []
726
+
727
+ tail_x, tail_y, head_x, head_y = _arrow_endpoints(arrow)
728
+ # Ensure tail is left of head
729
+ if tail_x > head_x:
730
+ tail_x, head_x = head_x, tail_x
731
+ tail_y, head_y = head_y, tail_y
732
+
733
+ arrow_y = (tail_y + head_y) / 2.0
734
+ _log(f" Arrow: tail=({tail_x:.1f}, {tail_y:.1f}), "
735
+ f"head=({head_x:.1f}, {head_y:.1f})")
736
+
737
+ # Collect the arrow element id (and any graphic superseding it)
738
+ arrow_ids = set()
739
+ aid = arrow.get("id")
740
+ if aid:
741
+ arrow_ids.add(aid)
742
+ # Also find graphic SupersededBy this arrow
743
+ for el in page:
744
+ if el.tag == "graphic" and el.get("SupersededBy") == aid:
745
+ gid = el.get("id")
746
+ if gid:
747
+ arrow_ids.add(gid)
748
+
749
+ species = []
750
+ warnings = []
751
+ sp_idx = 0
752
+
753
+ # Try to import frag_to_smiles (prefer resolved version for abbreviations)
754
+ _frag_to_smiles_resolved = None
755
+ _frag_to_smiles_plain = None
756
+ try:
757
+ from ..rdkit_utils import frag_to_smiles_resolved as _frag_to_smiles_resolved
758
+ from ..rdkit_utils import frag_to_smiles as _frag_to_smiles_plain
759
+ from ..rdkit_utils import frag_to_mw as _frag_to_mw
760
+ except ImportError:
761
+ _frag_to_mw = None
762
+
763
+ # Process all fragments
764
+ for el in page:
765
+ if el.tag != "fragment":
766
+ continue
767
+
768
+ eid = el.get("id", "")
769
+ if eid in arrow_ids:
770
+ continue
771
+
772
+ cx, cy = _fragment_centroid(el)
773
+
774
+ # Determine role by position relative to arrow
775
+ if cx > head_x:
776
+ pos_role = "product"
777
+ else:
778
+ pos_role = "candidate" # reactant or reagent — classified later
779
+
780
+ # Extract SMILES — prefer resolved (abbreviation-expanded) version
781
+ smi = None
782
+ if _frag_to_smiles_resolved is not None:
783
+ smi = _frag_to_smiles_resolved(el)
784
+
785
+ # Fallback to plain SMILES (may have [*] for abbreviations)
786
+ if smi is None and _frag_to_smiles_plain is not None:
787
+ smi = _frag_to_smiles_plain(el)
788
+
789
+ # If still has unresolved abbreviations, try ChemScript
790
+ if smi is not None and '*' in smi:
791
+ cs_smi = _try_chemscript_smiles(el, cdxml_path)
792
+ if cs_smi and '*' not in cs_smi:
793
+ smi = cs_smi
794
+
795
+ if smi is None:
796
+ # Try ChemScript fallback for total failures
797
+ smi = _try_chemscript_smiles(el, cdxml_path)
798
+
799
+ mw = 0.0
800
+ if _frag_to_mw is not None:
801
+ mw_val = _frag_to_mw(el)
802
+ if mw_val is not None:
803
+ mw = mw_val
804
+
805
+ # Extract original geometry (coordinates + abbreviation data)
806
+ geom = _extract_geometry(el)
807
+
808
+ sp = SpeciesDescriptor(
809
+ id=f"sp_{sp_idx}",
810
+ smiles=smi,
811
+ name="",
812
+ role=pos_role,
813
+ source="fragment",
814
+ source_id=eid,
815
+ mw=mw,
816
+ original_geometry=geom,
817
+ )
818
+ species.append(sp)
819
+ sp_idx += 1
820
+ _log(f" Fragment id={eid}: smiles={smi}, pos_role={pos_role}, mw={mw:.1f}")
821
+
822
+ # Process text labels (may contain reagent names and condition tokens)
823
+ cdxml_conditions: List[str] = []
824
+
825
+ for el in page:
826
+ if el.tag != "t":
827
+ continue
828
+
829
+ eid = el.get("id", "")
830
+ if eid in arrow_ids:
831
+ continue
832
+
833
+ text = _get_text_content(el)
834
+ if not text:
835
+ continue
836
+
837
+ tx, ty = _text_anchor(el)
838
+
839
+ # Skip text to the right of the arrow (product labels)
840
+ if tx > head_x:
841
+ continue
842
+
843
+ # Extract condition tokens from this text block
844
+ conds = extract_conditions_from_text(text)
845
+ cdxml_conditions.extend(conds)
846
+
847
+ # Split merged condition text into individual chemical tokens
848
+ tokens = split_condition_text(text)
849
+ if not tokens:
850
+ continue
851
+
852
+ for token in tokens:
853
+ smi = _resolve_text_label(token, use_network=use_network)
854
+
855
+ sp = SpeciesDescriptor(
856
+ id=f"sp_{sp_idx}",
857
+ smiles=smi,
858
+ name=token, # provisional — may be overwritten by display names
859
+ role="candidate",
860
+ source="text_label",
861
+ source_id=eid,
862
+ )
863
+ species.append(sp)
864
+ sp_idx += 1
865
+ _log(f" Text id={eid}: token='{token}', smiles={smi}")
866
+
867
+ return species, warnings, cdxml_conditions
868
+
869
+
870
+ def _try_chemscript_smiles(frag_elem: ET.Element,
871
+ cdxml_path: str) -> Optional[str]:
872
+ """Try to extract SMILES from a fragment via ChemScript.
873
+
874
+ Wraps the fragment in a minimal CDXML, writes to temp file,
875
+ and calls ChemScript to export SMILES.
876
+ """
877
+ try:
878
+ from ..chemdraw.chemscript_bridge import ChemScriptBridge
879
+ from ..constants import CDXML_MINIMAL_HEADER, CDXML_FOOTER
880
+ except ImportError:
881
+ return None
882
+
883
+ import tempfile
884
+
885
+ # Build minimal CDXML containing just this fragment
886
+ frag_xml = ET.tostring(frag_elem, encoding="unicode")
887
+ cdxml_str = f"{CDXML_MINIMAL_HEADER}<page>{frag_xml}</page>{CDXML_FOOTER}"
888
+
889
+ try:
890
+ with tempfile.NamedTemporaryFile(suffix=".cdxml", delete=False,
891
+ mode="w", encoding="utf-8") as f:
892
+ f.write(cdxml_str)
893
+ tmp_path = f.name
894
+ try:
895
+ cs = ChemScriptBridge()
896
+ smi = cs.write_data(tmp_path, "smiles")
897
+ return smi.strip() if smi else None
898
+ finally:
899
+ try:
900
+ os.unlink(tmp_path)
901
+ except OSError:
902
+ pass
903
+ except Exception:
904
+ return None
905
+
906
+
907
+ # ---------------------------------------------------------------------------
908
+ # RXN file extraction
909
+ # ---------------------------------------------------------------------------
910
+
911
+ def _extract_from_rxn(rxn_path: str) -> Tuple[List[SpeciesDescriptor], List[str]]:
912
+ """Extract species from an RXN file.
913
+
914
+ Tier 1: ChemScript ``load_reaction()`` → SMILES for each component.
915
+ Tier 2: RDKit ``ReactionFromRxnFile()`` → MOL templates → SMILES.
916
+
917
+ .. warning::
918
+ Neither tier handles V2000 S-group superatom abbreviations
919
+ (``M STY ... SUP`` / ``M SMT ... label``). Findmolecule RXN
920
+ exports commonly use these for groups like COOH, COOtBu, etc.
921
+ The placeholder atom is read as bare C, producing an incorrect
922
+ SMILES. **Best practice:** parse CDX (via ChemDraw COM) + CSV
923
+ together; RXN is a supplementary source only.
924
+
925
+ Returns (species_list, warnings).
926
+ """
927
+ species = []
928
+ warnings = []
929
+
930
+ # Tier 1: ChemScript
931
+ try:
932
+ from ..chemdraw.chemscript_bridge import ChemScriptBridge
933
+ cs = ChemScriptBridge()
934
+ result = cs.load_reaction(rxn_path)
935
+ if result and result.get("ok"):
936
+ sp_idx = 0
937
+ for rct in result.get("reactants", []):
938
+ sp = SpeciesDescriptor(
939
+ id=f"sp_{sp_idx}",
940
+ smiles=rct.get("smiles"),
941
+ name=rct.get("name", ""),
942
+ role="candidate",
943
+ source="rxn",
944
+ formula=rct.get("formula"),
945
+ )
946
+ species.append(sp)
947
+ sp_idx += 1
948
+ for prod in result.get("products", []):
949
+ sp = SpeciesDescriptor(
950
+ id=f"sp_{sp_idx}",
951
+ smiles=prod.get("smiles"),
952
+ name=prod.get("name", ""),
953
+ role="product",
954
+ source="rxn",
955
+ formula=prod.get("formula"),
956
+ )
957
+ species.append(sp)
958
+ sp_idx += 1
959
+ _log(f" RXN via ChemScript: {len(species)} species")
960
+ return species, warnings
961
+ except Exception as e:
962
+ _log(f" ChemScript RXN load failed: {e}")
963
+
964
+ # Tier 2: RDKit
965
+ try:
966
+ from rdkit import Chem
967
+ from rdkit.Chem import AllChem
968
+
969
+ rxn = AllChem.ReactionFromRxnFile(rxn_path)
970
+ if rxn is None:
971
+ warnings.append(f"RDKit could not parse RXN file: {rxn_path}")
972
+ return [], warnings
973
+
974
+ sp_idx = 0
975
+ for i in range(rxn.GetNumReactantTemplates()):
976
+ mol = rxn.GetReactantTemplate(i)
977
+ if mol is None or mol.GetNumAtoms() == 0:
978
+ continue
979
+ try:
980
+ Chem.SanitizeMol(mol)
981
+ except Exception:
982
+ pass
983
+ smi = Chem.MolToSmiles(mol) if mol else None
984
+ sp = SpeciesDescriptor(
985
+ id=f"sp_{sp_idx}",
986
+ smiles=smi,
987
+ role="candidate",
988
+ source="rxn",
989
+ )
990
+ species.append(sp)
991
+ sp_idx += 1
992
+
993
+ for i in range(rxn.GetNumProductTemplates()):
994
+ mol = rxn.GetProductTemplate(i)
995
+ if mol is None or mol.GetNumAtoms() == 0:
996
+ continue
997
+ try:
998
+ Chem.SanitizeMol(mol)
999
+ except Exception:
1000
+ pass
1001
+ smi = Chem.MolToSmiles(mol) if mol else None
1002
+ sp = SpeciesDescriptor(
1003
+ id=f"sp_{sp_idx}",
1004
+ smiles=smi,
1005
+ role="product",
1006
+ source="rxn",
1007
+ )
1008
+ species.append(sp)
1009
+ sp_idx += 1
1010
+
1011
+ _log(f" RXN via RDKit: {len(species)} species")
1012
+ except ImportError:
1013
+ warnings.append("Neither ChemScript nor RDKit available for RXN parsing")
1014
+ except Exception as e:
1015
+ warnings.append(f"RXN parsing failed: {e}")
1016
+
1017
+ return species, warnings
1018
+
1019
+
1020
+ # ---------------------------------------------------------------------------
1021
+ # CSV matching
1022
+ # ---------------------------------------------------------------------------
1023
+
1024
+ def _match_csv_data(species: List[SpeciesDescriptor],
1025
+ csv_path: str) -> Tuple[List[SpeciesDescriptor], List[str], Any]:
1026
+ """Match CSV reagent data to species by MW or name.
1027
+
1028
+ Supplements species with CSV metadata (equiv, mass, name, substrate flag).
1029
+ Species not matched to any structural source are added as csv_only.
1030
+
1031
+ Returns (updated_species, warnings, exp_data).
1032
+ """
1033
+ warnings = []
1034
+
1035
+ try:
1036
+ from .eln_csv_parser import parse_eln_csv
1037
+ except ImportError:
1038
+ warnings.append("eln_csv_parser not available for CSV parsing")
1039
+ return species, warnings, None
1040
+
1041
+ exp_data = parse_eln_csv(csv_path)
1042
+ if exp_data is None:
1043
+ warnings.append(f"Could not parse CSV: {csv_path}")
1044
+ return species, warnings, None
1045
+
1046
+ from ..resolve.reagent_db import get_reagent_db
1047
+ db = get_reagent_db()
1048
+
1049
+ # Build match tracking
1050
+ matched_species = set() # species indices already matched
1051
+ matched_csv = set() # CSV reagent indices already matched
1052
+
1053
+ # --- Pass 1: Name match ---
1054
+ for ci, rgt in enumerate(exp_data.reactants):
1055
+ if ci in matched_csv:
1056
+ continue
1057
+ csv_name_lower = rgt.name.strip().lower()
1058
+ csv_display = db.resolve_display(rgt.name)
1059
+
1060
+ for si, sp in enumerate(species):
1061
+ if si in matched_species:
1062
+ continue
1063
+ if sp.role == "product":
1064
+ continue
1065
+
1066
+ # Compare against text label name
1067
+ sp_name_lower = (sp.name or "").strip().lower()
1068
+ sp_display_lower = db.resolve_display(sp.name or "").lower()
1069
+
1070
+ if (sp_name_lower and (sp_name_lower == csv_name_lower
1071
+ or sp_display_lower == csv_display.lower())):
1072
+ _apply_csv_match(sp, rgt)
1073
+ matched_species.add(si)
1074
+ matched_csv.add(ci)
1075
+ _log(f" CSV name match: '{rgt.name}' → sp_{si}")
1076
+ break
1077
+
1078
+ # --- Pass 2: MW match (species with known MW) ---
1079
+ for ci, rgt in enumerate(exp_data.reactants):
1080
+ if ci in matched_csv:
1081
+ continue
1082
+ if rgt.mw <= 0:
1083
+ continue
1084
+
1085
+ best_si = None
1086
+ best_delta = MW_MATCH_TOLERANCE
1087
+
1088
+ for si, sp in enumerate(species):
1089
+ if si in matched_species:
1090
+ continue
1091
+ if sp.role == "product":
1092
+ continue
1093
+ if sp.mw <= 0:
1094
+ continue
1095
+
1096
+ delta = abs(sp.mw - rgt.mw)
1097
+ if delta < best_delta:
1098
+ best_delta = delta
1099
+ best_si = si
1100
+
1101
+ if best_si is not None:
1102
+ _apply_csv_match(species[best_si], rgt)
1103
+ matched_species.add(best_si)
1104
+ matched_csv.add(ci)
1105
+ _log(f" CSV MW match: '{rgt.name}' (MW={rgt.mw:.1f}) "
1106
+ f"→ sp_{best_si} (MW={species[best_si].mw:.1f})")
1107
+
1108
+ # --- Pass 3: MW match via SMILES from reagent_db ---
1109
+ try:
1110
+ from rdkit import Chem
1111
+ from rdkit.Chem import Descriptors
1112
+ has_rdkit = True
1113
+ except ImportError:
1114
+ has_rdkit = False
1115
+
1116
+ if has_rdkit:
1117
+ for ci, rgt in enumerate(exp_data.reactants):
1118
+ if ci in matched_csv:
1119
+ continue
1120
+ if rgt.mw <= 0:
1121
+ continue
1122
+
1123
+ for si, sp in enumerate(species):
1124
+ if si in matched_species:
1125
+ continue
1126
+ if sp.role == "product":
1127
+ continue
1128
+ if sp.smiles or sp.mw > 0:
1129
+ continue # already has structural data
1130
+
1131
+ # Try to get SMILES from reagent_db for this text label
1132
+ sp_name = (sp.name or "").strip()
1133
+ if not sp_name:
1134
+ continue
1135
+ entry = db.entry_for_name(sp_name.lower())
1136
+ if not entry:
1137
+ continue
1138
+ smi = entry.get("smiles")
1139
+ if not smi:
1140
+ continue
1141
+ if isinstance(smi, list):
1142
+ smi = smi[0]
1143
+
1144
+ mol = Chem.MolFromSmiles(smi)
1145
+ if mol is None:
1146
+ continue
1147
+ text_mw = Descriptors.MolWt(mol)
1148
+ delta = abs(text_mw - rgt.mw)
1149
+ if delta < MW_MATCH_TOLERANCE:
1150
+ sp.smiles = Chem.MolToSmiles(mol)
1151
+ sp.mw = text_mw
1152
+ _apply_csv_match(sp, rgt)
1153
+ matched_species.add(si)
1154
+ matched_csv.add(ci)
1155
+ _log(f" CSV MW→DB match: '{rgt.name}' → sp_{si} "
1156
+ f"via DB SMILES '{sp_name}'")
1157
+ break
1158
+
1159
+ # --- Add unmatched CSV reagents as csv_only species ---
1160
+ sp_idx = max((int(sp.id.split("_")[1]) for sp in species), default=-1) + 1
1161
+ for ci, rgt in enumerate(exp_data.reactants):
1162
+ if ci in matched_csv:
1163
+ continue
1164
+ sp = SpeciesDescriptor(
1165
+ id=f"sp_{sp_idx}",
1166
+ name=rgt.name,
1167
+ role="candidate",
1168
+ source="csv_only",
1169
+ mw=rgt.mw,
1170
+ csv_name=rgt.name,
1171
+ csv_equiv=rgt.equiv,
1172
+ csv_mass=rgt.mass,
1173
+ )
1174
+ # Try to resolve SMILES from name
1175
+ smi = _resolve_text_label(rgt.name, use_network=False)
1176
+ if smi:
1177
+ sp.smiles = smi
1178
+ if rgt.is_substrate:
1179
+ sp.is_sm = True
1180
+ species.append(sp)
1181
+ sp_idx += 1
1182
+ _log(f" CSV-only species: '{rgt.name}' (MW={rgt.mw:.1f})")
1183
+
1184
+ # --- Match product to CSV ---
1185
+ if exp_data.product and exp_data.product.mw > 0:
1186
+ for sp in species:
1187
+ if sp.role != "product":
1188
+ continue
1189
+ if sp.mw > 0:
1190
+ delta = abs(sp.mw - exp_data.product.mw)
1191
+ if delta < MW_MATCH_TOLERANCE:
1192
+ sp.csv_name = exp_data.product.name
1193
+ sp.is_dp = True
1194
+ _log(f" Product CSV match: '{exp_data.product.name}'")
1195
+ break
1196
+
1197
+ return species, warnings, exp_data
1198
+
1199
+
1200
+ def _apply_csv_match(sp: SpeciesDescriptor, rgt) -> None:
1201
+ """Apply CSV reagent data to a species descriptor."""
1202
+ sp.csv_name = rgt.name
1203
+ sp.csv_equiv = rgt.equiv
1204
+ sp.csv_mass = rgt.mass
1205
+ if hasattr(rgt, "volume") and rgt.volume:
1206
+ sp.csv_volume = rgt.volume
1207
+ if hasattr(rgt, "supplier") and rgt.supplier:
1208
+ sp.csv_supplier = rgt.supplier
1209
+ if hasattr(rgt, "is_substrate") and rgt.is_substrate:
1210
+ sp.is_sm = True # Mark from CSV substrate flag
1211
+ sp.is_substrate = True
1212
+
1213
+
1214
+ # ---------------------------------------------------------------------------
1215
+ # Species classification
1216
+ # ---------------------------------------------------------------------------
1217
+
1218
+ def _classify_species(species: List[SpeciesDescriptor],
1219
+ use_rxnmapper: bool = True,
1220
+ use_rxn_insight: bool = True,
1221
+ ) -> Optional[float]:
1222
+ """Classify non-product species using the tiered pipeline.
1223
+
1224
+ Returns Schneider FP score (if classification ran), or None.
1225
+ use_rxnmapper is deprecated and ignored (kept for API compat).
1226
+ """
1227
+ from .reactant_heuristic import (
1228
+ ReagentInfo, classify_reagents, role_lookup,
1229
+ )
1230
+
1231
+ # Find product SMILES (needed for classification)
1232
+ product_smiles = None
1233
+ for sp in species:
1234
+ if sp.role == "product" and sp.smiles:
1235
+ product_smiles = sp.smiles
1236
+ break
1237
+
1238
+ if not product_smiles:
1239
+ _log(" WARNING: No product SMILES found, cannot classify reagents")
1240
+ return None
1241
+
1242
+ # Build ReagentInfo list for the classification pipeline
1243
+ reagents = []
1244
+ sp_to_ri = {} # map species index → ReagentInfo index
1245
+ for i, sp in enumerate(species):
1246
+ if sp.role == "product":
1247
+ continue
1248
+ if sp.role == "candidate":
1249
+ ri = ReagentInfo(
1250
+ source_id=sp.source_id or sp.id,
1251
+ source_type=sp.source,
1252
+ name=sp.name or None,
1253
+ smiles=sp.smiles,
1254
+ position="reactant",
1255
+ classification="",
1256
+ classification_method="",
1257
+ )
1258
+ sp_to_ri[i] = len(reagents)
1259
+ reagents.append(ri)
1260
+
1261
+ if not reagents:
1262
+ return None
1263
+
1264
+ # Run 2-tier classification (Schneider FP → DB enrichment)
1265
+ classify_reagents(reagents, product_smiles)
1266
+
1267
+ # Apply results back to species
1268
+ schneider_score = None
1269
+ for sp_i, ri_i in sp_to_ri.items():
1270
+ ri = reagents[ri_i]
1271
+ sp = species[sp_i]
1272
+ sp.role = ri.classification or "unclassified"
1273
+ sp.classification_method = ri.classification_method
1274
+ sp.role_detail = ri.role
1275
+ if ri.schneider_score is not None and schneider_score is None:
1276
+ schneider_score = ri.schneider_score
1277
+
1278
+ # --- Optional RXN Insight enrichment ---
1279
+ rxn_class = None
1280
+ rxn_name = None
1281
+ if use_rxn_insight:
1282
+ rxn_class, rxn_name = _try_rxn_insight(species, product_smiles)
1283
+
1284
+ return schneider_score
1285
+
1286
+
1287
+ def _try_rxn_insight(species: List[SpeciesDescriptor],
1288
+ product_smiles: str,
1289
+ ) -> Tuple[Optional[str], Optional[str]]:
1290
+ """Try RXN Insight enrichment for reaction class and per-species roles.
1291
+
1292
+ Returns (reaction_class, reaction_name) or (None, None).
1293
+ """
1294
+ try:
1295
+ from experiments.role_classification.rxn_role_classifier import (
1296
+ classify_roles_enriched,
1297
+ )
1298
+ except ImportError:
1299
+ return None, None
1300
+
1301
+ # Build full reaction SMILES: all reactant/reagent SMILES >> product
1302
+ lhs_parts = []
1303
+ for sp in species:
1304
+ if sp.role != "product" and sp.smiles:
1305
+ lhs_parts.append(sp.smiles)
1306
+ if not lhs_parts:
1307
+ return None, None
1308
+
1309
+ rxn_smi = ".".join(lhs_parts) + ">>" + product_smiles
1310
+
1311
+ try:
1312
+ result = classify_roles_enriched(rxn_smi)
1313
+ except Exception as e:
1314
+ _log(f" RXN Insight failed: {e}")
1315
+ return None, None
1316
+
1317
+ if not result:
1318
+ return None, None
1319
+
1320
+ rxn_class = result.get("reaction_class")
1321
+ rxn_name = result.get("reaction_name")
1322
+
1323
+ # Map per-component roles back to species
1324
+ try:
1325
+ from rdkit import Chem
1326
+ def _canon(smi):
1327
+ mol = Chem.MolFromSmiles(smi)
1328
+ return Chem.MolToSmiles(mol) if mol else smi
1329
+ except ImportError:
1330
+ def _canon(smi):
1331
+ return smi
1332
+
1333
+ comp_map = {}
1334
+ for comp in result.get("components", []):
1335
+ canon = _canon(comp.get("smiles", ""))
1336
+ comp_map[canon] = comp.get("role")
1337
+
1338
+ for sp in species:
1339
+ if sp.smiles and sp.role != "product":
1340
+ canon = _canon(sp.smiles)
1341
+ insight_role = comp_map.get(canon)
1342
+ if insight_role:
1343
+ sp.rxn_insight_role = insight_role
1344
+
1345
+ _log(f" RXN Insight: class={rxn_class}, name={rxn_name}")
1346
+ return rxn_class, rxn_name
1347
+
1348
+
1349
+ # ---------------------------------------------------------------------------
1350
+ # SM / DP identification and display names
1351
+ # ---------------------------------------------------------------------------
1352
+
1353
+ def _identify_sm_dp(species: List[SpeciesDescriptor]) -> None:
1354
+ """Identify SM and DP, then apply display name precedence rules."""
1355
+
1356
+ # --- DP: single product or largest product ---
1357
+ products = [sp for sp in species if sp.role == "product"]
1358
+ if len(products) == 1:
1359
+ products[0].is_dp = True
1360
+ elif len(products) > 1:
1361
+ # If one already matched CSV product, it stays DP
1362
+ dp_found = any(sp.is_dp for sp in products)
1363
+ if not dp_found:
1364
+ # Pick largest by MW
1365
+ best = max(products, key=lambda sp: sp.mw)
1366
+ best.is_dp = True
1367
+
1368
+ # --- SM: CSV substrate flag → most contributing → largest ---
1369
+ # Priority 0: Check if CSV already marked a substrate
1370
+ csv_substrates = [sp for sp in species
1371
+ if sp.is_sm and sp.role != "product"]
1372
+ if csv_substrates:
1373
+ # Pick largest MW among CSV substrates (handles multi-substrate)
1374
+ sm = max(csv_substrates, key=lambda sp: sp.mw)
1375
+ # Clear other substrate flags — only keep the primary SM
1376
+ for sp in csv_substrates:
1377
+ if sp is not sm:
1378
+ sp.is_sm = False
1379
+ else:
1380
+ # Priority 1: Largest atom_contributing non-solvent by MW
1381
+ atom_contributing = [sp for sp in species
1382
+ if sp.role == "atom_contributing"
1383
+ and not sp.is_solvent and sp.mw > 50]
1384
+ if atom_contributing:
1385
+ sm = max(atom_contributing, key=lambda sp: sp.mw)
1386
+ sm.is_sm = True
1387
+ else:
1388
+ # Priority 2: Largest non-product, non-solvent species by MW
1389
+ # Exclude counterions (MW < 50: HCl=36, HBr=81 — use 50 cutoff)
1390
+ fallback = [sp for sp in species
1391
+ if sp.role != "product"
1392
+ and not sp.is_solvent
1393
+ and sp.mw > 50]
1394
+ if fallback:
1395
+ sm = max(fallback, key=lambda sp: sp.mw)
1396
+ sm.is_sm = True
1397
+
1398
+
1399
+ def _apply_display_names(species: List[SpeciesDescriptor]) -> None:
1400
+ """Apply display name precedence rules to all species."""
1401
+ from ..resolve.reagent_db import get_reagent_db
1402
+ db = get_reagent_db()
1403
+
1404
+ for sp in species:
1405
+ # SM / DP are identified by is_sm / is_dp flags — their display names
1406
+ # follow the same precedence as other species (no special "SM"/"DP"
1407
+ # override; compound labels are a layout-layer decision).
1408
+
1409
+ # 1. Reagent DB display name from SMILES
1410
+ if sp.smiles:
1411
+ display = db.display_for_smiles(sp.smiles)
1412
+ if display:
1413
+ sp.name = display
1414
+ continue
1415
+
1416
+ # 3. Reagent DB display name from name
1417
+ if sp.name:
1418
+ display = db.resolve_display(sp.name)
1419
+ if display and display.lower() != sp.name.lower():
1420
+ sp.name = display
1421
+ continue
1422
+ # Keep existing name if resolve_display just returns input
1423
+ if display:
1424
+ sp.name = display
1425
+ continue
1426
+
1427
+ # 3b. Reagent DB display name from csv_name (abbreviation > full name)
1428
+ if sp.csv_name:
1429
+ display = db.display_for_name(sp.csv_name.lower())
1430
+ if display:
1431
+ sp.name = display
1432
+ continue
1433
+
1434
+ # 4. CSV name
1435
+ if sp.csv_name:
1436
+ sp.name = sp.csv_name
1437
+ continue
1438
+
1439
+ # 5. Molecular formula
1440
+ if sp.formula:
1441
+ sp.name = sp.formula
1442
+ continue
1443
+
1444
+ # 6. SMILES as last resort
1445
+ if sp.smiles:
1446
+ sp.name = sp.smiles
1447
+
1448
+
1449
+
1450
+
1451
+ def _detect_solvents(species: List[SpeciesDescriptor],
1452
+ exp_data: Optional[Any] = None) -> None:
1453
+ """Mark solvent species from CSV SOLVENT section and reagent_db role."""
1454
+ from ..resolve.reagent_db import get_reagent_db
1455
+ db = get_reagent_db()
1456
+
1457
+ # From reagent_db role_detail
1458
+ for sp in species:
1459
+ if sp.role_detail == "solvent":
1460
+ sp.is_solvent = True
1461
+
1462
+ if exp_data is None:
1463
+ return
1464
+
1465
+ # From CSV SOLVENT section — match by name to existing species
1466
+ csv_solvents = getattr(exp_data, "solvents", [])
1467
+ matched_solvent_names = set()
1468
+
1469
+ for solv in csv_solvents:
1470
+ solv_name = solv.name.strip()
1471
+ if not solv_name:
1472
+ continue
1473
+ solv_lower = solv_name.lower()
1474
+ solv_display = db.resolve_display(solv_name).lower()
1475
+
1476
+ for sp in species:
1477
+ sp_name_lower = (sp.name or "").strip().lower()
1478
+ sp_csv_lower = (sp.csv_name or "").strip().lower()
1479
+ sp_display_lower = db.resolve_display(sp.name or "").lower()
1480
+ sp_display_text_lower = (sp.display_text or "").strip().lower()
1481
+ candidates = {sp_name_lower, sp_csv_lower, sp_display_lower,
1482
+ sp_display_text_lower} - {""}
1483
+ if candidates & {solv_lower, solv_display}:
1484
+ sp.is_solvent = True
1485
+ matched_solvent_names.add(solv_lower)
1486
+ break
1487
+
1488
+ # Add unmatched solvents as csv_only species
1489
+ sp_idx = max((int(sp.id.split("_")[1]) for sp in species), default=-1) + 1
1490
+ for solv in csv_solvents:
1491
+ solv_name = solv.name.strip()
1492
+ if not solv_name or solv_name.lower() in matched_solvent_names:
1493
+ continue
1494
+ # Check if this is a known reagent
1495
+ smi = None
1496
+ entry = db.entry_for_name(solv_name.lower())
1497
+ if entry:
1498
+ smi_val = entry.get("smiles")
1499
+ if isinstance(smi_val, list):
1500
+ smi_val = smi_val[0] if smi_val else None
1501
+ smi = smi_val
1502
+ sp = SpeciesDescriptor(
1503
+ id=f"sp_{sp_idx}",
1504
+ name=solv_name,
1505
+ role="non_contributing",
1506
+ role_detail="solvent",
1507
+ source="csv_only",
1508
+ smiles=smi,
1509
+ is_solvent=True,
1510
+ )
1511
+ species.append(sp)
1512
+ sp_idx += 1
1513
+ matched_solvent_names.add(solv_lower) # prevent duplicate csv_only entries
1514
+ _log(f" CSV solvent added: '{solv_name}'")
1515
+
1516
+
1517
+ def _format_equiv(equiv_str: str) -> str:
1518
+ """Format equivalents for display: '2.0' → '2', '0.05' → '0.05'."""
1519
+ if not equiv_str:
1520
+ return ""
1521
+ try:
1522
+ val = float(equiv_str)
1523
+ if val == int(val):
1524
+ return str(int(val))
1525
+ return equiv_str.strip()
1526
+ except (ValueError, TypeError):
1527
+ return equiv_str.strip()
1528
+
1529
+
1530
+ def _build_display_texts(species: List[SpeciesDescriptor]) -> None:
1531
+ """Build display_text for each species (name + equiv annotation).
1532
+
1533
+ display_text is what would appear on a rendered scheme:
1534
+ - Reagents with equiv > 1: "Cs2CO3 (2 eq.)"
1535
+ - Solvents: just the name (no equiv)
1536
+ - SM/DP substrates: just the name (equiv=1 suppressed)
1537
+ """
1538
+ for sp in species:
1539
+ base = sp.name or ""
1540
+ if not base:
1541
+ sp.display_text = None
1542
+ continue
1543
+
1544
+ # Substrates and products: just the name
1545
+ if sp.is_substrate or sp.is_sm or sp.is_dp:
1546
+ sp.display_text = base
1547
+ elif sp.is_solvent:
1548
+ sp.display_text = base
1549
+ elif sp.csv_equiv:
1550
+ # Non-substrate species with equiv → "Name (X eq.)"
1551
+ equiv_str = _format_equiv(sp.csv_equiv)
1552
+ if equiv_str and equiv_str != "1":
1553
+ sp.display_text = f"{base} ({equiv_str} eq.)"
1554
+ else:
1555
+ sp.display_text = base
1556
+ else:
1557
+ sp.display_text = base
1558
+
1559
+
1560
+ def _populate_eln_data(desc: "ReactionDescriptor",
1561
+ exp_data: Optional[Any]) -> None:
1562
+ """Populate desc.eln_data from parsed CSV ExperimentData."""
1563
+ if exp_data is None:
1564
+ return
1565
+
1566
+ eln = {}
1567
+
1568
+ # SM mass from substrate species
1569
+ sm = desc.get_sm()
1570
+ if sm and sm.csv_mass:
1571
+ eln["sm_mass"] = sm.csv_mass.strip()
1572
+
1573
+ # Product yield data
1574
+ product = getattr(exp_data, "product", None)
1575
+ if product:
1576
+ if hasattr(product, "obtained_mass") and product.obtained_mass:
1577
+ eln["product_obtained"] = product.obtained_mass.strip()
1578
+ if hasattr(product, "yield_pct") and product.yield_pct:
1579
+ eln["product_yield"] = product.yield_pct.strip()
1580
+
1581
+ # Procedure text (HTML + plain text)
1582
+ procedure = getattr(exp_data, "procedure_html", "")
1583
+ if procedure:
1584
+ eln["procedure_text"] = procedure
1585
+ procedure_plain = getattr(exp_data, "procedure_text", "")
1586
+ if procedure_plain:
1587
+ eln["procedure_plain"] = procedure_plain
1588
+
1589
+ # Experiment metadata
1590
+ reaction_type = getattr(exp_data, "reaction_type", "")
1591
+ if reaction_type:
1592
+ eln["reaction_type"] = reaction_type
1593
+ start_date = getattr(exp_data, "start_date", "")
1594
+ if start_date:
1595
+ eln["start_date"] = start_date
1596
+ labbook = getattr(exp_data, "labbook_name", "")
1597
+ if labbook:
1598
+ eln["labbook_name"] = labbook
1599
+
1600
+ # Solvents list (names only, backward compat)
1601
+ solvents = getattr(exp_data, "solvents", [])
1602
+ if solvents:
1603
+ eln["solvents"] = [s.name.strip() for s in solvents if s.name.strip()]
1604
+ # Full solvent details with volume/concentration
1605
+ eln["solvent_details"] = [
1606
+ {
1607
+ "name": s.name.strip(),
1608
+ "volume": getattr(s, "volume", "").strip(),
1609
+ "concentration": getattr(s, "concentration", "").strip(),
1610
+ }
1611
+ for s in solvents if s.name.strip()
1612
+ ]
1613
+
1614
+ if eln:
1615
+ desc.eln_data = eln
1616
+
1617
+
1618
+ # ---------------------------------------------------------------------------
1619
+ # Mass computation
1620
+ # ---------------------------------------------------------------------------
1621
+
1622
+ def _compute_all_masses(species: List[SpeciesDescriptor]) -> None:
1623
+ """Compute exact masses, neutral masses, MW, formula, and adducts."""
1624
+ try:
1625
+ from rdkit import Chem
1626
+ from rdkit.Chem import Descriptors, rdMolDescriptors
1627
+ has_rdkit = True
1628
+ except ImportError:
1629
+ has_rdkit = False
1630
+
1631
+ for sp in species:
1632
+ if not sp.smiles or not has_rdkit:
1633
+ continue
1634
+
1635
+ mol = Chem.MolFromSmiles(sp.smiles)
1636
+ if mol is None:
1637
+ continue
1638
+
1639
+ # Full mass (including counterions)
1640
+ sp.exact_mass_full = Descriptors.ExactMolWt(mol)
1641
+
1642
+ # Average MW (for CSV matching)
1643
+ if sp.mw <= 0:
1644
+ sp.mw = Descriptors.MolWt(mol)
1645
+
1646
+ # Formula
1647
+ if not sp.formula:
1648
+ sp.formula = rdMolDescriptors.CalcMolFormula(mol)
1649
+
1650
+ # Salt splitting: neutral = largest fragment
1651
+ frags = Chem.GetMolFrags(mol, asMols=True)
1652
+ if len(frags) > 1:
1653
+ neutral_mol = max(frags, key=lambda m: m.GetNumHeavyAtoms())
1654
+ sp.exact_mass = Descriptors.ExactMolWt(neutral_mol)
1655
+ sp.smiles_neutral = Chem.MolToSmiles(neutral_mol)
1656
+ else:
1657
+ sp.exact_mass = sp.exact_mass_full
1658
+ sp.smiles_neutral = sp.smiles
1659
+
1660
+ # Adducts from neutral mass (for LCMS matching)
1661
+ # [M+H]+, [M-H]-, [M+Na]+, [M+formate]-
1662
+ sp.adducts = {
1663
+ "[M+H]+": sp.exact_mass + 1.00728,
1664
+ "[M-H]-": sp.exact_mass - 1.00728,
1665
+ "[M+Na]+": sp.exact_mass + 22.98922,
1666
+ "[M+formate]-": sp.exact_mass + 44.99820,
1667
+ }
1668
+
1669
+
1670
+ # ---------------------------------------------------------------------------
1671
+ # Deduplication
1672
+ # ---------------------------------------------------------------------------
1673
+
1674
+ def _deduplicate_species(species: List[SpeciesDescriptor]) -> List[SpeciesDescriptor]:
1675
+ """Remove duplicate species by canonical SMILES.
1676
+
1677
+ When duplicates exist, prefer the one with the most metadata
1678
+ (CSV match, fragment source, etc.). SMILES are canonicalized via
1679
+ RDKit before comparison so that different representations of the
1680
+ same molecule (kekulized vs aromatic, different atom ordering) are
1681
+ recognized as duplicates.
1682
+
1683
+ Species with no SMILES are merged into a SMILES-bearing entry by MW,
1684
+ but only when MW values are unambiguous (no two remaining species
1685
+ share the same MW within tolerance).
1686
+ """
1687
+ if not species:
1688
+ return species
1689
+
1690
+ # --- Build canonicalizer ---
1691
+ try:
1692
+ from rdkit import Chem
1693
+
1694
+ def _canon(smi: str) -> str:
1695
+ mol = Chem.MolFromSmiles(smi)
1696
+ return Chem.MolToSmiles(mol) if mol else smi
1697
+ except ImportError:
1698
+ def _canon(smi: str) -> str:
1699
+ return smi
1700
+
1701
+ _ROLE_PRIO = {"product": 0, "atom_contributing": 1,
1702
+ "non_contributing": 2, "candidate": 3,
1703
+ "unclassified": 4}
1704
+
1705
+ seen: Dict[str, int] = {} # canonical SMILES → index in result
1706
+ result = []
1707
+
1708
+ for sp in species:
1709
+ if not sp.smiles:
1710
+ result.append(sp)
1711
+ continue
1712
+
1713
+ key = _canon(sp.smiles)
1714
+ # Also update the stored SMILES to canonical form
1715
+ sp.smiles = key
1716
+
1717
+ if key in seen:
1718
+ _merge_into(result[seen[key]], sp, _ROLE_PRIO)
1719
+ else:
1720
+ seen[key] = len(result)
1721
+ result.append(sp)
1722
+
1723
+ # --- MW-based merge for no-SMILES entries ---
1724
+ # Only when MW values are unambiguous: if two SMILES-bearing entries
1725
+ # have the same MW (within tolerance), skip MW-based merging entirely
1726
+ # to avoid wrong matches.
1727
+ try:
1728
+ from rdkit.Chem import Descriptors as _Desc
1729
+ from rdkit import Chem as _Chem
1730
+ _has_rdkit = True
1731
+ except ImportError:
1732
+ _has_rdkit = False
1733
+
1734
+ merged_indices: set = set()
1735
+ if _has_rdkit:
1736
+ # Compute MW for all SMILES-bearing entries
1737
+ smiles_mws: List[Tuple[int, float]] = [] # (index, mw)
1738
+ for i, sp in enumerate(result):
1739
+ if not sp.smiles:
1740
+ continue
1741
+ mol = _Chem.MolFromSmiles(sp.smiles)
1742
+ if mol is not None:
1743
+ smiles_mws.append((i, _Desc.MolWt(mol)))
1744
+
1745
+ # Check for ambiguous MWs (two entries within tolerance)
1746
+ mw_ambiguous = False
1747
+ for a_idx in range(len(smiles_mws)):
1748
+ for b_idx in range(a_idx + 1, len(smiles_mws)):
1749
+ if abs(smiles_mws[a_idx][1] - smiles_mws[b_idx][1]) < MW_MATCH_TOLERANCE:
1750
+ mw_ambiguous = True
1751
+ break
1752
+ if mw_ambiguous:
1753
+ break
1754
+
1755
+ if not mw_ambiguous:
1756
+ for i, sp in enumerate(result):
1757
+ if sp.smiles:
1758
+ continue
1759
+ sp_mw = sp.mw
1760
+ if not sp_mw:
1761
+ continue
1762
+ best_delta = MW_MATCH_TOLERANCE
1763
+ best_idx = -1
1764
+ for j, mw_val in smiles_mws:
1765
+ if j in merged_indices:
1766
+ continue
1767
+ delta = abs(mw_val - sp_mw)
1768
+ if delta < best_delta:
1769
+ best_delta = delta
1770
+ best_idx = j
1771
+ if best_idx >= 0:
1772
+ _merge_into(result[best_idx], sp, _ROLE_PRIO)
1773
+ merged_indices.add(i)
1774
+ _log(f" Dedup MW-merge: {sp.name or sp.csv_name} → "
1775
+ f"{result[best_idx].name or result[best_idx].csv_name} "
1776
+ f"(delta={best_delta:.1f} Da)")
1777
+ elif any(not sp.smiles and sp.mw for sp in result):
1778
+ _log(" Dedup: skipping MW-merge (ambiguous MW among species)")
1779
+
1780
+ if merged_indices:
1781
+ result = [sp for i, sp in enumerate(result) if i not in merged_indices]
1782
+
1783
+ # Re-index
1784
+ for i, sp in enumerate(result):
1785
+ sp.id = f"sp_{i}"
1786
+
1787
+ return result
1788
+
1789
+
1790
+ def _merge_into(existing: "SpeciesDescriptor", incoming: "SpeciesDescriptor",
1791
+ role_prio: Dict[str, int]) -> None:
1792
+ """Merge *incoming* metadata into *existing*, mutating existing in place."""
1793
+ if not existing.csv_name and incoming.csv_name:
1794
+ existing.csv_name = incoming.csv_name
1795
+ existing.csv_equiv = incoming.csv_equiv
1796
+ existing.csv_mass = incoming.csv_mass
1797
+ if not existing.name and incoming.name:
1798
+ existing.name = incoming.name
1799
+ if incoming.is_sm:
1800
+ existing.is_sm = True
1801
+ if incoming.is_dp:
1802
+ existing.is_dp = True
1803
+ if incoming.is_substrate and not existing.is_substrate:
1804
+ existing.is_substrate = True
1805
+ if incoming.is_solvent and not existing.is_solvent:
1806
+ existing.is_solvent = True
1807
+ # Prefer non-empty role_detail
1808
+ if not existing.role_detail and incoming.role_detail:
1809
+ existing.role_detail = incoming.role_detail
1810
+ # Prefer source with more info: fragment > rxn > text_label > csv_only
1811
+ _SRC_PRIO = {"fragment": 0, "rxn": 1, "text_label": 2, "csv_only": 3}
1812
+ if _SRC_PRIO.get(incoming.source, 9) < _SRC_PRIO.get(existing.source, 9):
1813
+ existing.source = incoming.source
1814
+ # Keep higher role (product > atom_contributing > non_contributing)
1815
+ if role_prio.get(incoming.role, 5) < role_prio.get(existing.role, 5):
1816
+ existing.role = incoming.role
1817
+ # Prefer SMILES from the incoming entry if existing has none
1818
+ if not existing.smiles and incoming.smiles:
1819
+ existing.smiles = incoming.smiles
1820
+ # Merge MW
1821
+ if not existing.mw and incoming.mw:
1822
+ existing.mw = incoming.mw
1823
+
1824
+
1825
+ # ---------------------------------------------------------------------------
1826
+ # Build reaction SMILES
1827
+ # ---------------------------------------------------------------------------
1828
+
1829
+ def _build_reaction_smiles(species: List[SpeciesDescriptor]) -> Optional[str]:
1830
+ """Build full reaction SMILES from species list."""
1831
+ lhs_parts = []
1832
+ rhs_parts = []
1833
+
1834
+ for sp in species:
1835
+ if not sp.smiles:
1836
+ continue
1837
+ if sp.role == "product":
1838
+ rhs_parts.append(sp.smiles)
1839
+ else:
1840
+ lhs_parts.append(sp.smiles)
1841
+
1842
+ if not rhs_parts or not lhs_parts:
1843
+ return None
1844
+
1845
+ return ".".join(lhs_parts) + ">>" + ".".join(rhs_parts)
1846
+
1847
+
1848
+ # ---------------------------------------------------------------------------
1849
+ # Main public API
1850
+ # ---------------------------------------------------------------------------
1851
+
1852
+ def parse_reaction(
1853
+ cdxml: Optional[str] = None,
1854
+ cdx: Optional[str] = None,
1855
+ csv: Optional[str] = None,
1856
+ rxn: Optional[str] = None,
1857
+ input_dir: Optional[str] = None,
1858
+ experiment: Optional[str] = None,
1859
+ use_rxnmapper: bool = False,
1860
+ use_rxn_insight: bool = True,
1861
+ use_network: bool = True,
1862
+ verbose: bool = False,
1863
+ ) -> ReactionDescriptor:
1864
+ """Parse reaction from ELN files and return a ReactionDescriptor.
1865
+
1866
+ Accepts any combination of input files. Each contributes different
1867
+ information (see plan).
1868
+
1869
+ Args:
1870
+ cdxml: Path to CDXML file (polished or raw)
1871
+ cdx: Path to CDX file (converted to CDXML internally)
1872
+ csv: Path to Findmolecule ELN CSV
1873
+ rxn: Path to RXN file
1874
+ input_dir: Directory to auto-discover files from
1875
+ experiment: Experiment name (with input_dir)
1876
+ use_rxnmapper: Deprecated, ignored. Classification uses Schneider FP.
1877
+ use_rxn_insight: Enable RXN Insight enrichment
1878
+ use_network: Enable PubChem name resolution
1879
+ verbose: Print diagnostic messages to stderr
1880
+
1881
+ Returns:
1882
+ ReactionDescriptor with all species and metadata.
1883
+ """
1884
+ global _verbose
1885
+ _verbose = verbose
1886
+
1887
+ # --- Step 0: Auto-discover files if input_dir given ---
1888
+ if input_dir and experiment:
1889
+ try:
1890
+ from discover_experiment_files import discover_experiment_files
1891
+ disc = discover_experiment_files(input_dir, experiment)
1892
+ if not cdxml and disc.cdx_files:
1893
+ cdx = cdx or disc.cdx_files[0]
1894
+ if not csv and disc.csv_files:
1895
+ csv = csv or disc.csv_files[0]
1896
+ if not rxn and disc.rxn_files:
1897
+ rxn = rxn or disc.rxn_files[0]
1898
+ except Exception as e:
1899
+ _log(f" File discovery failed: {e}")
1900
+
1901
+ # --- Step 0b: CDX → CDXML conversion ---
1902
+ if cdx and not cdxml:
1903
+ cdxml = _convert_cdx_to_cdxml(cdx)
1904
+
1905
+ desc = ReactionDescriptor(
1906
+ experiment=experiment or _stem(cdxml or cdx or rxn or csv or "unknown"),
1907
+ input_files={
1908
+ "cdxml": cdxml,
1909
+ "csv": csv,
1910
+ "rxn": rxn,
1911
+ "cdx": cdx,
1912
+ },
1913
+ )
1914
+
1915
+ # Metadata
1916
+ desc.metadata["parser_version"] = "1.3"
1917
+ desc.metadata["timestamp"] = datetime.datetime.now().isoformat(
1918
+ timespec="seconds")
1919
+ desc.metadata["rdkit_available"] = _check_rdkit()
1920
+ desc.metadata["chemscript_available"] = _check_chemscript()
1921
+
1922
+ # --- Step 1: Extract species from structural source ---
1923
+ species: List[SpeciesDescriptor] = []
1924
+ warnings: List[str] = []
1925
+ cdxml_conditions: List[str] = []
1926
+
1927
+ if cdxml:
1928
+ _log(f"Extracting from CDXML: {os.path.basename(cdxml)}")
1929
+ sp, w, conds = _extract_from_cdxml(cdxml, use_network=use_network)
1930
+ species.extend(sp)
1931
+ warnings.extend(w)
1932
+ cdxml_conditions.extend(conds)
1933
+ elif rxn:
1934
+ _log(f"Extracting from RXN: {os.path.basename(rxn)}")
1935
+ sp, w = _extract_from_rxn(rxn)
1936
+ species.extend(sp)
1937
+ warnings.extend(w)
1938
+
1939
+ # --- Step 2: Match CSV data (also returns exp_data for ELN enrichment) ---
1940
+ exp_data = None
1941
+ if csv:
1942
+ _log(f"Matching CSV: {os.path.basename(csv)}")
1943
+ species, w, exp_data = _match_csv_data(species, csv)
1944
+ warnings.extend(w)
1945
+
1946
+ # --- Step 3: Deduplicate ---
1947
+ species = _deduplicate_species(species)
1948
+
1949
+ # --- Step 4: Compute masses (needed before classification MW checks) ---
1950
+ _compute_all_masses(species)
1951
+
1952
+ # --- Step 5: Classify roles ---
1953
+ _log("Classifying species roles...")
1954
+ confidence = _classify_species(
1955
+ species,
1956
+ use_rxnmapper=use_rxnmapper,
1957
+ use_rxn_insight=use_rxn_insight,
1958
+ )
1959
+ desc.classification_confidence = confidence
1960
+
1961
+ # --- Step 6: Identify SM and DP ---
1962
+ _identify_sm_dp(species)
1963
+
1964
+ # --- Step 6.5: Detect solvents (from CSV + reagent_db) ---
1965
+ _detect_solvents(species, exp_data=exp_data)
1966
+
1967
+ # --- Step 7: Apply display names ---
1968
+ _apply_display_names(species)
1969
+
1970
+ # --- Step 8.5: Build display_text ---
1971
+ _build_display_texts(species)
1972
+
1973
+ # --- Step 9: Build reaction SMILES ---
1974
+ desc.reaction_smiles = _build_reaction_smiles(species)
1975
+
1976
+ # --- Step 10: Get RXN Insight reaction class (from classify step) ---
1977
+ for sp in species:
1978
+ if sp.rxn_insight_role:
1979
+ # _try_rxn_insight was called — check if it set reaction class
1980
+ break
1981
+
1982
+ desc.species = species
1983
+ desc.warnings = warnings
1984
+
1985
+ # --- Step 11: Populate ELN data (run arrow, procedure, solvents) ---
1986
+ _populate_eln_data(desc, exp_data)
1987
+
1988
+ # --- Step 12: Populate conditions (from CDXML text extraction) ---
1989
+ desc.conditions = cdxml_conditions
1990
+
1991
+ _log(f"Parsed {len(species)} species, "
1992
+ f"{sum(1 for s in species if s.is_sm)} SM, "
1993
+ f"{sum(1 for s in species if s.is_dp)} DP")
1994
+
1995
+ return desc
1996
+
1997
+
1998
+ # ---------------------------------------------------------------------------
1999
+ # Helper utilities
2000
+ # ---------------------------------------------------------------------------
2001
+
2002
+ def _stem(path: str) -> str:
2003
+ """Filename stem without extension."""
2004
+ return os.path.splitext(os.path.basename(path))[0]
2005
+
2006
+
2007
+ def _check_rdkit() -> bool:
2008
+ try:
2009
+ from rdkit import Chem # noqa: F401
2010
+ return True
2011
+ except ImportError:
2012
+ return False
2013
+
2014
+
2015
+ def _check_chemscript() -> bool:
2016
+ try:
2017
+ from ..chemdraw.chemscript_bridge import ChemScriptBridge # noqa: F401
2018
+ return True
2019
+ except ImportError:
2020
+ return False
2021
+
2022
+
2023
+ def _convert_cdx_to_cdxml(cdx_path: str) -> Optional[str]:
2024
+ """Convert CDX to CDXML via cdx_converter.py subprocess."""
2025
+ import subprocess
2026
+ import tempfile
2027
+
2028
+ out_path = os.path.splitext(cdx_path)[0] + ".cdxml"
2029
+ if os.path.isfile(out_path):
2030
+ return out_path
2031
+
2032
+ script_dir = os.path.dirname(os.path.abspath(__file__))
2033
+ converter = os.path.join(script_dir, "cdx_converter.py")
2034
+
2035
+ if not os.path.isfile(converter):
2036
+ _log(f" cdx_converter.py not found at {converter}")
2037
+ return None
2038
+
2039
+ try:
2040
+ result = subprocess.run(
2041
+ [sys.executable, converter, cdx_path, "-o", out_path],
2042
+ capture_output=True, text=True, timeout=60)
2043
+ if result.returncode == 0 and os.path.isfile(out_path):
2044
+ _log(f" Converted CDX → CDXML: {out_path}")
2045
+ return out_path
2046
+ else:
2047
+ _log(f" CDX conversion failed: {result.stderr}")
2048
+ return None
2049
+ except Exception as e:
2050
+ _log(f" CDX conversion error: {e}")
2051
+ return None
2052
+
2053
+
2054
+ # ---------------------------------------------------------------------------
2055
+ # CLI
2056
+ # ---------------------------------------------------------------------------
2057
+
2058
+ def _build_arg_parser() -> argparse.ArgumentParser:
2059
+ p = argparse.ArgumentParser(
2060
+ description="Parse reaction from ELN files into a persisted JSON descriptor.",
2061
+ formatter_class=argparse.RawDescriptionHelpFormatter,
2062
+ epilog="""
2063
+ Examples:
2064
+ python reaction_parser.py experiment.cdxml -o reaction.json
2065
+ python reaction_parser.py experiment.cdxml --csv exp.csv --pretty
2066
+ python reaction_parser.py --input-dir path/ --experiment KL-7001-004
2067
+ """,
2068
+ )
2069
+ # Input files
2070
+ p.add_argument("cdxml_positional", nargs="?", default=None,
2071
+ help="Input CDXML file (positional)")
2072
+ p.add_argument("--cdxml", dest="cdxml_named", default=None,
2073
+ help="Input CDXML file (named)")
2074
+ p.add_argument("--cdx", default=None,
2075
+ help="Input CDX file (converted to CDXML)")
2076
+ p.add_argument("--csv", default=None,
2077
+ help="Findmolecule ELN CSV file")
2078
+ p.add_argument("--rxn", default=None,
2079
+ help="RXN file")
2080
+ p.add_argument("--input-dir", default=None,
2081
+ help="Experiment directory (auto-discover files)")
2082
+ p.add_argument("--experiment", default=None,
2083
+ help="Experiment name (with --input-dir)")
2084
+ # Output
2085
+ p.add_argument("-o", "--output", default=None,
2086
+ help="Output JSON file (default: stdout)")
2087
+ p.add_argument("--pretty", action="store_true",
2088
+ help="Pretty-print JSON output")
2089
+ # Options
2090
+ p.add_argument("--no-rxnmapper", action="store_true",
2091
+ help="Deprecated (RXNMapper no longer used for classification)")
2092
+ p.add_argument("--no-rxn-insight", action="store_true",
2093
+ help="Skip RXN Insight enrichment")
2094
+ p.add_argument("--no-network", action="store_true",
2095
+ help="Skip PubChem name resolution (offline only)")
2096
+ p.add_argument("--json-errors", action="store_true",
2097
+ help="Output structured JSON errors to stderr")
2098
+ p.add_argument("-v", "--verbose", action="store_true",
2099
+ help="Print diagnostic messages to stderr")
2100
+ return p
2101
+
2102
+
2103
+ def main(argv=None) -> int:
2104
+ parser = _build_arg_parser()
2105
+ args = parser.parse_args(argv)
2106
+
2107
+ # Resolve CDXML from positional or named argument
2108
+ cdxml = args.cdxml_positional or args.cdxml_named
2109
+
2110
+ if not any([cdxml, args.cdx, args.csv, args.rxn,
2111
+ args.input_dir]):
2112
+ parser.error("No input files specified")
2113
+
2114
+ try:
2115
+ desc = parse_reaction(
2116
+ cdxml=cdxml,
2117
+ cdx=args.cdx,
2118
+ csv=args.csv,
2119
+ rxn=args.rxn,
2120
+ input_dir=args.input_dir,
2121
+ experiment=args.experiment,
2122
+ use_rxnmapper=not args.no_rxnmapper,
2123
+ use_rxn_insight=not args.no_rxn_insight,
2124
+ use_network=not args.no_network,
2125
+ verbose=args.verbose,
2126
+ )
2127
+
2128
+ if args.output:
2129
+ desc.to_json(args.output, pretty=args.pretty)
2130
+ print(f"Wrote {args.output} ({len(desc.species)} species)",
2131
+ file=sys.stderr)
2132
+ else:
2133
+ output = json.dumps(desc.to_dict(),
2134
+ indent=2 if args.pretty else None,
2135
+ ensure_ascii=False)
2136
+ print(output)
2137
+
2138
+ return 0
2139
+
2140
+ except Exception as e:
2141
+ if args.json_errors:
2142
+ err = {"error": "parse_failed", "detail": str(e)}
2143
+ print(json.dumps(err), file=sys.stderr)
2144
+ else:
2145
+ print(f"ERROR: {e}", file=sys.stderr)
2146
+ return 1
2147
+
2148
+
2149
+ if __name__ == "__main__":
2150
+ sys.exit(main())