cdxml-toolkit 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. cdxml_toolkit/__init__.py +18 -0
  2. cdxml_toolkit/_jre/__init__.py +2 -0
  3. cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
  4. cdxml_toolkit/analysis/__init__.py +35 -0
  5. cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
  6. cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
  7. cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
  8. cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
  9. cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
  10. cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
  11. cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
  12. cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
  13. cdxml_toolkit/analysis/extract_nmr.py +47 -0
  14. cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
  15. cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
  16. cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
  17. cdxml_toolkit/cdxml_builder.py +920 -0
  18. cdxml_toolkit/cdxml_utils.py +342 -0
  19. cdxml_toolkit/chemdraw/__init__.py +5 -0
  20. cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
  21. cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
  22. cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
  23. cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
  24. cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
  25. cdxml_toolkit/constants.py +304 -0
  26. cdxml_toolkit/coord_normalizer.py +438 -0
  27. cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
  28. cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
  29. cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
  30. cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
  31. cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
  32. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
  33. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
  34. cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
  35. cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
  36. cdxml_toolkit/image/__init__.py +15 -0
  37. cdxml_toolkit/image/reaction_from_image.py +2103 -0
  38. cdxml_toolkit/image/structure_from_image.py +1711 -0
  39. cdxml_toolkit/layout/__init__.py +5 -0
  40. cdxml_toolkit/layout/alignment.py +1642 -0
  41. cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
  42. cdxml_toolkit/layout/scheme_merger.py +2260 -0
  43. cdxml_toolkit/mcp_server/__init__.py +0 -0
  44. cdxml_toolkit/mcp_server/__main__.py +5 -0
  45. cdxml_toolkit/mcp_server/server.py +1567 -0
  46. cdxml_toolkit/naming/__init__.py +6 -0
  47. cdxml_toolkit/naming/aligned_namer.py +2342 -0
  48. cdxml_toolkit/naming/mol_builder.py +3722 -0
  49. cdxml_toolkit/naming/name_decomposer.py +2843 -0
  50. cdxml_toolkit/naming/reactions_datamol.json +2414 -0
  51. cdxml_toolkit/office/__init__.py +5 -0
  52. cdxml_toolkit/office/doc_from_template.py +722 -0
  53. cdxml_toolkit/office/ole_embedder.py +808 -0
  54. cdxml_toolkit/office/ole_extractor.py +272 -0
  55. cdxml_toolkit/perception/__init__.py +10 -0
  56. cdxml_toolkit/perception/compound_search.py +229 -0
  57. cdxml_toolkit/perception/eln_csv_parser.py +240 -0
  58. cdxml_toolkit/perception/rdf_parser.py +664 -0
  59. cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
  60. cdxml_toolkit/perception/reaction_parser.py +2150 -0
  61. cdxml_toolkit/perception/scheme_reader.py +2948 -0
  62. cdxml_toolkit/perception/scheme_refine.py +1404 -0
  63. cdxml_toolkit/perception/scheme_segmenter.py +619 -0
  64. cdxml_toolkit/perception/spatial_assignment.py +1013 -0
  65. cdxml_toolkit/rdkit_utils.py +605 -0
  66. cdxml_toolkit/render/__init__.py +17 -0
  67. cdxml_toolkit/render/auto_layout.py +229 -0
  68. cdxml_toolkit/render/compact_parser.py +632 -0
  69. cdxml_toolkit/render/parser.py +706 -0
  70. cdxml_toolkit/render/render_scheme.py +267 -0
  71. cdxml_toolkit/render/renderer.py +2387 -0
  72. cdxml_toolkit/render/schema.py +90 -0
  73. cdxml_toolkit/render/scheme_maker.py +1043 -0
  74. cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
  75. cdxml_toolkit/resolve/__init__.py +13 -0
  76. cdxml_toolkit/resolve/cas_resolver.py +430 -0
  77. cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
  78. cdxml_toolkit/resolve/condensed_formula.py +493 -0
  79. cdxml_toolkit/resolve/jre_manager.py +195 -0
  80. cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
  81. cdxml_toolkit/resolve/reagent_db.py +285 -0
  82. cdxml_toolkit/resolve/superatom_data.json +2856 -0
  83. cdxml_toolkit/resolve/superatom_table.py +146 -0
  84. cdxml_toolkit/text_formatting.py +298 -0
  85. cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
  86. cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
  87. cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
  88. cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
  89. cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
  90. cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
  91. cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,2948 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ scheme_reader.py — Read CDXML reaction schemes into structured descriptions.
4
+
5
+ The semantic inverse of the DSL renderer: takes a CDXML file containing a
6
+ reaction scheme (single or multi-step) and produces a structured JSON with
7
+ a species registry, reaction graph, topology classification, and a natural
8
+ language narrative suitable for LLM consumption.
9
+
10
+ Two parsing strategies (tried in order):
11
+ 1. Step-attribute path — reads <scheme><step> attributes
12
+ (ReactionStepReactants/Products/Above/Below).
13
+ 2. Geometry-based fallback — assigns roles by spatial position relative
14
+ to arrows.
15
+
16
+ CLI:
17
+ python -m cdxml_toolkit.scheme_reader scheme.cdxml -o description.json
18
+ python -m cdxml_toolkit.scheme_reader scheme.cdxml --narrative-only
19
+
20
+ Python API:
21
+ from cdxml_toolkit.perception.scheme_reader import read_scheme
22
+ desc = read_scheme("scheme.cdxml")
23
+ print(desc.narrative)
24
+ desc.to_json("description.json")
25
+ """
26
+
27
+ from __future__ import annotations
28
+
29
+ import argparse
30
+ import json
31
+ import os
32
+ import re
33
+ import sys
34
+ from collections import defaultdict
35
+ from dataclasses import dataclass, field, asdict
36
+ from typing import Any, Dict, List, Optional, Tuple, Set
37
+ from xml.etree import ElementTree as ET
38
+
39
+
40
+ # ---------------------------------------------------------------------------
41
+ # Logging
42
+ # ---------------------------------------------------------------------------
43
+ _verbose = False
44
+
45
+
46
+ def _log(msg: str) -> None:
47
+ if _verbose:
48
+ print(f" [scheme_reader] {msg}", file=sys.stderr)
49
+
50
+
51
+ # ---------------------------------------------------------------------------
52
+ # Data model
53
+ # ---------------------------------------------------------------------------
54
+
55
+ @dataclass
56
+ class SpeciesRecord:
57
+ """One chemical entity identified in the scheme."""
58
+ id: str = "" # "species_0", ...
59
+ cdxml_element_id: str = "" # CDXML element id
60
+ element_type: str = "" # "fragment" or "text"
61
+ smiles: Optional[str] = None # canonical SMILES (abbreviations resolved)
62
+ smiles_raw: Optional[str] = None # SMILES without abbreviation expansion
63
+ name: Optional[str] = None # display name / text label content
64
+ formula: Optional[str] = None # molecular formula
65
+ mw: Optional[float] = None # average molecular weight
66
+ label: Optional[str] = None # compound number ("1", "2a")
67
+ iupac_name: Optional[str] = None # IUPAC name (from ChemScript or PubChem)
68
+ aligned_iupac: Optional[str] = None # aligned IUPAC name (from aligned_namer)
69
+ text_category: Optional[str] = None # for text species: "chemical", "condition_ref",
70
+ # "footnote", "yield", "compound_label",
71
+ # "citation", "bioactivity"
72
+ is_solvent: bool = False # True if reagent_db role == "solvent"
73
+ equiv_text: Optional[str] = None # e.g. "1.2 eq", "5 mol%"
74
+
75
+ def to_dict(self) -> dict:
76
+ return {k: v for k, v in asdict(self).items()
77
+ if v is not None and v is not False}
78
+
79
+
80
+ @dataclass
81
+ class StepRecord:
82
+ """One reaction step extracted from the scheme."""
83
+ step_index: int = 0 # 0-based
84
+ reactant_ids: List[str] = field(default_factory=list)
85
+ product_ids: List[str] = field(default_factory=list)
86
+ reagent_ids: List[str] = field(default_factory=list)
87
+ conditions: List[str] = field(default_factory=list)
88
+ condition_text_raw: List[str] = field(default_factory=list)
89
+ yield_text: Optional[str] = None
90
+ arrow_style: str = "solid" # "solid", "dashed", "failed"
91
+ arrow_cdxml_id: Optional[str] = None
92
+ molecular_diff_text: Optional[str] = None # e.g. "bromo → phenyl"
93
+
94
+ def to_dict(self) -> dict:
95
+ d = asdict(self)
96
+ return {k: v for k, v in d.items()
97
+ if v is not None and v != [] and v != ""}
98
+
99
+
100
+ @dataclass
101
+ class ScopeEntry:
102
+ """One entry in a substrate scope table."""
103
+ entry_id: str = "" # "scope_0", "scope_1", ...
104
+ species_id: str = "" # SpeciesRecord.id of the scope structure
105
+ label: Optional[str] = None # compound number ("5.70a")
106
+ conditions_variant: Optional[str] = None # "X = I" or "X = Br"
107
+ yield_text: Optional[str] = None # "39%"
108
+ mass_text: Optional[str] = None # "22 mg"
109
+ notes: Optional[str] = None # "Scale-up: 130 mg, 16%"
110
+
111
+ def to_dict(self) -> dict:
112
+ return {k: v for k, v in asdict(self).items() if v is not None}
113
+
114
+
115
+ @dataclass
116
+ class SchemeDescription:
117
+ """Complete structured description of a reaction scheme."""
118
+ version: str = "1.0"
119
+ source_file: str = ""
120
+ topology: str = "linear"
121
+ content_type: str = "" # "synthesis", "sar_design", "biological_pathway",
122
+ # "target_array", "literature_comparison",
123
+ # "composite", "investigation", "unknown",
124
+ # "substrate_scope"
125
+ num_steps: int = 0
126
+ species: Dict[str, SpeciesRecord] = field(default_factory=dict)
127
+ steps: List[StepRecord] = field(default_factory=list)
128
+ scope_entries: List[ScopeEntry] = field(default_factory=list)
129
+ sub_schemes: List["SchemeDescription"] = field(default_factory=list)
130
+ narrative: str = ""
131
+ warnings: List[str] = field(default_factory=list)
132
+ # --- Spatial assignment metadata (v1.1) ---
133
+ layout_pattern: Optional[str] = None # detected layout from spatial engine
134
+ parse_method: str = "" # "geometry" or "step_attribute"
135
+ assignment_confidences: Dict[str, float] = field(default_factory=dict)
136
+
137
+ def to_dict(self) -> dict:
138
+ d = {
139
+ "version": self.version,
140
+ "source_file": self.source_file,
141
+ "topology": self.topology,
142
+ "num_steps": self.num_steps,
143
+ "species": {k: v.to_dict() for k, v in self.species.items()},
144
+ "steps": [s.to_dict() for s in self.steps],
145
+ "narrative": self.narrative,
146
+ "warnings": self.warnings,
147
+ }
148
+ if self.content_type:
149
+ d["content_type"] = self.content_type
150
+ if self.scope_entries:
151
+ d["scope_entries"] = [e.to_dict() for e in self.scope_entries]
152
+ if self.sub_schemes:
153
+ d["sub_schemes"] = [s.to_dict() for s in self.sub_schemes]
154
+ if self.layout_pattern:
155
+ d["layout_pattern"] = self.layout_pattern
156
+ if self.parse_method:
157
+ d["parse_method"] = self.parse_method
158
+ if self.assignment_confidences:
159
+ d["assignment_confidences"] = self.assignment_confidences
160
+ return d
161
+
162
+ def to_json(self, path: str, pretty: bool = True) -> None:
163
+ with open(path, "w", encoding="utf-8") as f:
164
+ json.dump(self.to_dict(), f, indent=2 if pretty else None,
165
+ ensure_ascii=False)
166
+
167
+ @classmethod
168
+ def from_json(cls, path: str) -> "SchemeDescription":
169
+ with open(path, "r", encoding="utf-8") as f:
170
+ return cls.from_dict(json.load(f))
171
+
172
+ @classmethod
173
+ def from_dict(cls, d: dict) -> "SchemeDescription":
174
+ species = {}
175
+ for k, v in d.get("species", {}).items():
176
+ valid = {f for f in SpeciesRecord.__dataclass_fields__}
177
+ species[k] = SpeciesRecord(**{f: v[f] for f in valid if f in v})
178
+ steps = []
179
+ for s in d.get("steps", []):
180
+ valid = {f for f in StepRecord.__dataclass_fields__}
181
+ steps.append(StepRecord(**{f: s[f] for f in valid if f in s}))
182
+ scope_entries = []
183
+ for se in d.get("scope_entries", []):
184
+ valid = {f for f in ScopeEntry.__dataclass_fields__}
185
+ scope_entries.append(
186
+ ScopeEntry(**{f: se[f] for f in valid if f in se}))
187
+ sub_schemes = [cls.from_dict(sd)
188
+ for sd in d.get("sub_schemes", [])]
189
+ return cls(
190
+ version=d.get("version", "1.0"),
191
+ source_file=d.get("source_file", ""),
192
+ topology=d.get("topology", "linear"),
193
+ content_type=d.get("content_type", ""),
194
+ num_steps=d.get("num_steps", 0),
195
+ species=species,
196
+ steps=steps,
197
+ scope_entries=scope_entries,
198
+ sub_schemes=sub_schemes,
199
+ narrative=d.get("narrative", ""),
200
+ warnings=d.get("warnings", []),
201
+ )
202
+
203
+ def to_scheme_descriptor(self) -> "SchemeDescriptor":
204
+ """Convert to a DSL SchemeDescriptor for round-trip rendering."""
205
+ from ..render.schema import (SchemeDescriptor, StepDescriptor,
206
+ ArrowContent, StructureRef)
207
+
208
+ structures = {}
209
+ for sp_id, sp in self.species.items():
210
+ if sp.smiles or sp.name:
211
+ structures[sp_id] = StructureRef(
212
+ id=sp_id,
213
+ smiles=sp.smiles,
214
+ name=sp.name if not sp.smiles else None,
215
+ label=sp.label,
216
+ )
217
+
218
+ dsl_steps = []
219
+ for step in self.steps:
220
+ above = ArrowContent()
221
+ below = ArrowContent()
222
+
223
+ for rid in step.reagent_ids:
224
+ sp = self.species.get(rid)
225
+ if sp and sp.element_type == "fragment" and sp.smiles:
226
+ above.structures.append(rid)
227
+ elif sp and sp.name:
228
+ below.text.append(sp.name)
229
+
230
+ below.text.extend(step.conditions)
231
+
232
+ sd = StepDescriptor(
233
+ substrates=list(step.reactant_ids),
234
+ products=list(step.product_ids),
235
+ above_arrow=above if (above.structures or above.text) else None,
236
+ below_arrow=below if below.text else None,
237
+ yield_=step.yield_text,
238
+ arrow_style=step.arrow_style,
239
+ )
240
+ dsl_steps.append(sd)
241
+
242
+ layout_map = {
243
+ "linear": "linear" if len(dsl_steps) <= 1 else "sequential",
244
+ "divergent": "divergent",
245
+ "convergent": "convergent",
246
+ "parallel": "stacked-rows",
247
+ "mixed": "sequential",
248
+ }
249
+
250
+ return SchemeDescriptor(
251
+ structures=structures,
252
+ steps=dsl_steps,
253
+ layout=layout_map.get(self.topology, "sequential"),
254
+ )
255
+
256
+
257
+ # ---------------------------------------------------------------------------
258
+ # Internal intermediate structure
259
+ # ---------------------------------------------------------------------------
260
+
261
+ @dataclass
262
+ class _RawStep:
263
+ """Intermediate parsed step before species registry is built."""
264
+ step_elem_id: str = ""
265
+ reactant_elem_ids: List[str] = field(default_factory=list)
266
+ product_elem_ids: List[str] = field(default_factory=list)
267
+ above_arrow_ids: List[str] = field(default_factory=list)
268
+ below_arrow_ids: List[str] = field(default_factory=list)
269
+ arrow_elem_id: Optional[str] = None
270
+
271
+
272
+ # ---------------------------------------------------------------------------
273
+ # Text extraction helpers
274
+ # ---------------------------------------------------------------------------
275
+
276
+ def _get_text_content(t_elem: ET.Element) -> str:
277
+ """Extract plain text from a <t> element."""
278
+ parts = []
279
+ for s in t_elem.iter("s"):
280
+ if s.text:
281
+ parts.append(s.text)
282
+ return "".join(parts).strip()
283
+
284
+
285
+ _YIELD_RE = re.compile(r"(\d+(?:\.\d+)?\s*%)")
286
+ _QUANT_RE = re.compile(r"\bquant\.?\b", re.IGNORECASE)
287
+ _LABEL_RE = re.compile(r"^[1-9]\d{0,2}[a-z]?$|^\([ivx]+\)$|^[a-z]$",
288
+ re.IGNORECASE)
289
+
290
+
291
+ def _extract_yield_from_text(text: str) -> Optional[str]:
292
+ """Extract yield percentage from a text string."""
293
+ m = _YIELD_RE.search(text)
294
+ if m:
295
+ return m.group(1)
296
+ if _QUANT_RE.search(text):
297
+ return "quant."
298
+ return None
299
+
300
+
301
+ # ---------------------------------------------------------------------------
302
+ # Arrow helpers
303
+ # ---------------------------------------------------------------------------
304
+
305
+ def _arrow_endpoints(arrow: ET.Element) -> Tuple[float, float, float, float]:
306
+ """Return (tail_x, tail_y, head_x, head_y) from an arrow element."""
307
+ from ..cdxml_utils import arrow_endpoints
308
+ return arrow_endpoints(arrow)
309
+
310
+
311
+ def _resolve_arrow(page: ET.Element, arrow_id: str,
312
+ id_map: Dict[str, ET.Element]) -> Optional[ET.Element]:
313
+ """Resolve arrow element from ID, following SupersededBy chains."""
314
+ el = id_map.get(arrow_id)
315
+ if el is not None and el.tag == "arrow":
316
+ return el
317
+ if el is not None and el.tag == "graphic":
318
+ sup_id = el.get("SupersededBy", "")
319
+ if sup_id:
320
+ arrow_el = id_map.get(sup_id)
321
+ if arrow_el is not None:
322
+ return arrow_el
323
+ # Also search page children for graphic → arrow chain
324
+ for child in page:
325
+ if child.tag == "graphic" and child.get("id") == arrow_id:
326
+ sup_id = child.get("SupersededBy", "")
327
+ if sup_id:
328
+ for child2 in page:
329
+ if child2.get("id") == sup_id:
330
+ return child2
331
+ return None
332
+
333
+
334
+ def _detect_arrow_style(arrow: Optional[ET.Element]) -> str:
335
+ """Detect arrow style from element attributes."""
336
+ if arrow is None:
337
+ return "solid"
338
+ # NoGo="Cross" means failed reaction (X on arrow)
339
+ if arrow.get("NoGo") == "Cross":
340
+ return "failed"
341
+ # Dashed arrow
342
+ line_type = arrow.get("LineType", "")
343
+ if line_type.lower() in ("dash", "dashed", "dot"):
344
+ return "dashed"
345
+ # Check ArrowheadType for dashed variant
346
+ aht = arrow.get("ArrowheadType", "")
347
+ if aht.lower() == "dashed":
348
+ return "dashed"
349
+ return "solid"
350
+
351
+
352
+ def _find_all_arrows(page: ET.Element) -> List[ET.Element]:
353
+ """Find all reaction arrows on the page."""
354
+ arrows = []
355
+ seen_ids: Set[str] = set()
356
+ for el in page:
357
+ if el.tag == "arrow":
358
+ eid = el.get("id", "")
359
+ if eid not in seen_ids:
360
+ arrows.append(el)
361
+ seen_ids.add(eid)
362
+ # Also check for graphic elements with arrow attributes
363
+ for el in page:
364
+ if el.tag == "graphic":
365
+ if el.get("GraphicType") == "Line" and el.get("ArrowType"):
366
+ eid = el.get("id", "")
367
+ if eid not in seen_ids:
368
+ arrows.append(el)
369
+ seen_ids.add(eid)
370
+ return arrows
371
+
372
+
373
+ # ---------------------------------------------------------------------------
374
+ # Step-attribute parsing (primary path)
375
+ # ---------------------------------------------------------------------------
376
+
377
+ def _parse_from_step_attributes(page: ET.Element,
378
+ id_map: Dict[str, ET.Element],
379
+ scheme_filter: Optional[Set[str]] = None,
380
+ ) -> List[_RawStep]:
381
+ """Parse steps using <scheme><step> element attributes.
382
+
383
+ Iterates ALL <scheme> elements on the page (there may be multiple
384
+ for stacked-rows layouts).
385
+
386
+ Parameters
387
+ ----------
388
+ scheme_filter : set of str, optional
389
+ If provided, only process ``<scheme>`` elements whose ``id``
390
+ is in this set. Used by the segmenter to parse a single
391
+ sub-scheme from a multi-panel file.
392
+ """
393
+ raw_steps: List[_RawStep] = []
394
+
395
+ # Find all scheme elements (could be multiple for stacked sections)
396
+ schemes = page.findall("scheme")
397
+ if not schemes:
398
+ # Also try deeper nesting
399
+ schemes = page.findall(".//scheme")
400
+
401
+ for scheme_el in schemes:
402
+ if scheme_filter is not None:
403
+ if scheme_el.get("id", "") not in scheme_filter:
404
+ continue
405
+ for step_el in scheme_el.findall("step"):
406
+ step_id = step_el.get("id", "")
407
+
408
+ reactant_ids = step_el.get("ReactionStepReactants", "").split()
409
+ product_ids = step_el.get("ReactionStepProducts", "").split()
410
+ above_ids = step_el.get("ReactionStepObjectsAboveArrow", "").split()
411
+ below_ids = step_el.get("ReactionStepObjectsBelowArrow", "").split()
412
+ arrow_ids = step_el.get("ReactionStepArrows", "").split()
413
+
414
+ # Filter out empty strings from split
415
+ reactant_ids = [x for x in reactant_ids if x]
416
+ product_ids = [x for x in product_ids if x]
417
+ above_ids = [x for x in above_ids if x]
418
+ below_ids = [x for x in below_ids if x]
419
+ arrow_ids = [x for x in arrow_ids if x]
420
+
421
+ # Validate IDs exist in id_map
422
+ for eid in reactant_ids + product_ids + above_ids + below_ids:
423
+ if eid not in id_map:
424
+ _log(f"Warning: element id {eid} in step {step_id} "
425
+ f"not found in page")
426
+
427
+ # Resolve arrow ID (take first if multiple)
428
+ arrow_elem_id = arrow_ids[0] if arrow_ids else None
429
+
430
+ raw_steps.append(_RawStep(
431
+ step_elem_id=step_id,
432
+ reactant_elem_ids=reactant_ids,
433
+ product_elem_ids=product_ids,
434
+ above_arrow_ids=above_ids,
435
+ below_arrow_ids=below_ids,
436
+ arrow_elem_id=arrow_elem_id,
437
+ ))
438
+
439
+ return raw_steps
440
+
441
+
442
+ # ---------------------------------------------------------------------------
443
+ # Orphan transition-arrow recovery (serpentine layouts)
444
+ # ---------------------------------------------------------------------------
445
+
446
+ def _recover_orphan_transition_steps(
447
+ page: ET.Element,
448
+ raw_steps: List[_RawStep],
449
+ id_map: Dict[str, ET.Element],
450
+ ) -> List[_RawStep]:
451
+ """Recover reaction steps from orphan vertical arrows.
452
+
453
+ In serpentine layouts the DSL renderer emits vertical transition arrows
454
+ outside any ``<scheme><step>`` element. The step-attribute parser
455
+ therefore misses them, leaving disconnected row-groups that the topology
456
+ detector wrongly classifies as "parallel".
457
+
458
+ This function detects those orphan vertical arrows, spatially resolves
459
+ their nearest reactant/product fragments, collects nearby condition text,
460
+ and inserts synthetic ``_RawStep`` entries at the correct position in
461
+ *raw_steps* so that the downstream species-registry and topology
462
+ detector see a fully-connected chain.
463
+
464
+ Parameters
465
+ ----------
466
+ page : ET.Element
467
+ The ``<page>`` element of the parsed CDXML.
468
+ raw_steps : list of _RawStep
469
+ The steps already found by the step-attribute parser (mutated
470
+ in-place via insertion).
471
+ id_map : dict
472
+ Element-id → element mapping for the page.
473
+
474
+ Returns
475
+ -------
476
+ list of _RawStep
477
+ The *raw_steps* list, possibly with additional entries inserted.
478
+ """
479
+ from ..cdxml_utils import arrow_endpoints as _ae, fragment_centroid
480
+
481
+ if not raw_steps:
482
+ return raw_steps
483
+
484
+ # Collect arrow IDs already claimed by existing steps
485
+ claimed_arrow_ids: Set[str] = set()
486
+ for rs in raw_steps:
487
+ if rs.arrow_elem_id:
488
+ claimed_arrow_ids.add(rs.arrow_elem_id)
489
+
490
+ # Collect element IDs already claimed (reactants/products/above/below)
491
+ claimed_elem_ids: Set[str] = set()
492
+ for rs in raw_steps:
493
+ claimed_elem_ids.update(rs.reactant_elem_ids)
494
+ claimed_elem_ids.update(rs.product_elem_ids)
495
+ claimed_elem_ids.update(rs.above_arrow_ids)
496
+ claimed_elem_ids.update(rs.below_arrow_ids)
497
+
498
+ # Compute a length threshold from existing step arrows.
499
+ # Serpentine transition arrows are comparable in size to the reaction
500
+ # arrows; tiny annotation arrows (15-20 pt) should be ignored.
501
+ import math
502
+ step_arrow_lengths: List[float] = []
503
+ for rs in raw_steps:
504
+ if rs.arrow_elem_id:
505
+ a_el = id_map.get(rs.arrow_elem_id)
506
+ if a_el is not None:
507
+ atx, aty, ahx, ahy = _ae(a_el)
508
+ step_arrow_lengths.append(math.hypot(ahx - atx, ahy - aty))
509
+ min_arrow_len = 30.0 # absolute floor
510
+ if step_arrow_lengths:
511
+ median_len = sorted(step_arrow_lengths)[len(step_arrow_lengths) // 2]
512
+ # Require at least 40% of the median step-arrow length
513
+ min_arrow_len = max(min_arrow_len, 0.4 * median_len)
514
+
515
+ # Build set of element IDs that are products of existing steps
516
+ # (the orphan arrow's reactant must be one of these to qualify)
517
+ existing_product_eids: Set[str] = set()
518
+ for rs in raw_steps:
519
+ existing_product_eids.update(rs.product_elem_ids)
520
+
521
+ # Find orphan arrows on the page
522
+ orphan_arrows = []
523
+ for el in page:
524
+ if el.tag != "arrow":
525
+ continue
526
+ eid = el.get("id", "")
527
+ if eid in claimed_arrow_ids:
528
+ continue
529
+ tx, ty, hx, hy = _ae(el)
530
+ dx, dy = hx - tx, hy - ty
531
+ # Only consider substantially vertical arrows (|dy| > |dx|)
532
+ if abs(dy) <= abs(dx):
533
+ continue
534
+ # Must be long enough to be a real reaction arrow
535
+ if math.hypot(dx, dy) < min_arrow_len:
536
+ continue
537
+ orphan_arrows.append({
538
+ "element": el,
539
+ "id": eid,
540
+ "tail_x": tx, "tail_y": ty,
541
+ "head_x": hx, "head_y": hy,
542
+ "mid_x": (tx + hx) / 2, "mid_y": (ty + hy) / 2,
543
+ })
544
+
545
+ if not orphan_arrows:
546
+ return raw_steps
547
+
548
+ # Collect fragment centroids (exclude already-claimed where possible)
549
+ frag_data = []
550
+ for el in page:
551
+ if el.tag == "fragment":
552
+ c = fragment_centroid(el)
553
+ if c:
554
+ frag_data.append({
555
+ "id": el.get("id", ""),
556
+ "cx": c[0], "cy": c[1],
557
+ })
558
+
559
+ # Collect text element positions
560
+ text_data = []
561
+ for el in page:
562
+ if el.tag == "t":
563
+ tid = el.get("id", "")
564
+ p = el.get("p")
565
+ if p:
566
+ parts = p.split()
567
+ tcx, tcy = float(parts[0]), float(parts[1])
568
+ else:
569
+ bb = el.get("BoundingBox", "")
570
+ if bb:
571
+ vals = [float(v) for v in bb.split()]
572
+ tcx = (vals[0] + vals[2]) / 2
573
+ tcy = (vals[1] + vals[3]) / 2
574
+ else:
575
+ continue
576
+ text_data.append({"id": tid, "cx": tcx, "cy": tcy})
577
+
578
+ # Build product→step-index map to find the insertion point
579
+ product_to_step_idx: Dict[str, int] = {}
580
+ for i, rs in enumerate(raw_steps):
581
+ for pid in rs.product_elem_ids:
582
+ product_to_step_idx[pid] = i
583
+
584
+ # Process each orphan vertical arrow
585
+ new_entries: List[Tuple[int, _RawStep]] = [] # (insert_after_idx, step)
586
+
587
+ for oa in orphan_arrows:
588
+ # Find nearest fragment on the tail side (reactant)
589
+ # CDXML y increases downward; vertical arrow goes from
590
+ # tail (upper) to head (lower).
591
+ best_reactant = None
592
+ best_r_dist = float("inf")
593
+ for fd in frag_data:
594
+ # Reactant should be above/near the tail (cy <= tail_y + margin)
595
+ if fd["cy"] > oa["mid_y"]:
596
+ continue # below midpoint — candidate for product, not reactant
597
+ dist = ((fd["cx"] - oa["tail_x"])**2
598
+ + (fd["cy"] - oa["tail_y"])**2)**0.5
599
+ if dist < best_r_dist:
600
+ best_r_dist = dist
601
+ best_reactant = fd
602
+
603
+ # Find nearest fragment on the head side (product)
604
+ best_product = None
605
+ best_p_dist = float("inf")
606
+ for fd in frag_data:
607
+ # Product should be below/near the head (cy >= mid_y)
608
+ if fd["cy"] < oa["mid_y"]:
609
+ continue # above midpoint — candidate for reactant
610
+ dist = ((fd["cx"] - oa["head_x"])**2
611
+ + (fd["cy"] - oa["head_y"])**2)**0.5
612
+ if dist < best_p_dist:
613
+ best_p_dist = dist
614
+ best_product = fd
615
+
616
+ if best_reactant is None or best_product is None:
617
+ continue # can't resolve both ends
618
+
619
+ # Sanity check: distances should be reasonable (< 5× arrow length)
620
+ arrow_len = abs(oa["head_y"] - oa["tail_y"])
621
+ if best_r_dist > 5 * arrow_len or best_p_dist > 5 * arrow_len:
622
+ continue
623
+
624
+ reactant_id = best_reactant["id"]
625
+ product_id = best_product["id"]
626
+
627
+ # The reactant fragment must be a product of an existing step —
628
+ # this ensures we are bridging two rows of a serpentine layout
629
+ # rather than picking up unrelated annotation arrows.
630
+ if reactant_id not in existing_product_eids:
631
+ continue
632
+
633
+ # Find condition text elements near the arrow body
634
+ # (between tail and head, or slightly to the side)
635
+ condition_ids = []
636
+ arrow_len_x2 = 2.0 * arrow_len
637
+ for td in text_data:
638
+ if td["id"] in claimed_elem_ids:
639
+ continue
640
+ # Must be reasonably close to the arrow midpoint
641
+ dist = ((td["cx"] - oa["mid_x"])**2
642
+ + (td["cy"] - oa["mid_y"])**2)**0.5
643
+ if dist > arrow_len_x2:
644
+ continue
645
+ # Skip compound labels that are close to reactant/product
646
+ if best_reactant:
647
+ r_dist = ((td["cx"] - best_reactant["cx"])**2
648
+ + (td["cy"] - best_reactant["cy"])**2)**0.5
649
+ if r_dist < arrow_len * 0.6:
650
+ continue
651
+ if best_product:
652
+ p_dist = ((td["cx"] - best_product["cx"])**2
653
+ + (td["cy"] - best_product["cy"])**2)**0.5
654
+ if p_dist < arrow_len * 0.6:
655
+ continue
656
+ condition_ids.append(td["id"])
657
+
658
+ # Build the synthetic _RawStep
659
+ step = _RawStep(
660
+ step_elem_id=oa["id"],
661
+ reactant_elem_ids=[reactant_id],
662
+ product_elem_ids=[product_id],
663
+ above_arrow_ids=[],
664
+ below_arrow_ids=condition_ids,
665
+ arrow_elem_id=oa["id"],
666
+ )
667
+
668
+ # Determine insertion position: after the step whose product is
669
+ # our reactant fragment
670
+ insert_after = product_to_step_idx.get(reactant_id, len(raw_steps) - 1)
671
+ new_entries.append((insert_after, step))
672
+
673
+ _log(f"Recovered orphan transition step from arrow {oa['id']}: "
674
+ f"reactant={reactant_id} -> product={product_id} "
675
+ f"(conditions: {len(condition_ids)} text element(s))")
676
+
677
+ # Insert new entries in reverse order to preserve indices
678
+ new_entries.sort(key=lambda x: x[0], reverse=True)
679
+ for insert_after, step in new_entries:
680
+ raw_steps.insert(insert_after + 1, step)
681
+
682
+ return raw_steps
683
+
684
+
685
+ # ---------------------------------------------------------------------------
686
+ # Geometry-based fallback
687
+ # ---------------------------------------------------------------------------
688
+
689
+ def _parse_from_geometry(page: ET.Element,
690
+ id_map: Dict[str, ET.Element],
691
+ ) -> List[_RawStep]:
692
+ """Parse steps using spatial position relative to arrows.
693
+
694
+ Fallback for CDXML files without <scheme><step> attributes.
695
+ """
696
+ from ..cdxml_utils import fragment_centroid
697
+
698
+ arrows = _find_all_arrows(page)
699
+ if not arrows:
700
+ return []
701
+
702
+ # Get arrow data sorted by tail x-position
703
+ arrow_data = []
704
+ for arrow in arrows:
705
+ tx, ty, hx, hy = _arrow_endpoints(arrow)
706
+ # Ensure tail is left of head for horizontal arrows
707
+ if tx > hx:
708
+ tx, ty, hx, hy = hx, hy, tx, ty
709
+ arrow_data.append({
710
+ "element": arrow,
711
+ "id": arrow.get("id", ""),
712
+ "tail_x": tx, "tail_y": ty,
713
+ "head_x": hx, "head_y": hy,
714
+ "mid_x": (tx + hx) / 2,
715
+ "mid_y": (ty + hy) / 2,
716
+ })
717
+ arrow_data.sort(key=lambda a: a["tail_x"])
718
+
719
+ # Collect all fragments and text elements on the page
720
+ fragments = []
721
+ texts = []
722
+ for el in page:
723
+ if el.tag == "fragment":
724
+ centroid = fragment_centroid(el)
725
+ if centroid:
726
+ cx, cy = centroid
727
+ else:
728
+ cx, cy = 0.0, 0.0
729
+ fragments.append({
730
+ "element": el,
731
+ "id": el.get("id", ""),
732
+ "cx": cx, "cy": cy,
733
+ })
734
+ elif el.tag == "t":
735
+ p = el.get("p")
736
+ if p:
737
+ parts = p.split()
738
+ tx_coord, ty_coord = float(parts[0]), float(parts[1])
739
+ else:
740
+ bb = el.get("BoundingBox", "")
741
+ if bb:
742
+ vals = [float(v) for v in bb.split()]
743
+ tx_coord = (vals[0] + vals[2]) / 2
744
+ ty_coord = (vals[1] + vals[3]) / 2
745
+ else:
746
+ continue
747
+ texts.append({
748
+ "element": el,
749
+ "id": el.get("id", ""),
750
+ "cx": tx_coord, "cy": ty_coord,
751
+ })
752
+
753
+ # Build raw steps by assigning elements to their nearest arrow
754
+ raw_steps: List[_RawStep] = []
755
+
756
+ for arrow_idx, ad in enumerate(arrow_data):
757
+ step = _RawStep(
758
+ step_elem_id=ad["id"],
759
+ arrow_elem_id=ad["id"],
760
+ )
761
+
762
+ # Determine the x-range boundaries for this arrow
763
+ # Left boundary: either the start of the page or the previous arrow's head
764
+ left_bound = arrow_data[arrow_idx - 1]["head_x"] if arrow_idx > 0 else -1e9
765
+ # Right boundary: either the end of the page or the next arrow's tail
766
+ right_bound = (arrow_data[arrow_idx + 1]["tail_x"]
767
+ if arrow_idx < len(arrow_data) - 1 else 1e9)
768
+
769
+ for frag in fragments:
770
+ cx = frag["cx"]
771
+ fid = frag["id"]
772
+
773
+ # Check if this fragment belongs to this arrow's zone
774
+ if cx < ad["tail_x"] and cx >= left_bound:
775
+ # Left of tail → reactant
776
+ step.reactant_elem_ids.append(fid)
777
+ elif cx > ad["head_x"] and cx <= right_bound:
778
+ # Right of head → product
779
+ step.product_elem_ids.append(fid)
780
+ elif ad["tail_x"] <= cx <= ad["head_x"]:
781
+ # Between tail and head → above/below based on y
782
+ cy = frag["cy"]
783
+ if cy < ad["mid_y"]:
784
+ step.above_arrow_ids.append(fid)
785
+ else:
786
+ step.below_arrow_ids.append(fid)
787
+
788
+ for txt in texts:
789
+ tx_coord = txt["cx"]
790
+ tid = txt["id"]
791
+
792
+ # Only assign text within the arrow's x-span
793
+ if ad["tail_x"] - 20 <= tx_coord <= ad["head_x"] + 20:
794
+ ty_coord = txt["cy"]
795
+ if ty_coord < ad["mid_y"]:
796
+ step.above_arrow_ids.append(tid)
797
+ else:
798
+ step.below_arrow_ids.append(tid)
799
+
800
+ raw_steps.append(step)
801
+
802
+ # Handle shared intermediates: product of step i that overlaps with
803
+ # reactant of step i+1
804
+ for i in range(len(raw_steps) - 1):
805
+ curr_products = set(raw_steps[i].product_elem_ids)
806
+ next_reactants = set(raw_steps[i + 1].reactant_elem_ids)
807
+ # If no reactants found for next step, check if current products
808
+ # should be shared
809
+ if not next_reactants:
810
+ for pid in raw_steps[i].product_elem_ids:
811
+ raw_steps[i + 1].reactant_elem_ids.append(pid)
812
+
813
+ return raw_steps
814
+
815
+
816
+ # ---------------------------------------------------------------------------
817
+ # Spatial-engine bridge (geometry-first primary path)
818
+ # ---------------------------------------------------------------------------
819
+
820
+ def _parse_from_spatial_engine(
821
+ page: ET.Element,
822
+ id_map: Dict[str, ET.Element],
823
+ ) -> Optional[List[_RawStep]]:
824
+ """Parse steps using the spatial_assignment engine.
825
+
826
+ Returns a list of _RawStep or None if no arrows found.
827
+ Stores metadata (layout_pattern, confidences) on the function object
828
+ as ``_parse_from_spatial_engine._last_meta`` for retrieval by the caller.
829
+ """
830
+ from .spatial_assignment import (
831
+ build_arrow_vectors, classify_layout, assign_elements,
832
+ )
833
+
834
+ arrows = build_arrow_vectors(page)
835
+ if not arrows:
836
+ _parse_from_spatial_engine._last_meta = {} # type: ignore[attr-defined]
837
+ return None
838
+
839
+ layout = classify_layout(arrows)
840
+ steps, results = assign_elements(arrows, page, layout)
841
+
842
+ # Convert spatial_assignment.RawStep -> scheme_reader._RawStep
843
+ raw_steps: List[_RawStep] = []
844
+ for sa_step in steps:
845
+ raw = _RawStep(
846
+ step_elem_id=sa_step.arrow_id,
847
+ arrow_elem_id=sa_step.arrow_id,
848
+ )
849
+ raw.reactant_elem_ids = list(sa_step.reactant_ids)
850
+ raw.product_elem_ids = list(sa_step.product_ids)
851
+ raw.above_arrow_ids = list(sa_step.above_arrow_ids)
852
+ raw.below_arrow_ids = list(sa_step.below_arrow_ids)
853
+ raw_steps.append(raw)
854
+
855
+ # Store metadata for caller
856
+ confidences = {r.element_id: r.confidence for r in results}
857
+ _parse_from_spatial_engine._last_meta = { # type: ignore[attr-defined]
858
+ "layout_pattern": layout.value,
859
+ "confidences": confidences,
860
+ }
861
+
862
+ return raw_steps
863
+
864
+
865
+ # ---------------------------------------------------------------------------
866
+ # Name resolution helpers
867
+ # ---------------------------------------------------------------------------
868
+
869
+ def _name_from_smiles(smiles: str) -> Optional[str]:
870
+ """Look up a display name for a SMILES string via reagent_db."""
871
+ try:
872
+ from ..resolve.reagent_db import get_reagent_db
873
+ db = get_reagent_db()
874
+ entry = db.entry_for_smiles(smiles)
875
+ if entry:
876
+ return entry.get("display") or entry.get("name")
877
+ except Exception:
878
+ pass
879
+ return None
880
+
881
+
882
+ # ---------------------------------------------------------------------------
883
+ # Text classification patterns
884
+ # ---------------------------------------------------------------------------
885
+
886
+ # Condition reference letters: "a", "b, c", "d,e", "a, b, c, d"
887
+ _CONDITION_REF_RE = re.compile(
888
+ r"^[a-z](\s*[,/]\s*[a-z])*$"
889
+ )
890
+
891
+ # Condition ref with "or": "a or b"
892
+ _CONDITION_REF_OR_RE = re.compile(
893
+ r"^[a-z]\s+or\s+[a-z]$", re.IGNORECASE
894
+ )
895
+
896
+ # Footnote text: "(a) morpholine (1.2 eq), Pd2(dba)3 (5 mol%), ..."
897
+ # Requires letter enclosed in parens — the standard format for condition footnotes
898
+ # in reaction scheme literature.
899
+ _FOOTNOTE_RE = re.compile(
900
+ r"^\(([a-z])\)\s+\S",
901
+ re.IGNORECASE
902
+ )
903
+
904
+ # Pure yield text: "72%", "(85%)", "92% yield", "quant.", "(quant.)"
905
+ _YIELD_ONLY_RE = re.compile(
906
+ r"^\(?\d+(?:\.\d+)?\s*%\s*(yield)?\)?$|"
907
+ r"^\(?quant\.?\)?$",
908
+ re.IGNORECASE
909
+ )
910
+
911
+ # Compound labels: "1", "2a", "15", "SM-1", "DP-2", "(iii)"
912
+ # Extends _LABEL_RE with prefix patterns (SM-, DP-, etc.)
913
+ _COMPOUND_LABEL_RE = re.compile(
914
+ r"^[1-9]\d{0,2}[a-z]?$|" # numeric: "1", "2a", "15b"
915
+ r"^\([ivx]+\)$|" # roman: "(i)", "(iii)"
916
+ r"^(SM|DP|P|CP|Int)-?\d+[a-z]?$", # prefixed: "SM-1", "DP-2", "P1"
917
+ re.IGNORECASE
918
+ )
919
+
920
+ # Literature citations: "Author et al. J. Org. Chem. 1994, 59, 1937"
921
+ _CITATION_RE = re.compile(
922
+ r"[A-Z][a-z]+\s+et\s+al\.", re.IGNORECASE
923
+ )
924
+ _JOURNAL_RE = re.compile(
925
+ r"(J\.\s*(Org|Med|Am)\.\s*Chem|Angew\.\s*Chem|Org\.\s*Lett|"
926
+ r"Tetrahedron|Bioorg\.\s*Med|Chem\.\s*Commun|ChemMedChem|"
927
+ r"Proc\.\s*Natl|Biochem\.\s*Biophys|Chem\.\s*Ber|"
928
+ r"Org\.\s*Process\.\s*Res|Digital\s*Discovery|RSC|"
929
+ r"JACS|ACS\s*Catal|Nat\.\s*Chem)",
930
+ re.IGNORECASE
931
+ )
932
+
933
+ # Bioactivity data: "IC50 = 23nM", "EC50 (RPMI-8226) = 190nM", "Ki = 5 µM"
934
+ _BIOACTIVITY_RE = re.compile(
935
+ r"(IC50|EC50|Ki|Kd|MIC|ED50|GI50|CC50)\s*[=(]",
936
+ re.IGNORECASE
937
+ )
938
+
939
+
940
+ def _classify_text_species(text: str) -> str:
941
+ """Classify a text element into a category.
942
+
943
+ Returns one of: "condition_ref", "footnote", "yield",
944
+ "compound_label", "citation", "bioactivity", "chemical" (default).
945
+ """
946
+ stripped = text.strip()
947
+
948
+ # Condition reference letters (single or comma/slash-separated)
949
+ if _CONDITION_REF_RE.match(stripped):
950
+ return "condition_ref"
951
+ if _CONDITION_REF_OR_RE.match(stripped):
952
+ return "condition_ref"
953
+
954
+ # Pure yield annotations (before footnote check — footnotes may end with %)
955
+ if _YIELD_ONLY_RE.match(stripped):
956
+ return "yield"
957
+
958
+ # Compound labels — short numeric/prefixed identifiers
959
+ if _COMPOUND_LABEL_RE.match(stripped):
960
+ return "compound_label"
961
+
962
+ # Footnote text: "(a) reagent, conditions..." or "(b) NBS, DMF..."
963
+ # Must be long enough to contain actual conditions (not just "(a)")
964
+ if len(stripped) > 5 and _FOOTNOTE_RE.match(stripped):
965
+ return "footnote"
966
+
967
+ # Literature citations
968
+ if _CITATION_RE.search(stripped) or _JOURNAL_RE.search(stripped):
969
+ return "citation"
970
+
971
+ # Bioactivity annotations
972
+ if _BIOACTIVITY_RE.search(stripped):
973
+ return "bioactivity"
974
+
975
+ return "chemical"
976
+
977
+
978
+ # Single-letter names that PubChem falsely resolves (d → deuterium, etc.)
979
+ _LETTER_SMILES_BLACKLIST = frozenset("abcdefghijklmnopqrstuvwxyz")
980
+
981
+
982
+ # ---------------------------------------------------------------------------
983
+ # Species registry building
984
+ # ---------------------------------------------------------------------------
985
+
986
+
987
+ def _extract_variable_labels(frag_el: ET.Element) -> List[str]:
988
+ """Extract variable position labels from a fragment's child nodes.
989
+
990
+ Looks for GenericNickname and Unspecified node types that carry
991
+ text labels (R3, R4, Linker, etc.).
992
+ """
993
+ labels = []
994
+ for node in frag_el.iter("n"):
995
+ node_type = node.get("NodeType", "")
996
+ if node_type in ("GenericNickname", "Unspecified"):
997
+ # Get the text label
998
+ t_el = node.find("t")
999
+ if t_el is not None:
1000
+ text = _get_text_content(t_el)
1001
+ if text and text.strip():
1002
+ labels.append(text.strip())
1003
+ elif node_type == "GenericNickname":
1004
+ # Fallback to GenericNickname attribute
1005
+ gn = node.get("GenericNickname", "")
1006
+ if gn:
1007
+ labels.append(gn)
1008
+ return labels
1009
+
1010
+
1011
+ def _build_static_species_registry(
1012
+ page: ET.Element,
1013
+ id_map: Dict[str, ET.Element],
1014
+ use_network: bool = True,
1015
+ use_chemscript: bool = False,
1016
+ ) -> Dict[str, SpeciesRecord]:
1017
+ """Enumerate all fragments on a page without requiring reaction steps.
1018
+
1019
+ Used for non-reaction CDXMLs (target arrays, standalone structures)
1020
+ where no arrows are present. Returns a species dict keyed by species
1021
+ ID, similar to the first return value of ``_build_species_registry``.
1022
+ """
1023
+ from ..rdkit_utils import frag_to_smiles_resolved, frag_to_smiles, frag_to_mw
1024
+
1025
+ # Optional ChemScript
1026
+ _frag_to_smiles_cs = None
1027
+ _cs_bridge = None
1028
+ if use_chemscript:
1029
+ try:
1030
+ from ..rdkit_utils import frag_to_smiles_chemscript
1031
+ _frag_to_smiles_cs = frag_to_smiles_chemscript
1032
+ except ImportError:
1033
+ pass
1034
+ try:
1035
+ from ..chemdraw.chemscript_bridge import ChemScriptBridge
1036
+ _cs_bridge = ChemScriptBridge()
1037
+ except Exception:
1038
+ pass
1039
+
1040
+ try:
1041
+ from rdkit import Chem
1042
+ from rdkit.Chem import Descriptors, rdMolDescriptors
1043
+ _has_rdkit = True
1044
+ except ImportError:
1045
+ _has_rdkit = False
1046
+
1047
+ species_dict: Dict[str, SpeciesRecord] = {}
1048
+ species_counter = 0
1049
+
1050
+ for el in page:
1051
+ if el.tag != "fragment":
1052
+ continue
1053
+
1054
+ elem_id = el.get("id", "")
1055
+ sp_id = f"species_{species_counter}"
1056
+ species_counter += 1
1057
+
1058
+ # SMILES extraction (same cascade as _build_species_registry)
1059
+ smiles_cs = None
1060
+ smiles_resolved = None
1061
+ smiles_raw = None
1062
+ if _frag_to_smiles_cs is not None:
1063
+ try:
1064
+ smiles_cs = _frag_to_smiles_cs(el)
1065
+ except Exception:
1066
+ pass
1067
+ try:
1068
+ smiles_resolved = frag_to_smiles_resolved(el)
1069
+ except Exception:
1070
+ pass
1071
+ try:
1072
+ smiles_raw = frag_to_smiles(el)
1073
+ except Exception:
1074
+ pass
1075
+
1076
+ smiles = smiles_cs or smiles_resolved or smiles_raw
1077
+
1078
+ # MW
1079
+ mw = None
1080
+ try:
1081
+ mw = frag_to_mw(el)
1082
+ except Exception:
1083
+ pass
1084
+
1085
+ # Formula
1086
+ formula = None
1087
+ if smiles and _has_rdkit:
1088
+ mol = Chem.MolFromSmiles(smiles)
1089
+ if mol:
1090
+ formula = rdMolDescriptors.CalcMolFormula(mol)
1091
+
1092
+ # Label
1093
+ label = _find_nearby_label(el, page, id_map)
1094
+
1095
+ # Name from reagent_db
1096
+ name = None
1097
+ if smiles:
1098
+ name = _name_from_smiles(smiles)
1099
+
1100
+ # IUPAC name
1101
+ iupac_name = None
1102
+ if _cs_bridge and smiles:
1103
+ try:
1104
+ iupac_name = _cs_bridge.get_name(smiles)
1105
+ except Exception:
1106
+ pass
1107
+
1108
+ # Generic/variable group metadata
1109
+ var_labels = _extract_variable_labels(el)
1110
+ if var_labels:
1111
+ var_str = ", ".join(var_labels)
1112
+ if name:
1113
+ name = f"{name} (variable: {var_str})"
1114
+ else:
1115
+ name = f"scaffold (variable: {var_str})"
1116
+
1117
+ record = SpeciesRecord(
1118
+ id=sp_id,
1119
+ cdxml_element_id=elem_id,
1120
+ element_type="fragment",
1121
+ smiles=smiles,
1122
+ smiles_raw=smiles_raw if smiles_raw != smiles else None,
1123
+ name=name,
1124
+ iupac_name=iupac_name,
1125
+ formula=formula,
1126
+ mw=round(mw, 2) if mw else None,
1127
+ label=label,
1128
+ )
1129
+ species_dict[sp_id] = record
1130
+
1131
+ # Also collect standalone text elements on the page
1132
+ for el in page:
1133
+ if el.tag != "t":
1134
+ continue
1135
+ text_content = _get_text_content(el)
1136
+ if not text_content or not text_content.strip():
1137
+ continue
1138
+ # Skip trivially short or known non-chemical text
1139
+ stripped = text_content.strip()
1140
+ if len(stripped) < 2:
1141
+ continue
1142
+
1143
+ sp_id = f"species_{species_counter}"
1144
+ species_counter += 1
1145
+ text_cat = _classify_text_species(stripped)
1146
+
1147
+ record = SpeciesRecord(
1148
+ id=sp_id,
1149
+ cdxml_element_id=el.get("id", ""),
1150
+ element_type="text",
1151
+ name=text_content,
1152
+ text_category=text_cat,
1153
+ )
1154
+ species_dict[sp_id] = record
1155
+
1156
+ return species_dict
1157
+
1158
+
1159
+ def _build_species_registry(
1160
+ raw_steps: List[_RawStep],
1161
+ id_map: Dict[str, ET.Element],
1162
+ page: ET.Element,
1163
+ use_network: bool = True,
1164
+ use_chemscript: bool = False,
1165
+ ) -> Tuple[Dict[str, SpeciesRecord], Dict[str, List[str]]]:
1166
+ """Build species records for all referenced elements.
1167
+
1168
+ Returns:
1169
+ (species_dict, elem_to_species_ids) where elem_to_species_ids maps
1170
+ CDXML element IDs to lists of species IDs (one-to-many for split
1171
+ text blocks).
1172
+ """
1173
+ from ..rdkit_utils import frag_to_smiles_resolved, frag_to_smiles, frag_to_mw
1174
+
1175
+ # Optional ChemScript-based SMILES (best abbreviation resolution)
1176
+ _frag_to_smiles_cs = None
1177
+ _cs_bridge = None
1178
+ if use_chemscript:
1179
+ try:
1180
+ from ..rdkit_utils import frag_to_smiles_chemscript
1181
+ _frag_to_smiles_cs = frag_to_smiles_chemscript
1182
+ _log("ChemScript SMILES resolution enabled")
1183
+ except ImportError:
1184
+ _log("ChemScript not available, using RDKit resolution")
1185
+ # Also get ChemScript bridge for IUPAC name generation
1186
+ try:
1187
+ from ..chemdraw.chemscript_bridge import ChemScriptBridge
1188
+ _cs_bridge = ChemScriptBridge()
1189
+ _log("ChemScript IUPAC naming enabled")
1190
+ except Exception:
1191
+ pass
1192
+
1193
+ try:
1194
+ from rdkit import Chem
1195
+ from rdkit.Chem import Descriptors, rdMolDescriptors
1196
+ _has_rdkit = True
1197
+ except ImportError:
1198
+ _has_rdkit = False
1199
+
1200
+ # Collect all unique element IDs
1201
+ all_elem_ids: Set[str] = set()
1202
+ for step in raw_steps:
1203
+ all_elem_ids.update(step.reactant_elem_ids)
1204
+ all_elem_ids.update(step.product_elem_ids)
1205
+ all_elem_ids.update(step.above_arrow_ids)
1206
+ all_elem_ids.update(step.below_arrow_ids)
1207
+
1208
+ species_dict: Dict[str, SpeciesRecord] = {}
1209
+ elem_to_species: Dict[str, List[str]] = {}
1210
+ species_counter = 0
1211
+
1212
+ for elem_id in sorted(all_elem_ids):
1213
+ if elem_id in elem_to_species:
1214
+ continue # already registered (shared intermediate)
1215
+
1216
+ el = id_map.get(elem_id)
1217
+ if el is None:
1218
+ _log(f"Element {elem_id} not found in id_map, skipping")
1219
+ continue
1220
+
1221
+ sp_id = f"species_{species_counter}"
1222
+ species_counter += 1
1223
+
1224
+ if el.tag == "fragment":
1225
+ # Extract SMILES — try ChemScript first (best abbreviation
1226
+ # expansion), then superatom-table resolution, then raw.
1227
+ smiles_cs = None
1228
+ smiles_resolved = None
1229
+ smiles_raw = None
1230
+ if _frag_to_smiles_cs is not None:
1231
+ try:
1232
+ smiles_cs = _frag_to_smiles_cs(el)
1233
+ except Exception as e:
1234
+ _log(f"frag_to_smiles_chemscript failed for {elem_id}: {e}")
1235
+ try:
1236
+ smiles_resolved = frag_to_smiles_resolved(el)
1237
+ except Exception as e:
1238
+ _log(f"frag_to_smiles_resolved failed for {elem_id}: {e}")
1239
+ try:
1240
+ smiles_raw = frag_to_smiles(el)
1241
+ except Exception as e:
1242
+ _log(f"frag_to_smiles failed for {elem_id}: {e}")
1243
+
1244
+ smiles = smiles_cs or smiles_resolved or smiles_raw
1245
+
1246
+ # Compute MW
1247
+ mw = None
1248
+ try:
1249
+ mw = frag_to_mw(el)
1250
+ except Exception:
1251
+ pass
1252
+
1253
+ # Compute formula from SMILES
1254
+ formula = None
1255
+ if smiles and _has_rdkit:
1256
+ mol = Chem.MolFromSmiles(smiles)
1257
+ if mol:
1258
+ formula = rdMolDescriptors.CalcMolFormula(mol)
1259
+
1260
+ # Detect compound label from nearby text
1261
+ label = _find_nearby_label(el, page, id_map)
1262
+
1263
+ # Try to get a name from reagent_db by SMILES
1264
+ name = None
1265
+ if smiles:
1266
+ name = _name_from_smiles(smiles)
1267
+
1268
+ # IUPAC name via ChemScript (when available)
1269
+ iupac_name = None
1270
+ if _cs_bridge and smiles:
1271
+ try:
1272
+ iupac_name = _cs_bridge.get_name(smiles)
1273
+ except Exception:
1274
+ pass # ChemScript fails on some structures (charges, etc.)
1275
+
1276
+ record = SpeciesRecord(
1277
+ id=sp_id,
1278
+ cdxml_element_id=elem_id,
1279
+ element_type="fragment",
1280
+ smiles=smiles,
1281
+ smiles_raw=smiles_raw if smiles_raw != smiles else None,
1282
+ name=name,
1283
+ iupac_name=iupac_name,
1284
+ formula=formula,
1285
+ mw=round(mw, 2) if mw else None,
1286
+ label=label,
1287
+ )
1288
+
1289
+ elif el.tag == "t":
1290
+ text_content = _get_text_content(el)
1291
+ if not text_content:
1292
+ continue
1293
+
1294
+ # Skip pure annotation text that isn't a chemical name:
1295
+ # - equiv annotations: "(1.2 eq)"
1296
+ # These are captured as step metadata, not species.
1297
+ stripped = text_content.strip()
1298
+ if re.match(r"^\(?\d+\.?\d*\s*eq\.?\)?$", stripped,
1299
+ re.IGNORECASE):
1300
+ _log(f"Skipping equiv annotation: {stripped!r}")
1301
+ species_counter -= 1 # reclaim ID
1302
+ continue
1303
+
1304
+ # Classify text species
1305
+ text_cat = _classify_text_species(stripped)
1306
+ _log(f"Text species {elem_id} classified as {text_cat}: "
1307
+ f"{stripped[:60]!r}")
1308
+
1309
+ if text_cat == "chemical":
1310
+ # Split multi-line text blocks into individual species.
1311
+ # Each chemical entity becomes its own SpeciesRecord;
1312
+ # condition tokens (temp, time, atmosphere) are skipped.
1313
+ from .reaction_parser import (
1314
+ _resolve_text_label, _is_condition_token)
1315
+ from ..resolve.reagent_db import get_reagent_db
1316
+ _reagent_db = get_reagent_db()
1317
+
1318
+ _equiv_re = re.compile(
1319
+ r'\s*\((\d+\.?\d*\s*(?:eq\.?|equiv\.?|mol\s*%))\)'
1320
+ r'\s*$', re.IGNORECASE)
1321
+
1322
+ lines = [l.strip() for l in text_content.split("\n")
1323
+ if l.strip()]
1324
+ split_records: List[SpeciesRecord] = []
1325
+
1326
+ for line in lines:
1327
+ # Extract equiv/mol% annotation
1328
+ eq_match = _equiv_re.search(line)
1329
+ line_equiv = eq_match.group(1) if eq_match else None
1330
+ clean_line = _equiv_re.sub("", line).strip()
1331
+ if not clean_line:
1332
+ continue
1333
+
1334
+ # Sub-split on ", " (comma+space) or ";"
1335
+ # Protects "1,4-dioxane" (no space after comma)
1336
+ parts = re.split(r'\s*;\s*|,\s+', clean_line)
1337
+
1338
+ # Further split "/" mixtures into separate entities.
1339
+ # e.g. "dioxane/H2O (3:1)" → ["dioxane", "H2O"]
1340
+ # Strip trailing ratio annotations like "(3:1)" first.
1341
+ expanded_parts: list = []
1342
+ _ratio_re = re.compile(
1343
+ r'\s*\(\d+:\d+\)\s*$')
1344
+ for p in parts:
1345
+ p_clean = _ratio_re.sub("", p).strip()
1346
+ if "/" in p_clean:
1347
+ expanded_parts.extend(
1348
+ s.strip() for s in p_clean.split("/")
1349
+ if s.strip())
1350
+ else:
1351
+ expanded_parts.append(p_clean)
1352
+ parts = expanded_parts
1353
+
1354
+ for pi, part in enumerate(parts):
1355
+ part = part.strip()
1356
+ if not part:
1357
+ continue
1358
+ # Skip condition tokens
1359
+ if _is_condition_token(part):
1360
+ continue
1361
+ # Skip yield annotations ("72%", "quant.")
1362
+ if _YIELD_ONLY_RE.match(part):
1363
+ continue
1364
+ # Skip compound labels ("3a", "SM-1")
1365
+ if _COMPOUND_LABEL_RE.match(part):
1366
+ continue
1367
+ # Skip single letters (false resolutions)
1368
+ if part.lower() in _LETTER_SMILES_BLACKLIST:
1369
+ continue
1370
+
1371
+ # Strip qualifier suffixes: "(cat.)", "(xs)",
1372
+ # "(excess)", "(aq.)", "(anhyd.)" etc.
1373
+ part = re.sub(
1374
+ r'\s*\((cat\.?|xs|excess|anhyd\.?|'
1375
+ r'aq\.?|anhydrous|catalytic|sat\.?|'
1376
+ r'conc\.?|dil\.?)\)\s*$',
1377
+ '', part, flags=re.IGNORECASE).strip()
1378
+ if not part:
1379
+ continue
1380
+
1381
+ # Skip reaction names / workup text that got
1382
+ # through as "chemical". Heuristic: if the
1383
+ # token contains only Latin words (no digits,
1384
+ # no chemical punctuation like parentheses or
1385
+ # brackets), and it contains a known non-
1386
+ # chemical keyword, skip it.
1387
+ _lower = part.lower()
1388
+ _NON_CHEM_KEYWORDS = {
1389
+ "formylation", "coupling", "reaction",
1390
+ "addition", "reduction", "oxidation",
1391
+ "cyclization", "rearrangement", "workup",
1392
+ "work-up", "quench", "extraction",
1393
+ }
1394
+ if any(kw in _lower for kw in _NON_CHEM_KEYWORDS):
1395
+ continue
1396
+ # Skip "then ..." workup prefixes
1397
+ if _lower.startswith("then "):
1398
+ continue
1399
+
1400
+ # Resolve SMILES
1401
+ smi = None
1402
+ try:
1403
+ smi = _resolve_text_label(
1404
+ part, use_network=use_network)
1405
+ except Exception:
1406
+ pass
1407
+
1408
+ # Compute MW / formula
1409
+ mw_val = None
1410
+ formula_val = None
1411
+ if smi and _has_rdkit:
1412
+ mol = Chem.MolFromSmiles(smi)
1413
+ if mol:
1414
+ formula_val = (
1415
+ rdMolDescriptors.CalcMolFormula(mol))
1416
+ mw_val = round(Descriptors.MolWt(mol), 2)
1417
+
1418
+ # Detect solvent via reagent_db role
1419
+ is_solvent = False
1420
+ role = _reagent_db.role_for_name(part)
1421
+ if role == "solvent":
1422
+ is_solvent = True
1423
+
1424
+ cur_id = f"species_{species_counter}"
1425
+ species_counter += 1
1426
+ rec = SpeciesRecord(
1427
+ id=cur_id,
1428
+ cdxml_element_id=elem_id,
1429
+ element_type="text",
1430
+ smiles=smi,
1431
+ name=part,
1432
+ formula=formula_val,
1433
+ mw=mw_val,
1434
+ text_category="chemical",
1435
+ is_solvent=is_solvent,
1436
+ # Attach equiv only to first part of a line
1437
+ equiv_text=line_equiv if pi == 0 else None,
1438
+ )
1439
+ split_records.append(rec)
1440
+
1441
+ if split_records:
1442
+ # Reclaim the pre-allocated sp_id; we use our own IDs
1443
+ species_counter -= 1 # undo the +1 from line 937
1444
+ # Re-number: the split_records already have correct IDs
1445
+ # allocated above; just fix the counter
1446
+ species_counter = int(
1447
+ split_records[-1].id.split("_")[1]) + 1
1448
+ for rec in split_records:
1449
+ species_dict[rec.id] = rec
1450
+ elem_to_species.setdefault(elem_id, []).append(
1451
+ rec.id)
1452
+ continue # skip the generic record/assignment below
1453
+ else:
1454
+ # No chemical tokens extracted — fall back to a single
1455
+ # record with the raw text (e.g. pure condition block)
1456
+ record = SpeciesRecord(
1457
+ id=sp_id,
1458
+ cdxml_element_id=elem_id,
1459
+ element_type="text",
1460
+ name=text_content,
1461
+ text_category=text_cat,
1462
+ )
1463
+ else:
1464
+ # Non-chemical text (condition_ref, citation, bioactivity)
1465
+ record = SpeciesRecord(
1466
+ id=sp_id,
1467
+ cdxml_element_id=elem_id,
1468
+ element_type="text",
1469
+ name=text_content,
1470
+ text_category=text_cat,
1471
+ )
1472
+
1473
+ else:
1474
+ # Unknown element type — skip but warn
1475
+ _log(f"Element {elem_id} has unexpected tag '{el.tag}', skipping")
1476
+ continue
1477
+
1478
+ species_dict[sp_id] = record
1479
+ elem_to_species.setdefault(elem_id, []).append(sp_id)
1480
+
1481
+ return species_dict, elem_to_species
1482
+
1483
+
1484
+ def _find_nearby_label(frag: ET.Element, page: ET.Element,
1485
+ id_map: Dict[str, ET.Element]) -> Optional[str]:
1486
+ """Find a compound label text element near the bottom of a fragment.
1487
+
1488
+ Labels are typically short text elements ("1", "2a", "3") positioned
1489
+ directly below the fragment bounding box.
1490
+ """
1491
+ from ..cdxml_utils import fragment_bbox
1492
+
1493
+ bbox = fragment_bbox(frag)
1494
+ if bbox is None:
1495
+ return None
1496
+
1497
+ min_x, min_y, max_x, max_y = bbox
1498
+ frag_center_x = (min_x + max_x) / 2
1499
+ frag_width = max_x - min_x
1500
+
1501
+ best_label = None
1502
+ best_dist = float("inf")
1503
+
1504
+ for el in page:
1505
+ if el.tag != "t":
1506
+ continue
1507
+ p = el.get("p")
1508
+ if not p:
1509
+ continue
1510
+ parts = p.split()
1511
+ tx, ty = float(parts[0]), float(parts[1])
1512
+
1513
+ # Label should be below the fragment (within ~25pt)
1514
+ if ty < max_y or ty > max_y + 25:
1515
+ continue
1516
+ # Label should be horizontally near the fragment center
1517
+ if abs(tx - frag_center_x) > frag_width / 2 + 15:
1518
+ continue
1519
+
1520
+ text = _get_text_content(el)
1521
+ if text and _LABEL_RE.match(text):
1522
+ dist = abs(tx - frag_center_x) + abs(ty - max_y)
1523
+ if dist < best_dist:
1524
+ best_dist = dist
1525
+ best_label = text
1526
+
1527
+ return best_label
1528
+
1529
+
1530
+ # ---------------------------------------------------------------------------
1531
+ # Step record building
1532
+ # ---------------------------------------------------------------------------
1533
+
1534
+ def _build_step_records(
1535
+ raw_steps: List[_RawStep],
1536
+ elem_to_species: Dict[str, List[str]],
1537
+ species_dict: Dict[str, "SpeciesRecord"],
1538
+ id_map: Dict[str, ET.Element],
1539
+ page: ET.Element,
1540
+ ) -> List[StepRecord]:
1541
+ """Convert raw steps to StepRecords with species IDs and parsed text."""
1542
+ from .reaction_parser import (split_condition_text,
1543
+ extract_conditions_from_text)
1544
+
1545
+ # Categories that should NOT be added to reagent_ids
1546
+ _NON_REAGENT_CATS = frozenset({
1547
+ "condition_ref", "yield", "compound_label",
1548
+ "footnote", "citation", "bioactivity",
1549
+ })
1550
+
1551
+ def _is_reagent_species(sp_id: str) -> bool:
1552
+ """Return True if a species should be listed as a reagent."""
1553
+ sp = species_dict.get(sp_id)
1554
+ if sp is None:
1555
+ return True # unknown → keep (shouldn't happen)
1556
+ if sp.element_type != "text":
1557
+ return True # fragments are always reagents
1558
+ return sp.text_category not in _NON_REAGENT_CATS
1559
+
1560
+ records: List[StepRecord] = []
1561
+
1562
+ for idx, raw in enumerate(raw_steps):
1563
+ step = StepRecord(step_index=idx)
1564
+
1565
+ # Map element IDs to species IDs
1566
+ for eid in raw.reactant_elem_ids:
1567
+ sp_ids = elem_to_species.get(eid, [])
1568
+ step.reactant_ids.extend(sp_ids)
1569
+
1570
+ for eid in raw.product_elem_ids:
1571
+ sp_ids = elem_to_species.get(eid, [])
1572
+ step.product_ids.extend(sp_ids)
1573
+
1574
+ # Process above/below arrow elements
1575
+ for eid in raw.above_arrow_ids:
1576
+ el = id_map.get(eid)
1577
+ if el is None:
1578
+ continue
1579
+
1580
+ if el.tag == "fragment":
1581
+ sp_ids = elem_to_species.get(eid, [])
1582
+ step.reagent_ids.extend(sp_ids)
1583
+ elif el.tag == "t":
1584
+ text = _get_text_content(el)
1585
+ if not text:
1586
+ continue
1587
+ # Text above arrow: only add chemical species as reagents
1588
+ sp_ids = elem_to_species.get(eid, [])
1589
+ reagent_sp_ids = [s for s in sp_ids if _is_reagent_species(s)]
1590
+ if reagent_sp_ids:
1591
+ step.reagent_ids.extend(reagent_sp_ids)
1592
+ elif not sp_ids:
1593
+ # No species at all → condition metadata
1594
+ step.condition_text_raw.append(text)
1595
+ # For yield text above arrow, extract yield
1596
+ stripped = text.strip()
1597
+ if _YIELD_ONLY_RE.match(stripped):
1598
+ y = _extract_yield_from_text(text)
1599
+ if y and step.yield_text is None:
1600
+ step.yield_text = y
1601
+
1602
+ for eid in raw.below_arrow_ids:
1603
+ el = id_map.get(eid)
1604
+ if el is None:
1605
+ continue
1606
+
1607
+ if el.tag == "fragment":
1608
+ sp_ids = elem_to_species.get(eid, [])
1609
+ step.reagent_ids.extend(sp_ids)
1610
+ elif el.tag == "t":
1611
+ text = _get_text_content(el)
1612
+ if not text:
1613
+ continue
1614
+
1615
+ step.condition_text_raw.append(text)
1616
+
1617
+ # Extract yield from text
1618
+ y = _extract_yield_from_text(text)
1619
+ if y and step.yield_text is None:
1620
+ step.yield_text = y
1621
+
1622
+ # Split into conditions vs chemical names
1623
+ conds = extract_conditions_from_text(text)
1624
+ step.conditions.extend(conds)
1625
+
1626
+ # Only add chemical text species as reagents
1627
+ sp_ids = elem_to_species.get(eid, [])
1628
+ reagent_sp_ids = [s for s in sp_ids if _is_reagent_species(s)]
1629
+ step.reagent_ids.extend(reagent_sp_ids)
1630
+
1631
+ # Detect arrow style
1632
+ if raw.arrow_elem_id:
1633
+ arrow_el = _resolve_arrow(page, raw.arrow_elem_id, id_map)
1634
+ step.arrow_style = _detect_arrow_style(arrow_el)
1635
+ step.arrow_cdxml_id = raw.arrow_elem_id
1636
+
1637
+ records.append(step)
1638
+
1639
+ return records
1640
+
1641
+
1642
+ # ---------------------------------------------------------------------------
1643
+ # Footnote resolution
1644
+ # ---------------------------------------------------------------------------
1645
+
1646
+ def _collect_footnotes(
1647
+ page: ET.Element,
1648
+ registered_elem_ids: Set[str],
1649
+ ) -> Dict[str, str]:
1650
+ """Scan page for footnote text elements and return {letter: conditions_text}.
1651
+
1652
+ Footnotes are standalone text blocks like:
1653
+ "(a) morpholine (1.2 eq), Pd2(dba)3 (5 mol%), ..."
1654
+ "(b) NBS (1.1 eq), DMF, 0 C, 2 h, 95%"
1655
+
1656
+ Only text elements NOT already registered as species are considered.
1657
+ """
1658
+ footnotes: Dict[str, str] = {}
1659
+ for el in page:
1660
+ if el.tag != "t":
1661
+ continue
1662
+ eid = el.get("id", "")
1663
+ if eid in registered_elem_ids:
1664
+ continue
1665
+ text = _get_text_content(el)
1666
+ if not text or len(text.strip()) <= 5:
1667
+ continue
1668
+ stripped = text.strip()
1669
+ m = _FOOTNOTE_RE.match(stripped)
1670
+ if m:
1671
+ letter = m.group(1).lower()
1672
+ # Extract the conditions part (everything after "(letter) ")
1673
+ cond_text = re.sub(r"^\([a-z]\)\s+", "", stripped,
1674
+ count=1, flags=re.IGNORECASE)
1675
+ if cond_text:
1676
+ footnotes[letter] = cond_text
1677
+ _log(f"Footnote '{letter}': {cond_text[:60]!r}")
1678
+ return footnotes
1679
+
1680
+
1681
+ def _resolve_footnote_conditions(
1682
+ steps: List[StepRecord],
1683
+ species_dict: Dict[str, "SpeciesRecord"],
1684
+ footnotes: Dict[str, str],
1685
+ ) -> None:
1686
+ """Enrich steps that use condition_ref letters with their footnote text.
1687
+
1688
+ For each step, if its above/below arrow text includes condition_ref
1689
+ species (letters like "a", "b"), look up the corresponding footnote
1690
+ and populate the step's condition_text_raw, conditions, and yield_text.
1691
+ """
1692
+ if not footnotes:
1693
+ return
1694
+
1695
+ from .reaction_parser import extract_conditions_from_text
1696
+
1697
+ for step in steps:
1698
+ # Find condition_ref letters used by this step
1699
+ # (they were NOT added to reagent_ids, but we can find them
1700
+ # by checking species that share the step's arrow elements)
1701
+ ref_letters: List[str] = []
1702
+ for sp in species_dict.values():
1703
+ if sp.text_category != "condition_ref":
1704
+ continue
1705
+ # Check if this condition_ref letter is associated with
1706
+ # any element that belongs to this step's raw data.
1707
+ # Since we can't easily access raw step data here, instead
1708
+ # we look at all condition_ref species and match by
1709
+ # checking if their letter has a footnote.
1710
+ letters = [c.strip().lower() for c in sp.name.split(",")
1711
+ if c.strip()]
1712
+ ref_letters.extend(letters)
1713
+
1714
+ # For simplicity, resolve ALL footnotes for ALL steps that have
1715
+ # condition_ref species. The proper approach would track which
1716
+ # condition_ref belongs to which step, but that requires the
1717
+ # raw step data. Instead, we map letters to steps by position.
1718
+ # This works because steps and condition_refs are ordered.
1719
+
1720
+ # Better approach: pair condition_ref species to steps via
1721
+ # elem_to_species mapping. Since we've already built steps,
1722
+ # we iterate steps and check for condition_ref species by
1723
+ # looking at which species are condition_ref and near which arrow.
1724
+ # For now, use a simpler heuristic: steps with no chemical reagents
1725
+ # and condition_ref species nearby get the footnote conditions.
1726
+
1727
+ # Collect all condition_ref letters per step
1728
+ # We need to re-derive this from the species dict.
1729
+ # Strategy: condition_ref species have names like "a", "b, c".
1730
+ # Steps are ordered; condition_refs are ordered by position.
1731
+ # Match them by step index.
1732
+ all_cond_refs = sorted(
1733
+ [(sp.cdxml_element_id, sp.name.strip().lower())
1734
+ for sp in species_dict.values()
1735
+ if sp.text_category == "condition_ref"],
1736
+ key=lambda x: x[0] # sort by element ID (roughly positional)
1737
+ )
1738
+
1739
+ if not all_cond_refs:
1740
+ return
1741
+
1742
+ # Map each step to its condition_ref letters
1743
+ # For schemes with N steps and N condition_ref letters, assign 1:1
1744
+ # For multi-letter refs like "a, b", split into individual letters
1745
+ ref_idx = 0
1746
+ for step in steps:
1747
+ if ref_idx >= len(all_cond_refs):
1748
+ break
1749
+ elem_id, ref_text = all_cond_refs[ref_idx]
1750
+ letters = [c.strip() for c in re.split(r"[,/\s]+", ref_text)
1751
+ if c.strip() and len(c.strip()) == 1]
1752
+ ref_idx += 1
1753
+
1754
+ for letter in letters:
1755
+ fn_text = footnotes.get(letter)
1756
+ if not fn_text:
1757
+ continue
1758
+
1759
+ _log(f"Step {step.step_index}: resolving footnote "
1760
+ f"'{letter}' → {fn_text[:60]!r}")
1761
+
1762
+ step.condition_text_raw.append(f"({letter}) {fn_text}")
1763
+
1764
+ # Extract yield
1765
+ y = _extract_yield_from_text(fn_text)
1766
+ if y and step.yield_text is None:
1767
+ step.yield_text = y
1768
+
1769
+ # Extract conditions
1770
+ conds = extract_conditions_from_text(fn_text)
1771
+ step.conditions.extend(conds)
1772
+
1773
+
1774
+ # ---------------------------------------------------------------------------
1775
+ # Cross-scheme linkage (for wrap-repeat layouts)
1776
+ # ---------------------------------------------------------------------------
1777
+
1778
+ def _smiles_to_inchi(smiles: str) -> Optional[str]:
1779
+ """Convert SMILES to InChI for stereo-invariant comparison.
1780
+
1781
+ InChI normalises stereochemistry representation, so two SMILES
1782
+ that differ only in stereo assignment (common when ChemScript
1783
+ re-processes redrawn copies of the same intermediate) will still
1784
+ match by InChI. Falls back to canonical SMILES if RDKit or InChI
1785
+ generation fails.
1786
+ """
1787
+ try:
1788
+ from rdkit import Chem
1789
+ from rdkit.Chem.inchi import MolToInchi
1790
+ mol = Chem.MolFromSmiles(smiles)
1791
+ if mol is None:
1792
+ return None
1793
+ inchi = MolToInchi(mol)
1794
+ return inchi if inchi else None
1795
+ except Exception:
1796
+ return None
1797
+
1798
+
1799
+ def _link_repeated_species(steps: List[StepRecord],
1800
+ species: Dict[str, SpeciesRecord]) -> None:
1801
+ """Link repeated structures across separate <scheme> elements.
1802
+
1803
+ Wrap-repeat layouts re-draw intermediates with new element IDs.
1804
+ E.g., the product of step 2 (species_X, SMILES=AAA) appears as the
1805
+ reactant of step 3 (species_Y, SMILES=AAA) with a different ID.
1806
+
1807
+ Uses InChI comparison (stereo-invariant) as the primary matcher,
1808
+ falling back to exact SMILES match. This handles the case where
1809
+ ChemScript produces different stereo-specific SMILES for two
1810
+ drawings of the same intermediate.
1811
+ """
1812
+ # Build product species lookup keyed by InChI (primary) and SMILES (fallback)
1813
+ product_by_inchi: Dict[str, str] = {} # InChI → species_id
1814
+ product_by_smiles: Dict[str, str] = {} # SMILES → species_id
1815
+ for step in steps:
1816
+ for pid in step.product_ids:
1817
+ sp = species.get(pid)
1818
+ if sp and sp.smiles:
1819
+ product_by_smiles[sp.smiles] = pid
1820
+ inchi = _smiles_to_inchi(sp.smiles)
1821
+ if inchi:
1822
+ product_by_inchi[inchi] = pid
1823
+
1824
+ # Check each step's reactants for matches
1825
+ for step in steps:
1826
+ new_reactants = []
1827
+ for rid in step.reactant_ids:
1828
+ sp = species.get(rid)
1829
+ if sp and sp.smiles:
1830
+ # Try InChI match first (handles stereo differences)
1831
+ matched_id = None
1832
+ inchi = _smiles_to_inchi(sp.smiles)
1833
+ if inchi and inchi in product_by_inchi:
1834
+ candidate = product_by_inchi[inchi]
1835
+ if candidate != rid:
1836
+ matched_id = candidate
1837
+ # Fallback to exact SMILES match
1838
+ if matched_id is None and sp.smiles in product_by_smiles:
1839
+ candidate = product_by_smiles[sp.smiles]
1840
+ if candidate != rid:
1841
+ matched_id = candidate
1842
+
1843
+ if matched_id:
1844
+ _log(f"Linking repeated species: {rid} -> {matched_id} "
1845
+ f"(SMILES: {sp.smiles[:40]})")
1846
+ new_reactants.append(matched_id)
1847
+ continue
1848
+ new_reactants.append(rid)
1849
+ step.reactant_ids = new_reactants
1850
+
1851
+
1852
+ # ---------------------------------------------------------------------------
1853
+ # Topology detection
1854
+ # ---------------------------------------------------------------------------
1855
+
1856
+ def _detect_topology(steps: List[StepRecord]) -> str:
1857
+ """Classify scheme topology from the reaction graph.
1858
+
1859
+ Returns one of: "linear", "divergent", "convergent", "parallel",
1860
+ "cycle", "mixed"
1861
+ """
1862
+ if len(steps) == 0:
1863
+ return "linear"
1864
+ if len(steps) == 1:
1865
+ return "linear"
1866
+
1867
+ # Build graph: which species are reactants/products in which steps
1868
+ reactant_of: Dict[str, Set[int]] = defaultdict(set)
1869
+ product_of: Dict[str, Set[int]] = defaultdict(set)
1870
+
1871
+ for i, step in enumerate(steps):
1872
+ for rid in step.reactant_ids:
1873
+ reactant_of[rid].add(i)
1874
+ for pid in step.product_ids:
1875
+ product_of[pid].add(i)
1876
+
1877
+ # Check for sequential links: product of step i = reactant of step j
1878
+ sequential_links = 0
1879
+ for i in range(len(steps)):
1880
+ for j in range(i + 1, len(steps)):
1881
+ shared = set(steps[i].product_ids) & set(steps[j].reactant_ids)
1882
+ if shared:
1883
+ sequential_links += 1
1884
+
1885
+ # Check divergent: same reactant in multiple steps with different products
1886
+ divergent = False
1887
+ for sp_id, step_indices in reactant_of.items():
1888
+ if len(step_indices) > 1:
1889
+ # Check that they produce different things
1890
+ product_sets = [frozenset(steps[i].product_ids)
1891
+ for i in step_indices]
1892
+ if len(set(product_sets)) > 1:
1893
+ divergent = True
1894
+ break
1895
+
1896
+ # Check convergent: one product step consumes species from multiple
1897
+ # different source steps
1898
+ convergent = False
1899
+ for sp_id, step_indices in product_of.items():
1900
+ if len(step_indices) > 1:
1901
+ convergent = True
1902
+ break
1903
+
1904
+ # Check for disconnected components (parallel reactions)
1905
+ # Build adjacency: two steps are connected if they share any species
1906
+ adj: Dict[int, Set[int]] = defaultdict(set)
1907
+ for sp_id in set(list(reactant_of.keys()) + list(product_of.keys())):
1908
+ involved = reactant_of[sp_id] | product_of[sp_id]
1909
+ for si in involved:
1910
+ for sj in involved:
1911
+ if si != sj:
1912
+ adj[si].add(sj)
1913
+
1914
+ # Count connected components via BFS
1915
+ visited: Set[int] = set()
1916
+ components = 0
1917
+ for i in range(len(steps)):
1918
+ if i in visited:
1919
+ continue
1920
+ components += 1
1921
+ queue = [i]
1922
+ while queue:
1923
+ node = queue.pop(0)
1924
+ if node in visited:
1925
+ continue
1926
+ visited.add(node)
1927
+ for neighbor in adj.get(node, set()):
1928
+ if neighbor not in visited:
1929
+ queue.append(neighbor)
1930
+
1931
+ # Check for cycles: product of step i = reactant of step j AND path
1932
+ # from j eventually leads back to i
1933
+ # Build directed graph: step i -> step j if product of i = reactant of j
1934
+ directed_adj: Dict[int, Set[int]] = defaultdict(set)
1935
+ for i in range(len(steps)):
1936
+ for j in range(len(steps)):
1937
+ if i == j:
1938
+ continue
1939
+ if set(steps[i].product_ids) & set(steps[j].reactant_ids):
1940
+ directed_adj[i].add(j)
1941
+
1942
+ # DFS cycle detection
1943
+ WHITE, GRAY, BLACK = 0, 1, 2
1944
+ color = [WHITE] * len(steps)
1945
+ has_cycle = False
1946
+
1947
+ def _dfs_cycle(u: int) -> bool:
1948
+ nonlocal has_cycle
1949
+ color[u] = GRAY
1950
+ for v in directed_adj.get(u, set()):
1951
+ if color[v] == GRAY:
1952
+ return True
1953
+ if color[v] == WHITE and _dfs_cycle(v):
1954
+ return True
1955
+ color[u] = BLACK
1956
+ return False
1957
+
1958
+ for i in range(len(steps)):
1959
+ if color[i] == WHITE and _dfs_cycle(i):
1960
+ has_cycle = True
1961
+ break
1962
+
1963
+ if components > 1:
1964
+ if divergent or convergent or has_cycle:
1965
+ return "mixed"
1966
+ return "parallel"
1967
+ if has_cycle:
1968
+ if divergent or convergent:
1969
+ return "mixed"
1970
+ return "cycle"
1971
+ if divergent and convergent:
1972
+ return "mixed"
1973
+ if divergent:
1974
+ return "divergent"
1975
+ if convergent:
1976
+ return "convergent"
1977
+ if sequential_links > 0:
1978
+ return "linear" # sequential chain
1979
+ return "parallel" # no links found between steps
1980
+
1981
+
1982
+ # ---------------------------------------------------------------------------
1983
+ # Content type heuristic detection
1984
+ # ---------------------------------------------------------------------------
1985
+
1986
+ def _detect_content_type(steps: List[StepRecord],
1987
+ species: Dict[str, SpeciesRecord]) -> str:
1988
+ """Classify the scheme content type using heuristics.
1989
+
1990
+ Returns one of: "synthesis", "sar_design", "biological_pathway",
1991
+ "target_array", "literature_comparison", "investigation", "unknown".
1992
+ """
1993
+ # No steps → static figure (target_array or standalone structure)
1994
+ if not steps:
1995
+ return "target_array"
1996
+
1997
+ # Count text species by category
1998
+ cats = defaultdict(int)
1999
+ for sp in species.values():
2000
+ if sp.text_category:
2001
+ cats[sp.text_category] += 1
2002
+
2003
+ n_citation = cats.get("citation", 0)
2004
+ n_bioactivity = cats.get("bioactivity", 0)
2005
+ n_condition_ref = cats.get("condition_ref", 0)
2006
+ n_chemical = cats.get("chemical", 0)
2007
+ n_text = sum(1 for sp in species.values() if sp.element_type == "text")
2008
+ n_frag = sum(1 for sp in species.values() if sp.element_type == "fragment")
2009
+
2010
+ # Bioactivity-heavy → literature comparison (SAR data display)
2011
+ if n_bioactivity >= 3:
2012
+ return "literature_comparison"
2013
+
2014
+ # Citation-heavy with few actual steps → literature comparison
2015
+ if n_citation >= 3 and len(steps) <= 2:
2016
+ return "literature_comparison"
2017
+
2018
+ # Check for biological pathway markers (enzyme names in text)
2019
+ enzyme_pattern = re.compile(
2020
+ r"(ase\b|synthase|transferase|reductase|oxidase|kinase|"
2021
+ r"isomerase|mutase|ligase|lyase|dehydrogenase)",
2022
+ re.IGNORECASE
2023
+ )
2024
+ enzyme_count = sum(
2025
+ 1 for sp in species.values()
2026
+ if sp.element_type == "text" and sp.name
2027
+ and enzyme_pattern.search(sp.name)
2028
+ )
2029
+ if enzyme_count >= 2:
2030
+ return "biological_pathway"
2031
+
2032
+ # Many condition refs → likely synthetic scheme with footnoted conditions
2033
+ # (typical of thesis schemes)
2034
+
2035
+ # Default: synthesis (the most common case)
2036
+ if len(steps) >= 1 and n_frag >= 2:
2037
+ return "synthesis"
2038
+
2039
+ return "unknown"
2040
+
2041
+
2042
+ # ---------------------------------------------------------------------------
2043
+ # Narrative generation
2044
+ # ---------------------------------------------------------------------------
2045
+
2046
+ def _species_display(sp: SpeciesRecord, include_smiles: bool = True) -> str:
2047
+ """Best available display string for a species.
2048
+
2049
+ Priority: label > aligned_iupac > name > formula > SMILES.
2050
+ """
2051
+ parts = []
2052
+ if sp.label:
2053
+ parts.append(sp.label)
2054
+ elif sp.aligned_iupac:
2055
+ parts.append(sp.aligned_iupac)
2056
+ elif sp.name:
2057
+ # Use first line of name only (multi-line condition blocks)
2058
+ first_line = sp.name.split("\n")[0].strip()
2059
+ parts.append(first_line)
2060
+ elif sp.formula:
2061
+ parts.append(sp.formula)
2062
+ elif sp.smiles:
2063
+ parts.append(sp.smiles[:40])
2064
+ else:
2065
+ parts.append(sp.id)
2066
+
2067
+ # When a label is used as primary, add the aligned name as qualifier
2068
+ if sp.label and sp.aligned_iupac:
2069
+ parts.append(f"({sp.aligned_iupac})")
2070
+ elif include_smiles and sp.smiles:
2071
+ # Fallback: add SMILES only if not already used as the main display
2072
+ display = parts[0]
2073
+ if display != sp.smiles and display != sp.smiles[:40]:
2074
+ parts.append(f"(SMILES: {sp.smiles})")
2075
+
2076
+ return " ".join(parts)
2077
+
2078
+
2079
+ def _generate_composite_narrative(
2080
+ sub_schemes: List["SchemeDescription"]) -> str:
2081
+ """Generate narrative for a composite (multi-panel) scheme."""
2082
+ parts = [f"Composite scheme with {len(sub_schemes)} independent "
2083
+ f"sub-schemes:"]
2084
+ parts.append("")
2085
+ for i, sub in enumerate(sub_schemes, 1):
2086
+ # Summarize each sub-scheme
2087
+ header = f"--- Sub-scheme {i} ---"
2088
+ parts.append(header)
2089
+ if sub.narrative:
2090
+ # Indent sub-narrative
2091
+ for line in sub.narrative.split("\n"):
2092
+ parts.append(f" {line}")
2093
+ else:
2094
+ parts.append(f" {sub.num_steps} step(s), "
2095
+ f"{len(sub.species)} species, "
2096
+ f"topology: {sub.topology}")
2097
+ parts.append("")
2098
+ return "\n".join(parts)
2099
+
2100
+
2101
+ def _generate_narrative(desc: SchemeDescription) -> str:
2102
+ """Generate LLM-consumable natural language description."""
2103
+ parts = []
2104
+
2105
+ # Opening line
2106
+ topo_label = {
2107
+ "linear": "linear",
2108
+ "divergent": "divergent",
2109
+ "convergent": "convergent",
2110
+ "parallel": "parallel (unrelated)",
2111
+ "mixed": "mixed-topology",
2112
+ }.get(desc.topology, desc.topology)
2113
+
2114
+ # Content type label
2115
+ ct_label = {
2116
+ "synthesis": "reaction scheme",
2117
+ "sar_design": "SAR design diagram",
2118
+ "biological_pathway": "biological pathway",
2119
+ "target_array": "target structure",
2120
+ "literature_comparison": "literature comparison",
2121
+ "investigation": "mechanistic investigation",
2122
+ }.get(desc.content_type, "reaction scheme")
2123
+
2124
+ if desc.num_steps == 1:
2125
+ parts.append(f"Single-step {ct_label}.")
2126
+ elif desc.num_steps == 0:
2127
+ parts.append(f"Static {ct_label} (no reaction steps).")
2128
+ else:
2129
+ parts.append(f"{desc.num_steps}-step {topo_label} {ct_label}.")
2130
+
2131
+ # Per-step descriptions
2132
+ for step in desc.steps:
2133
+ step_num = step.step_index + 1
2134
+ line_parts = [f"\nStep {step_num}:"]
2135
+
2136
+ # Reactants
2137
+ reactant_names = []
2138
+ for rid in step.reactant_ids:
2139
+ sp = desc.species.get(rid)
2140
+ if sp:
2141
+ reactant_names.append(_species_display(sp))
2142
+ if reactant_names:
2143
+ line_parts.append(" + ".join(reactant_names))
2144
+
2145
+ # Reagents
2146
+ reagent_names = []
2147
+ for rid in step.reagent_ids:
2148
+ sp = desc.species.get(rid)
2149
+ if sp:
2150
+ reagent_names.append(
2151
+ _species_display(sp, include_smiles=False))
2152
+ if reagent_names:
2153
+ line_parts.append(f"with {', '.join(reagent_names)}")
2154
+
2155
+ # Arrow
2156
+ line_parts.append("->")
2157
+
2158
+ # Products
2159
+ product_names = []
2160
+ for pid in step.product_ids:
2161
+ sp = desc.species.get(pid)
2162
+ if sp:
2163
+ product_names.append(_species_display(sp))
2164
+ if product_names:
2165
+ line_parts.append(" + ".join(product_names))
2166
+
2167
+ # Conditions — combine parsed conditions with raw text fallback
2168
+ if step.conditions:
2169
+ line_parts.append(f"({', '.join(step.conditions)})")
2170
+ elif step.condition_text_raw:
2171
+ # No parsed conditions — use raw text, cleaned up
2172
+ cleaned = []
2173
+ for raw in step.condition_text_raw:
2174
+ for line in raw.split("\n"):
2175
+ line = line.strip()
2176
+ if line:
2177
+ cleaned.append(line)
2178
+ if cleaned:
2179
+ line_parts.append(f"({'; '.join(cleaned)})")
2180
+
2181
+ # Yield
2182
+ if step.yield_text:
2183
+ line_parts.append(f"[{step.yield_text}]")
2184
+
2185
+ # Arrow style annotations
2186
+ if step.arrow_style == "failed":
2187
+ line_parts.append("[FAILED]")
2188
+ elif step.arrow_style == "dashed":
2189
+ line_parts.append("[tentative/planned]")
2190
+
2191
+ # Molecular diff
2192
+ if step.molecular_diff_text:
2193
+ line_parts.append(f"[{step.molecular_diff_text}]")
2194
+
2195
+ parts.append(" ".join(line_parts))
2196
+
2197
+ return "\n".join(parts)
2198
+
2199
+
2200
+ # ---------------------------------------------------------------------------
2201
+ # Substrate scope table detection
2202
+ # ---------------------------------------------------------------------------
2203
+
2204
+ # Regex for scope table yield/result annotations
2205
+ _SCOPE_YIELD_RE = re.compile(r'(\d+(?:\.\d+)?)\s*%')
2206
+ _SCOPE_MASS_RE = re.compile(r'(\d+(?:\.\d+)?)\s*mg')
2207
+ _SCOPE_X_RE = re.compile(r'(?:X|R\d*)\s*=\s*(\w+)', re.IGNORECASE)
2208
+ _SCOPE_LABEL_RE = re.compile(
2209
+ r'(\d+\.\d+[a-z](?:-[a-z])?(?:\')?)' # e.g. "5.70a", "5.70k'", "4.1a-f"
2210
+ r'|'
2211
+ r'(\d+[a-z](?:\')?)' # e.g. "3a", "4b'"
2212
+ )
2213
+
2214
+
2215
+ def _parse_scope_annotation(text: str) -> Optional[dict]:
2216
+ """Parse a scope table text annotation into structured fields.
2217
+
2218
+ Returns dict with keys: label, conditions_variant, yield_text,
2219
+ mass_text, notes. Returns None if text doesn't look like a scope entry.
2220
+ """
2221
+ if not text or len(text) < 3:
2222
+ return None
2223
+
2224
+ # Must contain at least one of: yield %, mass mg, X = halide
2225
+ has_yield = _SCOPE_YIELD_RE.search(text) is not None
2226
+ has_mass = _SCOPE_MASS_RE.search(text) is not None
2227
+ has_x = _SCOPE_X_RE.search(text) is not None
2228
+ has_label = _SCOPE_LABEL_RE.search(text) is not None
2229
+
2230
+ if not (has_yield or has_mass or has_x or has_label):
2231
+ return None
2232
+
2233
+ result = {}
2234
+
2235
+ # Extract compound label
2236
+ m = _SCOPE_LABEL_RE.search(text)
2237
+ if m:
2238
+ result["label"] = m.group(1) or m.group(2)
2239
+ elif has_x:
2240
+ # Try numeric-only label (e.g. "4.22") when followed by X=/R= variant
2241
+ m_num = re.match(r'(\d+\.\d+)\s+', text)
2242
+ if m_num:
2243
+ result["label"] = m_num.group(1)
2244
+
2245
+ # Extract conditions variant (X = I, R3 = F, etc.)
2246
+ # Capture all variable assignments in the line
2247
+ var_matches = _SCOPE_X_RE.findall(text)
2248
+ if var_matches:
2249
+ # Rebuild the full conditions string from all matches
2250
+ all_matches = list(_SCOPE_X_RE.finditer(text))
2251
+ result["conditions_variant"] = ", ".join(m.group(0) for m in all_matches)
2252
+
2253
+ # Extract yield
2254
+ yields = _SCOPE_YIELD_RE.findall(text)
2255
+ if yields:
2256
+ result["yield_text"] = yields[0] + "%"
2257
+
2258
+ # Extract mass
2259
+ masses = _SCOPE_MASS_RE.findall(text)
2260
+ if masses:
2261
+ result["mass_text"] = masses[0] + " mg"
2262
+
2263
+ # Notes: special annotations like "Reaction failed", "Scale-up:", etc.
2264
+ notes_parts = []
2265
+ if re.search(r'\bfailed\b', text, re.IGNORECASE):
2266
+ notes_parts.append("Reaction failed")
2267
+ m = re.search(r'[Ss]cale-up[:\s]*(\d+\s*mg[,\s]*\d+\s*%)', text)
2268
+ if m:
2269
+ notes_parts.append(f"Scale-up: {m.group(1).strip()}")
2270
+ result["notes"] = "; ".join(notes_parts) if notes_parts else None
2271
+
2272
+ return result
2273
+
2274
+
2275
+ def _detect_scope_table(
2276
+ page: ET.Element,
2277
+ id_map: Dict[str, ET.Element],
2278
+ raw_steps: List,
2279
+ species_dict: Dict[str, SpeciesRecord],
2280
+ elem_to_species: Dict[str, List[str]],
2281
+ use_network: bool = True,
2282
+ use_chemscript: bool = False,
2283
+ ) -> Tuple[List[ScopeEntry], Dict[str, SpeciesRecord]]:
2284
+ """Detect substrate scope table entries from orphaned structures.
2285
+
2286
+ Looks for:
2287
+ 1. ``<bracketedgroup>`` elements with ``BracketedObjectIDs``
2288
+ 2. Fragments/groups not claimed by any step
2289
+ 3. Yield/result text annotations near orphaned fragments
2290
+
2291
+ Returns (scope_entries, new_species) to be merged into the description.
2292
+ """
2293
+ from ..rdkit_utils import frag_to_smiles_resolved, frag_to_smiles, frag_to_mw
2294
+
2295
+ # Build set of all element IDs claimed by steps
2296
+ claimed: Set[str] = set()
2297
+ for step in raw_steps:
2298
+ claimed.update(step.reactant_elem_ids)
2299
+ claimed.update(step.product_elem_ids)
2300
+ claimed.update(step.above_arrow_ids)
2301
+ claimed.update(step.below_arrow_ids)
2302
+
2303
+ # Also include all elements already in species_dict
2304
+ for sp_id, sp in species_dict.items():
2305
+ claimed.add(sp.cdxml_element_id)
2306
+
2307
+ # Check for bracketedgroup elements — these are the primary scope signal
2308
+ bracketed_groups = list(page.iter("bracketedgroup"))
2309
+
2310
+ if not bracketed_groups:
2311
+ return [], {}
2312
+
2313
+ _log(f"Found {len(bracketed_groups)} bracketedgroup element(s)")
2314
+
2315
+ # Build parent map for looking up parent elements (standard ElementTree
2316
+ # does not track parent references)
2317
+ parent_map: Dict[ET.Element, ET.Element] = {}
2318
+ for parent in page.iter():
2319
+ for child in parent:
2320
+ parent_map[child] = parent
2321
+
2322
+ # Collect all text elements on the page with their positions
2323
+ # We iterate over elements that contain <t> children and are NOT inside
2324
+ # a fragment (to skip text labels on atoms).
2325
+ text_elements = []
2326
+ # Find <t> elements that are direct children of page-level containers
2327
+ # (not inside <fragment> elements)
2328
+ fragment_ids: Set[str] = set()
2329
+ for frag in page.iter("fragment"):
2330
+ fragment_ids.add(id(frag))
2331
+
2332
+ for t_el in page.iter("t"):
2333
+ # Check if this <t> is inside a fragment by walking parents
2334
+ in_fragment = False
2335
+ check = t_el
2336
+ while check in parent_map:
2337
+ p = parent_map[check]
2338
+ if p.tag == "fragment":
2339
+ in_fragment = True
2340
+ break
2341
+ check = p
2342
+ if in_fragment:
2343
+ continue
2344
+
2345
+ text_content = _get_text_content(t_el)
2346
+ if not text_content:
2347
+ continue
2348
+
2349
+ # Get bounding box from the <t> element itself or its parent
2350
+ bb = None
2351
+ t_parent = parent_map.get(t_el)
2352
+ for search_el in ([t_el, t_parent] if t_parent is not None
2353
+ else [t_el]):
2354
+ if search_el is None:
2355
+ continue
2356
+ bb_str = search_el.get("BoundingBox", "")
2357
+ if bb_str:
2358
+ try:
2359
+ vals = [float(v) for v in bb_str.split()]
2360
+ bb = vals
2361
+ except (ValueError, IndexError):
2362
+ pass
2363
+ break
2364
+ # Try position (p) attribute
2365
+ p = search_el.get("p", "")
2366
+ if p:
2367
+ try:
2368
+ parts = p.split()
2369
+ bb = [float(parts[0]), float(parts[1]),
2370
+ float(parts[0]) + 50, float(parts[1]) + 10]
2371
+ except (ValueError, IndexError):
2372
+ pass
2373
+ break
2374
+
2375
+ if bb:
2376
+ t_el_id = t_el.get("id", "")
2377
+ t_parent_id = (t_parent.get("id", "")
2378
+ if t_parent is not None else "")
2379
+ # Skip text elements already claimed by steps (conditions text)
2380
+ # Check both the <t> element ID and its parent's ID
2381
+ if (t_el_id and t_el_id in claimed) or \
2382
+ (t_parent_id and t_parent_id in claimed):
2383
+ continue
2384
+ # Use parent ID for display if available, else element ID
2385
+ el_id = t_parent_id if t_parent_id else t_el_id
2386
+ text_elements.append({
2387
+ "text": text_content,
2388
+ "id": el_id,
2389
+ "cx": (bb[0] + bb[2]) / 2,
2390
+ "cy": (bb[1] + bb[3]) / 2,
2391
+ "bb": bb,
2392
+ })
2393
+
2394
+ # Parse scope annotations from text elements.
2395
+ # Strategy: first try parsing each text box as a single scope entry.
2396
+ # If a text box has multiple lines where EACH line has its own compound
2397
+ # label or X= variant, split into per-line entries (e.g. oleObject9:
2398
+ # "4.22 X = H\n4.26 X = Me" → 2 entries). Otherwise treat the whole
2399
+ # text box as one entry (e.g. oleObject19: "5.70a\nX = I\n22 mg, 39%"
2400
+ # → 1 entry).
2401
+ scope_annotations = []
2402
+ for te in text_elements:
2403
+ full_text = te["text"]
2404
+ lines = [ln.strip() for ln in full_text.split("\n") if ln.strip()]
2405
+
2406
+ if len(lines) <= 1:
2407
+ # Single-line text: parse directly
2408
+ parsed = _parse_scope_annotation(full_text)
2409
+ if parsed:
2410
+ parsed["_text_id"] = te["id"]
2411
+ parsed["_cx"] = te["cx"]
2412
+ parsed["_cy"] = te["cy"]
2413
+ scope_annotations.append(parsed)
2414
+ continue
2415
+
2416
+ # Multi-line: count how many lines have their own scope signal
2417
+ # (label, X=, yield, mass). If multiple lines each have a label
2418
+ # or X= pattern, treat as per-line entries.
2419
+ line_parseds = []
2420
+ n_labels = 0
2421
+ n_x_variants = 0
2422
+ for ln in lines:
2423
+ p = _parse_scope_annotation(ln)
2424
+ line_parseds.append(p)
2425
+ if p:
2426
+ if p.get("label"):
2427
+ n_labels += 1
2428
+ if p.get("conditions_variant"):
2429
+ n_x_variants += 1
2430
+
2431
+ # Split into per-line entries if multiple lines have labels or
2432
+ # multiple lines have X=/R= variants (table of variants)
2433
+ split_by_line = (n_labels >= 2 or n_x_variants >= 2)
2434
+
2435
+ if split_by_line:
2436
+ for p in line_parseds:
2437
+ if p:
2438
+ p["_text_id"] = te["id"]
2439
+ p["_cx"] = te["cx"]
2440
+ p["_cy"] = te["cy"]
2441
+ scope_annotations.append(p)
2442
+ else:
2443
+ # Parse whole text box as one entry
2444
+ parsed = _parse_scope_annotation(full_text)
2445
+ if parsed:
2446
+ parsed["_text_id"] = te["id"]
2447
+ parsed["_cx"] = te["cx"]
2448
+ parsed["_cy"] = te["cy"]
2449
+ scope_annotations.append(parsed)
2450
+
2451
+ if not scope_annotations:
2452
+ return [], {}
2453
+
2454
+ _log(f"Found {len(scope_annotations)} scope annotation(s)")
2455
+
2456
+ # Build scope entries directly from annotations (one per text box or
2457
+ # per line when split). No spatial clustering needed since multi-line
2458
+ # handling already consolidates within each text element.
2459
+ scope_entries: List[ScopeEntry] = []
2460
+ new_species: Dict[str, SpeciesRecord] = {}
2461
+
2462
+ for i, ann in enumerate(scope_annotations):
2463
+ entry = ScopeEntry(
2464
+ entry_id=f"scope_{i}",
2465
+ label=ann.get("label"),
2466
+ conditions_variant=ann.get("conditions_variant"),
2467
+ yield_text=ann.get("yield_text"),
2468
+ mass_text=ann.get("mass_text"),
2469
+ notes=ann.get("notes"),
2470
+ )
2471
+ scope_entries.append(entry)
2472
+
2473
+ return scope_entries, new_species
2474
+
2475
+
2476
+ # ---------------------------------------------------------------------------
2477
+ # Aligned IUPAC naming enrichment
2478
+ # ---------------------------------------------------------------------------
2479
+
2480
+ # Common heterocyclic ring names for parent normalization, ordered
2481
+ # largest-first so that "benzimidazole" matches before "imidazole".
2482
+ _KNOWN_RING_NAMES = [
2483
+ 'benzimidazole', 'isoquinoline', 'quinazoline', 'naphthalene',
2484
+ 'quinoline', 'carbazole', 'acridine',
2485
+ 'morpholine', 'piperidine', 'piperazine', 'pyrimidine',
2486
+ 'pyridine', 'thiophene', 'imidazole', 'thiazole', 'oxazole',
2487
+ 'indole', 'furan', 'benzene',
2488
+ ]
2489
+
2490
+
2491
+ def _find_preferred_parent(desc: "SchemeDescription") -> str:
2492
+ """Pre-scan all principal species to find the dominant naming parent.
2493
+
2494
+ Decomposes each unique principal species (highest-MW per step) into
2495
+ its available naming parents, normalises each parent to a root ring
2496
+ name (e.g. "3-bromoquinoline" → "quinoline"), then picks the root
2497
+ ring that appears in the most compounds.
2498
+
2499
+ Tiebreaker: prefer the ring present in the **final product**
2500
+ (last step's product). In drug-discovery synthesis, the final product
2501
+ defines the target scaffold — transformations build *toward* that ring,
2502
+ while other ring substituents (morpholine, piperidine) are passengers.
2503
+
2504
+ Returns the root ring name (e.g. "quinoline") or "" if none found.
2505
+ """
2506
+ try:
2507
+ from ..naming.name_decomposer import decompose_name
2508
+ except ImportError:
2509
+ return ""
2510
+
2511
+ # Collect unique principal SMILES across all steps, preserving order
2512
+ principal_smiles: Dict[str, None] = {} # ordered set
2513
+ for step in desc.steps:
2514
+ for role_ids in [step.reactant_ids, step.product_ids]:
2515
+ sps = [desc.species[sid] for sid in role_ids
2516
+ if sid in desc.species and desc.species[sid].smiles]
2517
+ if sps:
2518
+ best = max(sps, key=lambda s: s.mw or 0)
2519
+ if best.smiles:
2520
+ principal_smiles[best.smiles] = None
2521
+
2522
+ if not principal_smiles:
2523
+ return ""
2524
+
2525
+ # Find the final product SMILES (last step's principal product)
2526
+ final_product_smiles = ""
2527
+ if desc.steps:
2528
+ last_step = desc.steps[-1]
2529
+ prod_sps = [desc.species[sid] for sid in last_step.product_ids
2530
+ if sid in desc.species and desc.species[sid].smiles]
2531
+ if prod_sps:
2532
+ final_product_smiles = max(prod_sps,
2533
+ key=lambda s: s.mw or 0).smiles or ""
2534
+
2535
+ # Decompose each and collect all available parent names
2536
+ from collections import Counter
2537
+ ring_counts: Counter = Counter()
2538
+ final_prod_rings: set = set() # rings in the final product
2539
+
2540
+ for smi in principal_smiles:
2541
+ try:
2542
+ r = decompose_name(smi)
2543
+ except Exception:
2544
+ continue
2545
+
2546
+ # Gather all parent strings for this compound
2547
+ parents: set = set()
2548
+ if r.canonical_parent:
2549
+ parents.add(r.canonical_parent.lower())
2550
+ for alt in r.alternatives:
2551
+ if alt.valid and alt.parent_name:
2552
+ parents.add(alt.parent_name.lower())
2553
+ # Also check the name itself for ring stems (handles cases
2554
+ # where the ring appears in complex parent names like
2555
+ # "4-(4-phenylquinolin-2-yl)morpholine")
2556
+ all_names = [r.canonical_name.lower()]
2557
+ all_names.extend(a.name.lower() for a in r.alternatives if a.valid)
2558
+
2559
+ # Normalise: for each parent/name, find which root ring it contains
2560
+ compound_rings: set = set()
2561
+ for text in list(parents) + all_names:
2562
+ for ring in _KNOWN_RING_NAMES:
2563
+ # Match both full form ("quinoline") and stem ("quinolin")
2564
+ ring_stem = ring.rstrip('e')
2565
+ if ring in text or ring_stem in text:
2566
+ compound_rings.add(ring)
2567
+ ring_counts.update(compound_rings)
2568
+
2569
+ # Remember which rings the final product has
2570
+ if smi == final_product_smiles:
2571
+ final_prod_rings = compound_rings.copy()
2572
+
2573
+ if not ring_counts:
2574
+ return ""
2575
+
2576
+ # Pick the ring present in the most compounds.
2577
+ # Tiebreaker: prefer rings from the final product (the target scaffold),
2578
+ # then larger ring systems.
2579
+ best_ring = max(
2580
+ ring_counts,
2581
+ key=lambda r: (ring_counts[r],
2582
+ 1 if r in final_prod_rings else 0,
2583
+ len(r)))
2584
+ _log(f"Preferred naming parent: {best_ring} "
2585
+ f"(in {ring_counts[best_ring]}/{len(principal_smiles)} compounds"
2586
+ f"{', final-product' if best_ring in final_prod_rings else ''})")
2587
+ return best_ring
2588
+
2589
+
2590
+ def _enrich_aligned_names(desc: "SchemeDescription") -> None:
2591
+ """Populate aligned_iupac on species and molecular_diff_text on steps.
2592
+
2593
+ For each step, finds the principal SM/product pair (largest MW),
2594
+ runs ``find_aligned_names`` to get MCS-based aligned IUPAC names,
2595
+ and fills ``format_molecular_diff`` text on the step.
2596
+
2597
+ Uses a global "preferred parent" strategy: pre-scans all principal
2598
+ species to find the dominant ring system, then passes it as a hint
2599
+ to every step so the entire scheme uses a consistent naming backbone.
2600
+
2601
+ Gracefully degrades if aligned_namer is unavailable.
2602
+ """
2603
+ try:
2604
+ from ..naming.aligned_namer import (
2605
+ find_aligned_names,
2606
+ format_molecular_diff,
2607
+ )
2608
+ except ImportError:
2609
+ return
2610
+
2611
+ # Find the globally preferred naming parent
2612
+ preferred_parent = _find_preferred_parent(desc)
2613
+
2614
+ for step in desc.steps:
2615
+ sm_list = [desc.species[sid] for sid in step.reactant_ids
2616
+ if sid in desc.species and desc.species[sid].smiles]
2617
+ prod_list = [desc.species[sid] for sid in step.product_ids
2618
+ if sid in desc.species and desc.species[sid].smiles]
2619
+
2620
+ if not sm_list or not prod_list:
2621
+ continue
2622
+
2623
+ # Principal pair: largest MW (the "core" substrate, not additives)
2624
+ sm_sp = max(sm_list, key=lambda s: s.mw or 0)
2625
+ prod_sp = max(prod_list, key=lambda s: s.mw or 0)
2626
+
2627
+ if not sm_sp.smiles or not prod_sp.smiles:
2628
+ continue
2629
+
2630
+ try:
2631
+ ar = find_aligned_names(sm_sp.smiles, prod_sp.smiles,
2632
+ preferred_parent=preferred_parent or None)
2633
+
2634
+ # Only set aligned_iupac if not already assigned by a previous
2635
+ # step. This preserves naming consistency for intermediates.
2636
+ if ar.best_sm_name and not sm_sp.aligned_iupac:
2637
+ sm_sp.aligned_iupac = ar.best_sm_name
2638
+ if ar.best_prod_name and not prod_sp.aligned_iupac:
2639
+ prod_sp.aligned_iupac = ar.best_prod_name
2640
+
2641
+ diff_text = format_molecular_diff(
2642
+ sm_sp.smiles, prod_sp.smiles, ar)
2643
+ if diff_text:
2644
+ step.molecular_diff_text = diff_text
2645
+ except Exception:
2646
+ # Non-critical enrichment — don't break scheme reading
2647
+ pass
2648
+
2649
+
2650
+ # ---------------------------------------------------------------------------
2651
+ # Main API
2652
+ # ---------------------------------------------------------------------------
2653
+
2654
+ def read_scheme(
2655
+ cdxml_path: str,
2656
+ use_network: bool = True,
2657
+ use_chemscript: bool = False,
2658
+ verbose: bool = False,
2659
+ segment: bool = False,
2660
+ _scheme_filter: Optional[Set[str]] = None,
2661
+ ) -> SchemeDescription:
2662
+ """Read a CDXML reaction scheme and return a structured description.
2663
+
2664
+ Primary path: uses <scheme><step> attributes if present.
2665
+ Fallback: geometry-based arrow detection.
2666
+
2667
+ Parameters
2668
+ ----------
2669
+ cdxml_path : str
2670
+ Path to CDXML file.
2671
+ use_network : bool
2672
+ Allow PubChem network lookups for text label resolution.
2673
+ use_chemscript : bool
2674
+ Use ChemScript for SMILES extraction (best abbreviation resolution,
2675
+ requires ChemDraw 16+ on Windows). Falls back to RDKit-based
2676
+ resolution if ChemScript is unavailable.
2677
+ verbose : bool
2678
+ Print debug info to stderr.
2679
+ segment : bool
2680
+ Auto-segment multi-panel CDXML files into independent sub-schemes.
2681
+ When True, the returned SchemeDescription may have a non-empty
2682
+ ``sub_schemes`` list with independent sub-scheme descriptions.
2683
+ _scheme_filter : set of str, optional
2684
+ Internal parameter used by the segmenter. If provided, only
2685
+ process ``<scheme>`` elements whose ``id`` is in this set.
2686
+
2687
+ Returns
2688
+ -------
2689
+ SchemeDescription
2690
+ Complete structured description with species, steps, topology,
2691
+ and narrative.
2692
+ """
2693
+ global _verbose
2694
+ _verbose = verbose
2695
+
2696
+ from ..cdxml_utils import parse_cdxml, build_id_map
2697
+
2698
+ tree = parse_cdxml(cdxml_path)
2699
+ root = tree.getroot()
2700
+ page = root.find(".//page")
2701
+ if page is None:
2702
+ return SchemeDescription(
2703
+ source_file=os.path.abspath(cdxml_path),
2704
+ warnings=["No <page> element found in CDXML"],
2705
+ )
2706
+
2707
+ id_map = build_id_map(page)
2708
+
2709
+ # -----------------------------------------------------------------------
2710
+ # Auto-segmentation: detect independent sub-schemes
2711
+ # -----------------------------------------------------------------------
2712
+ if segment and _scheme_filter is None:
2713
+ from .scheme_segmenter import segment_scheme as _segment_scheme
2714
+ seg_result = _segment_scheme(cdxml_path, verbose=verbose)
2715
+ if seg_result.is_multi_panel and seg_result.num_segments > 1:
2716
+ _log(f"Multi-panel detected: {seg_result.num_segments} segments")
2717
+ sub_schemes = []
2718
+ all_species: Dict[str, SpeciesRecord] = {}
2719
+ all_steps: List[StepRecord] = []
2720
+ for seg in seg_result.segments:
2721
+ filter_ids = set(seg.scheme_element_ids)
2722
+ sub_desc = read_scheme(
2723
+ cdxml_path,
2724
+ use_network=use_network,
2725
+ use_chemscript=use_chemscript,
2726
+ verbose=verbose,
2727
+ segment=False,
2728
+ _scheme_filter=filter_ids,
2729
+ )
2730
+ sub_desc.source_file = os.path.abspath(cdxml_path)
2731
+ sub_schemes.append(sub_desc)
2732
+ all_species.update(sub_desc.species)
2733
+ all_steps.extend(sub_desc.steps)
2734
+
2735
+ # Build composite description
2736
+ total_steps = sum(s.num_steps for s in sub_schemes)
2737
+ composite = SchemeDescription(
2738
+ source_file=os.path.abspath(cdxml_path),
2739
+ topology="parallel",
2740
+ content_type="composite",
2741
+ num_steps=total_steps,
2742
+ species=all_species,
2743
+ steps=all_steps,
2744
+ sub_schemes=sub_schemes,
2745
+ narrative=_generate_composite_narrative(sub_schemes),
2746
+ )
2747
+ return composite
2748
+
2749
+ # Dual-strategy parsing:
2750
+ # - Geometry engine: works on all files, including pycdxml-converted CDX
2751
+ # - Step-attribute path: uses ChemDraw's <scheme><step> when available
2752
+ #
2753
+ # Preference: use step attributes when available (they encode the
2754
+ # author's explicit grouping). Use geometry engine as primary when
2755
+ # step attributes are missing (pycdxml output, manual drawings).
2756
+ # The geometry engine also provides layout_pattern and confidence metadata
2757
+ # regardless of which strategy is used for the final assignment.
2758
+ parse_method = ""
2759
+ layout_pattern = None
2760
+ confidence_map: Dict[str, float] = {}
2761
+
2762
+ # Always run geometry engine (for metadata + fallback)
2763
+ geo_steps = _parse_from_spatial_engine(page, id_map)
2764
+ _sa_meta = getattr(_parse_from_spatial_engine, "_last_meta", {})
2765
+ layout_pattern = _sa_meta.get("layout_pattern")
2766
+ confidence_map = _sa_meta.get("confidences", {})
2767
+ if geo_steps:
2768
+ _log(f"Spatial engine: {len(geo_steps)} step(s), "
2769
+ f"layout={layout_pattern}")
2770
+
2771
+ # Try step-attribute path
2772
+ attr_steps = _parse_from_step_attributes(page, id_map,
2773
+ scheme_filter=_scheme_filter)
2774
+ if attr_steps:
2775
+ _log(f"Step-attribute path: {len(attr_steps)} step(s)")
2776
+
2777
+ # Choose strategy
2778
+ if attr_steps:
2779
+ raw_steps = attr_steps
2780
+ parse_method = "step_attribute"
2781
+ elif geo_steps:
2782
+ raw_steps = geo_steps
2783
+ parse_method = "geometry"
2784
+ else:
2785
+ raw_steps = []
2786
+
2787
+ if not raw_steps:
2788
+ # No reaction steps — still enumerate all structures on the page
2789
+ species_dict = _build_static_species_registry(
2790
+ page, id_map, use_network=use_network,
2791
+ use_chemscript=use_chemscript)
2792
+ content_type = "target_array" if species_dict else "unknown"
2793
+ desc = SchemeDescription(
2794
+ source_file=os.path.abspath(cdxml_path),
2795
+ content_type=content_type,
2796
+ species=species_dict,
2797
+ warnings=["No reaction steps found "
2798
+ "(no <step> attributes, no arrows)"],
2799
+ )
2800
+ desc.narrative = _generate_narrative(desc)
2801
+ return desc
2802
+
2803
+ # Recover orphan transition arrows (serpentine vertical connectors
2804
+ # that the renderer places outside <scheme><step> elements)
2805
+ if parse_method == "step_attribute":
2806
+ pre_count = len(raw_steps)
2807
+ raw_steps = _recover_orphan_transition_steps(page, raw_steps, id_map)
2808
+ if len(raw_steps) > pre_count:
2809
+ _log(f"Recovered {len(raw_steps) - pre_count} orphan "
2810
+ f"transition step(s)")
2811
+
2812
+ _log(f"Found {len(raw_steps)} step(s)")
2813
+
2814
+ # Build species registry
2815
+ species_dict, elem_to_species = _build_species_registry(
2816
+ raw_steps, id_map, page,
2817
+ use_network=use_network,
2818
+ use_chemscript=use_chemscript,
2819
+ )
2820
+ _log(f"Built registry with {len(species_dict)} species")
2821
+
2822
+ # Convert to step records
2823
+ steps = _build_step_records(raw_steps, elem_to_species, species_dict,
2824
+ id_map, page)
2825
+
2826
+ # Resolve footnote conditions (e.g. "(a) Pd2(dba)3, BINAP, ...")
2827
+ # for steps that use condition_ref letters
2828
+ registered_eids = set(elem_to_species.keys())
2829
+ footnotes = _collect_footnotes(page, registered_eids)
2830
+ if footnotes:
2831
+ _resolve_footnote_conditions(steps, species_dict, footnotes)
2832
+ _log(f"Resolved {len(footnotes)} footnote(s)")
2833
+
2834
+ # Link repeated structures across separate <scheme> elements
2835
+ # (wrap-repeat layouts re-draw intermediates with new element IDs)
2836
+ _link_repeated_species(steps, species_dict)
2837
+
2838
+ # Detect topology
2839
+ topology = _detect_topology(steps)
2840
+ _log(f"Detected topology: {topology}")
2841
+
2842
+ # Detect content type
2843
+ content_type = _detect_content_type(steps, species_dict)
2844
+ _log(f"Detected content type: {content_type}")
2845
+
2846
+ # Detect substrate scope table (orphaned structures + yield annotations)
2847
+ scope_entries, scope_species = _detect_scope_table(
2848
+ page, id_map, raw_steps, species_dict, elem_to_species,
2849
+ use_network=use_network, use_chemscript=use_chemscript,
2850
+ )
2851
+ if scope_species:
2852
+ species_dict.update(scope_species)
2853
+ if scope_entries:
2854
+ _log(f"Detected {len(scope_entries)} scope table entries")
2855
+ if content_type == "synthesis":
2856
+ content_type = "substrate_scope"
2857
+
2858
+ desc = SchemeDescription(
2859
+ source_file=os.path.abspath(cdxml_path),
2860
+ topology=topology,
2861
+ content_type=content_type,
2862
+ num_steps=len(steps),
2863
+ species=species_dict,
2864
+ steps=steps,
2865
+ scope_entries=scope_entries,
2866
+ layout_pattern=layout_pattern,
2867
+ parse_method=parse_method,
2868
+ assignment_confidences=confidence_map,
2869
+ )
2870
+
2871
+ # Add warnings for low-confidence assignments
2872
+ for elem_id, conf in confidence_map.items():
2873
+ if conf < 0.5:
2874
+ desc.warnings.append(
2875
+ f"Low confidence ({conf:.2f}) assigning element {elem_id}")
2876
+
2877
+ # Enrich with aligned IUPAC names + molecular diffs
2878
+ _enrich_aligned_names(desc)
2879
+
2880
+ # Generate narrative
2881
+ desc.narrative = _generate_narrative(desc)
2882
+
2883
+ return desc
2884
+
2885
+
2886
+ # ---------------------------------------------------------------------------
2887
+ # CLI
2888
+ # ---------------------------------------------------------------------------
2889
+
2890
+ def main(argv: Optional[List[str]] = None) -> int:
2891
+ parser = argparse.ArgumentParser(
2892
+ prog="scheme_reader",
2893
+ description="Read a CDXML reaction scheme and produce structured JSON.",
2894
+ formatter_class=argparse.RawDescriptionHelpFormatter,
2895
+ epilog="""\
2896
+ examples:
2897
+ python -m cdxml_toolkit.scheme_reader scheme.cdxml
2898
+ python -m cdxml_toolkit.scheme_reader scheme.cdxml -o description.json
2899
+ python -m cdxml_toolkit.scheme_reader scheme.cdxml --narrative-only
2900
+ """,
2901
+ )
2902
+ parser.add_argument("input", help="Input CDXML file with reaction scheme")
2903
+ parser.add_argument("-o", "--output",
2904
+ help="Output JSON path (default: stdout)")
2905
+ parser.add_argument("--pretty", action="store_true", default=True,
2906
+ help="Pretty-print JSON (default: yes)")
2907
+ parser.add_argument("--no-pretty", dest="pretty", action="store_false")
2908
+ parser.add_argument("--no-network", action="store_true",
2909
+ help="Disable network lookups (PubChem, OPSIN)")
2910
+ parser.add_argument("--chemscript", action="store_true",
2911
+ help="Use ChemScript for SMILES (best abbreviation "
2912
+ "resolution, requires ChemDraw 16+ on Windows)")
2913
+ parser.add_argument("--narrative-only", action="store_true",
2914
+ help="Print only the narrative text to stdout")
2915
+ parser.add_argument("-v", "--verbose", action="store_true",
2916
+ help="Print debug info to stderr")
2917
+
2918
+ args = parser.parse_args(argv)
2919
+
2920
+ if not os.path.isfile(args.input):
2921
+ print(f"Error: file not found: {args.input}", file=sys.stderr)
2922
+ return 1
2923
+
2924
+ desc = read_scheme(
2925
+ args.input,
2926
+ use_network=not args.no_network,
2927
+ use_chemscript=args.chemscript,
2928
+ verbose=args.verbose,
2929
+ )
2930
+
2931
+ if args.narrative_only:
2932
+ print(desc.narrative)
2933
+ return 0
2934
+
2935
+ if args.output:
2936
+ desc.to_json(args.output, pretty=args.pretty)
2937
+ print(f"Written to {args.output}", file=sys.stderr)
2938
+ else:
2939
+ out = json.dumps(desc.to_dict(), indent=2 if args.pretty else None,
2940
+ ensure_ascii=False)
2941
+ sys.stdout.buffer.write(out.encode("utf-8"))
2942
+ sys.stdout.buffer.write(b"\n")
2943
+
2944
+ return 0
2945
+
2946
+
2947
+ if __name__ == "__main__":
2948
+ sys.exit(main())