cdxml-toolkit 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. cdxml_toolkit/__init__.py +18 -0
  2. cdxml_toolkit/_jre/__init__.py +2 -0
  3. cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
  4. cdxml_toolkit/analysis/__init__.py +35 -0
  5. cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
  6. cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
  7. cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
  8. cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
  9. cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
  10. cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
  11. cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
  12. cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
  13. cdxml_toolkit/analysis/extract_nmr.py +47 -0
  14. cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
  15. cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
  16. cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
  17. cdxml_toolkit/cdxml_builder.py +920 -0
  18. cdxml_toolkit/cdxml_utils.py +342 -0
  19. cdxml_toolkit/chemdraw/__init__.py +5 -0
  20. cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
  21. cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
  22. cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
  23. cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
  24. cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
  25. cdxml_toolkit/constants.py +304 -0
  26. cdxml_toolkit/coord_normalizer.py +438 -0
  27. cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
  28. cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
  29. cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
  30. cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
  31. cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
  32. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
  33. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
  34. cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
  35. cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
  36. cdxml_toolkit/image/__init__.py +15 -0
  37. cdxml_toolkit/image/reaction_from_image.py +2103 -0
  38. cdxml_toolkit/image/structure_from_image.py +1711 -0
  39. cdxml_toolkit/layout/__init__.py +5 -0
  40. cdxml_toolkit/layout/alignment.py +1642 -0
  41. cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
  42. cdxml_toolkit/layout/scheme_merger.py +2260 -0
  43. cdxml_toolkit/mcp_server/__init__.py +0 -0
  44. cdxml_toolkit/mcp_server/__main__.py +5 -0
  45. cdxml_toolkit/mcp_server/server.py +1567 -0
  46. cdxml_toolkit/naming/__init__.py +6 -0
  47. cdxml_toolkit/naming/aligned_namer.py +2342 -0
  48. cdxml_toolkit/naming/mol_builder.py +3722 -0
  49. cdxml_toolkit/naming/name_decomposer.py +2843 -0
  50. cdxml_toolkit/naming/reactions_datamol.json +2414 -0
  51. cdxml_toolkit/office/__init__.py +5 -0
  52. cdxml_toolkit/office/doc_from_template.py +722 -0
  53. cdxml_toolkit/office/ole_embedder.py +808 -0
  54. cdxml_toolkit/office/ole_extractor.py +272 -0
  55. cdxml_toolkit/perception/__init__.py +10 -0
  56. cdxml_toolkit/perception/compound_search.py +229 -0
  57. cdxml_toolkit/perception/eln_csv_parser.py +240 -0
  58. cdxml_toolkit/perception/rdf_parser.py +664 -0
  59. cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
  60. cdxml_toolkit/perception/reaction_parser.py +2150 -0
  61. cdxml_toolkit/perception/scheme_reader.py +2948 -0
  62. cdxml_toolkit/perception/scheme_refine.py +1404 -0
  63. cdxml_toolkit/perception/scheme_segmenter.py +619 -0
  64. cdxml_toolkit/perception/spatial_assignment.py +1013 -0
  65. cdxml_toolkit/rdkit_utils.py +605 -0
  66. cdxml_toolkit/render/__init__.py +17 -0
  67. cdxml_toolkit/render/auto_layout.py +229 -0
  68. cdxml_toolkit/render/compact_parser.py +632 -0
  69. cdxml_toolkit/render/parser.py +706 -0
  70. cdxml_toolkit/render/render_scheme.py +267 -0
  71. cdxml_toolkit/render/renderer.py +2387 -0
  72. cdxml_toolkit/render/schema.py +90 -0
  73. cdxml_toolkit/render/scheme_maker.py +1043 -0
  74. cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
  75. cdxml_toolkit/resolve/__init__.py +13 -0
  76. cdxml_toolkit/resolve/cas_resolver.py +430 -0
  77. cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
  78. cdxml_toolkit/resolve/condensed_formula.py +493 -0
  79. cdxml_toolkit/resolve/jre_manager.py +195 -0
  80. cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
  81. cdxml_toolkit/resolve/reagent_db.py +285 -0
  82. cdxml_toolkit/resolve/superatom_data.json +2856 -0
  83. cdxml_toolkit/resolve/superatom_table.py +146 -0
  84. cdxml_toolkit/text_formatting.py +298 -0
  85. cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
  86. cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
  87. cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
  88. cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
  89. cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
  90. cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
  91. cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,2103 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ reaction_from_image.py — Build a full ChemDraw reaction scheme from a screenshot.
4
+
5
+ Takes a screenshot of a reaction scheme (e.g. from SciFinder, a paper, or a patent)
6
+ and produces a CDXML reaction scheme with proper arrow, conditions text, and
7
+ ACS Document 1996 styling.
8
+
9
+ Architecture
10
+ ------------
11
+ This is an *orchestration* tool. It does NOT try to auto-detect arrows or OCR
12
+ conditions text from the image — that is unreliable and unnecessary. Instead,
13
+ an LLM (or user) looks at the screenshot and provides a small JSON descriptor
14
+ that tells the tool:
15
+
16
+ 1. Which detected structures are reactants vs products (by left-to-right index)
17
+ 2. What conditions text goes above / below the arrow
18
+
19
+ The tool then:
20
+ a. Extracts molecular structures from the image via DECIMER
21
+ (delegates to structure_from_image.py)
22
+ b. Assigns them as reactants or products per the descriptor
23
+ c. Lays out the molecules, arrow, and conditions text
24
+ d. Builds a valid CDXML document via cdxml_builder.py
25
+ e. Applies subscript formatting to chemical formulae in conditions
26
+
27
+ Usage
28
+ -----
29
+ Minimal (LLM provides JSON descriptor on stdin):
30
+ python reaction_from_image.py --image scheme.png --descriptor desc.json -o scheme.cdxml
31
+
32
+ Descriptor JSON format:
33
+ {
34
+ "reactant_indices": [0, 1],
35
+ "product_indices": [2],
36
+ "conditions_above": ["Pd2dba3", "BINAP"],
37
+ "conditions_below": ["Dioxane", "24 h, reflux"]
38
+ }
39
+
40
+ - reactant_indices / product_indices refer to the left-to-right order of
41
+ detected structures (0-indexed). The tool extracts all structures first,
42
+ then assigns roles based on these indices.
43
+ - conditions_above / conditions_below are plain text strings.
44
+ - If a condition string matches a known abbreviation (e.g. "Cs2CO3"),
45
+ it is kept verbatim. Unknown abbreviations are reproduced as-is
46
+ (we never trust LLM-generated SMILES for reagents).
47
+
48
+ Abbreviation dictionary
49
+ -----------------------
50
+ A curated dictionary maps common reagent/ligand/catalyst abbreviations to
51
+ themselves (for subscript formatting) or to display names. This is NOT
52
+ for structure resolution — it's only to decide whether a name is "known"
53
+ and how to display it. If an abbreviation isn't in the dictionary, the
54
+ exact text from the descriptor is used verbatim.
55
+
56
+ The dictionary lives in ABBREVIATIONS below and can be extended over time.
57
+ """
58
+
59
+ import argparse
60
+ import json
61
+ import math
62
+ import os
63
+ import re
64
+ import sys
65
+ from copy import deepcopy
66
+ from typing import Dict, List, Optional, Tuple
67
+ from xml.sax.saxutils import escape as xml_escape
68
+
69
+ # ---------------------------------------------------------------------------
70
+ # Shared reagent database
71
+ # ---------------------------------------------------------------------------
72
+
73
+ from ..resolve.reagent_db import get_reagent_db
74
+ from ..text_formatting import needs_subscript, build_formatted_s_xml
75
+ from ..constants import (
76
+ ACS_BOND_LENGTH, EXPAND_SCALE_BOND,
77
+ CDXML_HEADER, CDXML_FOOTER,
78
+ ACS_LABEL_FONT, ACS_LABEL_SIZE, ACS_LABEL_FACE,
79
+ ACS_CAPTION_SIZE, ACS_HASH_SPACING, ACS_MARGIN_WIDTH,
80
+ ACS_LINE_WIDTH, ACS_BOLD_WIDTH, ACS_BOND_LENGTH_STR,
81
+ ACS_BOND_SPACING, ACS_CHAIN_ANGLE_STR,
82
+ )
83
+
84
+
85
+ # ---------------------------------------------------------------------------
86
+ # Resolve abbreviation display text
87
+ # ---------------------------------------------------------------------------
88
+
89
+ def resolve_abbreviation(text: str) -> str:
90
+ """Look up text in the reagent database.
91
+
92
+ Returns the canonical display form if found, otherwise the original text
93
+ verbatim (we never invent or transform unknown abbreviations).
94
+ """
95
+ return get_reagent_db().resolve_display(text)
96
+
97
+
98
+ # ---------------------------------------------------------------------------
99
+ # Condition classification: chemistry vs. non-chemistry text
100
+ # ---------------------------------------------------------------------------
101
+
102
+ def _is_non_chemistry_text(text: str) -> bool:
103
+ """Return True if *text* is a non-chemistry condition string that should
104
+ **always** be rendered as a text label (never expanded to a structure).
105
+
106
+ Examples that return True:
107
+ "24 h, reflux", "120 °C", "rt", "overnight", "10 mol%",
108
+ "1.5 equiv", "N2", "Ar", "sealed tube"
109
+ """
110
+ t = text.strip()
111
+ tl = t.lower()
112
+
113
+ # Temperature patterns
114
+ if re.search(r'-?\d+\s*°', t):
115
+ return True
116
+ if tl in ("rt", "room temperature", "room temp", "room temp."):
117
+ return True
118
+
119
+ # Time patterns
120
+ if re.search(r'\d+\s*(h|hr|hrs|min|d|days?)\b', tl):
121
+ return True
122
+ if tl in ("overnight", "o/n", "on"):
123
+ return True
124
+
125
+ # Percentage / equivalents / concentration
126
+ if re.search(r'\d+\s*(mol\s*)?%', tl):
127
+ return True
128
+ if re.search(r'[\d.]+\s*equiv', tl):
129
+ return True
130
+ if re.search(r'[\d.]+\s*M\b', t): # case-sensitive M
131
+ return True
132
+
133
+ # Physical conditions (single keywords)
134
+ _PHYS = {
135
+ "reflux", "sealed tube", "microwave", "mw", "ultrasound",
136
+ "sonication", "inert atmosphere", "dark", "hv", "light",
137
+ }
138
+ if tl in _PHYS:
139
+ return True
140
+
141
+ # Inert gas (very short abbreviations)
142
+ if tl in ("n2", "ar", "argon", "nitrogen"):
143
+ return True
144
+
145
+ # Compound phrases with comma → likely mixed ("24 h, reflux")
146
+ if "," in t:
147
+ return True
148
+
149
+ # "then ..." / step instructions
150
+ if tl.startswith("then "):
151
+ return True
152
+
153
+ return False
154
+
155
+
156
+ # ---------------------------------------------------------------------------
157
+ # Expand conditions: resolve names to structures (ChemScript → PubChem)
158
+ # ---------------------------------------------------------------------------
159
+
160
+ def _extract_fragment_from_cdxml(cdxml_str: str) -> Optional[Tuple[str, float, float, float, float]]:
161
+ """Parse a CDXML string and extract the first <fragment> element XML +
162
+ its bounding box. Returns (frag_xml, xmin, ymin, xmax, ymax) or None."""
163
+ import xml.etree.ElementTree as ET
164
+
165
+ if not cdxml_str or "<CDXML" not in cdxml_str:
166
+ return None
167
+ root = ET.fromstring(cdxml_str)
168
+ page_el = root.find("page")
169
+ if page_el is None:
170
+ return None
171
+ frag_el = page_el.find("fragment")
172
+ if frag_el is None:
173
+ return None
174
+
175
+ frag_xml = ET.tostring(frag_el, encoding="unicode")
176
+ xmin, ymin, xmax, ymax = _measure_fragment_xml(frag_xml)
177
+ if xmin == xmax:
178
+ return None
179
+ return (frag_xml, xmin, ymin, xmax, ymax)
180
+
181
+
182
+ def _scale_fragment_xml(
183
+ frag_xml: str,
184
+ scale: float,
185
+ xmin: float, ymin: float, xmax: float, ymax: float,
186
+ ) -> Tuple[str, float, float, float, float]:
187
+ """Scale a fragment's coordinates around its center by *scale* factor.
188
+
189
+ Returns ``(scaled_xml, new_xmin, new_ymin, new_xmax, new_ymax)``.
190
+ """
191
+ cx = (xmin + xmax) / 2.0
192
+ cy = (ymin + ymax) / 2.0
193
+
194
+ def scale_p(m: "re.Match") -> str:
195
+ x, y = float(m.group(1)), float(m.group(2))
196
+ nx = cx + (x - cx) * scale
197
+ ny = cy + (y - cy) * scale
198
+ return f'p="{nx:.3f} {ny:.3f}"'
199
+
200
+ def scale_bb(m: "re.Match") -> str:
201
+ vals = [float(v) for v in m.group(1).split()]
202
+ sv = [
203
+ f"{cx + (vals[0] - cx) * scale:.3f}",
204
+ f"{cy + (vals[1] - cy) * scale:.3f}",
205
+ f"{cx + (vals[2] - cx) * scale:.3f}",
206
+ f"{cy + (vals[3] - cy) * scale:.3f}",
207
+ ]
208
+ return f'BoundingBox="{" ".join(sv)}"'
209
+
210
+ scaled = re.sub(r'\bp="([-\d.]+)\s+([-\d.]+)"', scale_p, frag_xml)
211
+ scaled = re.sub(r'\bBoundingBox="((?:[-\d.]+ ?){4})"', scale_bb, scaled)
212
+ new_xmin, new_ymin, new_xmax, new_ymax = _measure_fragment_xml(scaled)
213
+ return scaled, new_xmin, new_ymin, new_xmax, new_ymax
214
+
215
+
216
+ def _resolve_condition_to_fragment(
217
+ text: str,
218
+ cs_bridge,
219
+ verbose: bool = False,
220
+ ) -> Optional[Tuple[str, float, float, float, float]]:
221
+ """Attempt to resolve a single condition string to a CDXML fragment.
222
+
223
+ Resolution chain:
224
+ 1. Skip if ``_is_non_chemistry_text(text)`` → always text.
225
+ 2. ChemScript ``name_to_cdxml(canonical_name)`` → extract fragment.
226
+ 3. PubChem name→SMILES → ChemScript ``smiles_to_cdxml(smiles)`` → extract fragment.
227
+ 4. Return ``None`` → caller renders text verbatim.
228
+
229
+ If the resolved structure exceeds ``EXPAND_MAX_WIDTH``, it is scaled down
230
+ so that conditions don't dominate the scheme.
231
+
232
+ Returns (fragment_xml, xmin, ymin, xmax, ymax) or None.
233
+ """
234
+ def log(msg: str):
235
+ if verbose:
236
+ print(f"[expand] {msg}", file=sys.stderr)
237
+
238
+ canonical = resolve_abbreviation(text)
239
+
240
+ if _is_non_chemistry_text(canonical):
241
+ log(f" '{canonical}' → non-chemistry text, keeping as label")
242
+ return None
243
+
244
+ result = None
245
+
246
+ # --- 1. ChemScript name resolution ---
247
+ try:
248
+ cdxml_str = cs_bridge.name_to_cdxml(canonical)
249
+ result = _extract_fragment_from_cdxml(cdxml_str)
250
+ if result is not None:
251
+ log(f" '{canonical}' → ChemScript name OK")
252
+ except Exception as exc:
253
+ log(f" '{canonical}' → ChemScript name failed: {exc}")
254
+
255
+ # --- 2. PubChem name → SMILES → ChemScript smiles_to_cdxml ---
256
+ if result is None:
257
+ try:
258
+ from ..resolve.cas_resolver import resolve_name_to_smiles
259
+ smiles = resolve_name_to_smiles(canonical)
260
+ if smiles:
261
+ log(f" '{canonical}' → PubChem SMILES: {smiles[:60]}")
262
+ cdxml_str = cs_bridge.smiles_to_cdxml(smiles)
263
+ result = _extract_fragment_from_cdxml(cdxml_str)
264
+ if result is not None:
265
+ log(f" '{canonical}' → PubChem+ChemScript OK")
266
+ except Exception as exc:
267
+ log(f" '{canonical}' → PubChem fallback failed: {exc}")
268
+
269
+ if result is None:
270
+ log(f" '{canonical}' → unresolved, keeping as text label")
271
+ return None
272
+
273
+ # --- Scale down large structures ---
274
+ frag_xml, xmin, ymin, xmax, ymax = result
275
+ w = xmax - xmin
276
+ if w > EXPAND_MAX_WIDTH:
277
+ scale = EXPAND_MAX_WIDTH / w
278
+ frag_xml, xmin, ymin, xmax, ymax = _scale_fragment_xml(
279
+ frag_xml, scale, xmin, ymin, xmax, ymax
280
+ )
281
+ log(f" '{canonical}' scaled to {scale:.2f}x (w={w:.1f} → {xmax - xmin:.1f})")
282
+
283
+ return (frag_xml, xmin, ymin, xmax, ymax)
284
+
285
+
286
+ def _resolve_all_conditions(
287
+ conditions: List[str],
288
+ cs_bridge,
289
+ verbose: bool = False,
290
+ ) -> List[Tuple[str, Optional[Tuple[str, float, float, float, float]]]]:
291
+ """Resolve a list of condition strings. For each returns
292
+ ``(display_text, fragment_info_or_None)``.
293
+ """
294
+ results: List[Tuple[str, Optional[Tuple[str, float, float, float, float]]]] = []
295
+ for text in conditions:
296
+ canonical = resolve_abbreviation(text)
297
+ frag = _resolve_condition_to_fragment(text, cs_bridge, verbose)
298
+ results.append((canonical, frag))
299
+ return results
300
+
301
+
302
+ # ---------------------------------------------------------------------------
303
+ # Fragment ID reassignment (prevent collisions with reactant/product IDs)
304
+ # ---------------------------------------------------------------------------
305
+
306
+ def _reassign_fragment_ids(frag_xml: str, ids: "_IDGen") -> Tuple[str, int]:
307
+ """Rewrite all element IDs in *frag_xml* using *ids* so they are unique
308
+ within the overall CDXML document.
309
+
310
+ Returns ``(new_xml, top_level_fragment_id)``.
311
+ """
312
+ import xml.etree.ElementTree as ET
313
+
314
+ root = ET.fromstring(frag_xml)
315
+ old_to_new: Dict[str, str] = {}
316
+
317
+ # First pass: assign new IDs
318
+ for el in root.iter():
319
+ old_id = el.get("id")
320
+ if old_id is not None:
321
+ new_id = str(ids.next())
322
+ old_to_new[old_id] = new_id
323
+
324
+ # Second pass: rewrite id, B (bond begin), E (bond end), Z
325
+ for el in root.iter():
326
+ for attr in ("id", "B", "E", "SupersededBy"):
327
+ val = el.get(attr)
328
+ if val and val in old_to_new:
329
+ el.set(attr, old_to_new[val])
330
+ # Z attribute also needs a unique value
331
+ if el.get("Z") is not None:
332
+ el.set("Z", str(ids.next()))
333
+
334
+ top_id = int(old_to_new.get(root.get("id", "0"), "0"))
335
+ new_xml = ET.tostring(root, encoding="unicode")
336
+ return new_xml, top_id
337
+
338
+
339
+ # ---------------------------------------------------------------------------
340
+ # Layout constants (ACS Document 1996)
341
+ # ---------------------------------------------------------------------------
342
+
343
+ INTER_MOL_GAP = 18.0 # horizontal gap between molecules on same side
344
+ ARROW_MARGIN = 20.0 # gap between molecules and arrow ends
345
+ ARROW_LENGTH = 80.0 # default arrow shaft length
346
+ PAGE_LEFT = 80.0 # left margin for first reactant
347
+ VERTICAL_CENTER = 500.0 # y-coordinate for vertical centre of the scheme
348
+ CONDITIONS_GAP_ABOVE = 10.0 # clear gap between bottom of above-text and arrow shaft
349
+ CONDITIONS_GAP_BELOW = 10.0 # clear gap between arrow shaft and top of below-text
350
+ CONDITIONS_LINE_HEIGHT = 12.0 # line height for conditions text
351
+ CONDITIONS_DESCENDER = 3.0 # extra space below baseline for descenders (g, p, y)
352
+
353
+ # Expanded conditions layout constants
354
+ EXPAND_STRUCTURE_GAP = 10.0 # horizontal gap between adjacent condition structures
355
+ EXPAND_ABOVE_CLEARANCE = 12.0 # clearance from arrow shaft to bottom of above-structures
356
+ EXPAND_BELOW_CLEARANCE = 12.0 # clearance from arrow shaft to top of below-structures
357
+ EXPAND_MAX_WIDTH = 80.0 # max width for a single condition structure before scaling
358
+
359
+
360
+ # ---------------------------------------------------------------------------
361
+ # CDXML header helper (uses shared template from constants.py)
362
+ # ---------------------------------------------------------------------------
363
+
364
+ def _format_cdxml_header(bbox: str) -> str:
365
+ """Format CDXML_HEADER template with ACS Document 1996 style constants."""
366
+ return CDXML_HEADER.format(
367
+ bbox=bbox,
368
+ label_font=ACS_LABEL_FONT,
369
+ label_size=ACS_LABEL_SIZE,
370
+ label_face=ACS_LABEL_FACE,
371
+ caption_size=ACS_CAPTION_SIZE,
372
+ hash_spacing=ACS_HASH_SPACING,
373
+ margin_width=ACS_MARGIN_WIDTH,
374
+ line_width=ACS_LINE_WIDTH,
375
+ bold_width=ACS_BOLD_WIDTH,
376
+ bond_length=ACS_BOND_LENGTH_STR,
377
+ bond_spacing=ACS_BOND_SPACING,
378
+ chain_angle=ACS_CHAIN_ANGLE_STR,
379
+ )
380
+
381
+
382
+ # ---------------------------------------------------------------------------
383
+ # ID generator
384
+ # ---------------------------------------------------------------------------
385
+
386
+ class _IDGen:
387
+ """Simple incrementing integer ID generator."""
388
+ def __init__(self, start: int = 1000):
389
+ self._n = start
390
+
391
+ def next(self) -> int:
392
+ v = self._n
393
+ self._n += 1
394
+ return v
395
+
396
+
397
+ # ---------------------------------------------------------------------------
398
+ # Molecule bounding box helpers
399
+ # ---------------------------------------------------------------------------
400
+
401
+ def _mol_extent(mol: Dict) -> Tuple[float, float, float, float]:
402
+ """Return (min_x, min_y, max_x, max_y) of atoms."""
403
+ xs = [a["x"] for a in mol["atoms"]]
404
+ ys = [a["y"] for a in mol["atoms"]]
405
+ return min(xs), min(ys), max(xs), max(ys)
406
+
407
+
408
+ def _translate_mol(mol: Dict, dx: float, dy: float) -> Dict:
409
+ """Translate all atom coordinates by (dx, dy). Returns a new dict."""
410
+ mol = deepcopy(mol)
411
+ for a in mol["atoms"]:
412
+ a["x"] += dx
413
+ a["y"] += dy
414
+ return mol
415
+
416
+
417
+ # ---------------------------------------------------------------------------
418
+ # Fragment (molecule) XML builder — adapted from cdxml_builder.py
419
+ # ---------------------------------------------------------------------------
420
+
421
+ # Element numbers for heteroatoms
422
+ ELEMENT_NUMBERS: Dict[str, int] = {
423
+ "H": 1, "B": 5, "C": 6, "N": 7, "O": 8,
424
+ "F": 9, "Si": 14, "P": 15, "S": 16, "Cl": 17,
425
+ "Se": 34, "Br": 35, "I": 53, "Cs": 55,
426
+ }
427
+
428
+ WIDE_SYMBOLS = {"Br", "Cl", "Si", "Se", "Cs"}
429
+
430
+ BOND_ORDER_ATTR: Dict[int, Optional[str]] = {
431
+ 1: None, 2: "2", 3: "3", 4: "1.5",
432
+ }
433
+
434
+ BOND_STEREO_ATTR: Dict[int, str] = {
435
+ 1: "WedgeBegin", 4: "WedgeBegin", 6: "WedgedHashBegin",
436
+ }
437
+
438
+
439
+ def _label_bbox(x: float, y: float, symbol: str) -> str:
440
+ char_w = 7.0 if symbol in WIDE_SYMBOLS else 6.0
441
+ lx = x - char_w / 2.0
442
+ ty = y - 7.52
443
+ by = y
444
+ rx = lx + char_w
445
+ return f"{lx:.2f} {ty:.2f} {rx:.2f} {by:.2f}"
446
+
447
+
448
+ def _build_fragment(
449
+ atoms: List[Dict],
450
+ bonds: List[Dict],
451
+ ids: _IDGen,
452
+ ) -> Tuple[str, int]:
453
+ """Build a <fragment> XML string. Returns (xml_string, fragment_id)."""
454
+ frag_id = ids.next()
455
+ atom_id_map: Dict[int, int] = {}
456
+
457
+ xs = [a["x"] for a in atoms]
458
+ ys = [a["y"] for a in atoms]
459
+ bb = f"{min(xs):.2f} {min(ys):.2f} {max(xs):.2f} {max(ys):.2f}"
460
+
461
+ lines = [f'<fragment id="{frag_id}" BoundingBox="{bb}" Z="{ids.next()}">']
462
+
463
+ for a in atoms:
464
+ aid = ids.next()
465
+ atom_id_map[a["index"]] = aid
466
+ sym = a.get("symbol", "C")
467
+ ax, ay = a["x"], a["y"]
468
+ z = ids.next()
469
+ attrs = [f'id="{aid}"', f'p="{ax:.2f} {ay:.2f}"', f'Z="{z}"']
470
+
471
+ is_carbon = (sym == "C")
472
+ charge = a.get("charge", 0)
473
+
474
+ if not is_carbon:
475
+ el_num = ELEMENT_NUMBERS.get(sym, 0)
476
+ if el_num:
477
+ attrs.append(f'Element="{el_num}"')
478
+ nh = a.get("num_hydrogens", 0)
479
+ attrs.append(f'NumHydrogens="{nh}"')
480
+ attrs.append('NeedsClean="yes"')
481
+ attrs.append('AS="N"')
482
+
483
+ if charge:
484
+ attrs.append(f'Charge="{charge}"')
485
+
486
+ cfg = a.get("cfg", 0)
487
+ if cfg:
488
+ attrs.append(f'Stereo="{cfg}"')
489
+
490
+ if is_carbon and not charge:
491
+ lines.append(f'<n {" ".join(attrs)}/>')
492
+ else:
493
+ lx = ax - 3.25
494
+ ly = ay + 3.52
495
+ bbox = _label_bbox(ax, ay, sym)
496
+ label_text = xml_escape(sym)
497
+ label_align = ""
498
+ if sym in WIDE_SYMBOLS:
499
+ label_align = ' LabelAlignment="Left"'
500
+ lines.append(f'<n {" ".join(attrs)}>')
501
+ lines.append(
502
+ f'<t p="{lx:.2f} {ly:.2f}" BoundingBox="{bbox}" '
503
+ f'LabelJustification="Left"{label_align}>'
504
+ )
505
+ lines.append(
506
+ f'<s font="3" size="10" color="0" face="96">{label_text}</s>'
507
+ )
508
+ lines.append("</t>")
509
+ lines.append("</n>")
510
+
511
+ for b in bonds:
512
+ bid = ids.next()
513
+ z = ids.next()
514
+ a1 = atom_id_map.get(b["atom1"], 0)
515
+ a2 = atom_id_map.get(b["atom2"], 0)
516
+ order = b.get("order", 1)
517
+ cfg = b.get("cfg", 0)
518
+ attrs = [f'id="{bid}"', f'Z="{z}"', f'B="{a1}"', f'E="{a2}"']
519
+
520
+ order_attr = BOND_ORDER_ATTR.get(order)
521
+ if order_attr:
522
+ attrs.append(f'Order="{order_attr}"')
523
+
524
+ double_pos = b.get("double_pos", "")
525
+ if double_pos:
526
+ attrs.append(f'DoublePosition="{double_pos}"')
527
+
528
+ if cfg and cfg in BOND_STEREO_ATTR:
529
+ attrs.append(f'Display="{BOND_STEREO_ATTR[cfg]}"')
530
+ elif order == 1:
531
+ attrs.append('BS="N"')
532
+
533
+ lines.append(f'<b {" ".join(attrs)}/>')
534
+
535
+ lines.append("</fragment>")
536
+ return "\n".join(lines), frag_id
537
+
538
+
539
+ # ---------------------------------------------------------------------------
540
+ # Arrow XML builder
541
+ # ---------------------------------------------------------------------------
542
+
543
+ def _build_arrow(
544
+ tail_x: float, tail_y: float,
545
+ head_x: float, head_y: float,
546
+ ids: _IDGen,
547
+ ) -> Tuple[str, int]:
548
+ """Build an <arrow> element. Returns (xml_string, arrow_id)."""
549
+ aid = ids.next()
550
+ z = ids.next()
551
+ bx1 = min(tail_x, head_x)
552
+ by1 = min(tail_y, head_y) - 4.0
553
+ bx2 = max(tail_x, head_x)
554
+ by2 = max(tail_y, head_y) + 4.0
555
+ cx3 = (tail_x + head_x) / 2.0
556
+ cy3 = tail_y + 100.0
557
+ xml = (
558
+ f'<arrow id="{aid}" '
559
+ f'BoundingBox="{bx1:.2f} {by1:.2f} {bx2:.2f} {by2:.2f}" '
560
+ f'Z="{z}" '
561
+ f'FillType="None" '
562
+ f'ArrowheadHead="Full" '
563
+ f'ArrowheadType="Solid" '
564
+ f'HeadSize="1000" '
565
+ f'ArrowheadCenterSize="875" '
566
+ f'ArrowheadWidth="250" '
567
+ f'Head3D="{head_x:.2f} {head_y:.2f} 0" '
568
+ f'Tail3D="{tail_x:.2f} {tail_y:.2f} 0" '
569
+ f'Center3D="{cx3:.2f} {cy3:.2f} 0" '
570
+ f'MajorAxisEnd3D="{cx3 + 80:.2f} {cy3:.2f} 0" '
571
+ f'MinorAxisEnd3D="{cx3:.2f} {cy3 + 80:.2f} 0"'
572
+ f'/>'
573
+ )
574
+ return xml, aid
575
+
576
+
577
+ # ---------------------------------------------------------------------------
578
+ # Conditions text XML builder (with subscript support)
579
+ # ---------------------------------------------------------------------------
580
+
581
+ def _build_conditions_text(
582
+ text_lines: List[str],
583
+ x: float,
584
+ baseline_y: float,
585
+ ids: _IDGen,
586
+ ) -> Tuple[str, int]:
587
+ """Build a <t> element for conditions text above or below the arrow.
588
+
589
+ Each entry in text_lines becomes one line. Chemical formulae get
590
+ subscript formatting (e.g. Pd2dba3 → Pd₂dba₃).
591
+
592
+ Parameters
593
+ ----------
594
+ x : horizontal centre of the text block
595
+ baseline_y : y-coordinate for the baseline of the FIRST line
596
+ (CDXML <t> p="x y" uses first-line baseline)
597
+
598
+ Returns (xml_string, text_element_id).
599
+ """
600
+ tid = ids.next()
601
+ z = ids.next()
602
+
603
+ # Estimate bounding box
604
+ max_chars = max((len(ln) for ln in text_lines), default=1)
605
+ n_lines = len(text_lines)
606
+ w = max_chars * 5.8
607
+ ascender = 8.0 # approximate ascender height above baseline
608
+ descender = 3.0 # approximate descender depth below baseline
609
+
610
+ bx1 = x - w / 2.0
611
+ by1 = baseline_y - ascender
612
+ bx2 = x + w / 2.0
613
+ by2 = baseline_y + (n_lines - 1) * CONDITIONS_LINE_HEIGHT + descender
614
+
615
+ # Build the <s> content. We join lines with \n inside the <s>,
616
+ # applying subscripts per-line where appropriate. If ANY line
617
+ # needs subscripts, we build per-line <s> elements; otherwise
618
+ # we use a single <s> block.
619
+ any_subscript = any(needs_subscript(ln) for ln in text_lines)
620
+
621
+ if any_subscript:
622
+ # Build each line as separate <s> element(s), with \n between lines
623
+ s_parts = []
624
+ for i, ln in enumerate(text_lines):
625
+ if i > 0:
626
+ # Newline between lines — plain text <s>
627
+ s_parts.append(
628
+ '<s font="3" size="10" color="0" face="96">\n</s>'
629
+ )
630
+ s_parts.append(build_formatted_s_xml(ln))
631
+ s_xml = "".join(s_parts)
632
+ else:
633
+ # Simple: all lines in one <s>
634
+ text = "\n".join(xml_escape(ln) for ln in text_lines)
635
+ s_xml = f'<s font="3" size="10" color="0" face="96">{text}</s>'
636
+
637
+ xml = (
638
+ f'<t id="{tid}" p="{x:.2f} {baseline_y:.2f}" '
639
+ f'BoundingBox="{bx1:.2f} {by1:.2f} {bx2:.2f} {by2:.2f}" '
640
+ f'Z="{z}" '
641
+ f'CaptionJustification="Center" '
642
+ f'Justification="Center" '
643
+ f'LineHeight="auto">'
644
+ f'{s_xml}'
645
+ f'</t>'
646
+ )
647
+ return xml, tid
648
+
649
+
650
+ # ---------------------------------------------------------------------------
651
+ # Layout expanded conditions (structures + text) above/below arrow
652
+ # ---------------------------------------------------------------------------
653
+
654
+ ExpandedItems = List[Tuple[str, Optional[Tuple[str, float, float, float, float]]]]
655
+
656
+
657
+ def _layout_expanded_conditions(
658
+ resolved_items: "ExpandedItems",
659
+ arrow_tail_x: float,
660
+ arrow_head_x: float,
661
+ arrow_y: float,
662
+ position: str, # "above" or "below"
663
+ ids: "_IDGen",
664
+ verbose: bool = False,
665
+ ) -> Tuple[List[str], List[int]]:
666
+ """Position resolved condition items (structures + text labels) above or
667
+ below the arrow, arranged horizontally.
668
+
669
+ Parameters
670
+ ----------
671
+ resolved_items : list of (display_text, fragment_info_or_None)
672
+ arrow_tail_x, arrow_head_x : arrow horizontal extent
673
+ arrow_y : arrow y-coordinate
674
+ position : "above" or "below"
675
+ ids : shared ID generator
676
+ verbose : print debug info
677
+
678
+ Returns
679
+ -------
680
+ (xml_strings, element_ids)
681
+ """
682
+ def log(msg: str):
683
+ if verbose:
684
+ print(f"[expand-layout] {msg}", file=sys.stderr)
685
+
686
+ if not resolved_items:
687
+ return [], []
688
+
689
+ arrow_mid_x = (arrow_tail_x + arrow_head_x) / 2.0
690
+
691
+ # --- Compute widths and heights for each item ---
692
+ item_infos = [] # (width, height, is_structure)
693
+ for display_text, frag_info in resolved_items:
694
+ if frag_info is not None:
695
+ _xml, xmin, ymin, xmax, ymax = frag_info
696
+ w = xmax - xmin
697
+ h = ymax - ymin
698
+ item_infos.append((w, h, True))
699
+ else:
700
+ # Estimate text width
701
+ w = max(len(display_text) * 5.8, 20.0)
702
+ h = 12.0 # single-line text height
703
+ item_infos.append((w, h, False))
704
+
705
+ n = len(item_infos)
706
+ total_width = sum(info[0] for info in item_infos) + (n - 1) * EXPAND_STRUCTURE_GAP
707
+ max_height = max(info[1] for info in item_infos)
708
+
709
+ # Starting x so the row is centered over the arrow midpoint
710
+ start_x = arrow_mid_x - total_width / 2.0
711
+
712
+ # --- Compute vertical anchor ---
713
+ if position == "above":
714
+ # Bottom edge of all items at arrow_y - clearance
715
+ items_bottom_y = arrow_y - EXPAND_ABOVE_CLEARANCE
716
+ else:
717
+ # Top edge of all items at arrow_y + clearance
718
+ items_top_y = arrow_y + EXPAND_BELOW_CLEARANCE
719
+
720
+ # --- Place each item ---
721
+ xml_parts: List[str] = []
722
+ id_parts: List[int] = []
723
+ cursor_x = start_x
724
+
725
+ for i, (display_text, frag_info) in enumerate(resolved_items):
726
+ w, h, is_struct = item_infos[i]
727
+
728
+ if is_struct and frag_info is not None:
729
+ frag_xml, xmin, ymin, xmax, ymax = frag_info
730
+ frag_cx = (xmin + xmax) / 2.0
731
+ frag_cy = (ymin + ymax) / 2.0
732
+ target_cx = cursor_x + w / 2.0
733
+
734
+ if position == "above":
735
+ # Place so fragment's ymax = items_bottom_y
736
+ target_cy = items_bottom_y - h / 2.0
737
+ else:
738
+ # Place so fragment's ymin = items_top_y
739
+ target_cy = items_top_y + h / 2.0
740
+
741
+ dx = target_cx - frag_cx
742
+ dy = target_cy - frag_cy
743
+ translated = _translate_fragment_xml(frag_xml, dx, dy)
744
+ final_xml, frag_id = _reassign_fragment_ids(translated, ids)
745
+ xml_parts.append(final_xml)
746
+ id_parts.append(frag_id)
747
+ log(f" Structure '{display_text}' at cx={target_cx:.1f} cy={target_cy:.1f}")
748
+
749
+ else:
750
+ # Text label fallback
751
+ text_cx = cursor_x + w / 2.0
752
+ if position == "above":
753
+ ascender = 8.0
754
+ baseline_y = items_bottom_y - ascender
755
+ else:
756
+ ascender = 8.0
757
+ baseline_y = items_top_y + ascender
758
+
759
+ txt_xml, txt_id = _build_conditions_text(
760
+ [display_text], text_cx, baseline_y, ids
761
+ )
762
+ xml_parts.append(txt_xml)
763
+ id_parts.append(txt_id)
764
+ log(f" Text '{display_text}' at cx={text_cx:.1f} baseline={baseline_y:.1f}")
765
+
766
+ cursor_x += w + EXPAND_STRUCTURE_GAP
767
+
768
+ return xml_parts, id_parts
769
+
770
+
771
+ # ---------------------------------------------------------------------------
772
+ # Core: build reaction scheme CDXML
773
+ # ---------------------------------------------------------------------------
774
+
775
+ def build_reaction_scheme(
776
+ structures: List[Dict],
777
+ reactant_indices: List[int],
778
+ product_indices: List[int],
779
+ conditions_above: List[str],
780
+ conditions_below: List[str],
781
+ verbose: bool = False,
782
+ expanded_above: Optional["ExpandedItems"] = None,
783
+ expanded_below: Optional["ExpandedItems"] = None,
784
+ ) -> str:
785
+ """
786
+ Assemble a CDXML reaction scheme from extracted structures + descriptor.
787
+
788
+ Parameters
789
+ ----------
790
+ structures : list of structure dicts from extract_structures_from_image
791
+ reactant_indices : which structures (by index) are reactants
792
+ product_indices : which structures (by index) are products
793
+ conditions_above : text lines for above the arrow
794
+ conditions_below : text lines for below the arrow
795
+ verbose : print layout info to stderr
796
+ expanded_above : pre-resolved expanded conditions for above (from --expand)
797
+ expanded_below : pre-resolved expanded conditions for below (from --expand)
798
+
799
+ Returns
800
+ -------
801
+ CDXML document string
802
+ """
803
+ def log(msg: str):
804
+ if verbose:
805
+ print(f"[reaction_from_image] {msg}", file=sys.stderr)
806
+
807
+ # Validate indices
808
+ n = len(structures)
809
+ for idx in reactant_indices + product_indices:
810
+ if idx < 0 or idx >= n:
811
+ raise ValueError(
812
+ f"Structure index {idx} out of range (0–{n-1}). "
813
+ f"Image yielded {n} structures."
814
+ )
815
+
816
+ # Separate reactant and product molecules
817
+ reactant_mols = [structures[i] for i in reactant_indices]
818
+ product_mols = [structures[i] for i in product_indices]
819
+
820
+ # Check that all molecules have atoms
821
+ for side, mols, label in [
822
+ (reactant_indices, reactant_mols, "reactant"),
823
+ (product_indices, product_mols, "product"),
824
+ ]:
825
+ for idx, mol in zip(side, mols):
826
+ if not mol.get("atoms"):
827
+ raise ValueError(
828
+ f"Structure {idx} ({label}) has no atoms — DECIMER may have "
829
+ f"failed on this region. SMILES: {mol.get('smiles', '(none)')}"
830
+ )
831
+
832
+ # ------------------------------------------------------------------
833
+ # Layout: position molecules left-to-right
834
+ #
835
+ # [Reactant1] [gap] [Reactant2] [margin] →arrow→ [margin] [Product1]
836
+ #
837
+ # All molecules are centred vertically at VERTICAL_CENTER.
838
+ # ------------------------------------------------------------------
839
+
840
+ ids = _IDGen(1000)
841
+ cursor_x = PAGE_LEFT # running x position
842
+
843
+ # Position reactants
844
+ positioned_reactants: List[Dict] = []
845
+ for mol in reactant_mols:
846
+ x0, y0, x1, y1 = _mol_extent(mol)
847
+ mol_w = x1 - x0
848
+ mol_h = y1 - y0
849
+ # Translate: left edge to cursor_x, vertical centre to VERTICAL_CENTER
850
+ dx = cursor_x - x0
851
+ dy = VERTICAL_CENTER - (y0 + y1) / 2.0
852
+ positioned = _translate_mol(mol, dx, dy)
853
+ positioned_reactants.append(positioned)
854
+ cursor_x += mol_w + INTER_MOL_GAP
855
+ log(f"Reactant placed at x=[{cursor_x - mol_w - INTER_MOL_GAP:.1f}, {cursor_x - INTER_MOL_GAP:.1f}]")
856
+
857
+ # Arrow position
858
+ arrow_tail_x = cursor_x - INTER_MOL_GAP + ARROW_MARGIN
859
+ arrow_head_x = arrow_tail_x + ARROW_LENGTH
860
+ arrow_y = VERTICAL_CENTER
861
+
862
+ log(f"Arrow: tail={arrow_tail_x:.1f}, head={arrow_head_x:.1f}, y={arrow_y:.1f}")
863
+
864
+ # Position products
865
+ cursor_x = arrow_head_x + ARROW_MARGIN
866
+ positioned_products: List[Dict] = []
867
+ for mol in product_mols:
868
+ x0, y0, x1, y1 = _mol_extent(mol)
869
+ mol_w = x1 - x0
870
+ dx = cursor_x - x0
871
+ dy = VERTICAL_CENTER - (y0 + y1) / 2.0
872
+ positioned = _translate_mol(mol, dx, dy)
873
+ positioned_products.append(positioned)
874
+ cursor_x += mol_w + INTER_MOL_GAP
875
+ log(f"Product placed at x=[{cursor_x - mol_w - INTER_MOL_GAP:.1f}, {cursor_x - INTER_MOL_GAP:.1f}]")
876
+
877
+ # Resolve abbreviations in conditions text
878
+ resolved_above = [resolve_abbreviation(line) for line in conditions_above]
879
+ resolved_below = [resolve_abbreviation(line) for line in conditions_below]
880
+
881
+ log(f"Conditions above: {resolved_above}")
882
+ log(f"Conditions below: {resolved_below}")
883
+
884
+ # ------------------------------------------------------------------
885
+ # Build XML elements
886
+ # ------------------------------------------------------------------
887
+
888
+ fragment_xmls: List[str] = []
889
+ reactant_frag_ids: List[int] = []
890
+ product_frag_ids: List[int] = []
891
+
892
+ for mol in positioned_reactants:
893
+ frag_xml, frag_id = _build_fragment(mol["atoms"], mol["bonds"], ids)
894
+ fragment_xmls.append(frag_xml)
895
+ reactant_frag_ids.append(frag_id)
896
+
897
+ for mol in positioned_products:
898
+ frag_xml, frag_id = _build_fragment(mol["atoms"], mol["bonds"], ids)
899
+ fragment_xmls.append(frag_xml)
900
+ product_frag_ids.append(frag_id)
901
+
902
+ # Conditions: expanded structures or text labels
903
+ above_xmls: List[str] = []
904
+ below_xmls: List[str] = []
905
+ above_ids: List[int] = []
906
+ below_ids: List[int] = []
907
+
908
+ arrow_mid_x = (arrow_tail_x + arrow_head_x) / 2.0
909
+
910
+ if expanded_above is not None:
911
+ # --expand mode: structures + text fallback
912
+ ax, ai = _layout_expanded_conditions(
913
+ expanded_above, arrow_tail_x, arrow_head_x, arrow_y,
914
+ "above", ids, verbose,
915
+ )
916
+ above_xmls.extend(ax)
917
+ above_ids.extend(ai)
918
+ elif resolved_above:
919
+ n_above = len(resolved_above)
920
+ baseline_y = (arrow_y - CONDITIONS_GAP_ABOVE - CONDITIONS_DESCENDER
921
+ - (n_above - 1) * CONDITIONS_LINE_HEIGHT)
922
+ txt_xml, txt_id = _build_conditions_text(
923
+ resolved_above, arrow_mid_x, baseline_y, ids
924
+ )
925
+ above_xmls.append(txt_xml)
926
+ above_ids.append(txt_id)
927
+
928
+ if expanded_below is not None:
929
+ bx, bi = _layout_expanded_conditions(
930
+ expanded_below, arrow_tail_x, arrow_head_x, arrow_y,
931
+ "below", ids, verbose,
932
+ )
933
+ below_xmls.extend(bx)
934
+ below_ids.extend(bi)
935
+ elif resolved_below:
936
+ ascender = 8.0
937
+ baseline_y = arrow_y + CONDITIONS_GAP_BELOW + ascender
938
+ txt_xml, txt_id = _build_conditions_text(
939
+ resolved_below, arrow_mid_x, baseline_y, ids
940
+ )
941
+ below_xmls.append(txt_xml)
942
+ below_ids.append(txt_id)
943
+
944
+ # Arrow
945
+ arrow_xml, arrow_id = _build_arrow(
946
+ arrow_tail_x, arrow_y, arrow_head_x, arrow_y, ids
947
+ )
948
+
949
+ # Scheme / step
950
+ scheme_id = ids.next()
951
+ step_id = ids.next()
952
+ step_attrs = [
953
+ f'id="{step_id}"',
954
+ f'ReactionStepReactants="{" ".join(str(i) for i in reactant_frag_ids)}"',
955
+ f'ReactionStepProducts="{" ".join(str(i) for i in product_frag_ids)}"',
956
+ f'ReactionStepArrows="{arrow_id}"',
957
+ ]
958
+ if above_ids:
959
+ step_attrs.append(
960
+ f'ReactionStepObjectsAboveArrow="{" ".join(str(i) for i in above_ids)}"'
961
+ )
962
+ if below_ids:
963
+ step_attrs.append(
964
+ f'ReactionStepObjectsBelowArrow="{" ".join(str(i) for i in below_ids)}"'
965
+ )
966
+
967
+ scheme_xml = f'<scheme id="{scheme_id}"><step {" ".join(step_attrs)}/></scheme>'
968
+
969
+ # ------------------------------------------------------------------
970
+ # Compute overall bounding box
971
+ # ------------------------------------------------------------------
972
+ all_xs: List[float] = []
973
+ all_ys: List[float] = []
974
+ for mol in positioned_reactants + positioned_products:
975
+ for a in mol["atoms"]:
976
+ all_xs.append(a["x"])
977
+ all_ys.append(a["y"])
978
+
979
+ # Include expanded condition fragments in bounding box
980
+ for frag_xml_str in above_xmls + below_xmls:
981
+ if frag_xml_str.lstrip().startswith("<fragment"):
982
+ fx0, fy0, fx1, fy1 = _measure_fragment_xml(frag_xml_str)
983
+ if fx0 != fx1:
984
+ all_xs.extend([fx0, fx1])
985
+ all_ys.extend([fy0, fy1])
986
+
987
+ margin = 20.0
988
+ doc_bbox = (
989
+ f"{min(all_xs) - margin:.2f} {min(all_ys) - margin:.2f} "
990
+ f"{max(all_xs) + margin:.2f} {max(all_ys) + margin:.2f}"
991
+ )
992
+
993
+ # Page size — generous
994
+ page_w = max(all_xs) + 100
995
+ page_h = max(all_ys) + 100
996
+ page_id = ids.next()
997
+
998
+ # ------------------------------------------------------------------
999
+ # Assemble document
1000
+ # ------------------------------------------------------------------
1001
+ parts = [
1002
+ _format_cdxml_header(doc_bbox),
1003
+ f'<page id="{page_id}" BoundingBox="0 0 {page_w:.0f} {page_h:.0f}" '
1004
+ f'HeaderPosition="36" FooterPosition="36" '
1005
+ f'PrintTrimMarks="yes" HeightPages="1" WidthPages="2">',
1006
+ ]
1007
+ parts.extend(fragment_xmls)
1008
+ parts.extend(above_xmls)
1009
+ parts.extend(below_xmls)
1010
+ parts.append(arrow_xml)
1011
+ parts.append(scheme_xml)
1012
+ parts.append("</page>")
1013
+ parts.append(CDXML_FOOTER)
1014
+
1015
+ return "\n".join(parts)
1016
+
1017
+
1018
+ # ---------------------------------------------------------------------------
1019
+ # Fragment XML translation helper (for ChemScript fragments)
1020
+ # ---------------------------------------------------------------------------
1021
+
1022
+ def _translate_fragment_xml(frag_xml: str, dx: float, dy: float) -> str:
1023
+ """Shift all coordinate attributes in a fragment XML string by (dx, dy).
1024
+
1025
+ Handles: p="x y" and BoundingBox="x1 y1 x2 y2"
1026
+ """
1027
+ def shift_p(m: "re.Match") -> str:
1028
+ x, y = float(m.group(1)), float(m.group(2))
1029
+ return f'p="{x + dx:.3f} {y + dy:.3f}"'
1030
+
1031
+ def shift_bb(m: "re.Match") -> str:
1032
+ vals = [float(v) for v in m.group(1).split()]
1033
+ shifted = [
1034
+ f"{vals[0] + dx:.3f}", f"{vals[1] + dy:.3f}",
1035
+ f"{vals[2] + dx:.3f}", f"{vals[3] + dy:.3f}",
1036
+ ]
1037
+ return f'BoundingBox="{" ".join(shifted)}"'
1038
+
1039
+ frag_xml = re.sub(r'\bp="([-\d.]+)\s+([-\d.]+)"', shift_p, frag_xml)
1040
+ frag_xml = re.sub(r'\bBoundingBox="((?:[-\d.]+ ?){4})"', shift_bb, frag_xml)
1041
+ return frag_xml
1042
+
1043
+
1044
+ def _measure_fragment_xml(frag_xml: str) -> Tuple[float, float, float, float]:
1045
+ """Measure (xmin, ymin, xmax, ymax) from all p="x y" attributes in fragment XML."""
1046
+ xs, ys = [], []
1047
+ for m in re.finditer(r'\bp="([-\d.]+)\s+([-\d.]+)"', frag_xml):
1048
+ xs.append(float(m.group(1)))
1049
+ ys.append(float(m.group(2)))
1050
+ if not xs:
1051
+ return (0, 0, 0, 0)
1052
+ return min(xs), min(ys), max(xs), max(ys)
1053
+
1054
+
1055
+ def _best_smiles_component(smiles: str) -> str:
1056
+ """For a multi-component SMILES (dot-separated), return the largest
1057
+ drug-like component (most heavy atoms, filtering out pure alkyne chains)."""
1058
+ components = smiles.split(".")
1059
+ if len(components) <= 1:
1060
+ return smiles
1061
+
1062
+ best = ""
1063
+ best_score = -1
1064
+ for comp in components:
1065
+ comp = comp.strip()
1066
+ if not comp:
1067
+ continue
1068
+ # Reject pure-alkyne chains
1069
+ if re.fullmatch(r'[C#]+', comp):
1070
+ continue
1071
+ # Score by number of heavy-atom characters
1072
+ score = sum(1 for c in comp if c.isalpha() and c.isupper())
1073
+ if score > best_score:
1074
+ best = comp
1075
+ best_score = score
1076
+
1077
+ return best or smiles
1078
+
1079
+
1080
+ # ---------------------------------------------------------------------------
1081
+ # ChemScript cleanup: SMILES → ChemDraw-native fragment XML
1082
+ # ---------------------------------------------------------------------------
1083
+
1084
+ def _open_chemscript_bridge(verbose: bool = False):
1085
+ """Import and open a ChemScriptBridge instance. Caller must call .close()."""
1086
+ import importlib.util
1087
+ _dir = os.path.dirname(os.path.abspath(__file__))
1088
+ try:
1089
+ spec = importlib.util.spec_from_file_location(
1090
+ "chemscript_bridge", os.path.join(_dir, "chemscript_bridge.py")
1091
+ )
1092
+ csb_mod = importlib.util.module_from_spec(spec)
1093
+ spec.loader.exec_module(csb_mod)
1094
+ except Exception as exc:
1095
+ raise ImportError(
1096
+ f"Could not import chemscript_bridge.py: {exc}\n"
1097
+ "ChemDraw and chemscript_bridge are required."
1098
+ ) from exc
1099
+ if verbose:
1100
+ print("[reaction_from_image] Opening ChemScript bridge...",
1101
+ file=sys.stderr)
1102
+ return csb_mod.ChemScriptBridge()
1103
+
1104
+
1105
+ def _chemscript_fragment_xmls(
1106
+ structures: List[Dict],
1107
+ verbose: bool = False,
1108
+ ) -> Dict[int, Tuple[str, float, float, float, float]]:
1109
+ """
1110
+ For each structure with a valid SMILES, produce a ChemScript-cleaned
1111
+ fragment XML string + its bounding box.
1112
+
1113
+ Returns dict: structure_index → (fragment_xml, xmin, ymin, xmax, ymax)
1114
+ """
1115
+ import xml.etree.ElementTree as ET
1116
+
1117
+ def log(msg: str):
1118
+ if verbose:
1119
+ print(f"[reaction_from_image] {msg}", file=sys.stderr)
1120
+
1121
+ cs = _open_chemscript_bridge(verbose)
1122
+
1123
+ result: Dict[int, Tuple[str, float, float, float, float]] = {}
1124
+ try:
1125
+ for i, entry in enumerate(structures):
1126
+ smiles = entry.get("smiles", "").strip()
1127
+ if not smiles:
1128
+ continue
1129
+ if "." in smiles:
1130
+ smiles = _best_smiles_component(smiles)
1131
+
1132
+ log(f" ChemScript [{i}]: {smiles[:60]}...")
1133
+ try:
1134
+ cdxml_str = cs.smiles_to_cdxml(smiles)
1135
+ except Exception as exc:
1136
+ log(f" ChemScript failed for [{i}]: {exc}")
1137
+ continue
1138
+
1139
+ if not cdxml_str or "<CDXML" not in cdxml_str:
1140
+ log(f" ChemScript returned empty CDXML for [{i}]")
1141
+ continue
1142
+
1143
+ # Parse and extract the first <fragment>
1144
+ root = ET.fromstring(cdxml_str)
1145
+ page_el = root.find("page")
1146
+ if page_el is None:
1147
+ continue
1148
+ frag_el = page_el.find("fragment")
1149
+ if frag_el is None:
1150
+ continue
1151
+
1152
+ frag_xml = ET.tostring(frag_el, encoding="unicode")
1153
+
1154
+ # Measure bounding box from atom positions
1155
+ xmin, ymin, xmax, ymax = _measure_fragment_xml(frag_xml)
1156
+ if xmin == xmax:
1157
+ continue
1158
+
1159
+ result[i] = (frag_xml, xmin, ymin, xmax, ymax)
1160
+ log(f" ChemScript [{i}]: OK, bbox w={xmax-xmin:.1f} h={ymax-ymin:.1f}")
1161
+ finally:
1162
+ cs.close()
1163
+
1164
+ return result
1165
+
1166
+
1167
+ # ---------------------------------------------------------------------------
1168
+ # Build reaction scheme using ChemScript fragments
1169
+ # ---------------------------------------------------------------------------
1170
+
1171
+ def build_reaction_scheme_chemscript(
1172
+ structures: List[Dict],
1173
+ cs_fragments: Dict[int, Tuple[str, float, float, float, float]],
1174
+ reactant_indices: List[int],
1175
+ product_indices: List[int],
1176
+ conditions_above: List[str],
1177
+ conditions_below: List[str],
1178
+ verbose: bool = False,
1179
+ expanded_above: Optional["ExpandedItems"] = None,
1180
+ expanded_below: Optional["ExpandedItems"] = None,
1181
+ ) -> str:
1182
+ """
1183
+ Assemble a CDXML reaction scheme using ChemScript-cleaned fragment XML.
1184
+
1185
+ Same layout logic as build_reaction_scheme but uses native ChemDraw
1186
+ fragments instead of building from atom/bond dicts.
1187
+ """
1188
+ def log(msg: str):
1189
+ if verbose:
1190
+ print(f"[reaction_from_image] {msg}", file=sys.stderr)
1191
+
1192
+ # Validate indices — must have ChemScript fragments for all
1193
+ for idx in reactant_indices + product_indices:
1194
+ if idx not in cs_fragments:
1195
+ raise ValueError(
1196
+ f"Structure {idx} has no ChemScript fragment — "
1197
+ f"cleanup may have failed for this structure."
1198
+ )
1199
+
1200
+ ids = _IDGen(1000)
1201
+ cursor_x = PAGE_LEFT
1202
+
1203
+ # ------------------------------------------------------------------
1204
+ # Layout: translate ChemScript fragments to final positions
1205
+ # ------------------------------------------------------------------
1206
+
1207
+ reactant_frag_xmls: List[str] = []
1208
+ reactant_extents: List[Tuple[float, float, float, float]] = []
1209
+
1210
+ for idx in reactant_indices:
1211
+ frag_xml, xmin, ymin, xmax, ymax = cs_fragments[idx]
1212
+ mol_w = xmax - xmin
1213
+ cx = (xmin + xmax) / 2.0
1214
+ cy = (ymin + ymax) / 2.0
1215
+ target_cx = cursor_x + mol_w / 2.0
1216
+ target_cy = VERTICAL_CENTER
1217
+ dx = target_cx - cx
1218
+ dy = target_cy - cy
1219
+ translated = _translate_fragment_xml(frag_xml, dx, dy)
1220
+
1221
+ # Re-measure to get actual final extent
1222
+ fx0, fy0, fx1, fy1 = _measure_fragment_xml(translated)
1223
+ reactant_frag_xmls.append(translated)
1224
+ reactant_extents.append((fx0, fy0, fx1, fy1))
1225
+ cursor_x += mol_w + INTER_MOL_GAP
1226
+ log(f"Reactant [{idx}] placed at x=[{cursor_x - mol_w - INTER_MOL_GAP:.1f}, {cursor_x - INTER_MOL_GAP:.1f}]")
1227
+
1228
+ # Arrow
1229
+ arrow_tail_x = cursor_x - INTER_MOL_GAP + ARROW_MARGIN
1230
+ arrow_head_x = arrow_tail_x + ARROW_LENGTH
1231
+ arrow_y = VERTICAL_CENTER
1232
+ log(f"Arrow: tail={arrow_tail_x:.1f}, head={arrow_head_x:.1f}, y={arrow_y:.1f}")
1233
+
1234
+ cursor_x = arrow_head_x + ARROW_MARGIN
1235
+
1236
+ product_frag_xmls: List[str] = []
1237
+ product_extents: List[Tuple[float, float, float, float]] = []
1238
+
1239
+ for idx in product_indices:
1240
+ frag_xml, xmin, ymin, xmax, ymax = cs_fragments[idx]
1241
+ mol_w = xmax - xmin
1242
+ cx = (xmin + xmax) / 2.0
1243
+ cy = (ymin + ymax) / 2.0
1244
+ target_cx = cursor_x + mol_w / 2.0
1245
+ target_cy = VERTICAL_CENTER
1246
+ dx = target_cx - cx
1247
+ dy = target_cy - cy
1248
+ translated = _translate_fragment_xml(frag_xml, dx, dy)
1249
+
1250
+ fx0, fy0, fx1, fy1 = _measure_fragment_xml(translated)
1251
+ product_frag_xmls.append(translated)
1252
+ product_extents.append((fx0, fy0, fx1, fy1))
1253
+ cursor_x += mol_w + INTER_MOL_GAP
1254
+ log(f"Product [{idx}] placed at x=[{cursor_x - mol_w - INTER_MOL_GAP:.1f}, {cursor_x - INTER_MOL_GAP:.1f}]")
1255
+
1256
+ # Assign IDs to ChemScript fragments (need IDs for <scheme><step> references)
1257
+ # ChemScript fragments already have their own internal IDs; we need to extract
1258
+ # the top-level fragment id for the <step> element.
1259
+ reactant_frag_ids: List[str] = []
1260
+ for xml in reactant_frag_xmls:
1261
+ m = re.search(r'<fragment\s+id="(\d+)"', xml)
1262
+ if m:
1263
+ reactant_frag_ids.append(m.group(1))
1264
+
1265
+ product_frag_ids: List[str] = []
1266
+ for xml in product_frag_xmls:
1267
+ m = re.search(r'<fragment\s+id="(\d+)"', xml)
1268
+ if m:
1269
+ product_frag_ids.append(m.group(1))
1270
+
1271
+ # Resolve abbreviations
1272
+ resolved_above = [resolve_abbreviation(line) for line in conditions_above]
1273
+ resolved_below = [resolve_abbreviation(line) for line in conditions_below]
1274
+ log(f"Conditions above: {resolved_above}")
1275
+ log(f"Conditions below: {resolved_below}")
1276
+
1277
+ # Conditions: expanded structures or text labels
1278
+ above_xmls: List[str] = []
1279
+ below_xmls: List[str] = []
1280
+ above_ids: List[int] = []
1281
+ below_ids: List[int] = []
1282
+
1283
+ arrow_mid_x = (arrow_tail_x + arrow_head_x) / 2.0
1284
+
1285
+ if expanded_above is not None:
1286
+ ax, ai = _layout_expanded_conditions(
1287
+ expanded_above, arrow_tail_x, arrow_head_x, arrow_y,
1288
+ "above", ids, verbose,
1289
+ )
1290
+ above_xmls.extend(ax)
1291
+ above_ids.extend(ai)
1292
+ elif resolved_above:
1293
+ n_above = len(resolved_above)
1294
+ baseline_y = (arrow_y - CONDITIONS_GAP_ABOVE - CONDITIONS_DESCENDER
1295
+ - (n_above - 1) * CONDITIONS_LINE_HEIGHT)
1296
+ txt_xml, txt_id = _build_conditions_text(
1297
+ resolved_above, arrow_mid_x, baseline_y, ids
1298
+ )
1299
+ above_xmls.append(txt_xml)
1300
+ above_ids.append(txt_id)
1301
+
1302
+ if expanded_below is not None:
1303
+ bx, bi = _layout_expanded_conditions(
1304
+ expanded_below, arrow_tail_x, arrow_head_x, arrow_y,
1305
+ "below", ids, verbose,
1306
+ )
1307
+ below_xmls.extend(bx)
1308
+ below_ids.extend(bi)
1309
+ elif resolved_below:
1310
+ ascender = 8.0
1311
+ baseline_y = arrow_y + CONDITIONS_GAP_BELOW + ascender
1312
+ txt_xml, txt_id = _build_conditions_text(
1313
+ resolved_below, arrow_mid_x, baseline_y, ids
1314
+ )
1315
+ below_xmls.append(txt_xml)
1316
+ below_ids.append(txt_id)
1317
+
1318
+ # Arrow XML
1319
+ arrow_xml, arrow_id = _build_arrow(
1320
+ arrow_tail_x, arrow_y, arrow_head_x, arrow_y, ids
1321
+ )
1322
+
1323
+ # Scheme / step
1324
+ scheme_id = ids.next()
1325
+ step_id = ids.next()
1326
+ step_attrs = [
1327
+ f'id="{step_id}"',
1328
+ f'ReactionStepReactants="{" ".join(reactant_frag_ids)}"',
1329
+ f'ReactionStepProducts="{" ".join(product_frag_ids)}"',
1330
+ f'ReactionStepArrows="{arrow_id}"',
1331
+ ]
1332
+ if above_ids:
1333
+ step_attrs.append(
1334
+ f'ReactionStepObjectsAboveArrow="{" ".join(str(i) for i in above_ids)}"'
1335
+ )
1336
+ if below_ids:
1337
+ step_attrs.append(
1338
+ f'ReactionStepObjectsBelowArrow="{" ".join(str(i) for i in below_ids)}"'
1339
+ )
1340
+ scheme_xml = f'<scheme id="{scheme_id}"><step {" ".join(step_attrs)}/></scheme>'
1341
+
1342
+ # Bounding box
1343
+ all_extents = reactant_extents + product_extents
1344
+ all_x0 = min(e[0] for e in all_extents)
1345
+ all_y0 = min(e[1] for e in all_extents)
1346
+ all_x1 = max(e[2] for e in all_extents)
1347
+ all_y1 = max(e[3] for e in all_extents)
1348
+
1349
+ # Include expanded condition fragments in bounding box
1350
+ for frag_xml_str in above_xmls + below_xmls:
1351
+ if frag_xml_str.lstrip().startswith("<fragment"):
1352
+ fx0, fy0, fx1, fy1 = _measure_fragment_xml(frag_xml_str)
1353
+ if fx0 != fx1:
1354
+ all_x0 = min(all_x0, fx0)
1355
+ all_y0 = min(all_y0, fy0)
1356
+ all_x1 = max(all_x1, fx1)
1357
+ all_y1 = max(all_y1, fy1)
1358
+
1359
+ margin = 20.0
1360
+ doc_bbox = (
1361
+ f"{all_x0 - margin:.2f} {all_y0 - margin:.2f} "
1362
+ f"{all_x1 + margin:.2f} {all_y1 + margin:.2f}"
1363
+ )
1364
+ page_w = all_x1 + 100
1365
+ page_h = all_y1 + 100
1366
+ page_id = ids.next()
1367
+
1368
+ # Assemble
1369
+ parts = [
1370
+ _format_cdxml_header(doc_bbox),
1371
+ f'<page id="{page_id}" BoundingBox="0 0 {page_w:.0f} {page_h:.0f}" '
1372
+ f'HeaderPosition="36" FooterPosition="36" '
1373
+ f'PrintTrimMarks="yes" HeightPages="1" WidthPages="2">',
1374
+ ]
1375
+ parts.extend(reactant_frag_xmls)
1376
+ parts.extend(product_frag_xmls)
1377
+ parts.extend(above_xmls)
1378
+ parts.extend(below_xmls)
1379
+ parts.append(arrow_xml)
1380
+ parts.append(scheme_xml)
1381
+ parts.append("</page>")
1382
+ parts.append(CDXML_FOOTER)
1383
+
1384
+ return "\n".join(parts)
1385
+
1386
+
1387
+ # ---------------------------------------------------------------------------
1388
+ # High-level pipeline: image + descriptor → CDXML
1389
+ # ---------------------------------------------------------------------------
1390
+
1391
+ def reaction_from_image(
1392
+ image_path: str,
1393
+ descriptor: Dict,
1394
+ page: int = 0,
1395
+ segment: bool = True,
1396
+ hand_drawn: bool = False,
1397
+ verbose: bool = False,
1398
+ merge_gap: Optional[int] = None,
1399
+ cleanup: bool = False,
1400
+ expand: bool = False,
1401
+ ) -> str:
1402
+ """
1403
+ Full pipeline: image + reaction descriptor → CDXML reaction scheme.
1404
+
1405
+ Parameters
1406
+ ----------
1407
+ image_path : path to screenshot PNG/JPG/PDF
1408
+ descriptor : dict with reactant_indices, product_indices, conditions_above/below
1409
+ page : PDF page number
1410
+ segment : whether to segment the image
1411
+ hand_drawn : use hand-drawn DECIMER model
1412
+ verbose : print progress
1413
+ merge_gap : pixel gap for merging nearby boxes (None = adaptive)
1414
+ cleanup : run ChemScript cleanup on structures (ChemDraw-native quality)
1415
+ expand : expand conditions to molecular structures where possible
1416
+
1417
+ Returns
1418
+ -------
1419
+ CDXML document string
1420
+ """
1421
+ # Import structure_from_image (sibling module)
1422
+ from . import structure_from_image as sfi
1423
+
1424
+ # Step 1: Extract structures (DECIMER)
1425
+ if verbose:
1426
+ print(f"[reaction_from_image] Extracting structures from {image_path}...",
1427
+ file=sys.stderr)
1428
+
1429
+ structures = sfi._extract_structures_raw(
1430
+ image_path,
1431
+ page=page,
1432
+ segment=segment,
1433
+ hand_drawn=hand_drawn,
1434
+ verbose=verbose,
1435
+ merge_gap=merge_gap,
1436
+ )
1437
+
1438
+ if verbose:
1439
+ print(f"[reaction_from_image] Extracted {len(structures)} structure(s)",
1440
+ file=sys.stderr)
1441
+ for i, s in enumerate(structures):
1442
+ print(f" [{i}] SMILES={s.get('smiles', '?')}, "
1443
+ f"bbox={s.get('bbox', '?')}, "
1444
+ f"atoms={len(s.get('atoms', []))}",
1445
+ file=sys.stderr)
1446
+
1447
+ reactant_indices = descriptor.get("reactant_indices", [])
1448
+ product_indices = descriptor.get("product_indices", [])
1449
+ conditions_above = descriptor.get("conditions_above", [])
1450
+ conditions_below = descriptor.get("conditions_below", [])
1451
+
1452
+ # Step 2: Optionally expand conditions to structures
1453
+ expanded_above = None
1454
+ expanded_below = None
1455
+ if expand:
1456
+ cs_bridge = _open_chemscript_bridge(verbose)
1457
+ try:
1458
+ if verbose:
1459
+ print("[reaction_from_image] Resolving conditions to structures...",
1460
+ file=sys.stderr)
1461
+ expanded_above = _resolve_all_conditions(
1462
+ conditions_above, cs_bridge, verbose
1463
+ )
1464
+ expanded_below = _resolve_all_conditions(
1465
+ conditions_below, cs_bridge, verbose
1466
+ )
1467
+ except Exception as exc:
1468
+ if verbose:
1469
+ print(f"[reaction_from_image] Expand failed: {exc}", file=sys.stderr)
1470
+ # Don't close bridge yet if cleanup also needs it
1471
+
1472
+ # Step 3: Build reaction scheme
1473
+ if cleanup:
1474
+ # Use ChemScript for publication-quality structures
1475
+ if verbose:
1476
+ print("[reaction_from_image] Running ChemScript cleanup...",
1477
+ file=sys.stderr)
1478
+ cs_fragments = _chemscript_fragment_xmls(structures, verbose=verbose)
1479
+
1480
+ cdxml = build_reaction_scheme_chemscript(
1481
+ structures=structures,
1482
+ cs_fragments=cs_fragments,
1483
+ reactant_indices=reactant_indices,
1484
+ product_indices=product_indices,
1485
+ conditions_above=conditions_above,
1486
+ conditions_below=conditions_below,
1487
+ verbose=verbose,
1488
+ expanded_above=expanded_above,
1489
+ expanded_below=expanded_below,
1490
+ )
1491
+ else:
1492
+ # Use RDKit coordinates (faster, no ChemDraw dependency)
1493
+ cdxml = build_reaction_scheme(
1494
+ structures=structures,
1495
+ reactant_indices=reactant_indices,
1496
+ product_indices=product_indices,
1497
+ conditions_above=conditions_above,
1498
+ conditions_below=conditions_below,
1499
+ verbose=verbose,
1500
+ expanded_above=expanded_above,
1501
+ expanded_below=expanded_below,
1502
+ )
1503
+
1504
+ return cdxml
1505
+
1506
+
1507
+ # ---------------------------------------------------------------------------
1508
+ # High-level pipeline: image + descriptor → JSON (ReactionDescriptor)
1509
+ # ---------------------------------------------------------------------------
1510
+
1511
+ def reaction_from_image_to_json(
1512
+ image_path: str,
1513
+ descriptor: Dict,
1514
+ output_path: Optional[str] = None,
1515
+ page: int = 0,
1516
+ segment: bool = True,
1517
+ hand_drawn: bool = False,
1518
+ verbose: bool = False,
1519
+ merge_gap: Optional[int] = None,
1520
+ use_network: bool = True,
1521
+ ) -> "ReactionDescriptor":
1522
+ """
1523
+ Full pipeline: image + reaction descriptor → ReactionDescriptor JSON.
1524
+
1525
+ Same extraction as :func:`reaction_from_image`, but returns a
1526
+ :class:`ReactionDescriptor` (the standard JSON source of truth) instead
1527
+ of a CDXML string. The agent can then render a scheme downstream via
1528
+ the scheme DSL if needed.
1529
+
1530
+ Parameters
1531
+ ----------
1532
+ image_path : path to screenshot PNG/JPG/PDF
1533
+ descriptor : dict with reactant_indices, product_indices, conditions_above/below
1534
+ output_path : if given, write JSON to this path
1535
+ page : PDF page number
1536
+ segment : whether to segment the image
1537
+ hand_drawn : use hand-drawn DECIMER model
1538
+ verbose : print progress
1539
+ merge_gap : pixel gap for merging nearby boxes (None = adaptive)
1540
+ use_network : allow PubChem lookups for condition name resolution
1541
+
1542
+ Returns
1543
+ -------
1544
+ ReactionDescriptor
1545
+ """
1546
+ from ..perception.reaction_parser import (
1547
+ ReactionDescriptor, SpeciesDescriptor,
1548
+ _resolve_text_label, _compute_all_masses,
1549
+ )
1550
+ from . import structure_from_image as sfi
1551
+ import datetime
1552
+
1553
+ def _log(msg: str):
1554
+ if verbose:
1555
+ print(f"[reaction_from_image_to_json] {msg}", file=sys.stderr)
1556
+
1557
+ # Step 1: Extract structures (DECIMER)
1558
+ _log(f"Extracting structures from {image_path}...")
1559
+ structures = sfi._extract_structures_raw(
1560
+ image_path,
1561
+ page=page,
1562
+ segment=segment,
1563
+ hand_drawn=hand_drawn,
1564
+ verbose=verbose,
1565
+ merge_gap=merge_gap,
1566
+ )
1567
+ _log(f"Extracted {len(structures)} structure(s)")
1568
+
1569
+ reactant_indices = set(descriptor.get("reactant_indices", []))
1570
+ product_indices = set(descriptor.get("product_indices", []))
1571
+ conditions_above = descriptor.get("conditions_above", [])
1572
+ conditions_below = descriptor.get("conditions_below", [])
1573
+
1574
+ db = get_reagent_db()
1575
+ species_list: List[SpeciesDescriptor] = []
1576
+ warnings: List[str] = []
1577
+ sp_idx = 0
1578
+
1579
+ # Step 2: Build SpeciesDescriptor for each extracted structure
1580
+ for entry in structures:
1581
+ smiles = entry.get("smiles", "").strip()
1582
+ idx = entry.get("index", 0)
1583
+
1584
+ if not smiles:
1585
+ warnings.append(f"Structure at index {idx}: DECIMER returned no SMILES")
1586
+ continue
1587
+
1588
+ # Canonicalize SMILES
1589
+ canon_smiles = smiles
1590
+ try:
1591
+ from rdkit import Chem
1592
+ mol = Chem.MolFromSmiles(smiles)
1593
+ if mol:
1594
+ canon_smiles = Chem.MolToSmiles(mol)
1595
+ except ImportError:
1596
+ pass
1597
+
1598
+ # Determine role from descriptor indices
1599
+ if idx in reactant_indices:
1600
+ role = "atom_contributing"
1601
+ is_sm = (idx == min(reactant_indices)) # first reactant = SM
1602
+ elif idx in product_indices:
1603
+ role = "product"
1604
+ is_sm = False
1605
+ else:
1606
+ role = "non_contributing"
1607
+ is_sm = False
1608
+
1609
+ is_dp = (role == "product" and
1610
+ (len(product_indices) == 1 or idx == min(product_indices)))
1611
+
1612
+ # Try to get a display name from reagent DB (by SMILES)
1613
+ display = db.display_for_smiles(canon_smiles) if canon_smiles else None
1614
+ name = display or canon_smiles
1615
+
1616
+ # Role detail from reagent DB (by SMILES)
1617
+ role_detail = db.role_for_smiles(canon_smiles) if canon_smiles else None
1618
+
1619
+ # Build original_geometry from extracted atoms/bonds
1620
+ atoms = entry.get("atoms", [])
1621
+ bonds = entry.get("bonds", [])
1622
+ original_geometry = None
1623
+ if atoms:
1624
+ original_geometry = {
1625
+ "atoms": [
1626
+ {k: v for k, v in a.items()
1627
+ if k in ("index", "x", "y", "symbol", "num_hydrogens", "charge")}
1628
+ for a in atoms
1629
+ ],
1630
+ "bonds": [
1631
+ {k: v for k, v in b.items()
1632
+ if k in ("index", "atom1", "atom2", "order", "cfg", "double_pos")}
1633
+ for b in bonds
1634
+ ],
1635
+ }
1636
+
1637
+ sp = SpeciesDescriptor(
1638
+ id=f"sp_{sp_idx}",
1639
+ smiles=canon_smiles,
1640
+ name=name,
1641
+ role=role,
1642
+ role_detail=role_detail,
1643
+ classification_method="image_descriptor",
1644
+ is_sm=is_sm,
1645
+ is_dp=is_dp,
1646
+ source="image",
1647
+ display_text=name,
1648
+ original_geometry=original_geometry,
1649
+ )
1650
+ species_list.append(sp)
1651
+ sp_idx += 1
1652
+
1653
+ # Step 3: Resolve conditions text to species or condition strings
1654
+ condition_strings: List[str] = []
1655
+ all_conditions = conditions_above + conditions_below
1656
+
1657
+ for cond_text in all_conditions:
1658
+ cond_text = cond_text.strip()
1659
+ if not cond_text:
1660
+ continue
1661
+
1662
+ # Non-chemistry text goes straight to conditions
1663
+ if _is_non_chemistry_text(cond_text):
1664
+ condition_strings.append(cond_text)
1665
+ continue
1666
+
1667
+ # Try to resolve to SMILES
1668
+ smi = _resolve_text_label(cond_text, use_network=use_network)
1669
+ if smi:
1670
+ # Get display name and role from reagent DB
1671
+ display = db.display_for_name(cond_text) or cond_text
1672
+ role_detail = db.role_for_name(cond_text)
1673
+
1674
+ sp = SpeciesDescriptor(
1675
+ id=f"sp_{sp_idx}",
1676
+ smiles=smi,
1677
+ name=display,
1678
+ role="non_contributing",
1679
+ role_detail=role_detail,
1680
+ classification_method="name_resolution",
1681
+ source="text_label",
1682
+ display_text=display,
1683
+ )
1684
+ species_list.append(sp)
1685
+ sp_idx += 1
1686
+ else:
1687
+ # Could not resolve — store as condition text
1688
+ condition_strings.append(cond_text)
1689
+
1690
+ # Step 4: Compute masses, formulas, adducts for all species
1691
+ _compute_all_masses(species_list)
1692
+
1693
+ # Step 5: Build the reaction SMILES
1694
+ reaction_smiles = None
1695
+ try:
1696
+ reactant_smis = [sp.smiles for sp in species_list
1697
+ if sp.role == "atom_contributing" and sp.smiles]
1698
+ product_smis = [sp.smiles for sp in species_list
1699
+ if sp.role == "product" and sp.smiles]
1700
+ if reactant_smis and product_smis:
1701
+ reaction_smiles = (
1702
+ ".".join(reactant_smis) + ">>" + ".".join(product_smis)
1703
+ )
1704
+ except Exception:
1705
+ pass
1706
+
1707
+ desc = ReactionDescriptor(
1708
+ version="1.3",
1709
+ experiment="",
1710
+ input_files={"image": os.path.abspath(image_path)},
1711
+ reaction_smiles=reaction_smiles,
1712
+ species=species_list,
1713
+ warnings=warnings,
1714
+ metadata={
1715
+ "parser_version": "reaction_from_image 1.0",
1716
+ "timestamp": datetime.datetime.now().isoformat(),
1717
+ "source": "image",
1718
+ },
1719
+ conditions=condition_strings,
1720
+ )
1721
+
1722
+ if output_path:
1723
+ desc.to_json(output_path)
1724
+ _log(f"Wrote {output_path}")
1725
+
1726
+ return desc
1727
+
1728
+
1729
+ # ---------------------------------------------------------------------------
1730
+ # CLI
1731
+ # ---------------------------------------------------------------------------
1732
+
1733
+ def _structures_json_to_descriptor(
1734
+ structures: List[Dict],
1735
+ descriptor: Dict,
1736
+ source_path: str,
1737
+ verbose: bool = False,
1738
+ ) -> "ReactionDescriptor":
1739
+ """Build a ReactionDescriptor from pre-extracted structures + descriptor.
1740
+
1741
+ Used when --structures-json is combined with --format json in the CLI.
1742
+ """
1743
+ from ..perception.reaction_parser import (
1744
+ ReactionDescriptor, SpeciesDescriptor,
1745
+ _resolve_text_label, _compute_all_masses,
1746
+ )
1747
+ import datetime
1748
+
1749
+ db = get_reagent_db()
1750
+ reactant_indices = set(descriptor.get("reactant_indices", []))
1751
+ product_indices = set(descriptor.get("product_indices", []))
1752
+ conditions_above = descriptor.get("conditions_above", [])
1753
+ conditions_below = descriptor.get("conditions_below", [])
1754
+
1755
+ species_list: List[SpeciesDescriptor] = []
1756
+ warnings: List[str] = []
1757
+ sp_idx = 0
1758
+
1759
+ for entry in structures:
1760
+ smiles = entry.get("smiles", "").strip()
1761
+ idx = entry.get("index", 0)
1762
+ if not smiles:
1763
+ warnings.append(f"Structure at index {idx}: no SMILES")
1764
+ continue
1765
+
1766
+ canon_smiles = smiles
1767
+ try:
1768
+ from rdkit import Chem
1769
+ mol = Chem.MolFromSmiles(smiles)
1770
+ if mol:
1771
+ canon_smiles = Chem.MolToSmiles(mol)
1772
+ except ImportError:
1773
+ pass
1774
+
1775
+ if idx in reactant_indices:
1776
+ role = "atom_contributing"
1777
+ is_sm = (idx == min(reactant_indices))
1778
+ elif idx in product_indices:
1779
+ role = "product"
1780
+ is_sm = False
1781
+ else:
1782
+ role = "non_contributing"
1783
+ is_sm = False
1784
+
1785
+ is_dp = (role == "product" and
1786
+ (len(product_indices) == 1 or idx == min(product_indices)))
1787
+
1788
+ display = db.display_for_smiles(canon_smiles) if canon_smiles else None
1789
+ name = display or canon_smiles
1790
+ role_detail = db.role_for_smiles(canon_smiles) if canon_smiles else None
1791
+
1792
+ atoms = entry.get("atoms", [])
1793
+ bonds = entry.get("bonds", [])
1794
+ original_geometry = None
1795
+ if atoms:
1796
+ original_geometry = {
1797
+ "atoms": [
1798
+ {k: v for k, v in a.items()
1799
+ if k in ("index", "x", "y", "symbol", "num_hydrogens", "charge")}
1800
+ for a in atoms
1801
+ ],
1802
+ "bonds": [
1803
+ {k: v for k, v in b.items()
1804
+ if k in ("index", "atom1", "atom2", "order", "cfg", "double_pos")}
1805
+ for b in bonds
1806
+ ],
1807
+ }
1808
+
1809
+ sp = SpeciesDescriptor(
1810
+ id=f"sp_{sp_idx}",
1811
+ smiles=canon_smiles,
1812
+ name=name,
1813
+ role=role,
1814
+ role_detail=role_detail,
1815
+ classification_method="image_descriptor",
1816
+ is_sm=is_sm,
1817
+ is_dp=is_dp,
1818
+ source="image",
1819
+ display_text=name,
1820
+ original_geometry=original_geometry,
1821
+ )
1822
+ species_list.append(sp)
1823
+ sp_idx += 1
1824
+
1825
+ condition_strings: List[str] = []
1826
+ for cond_text in conditions_above + conditions_below:
1827
+ cond_text = cond_text.strip()
1828
+ if not cond_text:
1829
+ continue
1830
+ if _is_non_chemistry_text(cond_text):
1831
+ condition_strings.append(cond_text)
1832
+ continue
1833
+ smi = _resolve_text_label(cond_text, use_network=True)
1834
+ if smi:
1835
+ display = db.display_for_name(cond_text) or cond_text
1836
+ role_detail_cond = db.role_for_name(cond_text)
1837
+ sp = SpeciesDescriptor(
1838
+ id=f"sp_{sp_idx}",
1839
+ smiles=smi,
1840
+ name=display,
1841
+ role="non_contributing",
1842
+ role_detail=role_detail_cond,
1843
+ classification_method="name_resolution",
1844
+ source="text_label",
1845
+ display_text=display,
1846
+ )
1847
+ species_list.append(sp)
1848
+ sp_idx += 1
1849
+ else:
1850
+ condition_strings.append(cond_text)
1851
+
1852
+ _compute_all_masses(species_list)
1853
+
1854
+ reaction_smiles = None
1855
+ try:
1856
+ r_smis = [sp.smiles for sp in species_list
1857
+ if sp.role == "atom_contributing" and sp.smiles]
1858
+ p_smis = [sp.smiles for sp in species_list
1859
+ if sp.role == "product" and sp.smiles]
1860
+ if r_smis and p_smis:
1861
+ reaction_smiles = ".".join(r_smis) + ">>" + ".".join(p_smis)
1862
+ except Exception:
1863
+ pass
1864
+
1865
+ return ReactionDescriptor(
1866
+ version="1.3",
1867
+ experiment="",
1868
+ input_files={"structures_json": os.path.abspath(source_path)},
1869
+ reaction_smiles=reaction_smiles,
1870
+ species=species_list,
1871
+ warnings=warnings,
1872
+ metadata={
1873
+ "parser_version": "reaction_from_image 1.0",
1874
+ "timestamp": datetime.datetime.now().isoformat(),
1875
+ "source": "image",
1876
+ },
1877
+ conditions=condition_strings,
1878
+ )
1879
+
1880
+
1881
+ def _build_parser() -> argparse.ArgumentParser:
1882
+ p = argparse.ArgumentParser(
1883
+ description=(
1884
+ "Build a ChemDraw reaction scheme (CDXML) from a screenshot image. "
1885
+ "Requires a JSON descriptor specifying which structures are reactants/products "
1886
+ "and what conditions text to include."
1887
+ ),
1888
+ formatter_class=argparse.RawDescriptionHelpFormatter,
1889
+ epilog=__doc__,
1890
+ )
1891
+ p.add_argument(
1892
+ "--image", "-i",
1893
+ default=None,
1894
+ help="Input image file (PNG/JPG/PDF). Required unless --structures-json is used.",
1895
+ )
1896
+ p.add_argument(
1897
+ "--descriptor", "-d",
1898
+ required=True,
1899
+ help="JSON descriptor file (or '-' for stdin)",
1900
+ )
1901
+ p.add_argument(
1902
+ "--output", "-o",
1903
+ default=None,
1904
+ help="Output CDXML file (default: <image_stem>_scheme.cdxml)",
1905
+ )
1906
+ p.add_argument(
1907
+ "--page",
1908
+ type=int,
1909
+ default=0,
1910
+ help="PDF page number, 0-indexed (default: 0)",
1911
+ )
1912
+ p.add_argument(
1913
+ "--no-segment",
1914
+ action="store_true",
1915
+ help="Don't segment — treat each region as one structure",
1916
+ )
1917
+ p.add_argument(
1918
+ "--hand-drawn",
1919
+ action="store_true",
1920
+ help="Use DECIMER hand-drawn model",
1921
+ )
1922
+ p.add_argument(
1923
+ "--gap",
1924
+ type=int,
1925
+ default=None,
1926
+ help="Merge gap in pixels for segmentation (default: adaptive)",
1927
+ )
1928
+ p.add_argument(
1929
+ "--cleanup",
1930
+ action="store_true",
1931
+ help="Run ChemScript cleanup on extracted structures",
1932
+ )
1933
+ p.add_argument(
1934
+ "--expand",
1935
+ action="store_true",
1936
+ help=(
1937
+ "Expand conditions to molecular structures where possible. "
1938
+ "Uses ChemScript name resolution and PubChem lookup. "
1939
+ "Falls back to text labels for unresolvable conditions."
1940
+ ),
1941
+ )
1942
+ p.add_argument(
1943
+ "--verbose", "-v",
1944
+ action="store_true",
1945
+ help="Print progress to stderr",
1946
+ )
1947
+ p.add_argument(
1948
+ "--structures-json",
1949
+ default=None,
1950
+ help=(
1951
+ "Path to a pre-extracted structures JSON file "
1952
+ "(from structure_from_image.py). Skips DECIMER extraction."
1953
+ ),
1954
+ )
1955
+ p.add_argument(
1956
+ "--format",
1957
+ choices=["cdxml", "json"],
1958
+ default="cdxml",
1959
+ help=(
1960
+ "Output format (default: cdxml). 'json' produces a "
1961
+ "ReactionDescriptor JSON file (same format as cdxml-parse)."
1962
+ ),
1963
+ )
1964
+ return p
1965
+
1966
+
1967
+ def main(argv: Optional[List[str]] = None) -> int:
1968
+ parser = _build_parser()
1969
+ args = parser.parse_args(argv)
1970
+
1971
+ # Validate: --image is required unless --structures-json is used
1972
+ if args.image is None and args.structures_json is None:
1973
+ parser.error("--image is required unless --structures-json is provided")
1974
+
1975
+ # Load descriptor
1976
+ if args.descriptor == "-":
1977
+ descriptor = json.load(sys.stdin)
1978
+ else:
1979
+ with open(args.descriptor, encoding="utf-8") as f:
1980
+ descriptor = json.load(f)
1981
+
1982
+ # Output path
1983
+ ext = ".json" if args.format == "json" else ".cdxml"
1984
+ if args.output is None:
1985
+ if args.image:
1986
+ stem = os.path.splitext(os.path.basename(args.image))[0]
1987
+ else:
1988
+ stem = os.path.splitext(os.path.basename(args.structures_json))[0]
1989
+ args.output = stem + ("_reaction" + ext if args.format == "json"
1990
+ else "_scheme" + ext)
1991
+
1992
+ # --- JSON output path: use reaction_from_image_to_json() ---
1993
+ if args.format == "json":
1994
+ if args.image:
1995
+ desc = reaction_from_image_to_json(
1996
+ image_path=args.image,
1997
+ descriptor=descriptor,
1998
+ output_path=args.output,
1999
+ page=args.page,
2000
+ segment=not args.no_segment,
2001
+ hand_drawn=args.hand_drawn,
2002
+ verbose=args.verbose,
2003
+ merge_gap=args.gap,
2004
+ )
2005
+ else:
2006
+ # --structures-json provided: build descriptor from pre-extracted
2007
+ from . import structure_from_image as sfi
2008
+ with open(args.structures_json, encoding="utf-8") as f:
2009
+ structures = json.load(f)
2010
+ # Synthesize an image path for metadata
2011
+ desc = _structures_json_to_descriptor(
2012
+ structures, descriptor, args.structures_json, args.verbose,
2013
+ )
2014
+ desc.to_json(args.output)
2015
+
2016
+ print(f"Written reaction JSON to {args.output}", file=sys.stderr)
2017
+ return 0
2018
+
2019
+ # --- CDXML output path (existing behaviour) ---
2020
+ if args.structures_json:
2021
+ with open(args.structures_json, encoding="utf-8") as f:
2022
+ structures = json.load(f)
2023
+
2024
+ if args.verbose:
2025
+ print(f"[reaction_from_image] Loaded {len(structures)} structures "
2026
+ f"from {args.structures_json}", file=sys.stderr)
2027
+ for i, s in enumerate(structures):
2028
+ print(f" [{i}] SMILES={s.get('smiles', '?')}, "
2029
+ f"atoms={len(s.get('atoms', []))}",
2030
+ file=sys.stderr)
2031
+
2032
+ reactant_indices = descriptor.get("reactant_indices", [])
2033
+ product_indices = descriptor.get("product_indices", [])
2034
+ conditions_above = descriptor.get("conditions_above", [])
2035
+ conditions_below = descriptor.get("conditions_below", [])
2036
+
2037
+ # Resolve conditions to structures if --expand
2038
+ expanded_above = None
2039
+ expanded_below = None
2040
+ if args.expand:
2041
+ cs_bridge = _open_chemscript_bridge(args.verbose)
2042
+ try:
2043
+ if args.verbose:
2044
+ print("[reaction_from_image] Resolving conditions to structures...",
2045
+ file=sys.stderr)
2046
+ expanded_above = _resolve_all_conditions(
2047
+ conditions_above, cs_bridge, args.verbose
2048
+ )
2049
+ expanded_below = _resolve_all_conditions(
2050
+ conditions_below, cs_bridge, args.verbose
2051
+ )
2052
+ finally:
2053
+ cs_bridge.close()
2054
+
2055
+ if args.cleanup:
2056
+ if args.verbose:
2057
+ print("[reaction_from_image] Running ChemScript cleanup...",
2058
+ file=sys.stderr)
2059
+ cs_fragments = _chemscript_fragment_xmls(structures, verbose=args.verbose)
2060
+ cdxml = build_reaction_scheme_chemscript(
2061
+ structures=structures,
2062
+ cs_fragments=cs_fragments,
2063
+ reactant_indices=reactant_indices,
2064
+ product_indices=product_indices,
2065
+ conditions_above=conditions_above,
2066
+ conditions_below=conditions_below,
2067
+ verbose=args.verbose,
2068
+ expanded_above=expanded_above,
2069
+ expanded_below=expanded_below,
2070
+ )
2071
+ else:
2072
+ cdxml = build_reaction_scheme(
2073
+ structures=structures,
2074
+ reactant_indices=reactant_indices,
2075
+ product_indices=product_indices,
2076
+ conditions_above=conditions_above,
2077
+ conditions_below=conditions_below,
2078
+ verbose=args.verbose,
2079
+ expanded_above=expanded_above,
2080
+ expanded_below=expanded_below,
2081
+ )
2082
+ else:
2083
+ cdxml = reaction_from_image(
2084
+ image_path=args.image,
2085
+ descriptor=descriptor,
2086
+ page=args.page,
2087
+ segment=not args.no_segment,
2088
+ hand_drawn=args.hand_drawn,
2089
+ verbose=args.verbose,
2090
+ merge_gap=args.gap,
2091
+ cleanup=args.cleanup,
2092
+ expand=args.expand,
2093
+ )
2094
+
2095
+ with open(args.output, "w", encoding="utf-8") as f:
2096
+ f.write(cdxml)
2097
+
2098
+ print(f"Written reaction scheme to {args.output}", file=sys.stderr)
2099
+ return 0
2100
+
2101
+
2102
+ if __name__ == "__main__":
2103
+ sys.exit(main())