cdxml-toolkit 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. cdxml_toolkit/__init__.py +18 -0
  2. cdxml_toolkit/_jre/__init__.py +2 -0
  3. cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
  4. cdxml_toolkit/analysis/__init__.py +35 -0
  5. cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
  6. cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
  7. cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
  8. cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
  9. cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
  10. cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
  11. cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
  12. cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
  13. cdxml_toolkit/analysis/extract_nmr.py +47 -0
  14. cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
  15. cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
  16. cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
  17. cdxml_toolkit/cdxml_builder.py +920 -0
  18. cdxml_toolkit/cdxml_utils.py +342 -0
  19. cdxml_toolkit/chemdraw/__init__.py +5 -0
  20. cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
  21. cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
  22. cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
  23. cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
  24. cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
  25. cdxml_toolkit/constants.py +304 -0
  26. cdxml_toolkit/coord_normalizer.py +438 -0
  27. cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
  28. cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
  29. cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
  30. cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
  31. cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
  32. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
  33. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
  34. cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
  35. cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
  36. cdxml_toolkit/image/__init__.py +15 -0
  37. cdxml_toolkit/image/reaction_from_image.py +2103 -0
  38. cdxml_toolkit/image/structure_from_image.py +1711 -0
  39. cdxml_toolkit/layout/__init__.py +5 -0
  40. cdxml_toolkit/layout/alignment.py +1642 -0
  41. cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
  42. cdxml_toolkit/layout/scheme_merger.py +2260 -0
  43. cdxml_toolkit/mcp_server/__init__.py +0 -0
  44. cdxml_toolkit/mcp_server/__main__.py +5 -0
  45. cdxml_toolkit/mcp_server/server.py +1567 -0
  46. cdxml_toolkit/naming/__init__.py +6 -0
  47. cdxml_toolkit/naming/aligned_namer.py +2342 -0
  48. cdxml_toolkit/naming/mol_builder.py +3722 -0
  49. cdxml_toolkit/naming/name_decomposer.py +2843 -0
  50. cdxml_toolkit/naming/reactions_datamol.json +2414 -0
  51. cdxml_toolkit/office/__init__.py +5 -0
  52. cdxml_toolkit/office/doc_from_template.py +722 -0
  53. cdxml_toolkit/office/ole_embedder.py +808 -0
  54. cdxml_toolkit/office/ole_extractor.py +272 -0
  55. cdxml_toolkit/perception/__init__.py +10 -0
  56. cdxml_toolkit/perception/compound_search.py +229 -0
  57. cdxml_toolkit/perception/eln_csv_parser.py +240 -0
  58. cdxml_toolkit/perception/rdf_parser.py +664 -0
  59. cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
  60. cdxml_toolkit/perception/reaction_parser.py +2150 -0
  61. cdxml_toolkit/perception/scheme_reader.py +2948 -0
  62. cdxml_toolkit/perception/scheme_refine.py +1404 -0
  63. cdxml_toolkit/perception/scheme_segmenter.py +619 -0
  64. cdxml_toolkit/perception/spatial_assignment.py +1013 -0
  65. cdxml_toolkit/rdkit_utils.py +605 -0
  66. cdxml_toolkit/render/__init__.py +17 -0
  67. cdxml_toolkit/render/auto_layout.py +229 -0
  68. cdxml_toolkit/render/compact_parser.py +632 -0
  69. cdxml_toolkit/render/parser.py +706 -0
  70. cdxml_toolkit/render/render_scheme.py +267 -0
  71. cdxml_toolkit/render/renderer.py +2387 -0
  72. cdxml_toolkit/render/schema.py +90 -0
  73. cdxml_toolkit/render/scheme_maker.py +1043 -0
  74. cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
  75. cdxml_toolkit/resolve/__init__.py +13 -0
  76. cdxml_toolkit/resolve/cas_resolver.py +430 -0
  77. cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
  78. cdxml_toolkit/resolve/condensed_formula.py +493 -0
  79. cdxml_toolkit/resolve/jre_manager.py +195 -0
  80. cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
  81. cdxml_toolkit/resolve/reagent_db.py +285 -0
  82. cdxml_toolkit/resolve/superatom_data.json +2856 -0
  83. cdxml_toolkit/resolve/superatom_table.py +146 -0
  84. cdxml_toolkit/text_formatting.py +298 -0
  85. cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
  86. cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
  87. cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
  88. cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
  89. cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
  90. cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
  91. cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1002 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ reaction_cleanup.py — Clean up a CDXML reaction scheme layout (pure Python).
4
+
5
+ Replaces ChemDraw COM "Clean Up Reaction" with algorithmic layout.
6
+ Offers multiple approaches that can be compared side-by-side.
7
+
8
+ Usage
9
+ -----
10
+ python reaction_cleanup.py input.cdxml # default approach
11
+ python reaction_cleanup.py input.cdxml -o out.cdxml # explicit output
12
+ python reaction_cleanup.py input.cdxml --approach bbox_center # pick approach
13
+ python reaction_cleanup.py input.cdxml --all # run all 6 approaches
14
+ python reaction_cleanup.py input.cdxml --all --render # run all + PNG
15
+
16
+ Approaches
17
+ ----------
18
+ 1. bbox_center — Bounding-box centroid alignment + uniform gaps
19
+ 2. arrow_driven — Arrow length drives layout; molecules placed relative to arrow ends
20
+ 3. proportional — Gap sizes proportional to molecule widths
21
+ 4. compact — Minimal gaps; tight layout for slides/posters
22
+ 5. golden_ratio — Arrow length and gaps use golden ratio proportions
23
+ 6. chemdraw_mimic — Closest emulation of ChemDraw's own cleanup heuristics
24
+ """
25
+
26
+ import argparse
27
+ import copy
28
+ import json
29
+ import math
30
+ import os
31
+ import sys
32
+ import xml.etree.ElementTree as ET
33
+ from typing import Dict, List, Optional, Tuple
34
+
35
+ from ..constants import (
36
+ ACS_BOND_LENGTH,
37
+ LAYOUT_ABOVE_GAP,
38
+ LAYOUT_BELOW_GAP,
39
+ LAYOUT_FRAG_GAP_BONDS,
40
+ LAYOUT_HANGING_LABEL_GAP,
41
+ LAYOUT_INTER_FRAGMENT_GAP,
42
+ LAYOUT_INTER_GAP_BONDS,
43
+ )
44
+ from ..cdxml_utils import (
45
+ fragment_bbox,
46
+ fragment_bottom_has_hanging_label,
47
+ parse_cdxml,
48
+ recompute_text_bbox,
49
+ write_cdxml,
50
+ )
51
+
52
+ # Backward-compat alias (imported by eln_enrichment.py)
53
+ _recompute_text_bbox = recompute_text_bbox
54
+
55
+ # Below-arrow fragment padding (not in shared constants)
56
+ LAYOUT_BELOW_FRAG_PAD = 2.0
57
+
58
+
59
+ # ---------------------------------------------------------------------------
60
+ # CDXML geometry helpers
61
+ # ---------------------------------------------------------------------------
62
+
63
+
64
+ def _get_page(root: ET.Element) -> Optional[ET.Element]:
65
+ return root.find("page")
66
+
67
+
68
+ def _build_id_map(page: ET.Element) -> Dict[str, ET.Element]:
69
+ """Map element id → element for all direct children of page."""
70
+ m: Dict[str, ET.Element] = {}
71
+ for el in page:
72
+ eid = el.get("id", "")
73
+ if eid:
74
+ m[eid] = el
75
+ return m
76
+
77
+
78
+ def _get_step(page: ET.Element) -> Optional[ET.Element]:
79
+ """Find the first <step> inside a <scheme> on the page."""
80
+ scheme = page.find("scheme")
81
+ if scheme is None:
82
+ return None
83
+ return scheme.find("step")
84
+
85
+
86
+ def _get_arrow(page: ET.Element, step: ET.Element,
87
+ id_map: Dict[str, ET.Element]) -> Optional[ET.Element]:
88
+ """Resolve the arrow element from step metadata."""
89
+ arrow_ids = step.get("ReactionStepArrows", "").split()
90
+ for aid in arrow_ids:
91
+ el = id_map.get(aid)
92
+ if el is not None and el.tag == "arrow":
93
+ return el
94
+ # Check for graphic superseded by arrow
95
+ if el is not None and el.tag == "graphic":
96
+ sup_id = el.get("SupersededBy", "")
97
+ if sup_id:
98
+ arrow_el = id_map.get(sup_id)
99
+ if arrow_el is not None:
100
+ return arrow_el
101
+ # Also search all page children for graphic → arrow chain
102
+ for child in page:
103
+ if child.tag == "graphic" and child.get("id") == aid:
104
+ sup_id = child.get("SupersededBy", "")
105
+ if sup_id:
106
+ for child2 in page:
107
+ if child2.get("id") == sup_id:
108
+ return child2
109
+ return None
110
+
111
+
112
+ def _arrow_endpoints(arrow: ET.Element) -> Tuple[float, float, float, float]:
113
+ """Return (tail_x, tail_y, head_x, head_y) from arrow element."""
114
+ from ..cdxml_utils import arrow_endpoints
115
+ return arrow_endpoints(arrow)
116
+
117
+
118
+
119
+ # _fragment_bbox and _fragment_bottom_has_hanging_label are now in cdxml_utils
120
+
121
+
122
+ def _text_bbox(t_el: ET.Element) -> Tuple[float, float, float, float]:
123
+ """Bounding box of a text element."""
124
+ bb = t_el.get("BoundingBox", "")
125
+ if bb:
126
+ vals = [float(v) for v in bb.split()]
127
+ if len(vals) >= 4:
128
+ return vals[0], vals[1], vals[2], vals[3]
129
+ p = t_el.get("p", "")
130
+ if p:
131
+ parts = [float(v) for v in p.split()]
132
+ # Estimate text size
133
+ text_content = "".join(s.text or "" for s in t_el.iter("s"))
134
+ w = len(text_content) * 5.8
135
+ h = 12.0 * max(1, text_content.count("\n") + 1)
136
+ return parts[0] - w/2, parts[1] - h, parts[0] + w/2, parts[1]
137
+ return 0, 0, 0, 0
138
+
139
+
140
+
141
+ # _recompute_text_bbox is now imported from cdxml_utils (alias at top of file)
142
+
143
+
144
+ def _estimate_text_width(t_el: ET.Element) -> float:
145
+ """Estimate text width from content (5.8 pt/char for Arial 10pt).
146
+
147
+ Uses the same character-width estimate as _recompute_text_bbox but
148
+ without modifying the element. Immune to stale BoundingBox values
149
+ from upstream processing (e.g. ELN exports with non-ACS scaling).
150
+ """
151
+ text_content = "".join(s.text or "" for s in t_el.iter("s"))
152
+ lines = text_content.split("\n") if "\n" in text_content else [text_content]
153
+ max_line_len = max((len(l) for l in lines), default=0)
154
+ return max_line_len * 5.8
155
+
156
+
157
+ def _element_bbox(el: ET.Element) -> Tuple[float, float, float, float]:
158
+ """Bounding box for any element (fragment or text)."""
159
+ if el.tag == "fragment":
160
+ bb = fragment_bbox(el)
161
+ return bb if bb is not None else (0, 0, 0, 0)
162
+ elif el.tag == "t":
163
+ return _text_bbox(el)
164
+ bb = el.get("BoundingBox", "")
165
+ if bb:
166
+ vals = [float(v) for v in bb.split()]
167
+ if len(vals) >= 4:
168
+ return vals[0], vals[1], vals[2], vals[3]
169
+ return 0, 0, 0, 0
170
+
171
+
172
+ def _bbox_center(bb: Tuple[float, float, float, float]) -> Tuple[float, float]:
173
+ return (bb[0] + bb[2]) / 2.0, (bb[1] + bb[3]) / 2.0
174
+
175
+
176
+ def _bbox_width(bb: Tuple[float, float, float, float]) -> float:
177
+ return bb[2] - bb[0]
178
+
179
+
180
+ def _bbox_height(bb: Tuple[float, float, float, float]) -> float:
181
+ return bb[3] - bb[1]
182
+
183
+
184
+ # ---------------------------------------------------------------------------
185
+ # Element shifting / positioning
186
+ # ---------------------------------------------------------------------------
187
+
188
+ def _shift_element(el: ET.Element, dx: float, dy: float):
189
+ """Translate an element (fragment or text) by (dx, dy).
190
+
191
+ For fragments, shifts ALL descendant nodes and text elements
192
+ (including those inside inner NodeType="Fragment" sub-structures).
193
+ This is correct because all coordinates live in the same space.
194
+ Also shifts BoundingBox attributes on all sub-elements.
195
+ """
196
+ if el.tag == "fragment":
197
+ for n in el.iter("n"):
198
+ p = n.get("p")
199
+ if p:
200
+ parts = p.split()
201
+ if len(parts) >= 2:
202
+ nx = float(parts[0]) + dx
203
+ ny = float(parts[1]) + dy
204
+ n.set("p", f"{nx:.2f} {ny:.2f}")
205
+ for t in el.iter("t"):
206
+ _shift_text_element(t, dx, dy)
207
+ # Shift BoundingBox on the fragment itself
208
+ _shift_bbox_attr(el, dx, dy)
209
+ # Also shift BoundingBox on any inner <fragment> elements
210
+ for inner_frag in el.iter("fragment"):
211
+ if inner_frag is not el:
212
+ _shift_bbox_attr(inner_frag, dx, dy)
213
+
214
+ elif el.tag == "t":
215
+ _shift_text_element(el, dx, dy)
216
+
217
+
218
+ def _shift_text_element(t: ET.Element, dx: float, dy: float):
219
+ """Shift a <t> element's position and bounding box."""
220
+ p = t.get("p")
221
+ if p:
222
+ parts = p.split()
223
+ if len(parts) >= 2:
224
+ nx = float(parts[0]) + dx
225
+ ny = float(parts[1]) + dy
226
+ t.set("p", f"{nx:.2f} {ny:.2f}")
227
+ _shift_bbox_attr(t, dx, dy)
228
+
229
+
230
+ def _shift_bbox_attr(el: ET.Element, dx: float, dy: float):
231
+ """Shift BoundingBox attribute by (dx, dy)."""
232
+ bb = el.get("BoundingBox")
233
+ if bb:
234
+ vals = [float(v) for v in bb.split()]
235
+ if len(vals) >= 4:
236
+ vals[0] += dx
237
+ vals[1] += dy
238
+ vals[2] += dx
239
+ vals[3] += dy
240
+ el.set("BoundingBox", " ".join(f"{v:.2f}" for v in vals))
241
+
242
+
243
+ def _set_arrow(arrow: ET.Element, tail_x: float, tail_y: float,
244
+ head_x: float, head_y: float):
245
+ """Set arrow endpoints and update its bounding box."""
246
+ arrow.set("Tail3D", f"{tail_x:.2f} {tail_y:.2f} 0")
247
+ arrow.set("Head3D", f"{head_x:.2f} {head_y:.2f} 0")
248
+ # Update Center3D and axis ends (elliptical arc geometry — ChemDraw internal)
249
+ cx = (tail_x + head_x) / 2.0
250
+ cy = (tail_y + head_y) / 2.0
251
+ half_len = abs(head_x - tail_x) / 2.0
252
+ arrow.set("Center3D", f"{cx + 280:.2f} {cy + 130:.2f} 0")
253
+ arrow.set("MajorAxisEnd3D", f"{cx + 280 + half_len:.2f} {cy + 130:.2f} 0")
254
+ arrow.set("MinorAxisEnd3D", f"{cx + 280:.2f} {cy + 130 + half_len:.2f} 0")
255
+ # BoundingBox
256
+ pad = 2.0
257
+ bb_x1 = min(tail_x, head_x)
258
+ bb_x2 = max(tail_x, head_x)
259
+ arrow.set("BoundingBox",
260
+ f"{bb_x1:.2f} {tail_y - pad:.2f} {bb_x2:.2f} {tail_y + pad:.2f}")
261
+ # Also update the superseding graphic if present
262
+ # (handled at page level in the caller)
263
+
264
+
265
+ def _update_graphic_for_arrow(page: ET.Element, arrow: ET.Element,
266
+ tail_x: float, head_x: float, arrow_y: float):
267
+ """Update the <graphic> that the arrow supersedes."""
268
+ arrow_id = arrow.get("id", "")
269
+ for el in page:
270
+ if el.tag == "graphic" and el.get("SupersededBy") == arrow_id:
271
+ el.set("BoundingBox",
272
+ f"{head_x:.2f} {arrow_y:.2f} {tail_x:.2f} {arrow_y:.2f}")
273
+ break
274
+
275
+
276
+ def _center_element_x(el: ET.Element, target_cx: float):
277
+ """Move element so its horizontal center is at target_cx."""
278
+ bb = _element_bbox(el)
279
+ current_cx = (bb[0] + bb[2]) / 2.0
280
+ dx = target_cx - current_cx
281
+ _shift_element(el, dx, 0)
282
+
283
+
284
+ def _center_element_y(el: ET.Element, target_cy: float):
285
+ """Move element so its vertical center is at target_cy."""
286
+ bb = _element_bbox(el)
287
+ current_cy = (bb[1] + bb[3]) / 2.0
288
+ dy = target_cy - current_cy
289
+ _shift_element(el, 0, dy)
290
+
291
+
292
+ def _move_element_to(el: ET.Element, target_cx: float, target_cy: float):
293
+ """Move element so its center is at (target_cx, target_cy)."""
294
+ bb = _element_bbox(el)
295
+ cx = (bb[0] + bb[2]) / 2.0
296
+ cy = (bb[1] + bb[3]) / 2.0
297
+ _shift_element(el, target_cx - cx, target_cy - cy)
298
+
299
+
300
+ # ---------------------------------------------------------------------------
301
+ # Reaction parsing — extract roles from <step>
302
+ # ---------------------------------------------------------------------------
303
+
304
+ def _parse_reaction(page: ET.Element, step: ET.Element,
305
+ id_map: Dict[str, ET.Element]):
306
+ """Extract reactants, products, above-arrow, below-arrow element lists."""
307
+ def _resolve(attr):
308
+ ids = step.get(attr, "").split()
309
+ return [id_map[i] for i in ids if i in id_map]
310
+
311
+ reactants = _resolve("ReactionStepReactants")
312
+ products = _resolve("ReactionStepProducts")
313
+ above = _resolve("ReactionStepObjectsAboveArrow")
314
+ below = _resolve("ReactionStepObjectsBelowArrow")
315
+ return reactants, products, above, below
316
+
317
+
318
+ # ---------------------------------------------------------------------------
319
+ # Approach 1: bbox_center — Bounding-box centroid alignment + uniform gaps
320
+ # ---------------------------------------------------------------------------
321
+
322
+ def approach_bbox_center(page, step, id_map, arrow, verbose=False):
323
+ """
324
+ Simple centroid-based layout:
325
+ - All molecules vertically centered on arrow y
326
+ - Uniform horizontal gaps between reactants, arrow, products
327
+ - Above/below text centered over arrow
328
+ """
329
+ reactants, products, above, below = _parse_reaction(page, step, id_map)
330
+ if not reactants or not products:
331
+ return
332
+
333
+ GAP = 15.0 # gap between elements and arrow (approach-specific)
334
+
335
+ # Compute total width of reactant group and product group
336
+ r_bboxes = [_element_bbox(r) for r in reactants]
337
+ p_bboxes = [_element_bbox(p) for p in products]
338
+ r_total_w = sum(_bbox_width(b) for b in r_bboxes) + GAP * max(0, len(reactants) - 1)
339
+ p_total_w = sum(_bbox_width(b) for b in p_bboxes) + GAP * max(0, len(products) - 1)
340
+
341
+ # Arrow length: at least as wide as the widest above/below object
342
+ arrow_len = _compute_arrow_len_from_content(above, below)
343
+
344
+ # Compute arrow y as average of all molecule centers
345
+ all_bbs = r_bboxes + p_bboxes
346
+ arrow_y = sum(_bbox_center(b)[1] for b in all_bbs) / len(all_bbs)
347
+
348
+ # Layout: reactants | GAP | arrow | GAP | products
349
+ # Find current centroid to place everything relative to it
350
+ all_cx = sum(_bbox_center(b)[0] for b in all_bbs) / len(all_bbs)
351
+ total_w = r_total_w + GAP + arrow_len + GAP + p_total_w
352
+ start_x = all_cx - total_w / 2.0
353
+
354
+ # Place reactants
355
+ cursor_x = start_x
356
+ for i, r in enumerate(reactants):
357
+ bb = _element_bbox(r)
358
+ w = _bbox_width(bb)
359
+ _move_element_to(r, cursor_x + w / 2.0, arrow_y)
360
+ cursor_x += w + GAP
361
+
362
+ # Place arrow
363
+ tail_x = cursor_x
364
+ head_x = cursor_x + arrow_len
365
+ _set_arrow(arrow, tail_x, arrow_y, head_x, arrow_y)
366
+ _update_graphic_for_arrow(page, arrow, tail_x, head_x, arrow_y)
367
+ cursor_x = head_x + GAP
368
+
369
+ # Place products
370
+ for i, p in enumerate(products):
371
+ bb = _element_bbox(p)
372
+ w = _bbox_width(bb)
373
+ _move_element_to(p, cursor_x + w / 2.0, arrow_y)
374
+ cursor_x += w + GAP
375
+
376
+ # Center above-arrow objects
377
+ arrow_cx = (tail_x + head_x) / 2.0
378
+ _stack_above_below(above, below, arrow_cx, arrow_y,
379
+ LAYOUT_ABOVE_GAP, LAYOUT_BELOW_GAP)
380
+
381
+
382
+ # ---------------------------------------------------------------------------
383
+ # Approach 2: arrow_driven — Arrow length drives layout
384
+ # ---------------------------------------------------------------------------
385
+
386
+ def approach_arrow_driven(page, step, id_map, arrow, verbose=False):
387
+ """
388
+ Arrow-centric layout:
389
+ - Arrow stays at a fixed reasonable length (70pt ≈ ~1 inch)
390
+ - Reactants right-aligned to arrow tail with gap
391
+ - Products left-aligned to arrow head with gap
392
+ - Vertical centering on arrow midpoint
393
+ """
394
+ reactants, products, above, below = _parse_reaction(page, step, id_map)
395
+ if not reactants or not products:
396
+ return
397
+
398
+ FRAG_GAP = 12.0 # gap between fragment edge and arrow tip (approach-specific)
399
+ INTER_GAP = LAYOUT_INTER_FRAGMENT_GAP
400
+
401
+ # Arrow length: at least as wide as widest above/below object, min 70pt
402
+ ARROW_LEN = _compute_arrow_len_from_content(above, below, min_len=70.0)
403
+
404
+ # Determine arrow y from the tallest molecule's vertical center
405
+ all_bbs = [_element_bbox(r) for r in reactants] + [_element_bbox(p) for p in products]
406
+ arrow_y = sum(_bbox_center(b)[1] for b in all_bbs) / len(all_bbs)
407
+
408
+ # Place arrow centered on current midpoint
409
+ all_cx = sum(_bbox_center(b)[0] for b in all_bbs) / len(all_bbs)
410
+ tail_x = all_cx - ARROW_LEN / 2.0
411
+ head_x = all_cx + ARROW_LEN / 2.0
412
+
413
+ _set_arrow(arrow, tail_x, arrow_y, head_x, arrow_y)
414
+ _update_graphic_for_arrow(page, arrow, tail_x, head_x, arrow_y)
415
+
416
+ # Place reactants right-to-left from arrow tail
417
+ cursor_x = tail_x - FRAG_GAP
418
+ for r in reversed(reactants):
419
+ bb = _element_bbox(r)
420
+ w = _bbox_width(bb)
421
+ _move_element_to(r, cursor_x - w / 2.0, arrow_y)
422
+ cursor_x -= w + INTER_GAP
423
+
424
+ # Place products left-to-right from arrow head
425
+ cursor_x = head_x + FRAG_GAP
426
+ for p in products:
427
+ bb = _element_bbox(p)
428
+ w = _bbox_width(bb)
429
+ _move_element_to(p, cursor_x + w / 2.0, arrow_y)
430
+ cursor_x += w + INTER_GAP
431
+
432
+ # Conditions
433
+ arrow_cx = (tail_x + head_x) / 2.0
434
+ _stack_above_below(above, below, arrow_cx, arrow_y,
435
+ LAYOUT_ABOVE_GAP, LAYOUT_BELOW_GAP)
436
+
437
+
438
+ # ---------------------------------------------------------------------------
439
+ # Approach 3: proportional — Gaps proportional to molecule widths
440
+ # ---------------------------------------------------------------------------
441
+
442
+ def approach_proportional(page, step, id_map, arrow, verbose=False):
443
+ """
444
+ Proportional spacing:
445
+ - Arrow length = 0.6× the average molecule width
446
+ - Gaps scale with molecule size
447
+ - Looks balanced for both small and large molecules
448
+ """
449
+ reactants, products, above, below = _parse_reaction(page, step, id_map)
450
+ if not reactants or not products:
451
+ return
452
+
453
+ r_bbs = [_element_bbox(r) for r in reactants]
454
+ p_bbs = [_element_bbox(p) for p in products]
455
+
456
+ avg_w = (sum(_bbox_width(b) for b in r_bbs + p_bbs) /
457
+ len(r_bbs + p_bbs))
458
+
459
+ content_len = _compute_arrow_len_from_content(above, below, min_len=45.0)
460
+ ARROW_LEN = max(content_len, min(100.0, avg_w * 0.6))
461
+ GAP_RATIO = 0.25 # gap = 25% of adjacent molecule width
462
+
463
+ all_bbs = r_bbs + p_bbs
464
+ arrow_y = sum(_bbox_center(b)[1] for b in all_bbs) / len(all_bbs)
465
+ all_cx = sum(_bbox_center(b)[0] for b in all_bbs) / len(all_bbs)
466
+
467
+ # Compute total width
468
+ r_widths = [_bbox_width(b) for b in r_bbs]
469
+ p_widths = [_bbox_width(b) for b in p_bbs]
470
+
471
+ r_total = sum(r_widths) + sum(w * GAP_RATIO for w in r_widths[:-1]) if r_widths else 0
472
+ p_total = sum(p_widths) + sum(w * GAP_RATIO for w in p_widths[:-1]) if p_widths else 0
473
+
474
+ # Gap between last reactant and arrow tail
475
+ r_arrow_gap = (r_widths[-1] * GAP_RATIO + 8.0) if r_widths else 12.0
476
+ # Gap between arrow head and first product
477
+ p_arrow_gap = (p_widths[0] * GAP_RATIO + 8.0) if p_widths else 12.0
478
+
479
+ total_w = r_total + r_arrow_gap + ARROW_LEN + p_arrow_gap + p_total
480
+ start_x = all_cx - total_w / 2.0
481
+
482
+ # Place reactants
483
+ cursor_x = start_x
484
+ for i, r in enumerate(reactants):
485
+ w = r_widths[i]
486
+ _move_element_to(r, cursor_x + w / 2.0, arrow_y)
487
+ cursor_x += w + (w * GAP_RATIO if i < len(reactants) - 1 else 0)
488
+
489
+ cursor_x += r_arrow_gap
490
+
491
+ # Arrow
492
+ tail_x = cursor_x
493
+ head_x = cursor_x + ARROW_LEN
494
+ _set_arrow(arrow, tail_x, arrow_y, head_x, arrow_y)
495
+ _update_graphic_for_arrow(page, arrow, tail_x, head_x, arrow_y)
496
+ cursor_x = head_x + p_arrow_gap
497
+
498
+ # Products
499
+ for i, p in enumerate(products):
500
+ w = p_widths[i]
501
+ _move_element_to(p, cursor_x + w / 2.0, arrow_y)
502
+ cursor_x += w + (w * GAP_RATIO if i < len(products) - 1 else 0)
503
+
504
+ arrow_cx = (tail_x + head_x) / 2.0
505
+ _stack_above_below(above, below, arrow_cx, arrow_y,
506
+ LAYOUT_ABOVE_GAP, LAYOUT_BELOW_GAP)
507
+
508
+
509
+ # ---------------------------------------------------------------------------
510
+ # Approach 4: compact — Minimal gaps for slides/posters
511
+ # ---------------------------------------------------------------------------
512
+
513
+ def approach_compact(page, step, id_map, arrow, verbose=False):
514
+ """
515
+ Compact layout for space-constrained output:
516
+ - Minimal gaps (5pt)
517
+ - Short arrow (45pt)
518
+ - Tight vertical stacking
519
+ """
520
+ reactants, products, above, below = _parse_reaction(page, step, id_map)
521
+ if not reactants or not products:
522
+ return
523
+
524
+ ARROW_LEN = _compute_arrow_len_from_content(above, below, min_len=45.0)
525
+ GAP = 5.0
526
+ ABOVE_GAP = 5.0 # approach-specific (tighter than standard)
527
+
528
+ r_bbs = [_element_bbox(r) for r in reactants]
529
+ p_bbs = [_element_bbox(p) for p in products]
530
+ all_bbs = r_bbs + p_bbs
531
+
532
+ arrow_y = sum(_bbox_center(b)[1] for b in all_bbs) / len(all_bbs)
533
+ all_cx = sum(_bbox_center(b)[0] for b in all_bbs) / len(all_bbs)
534
+
535
+ r_total = sum(_bbox_width(b) for b in r_bbs) + GAP * max(0, len(r_bbs) - 1)
536
+ p_total = sum(_bbox_width(b) for b in p_bbs) + GAP * max(0, len(p_bbs) - 1)
537
+
538
+ total_w = r_total + GAP + ARROW_LEN + GAP + p_total
539
+ start_x = all_cx - total_w / 2.0
540
+
541
+ cursor_x = start_x
542
+ for i, r in enumerate(reactants):
543
+ w = _bbox_width(r_bbs[i])
544
+ _move_element_to(r, cursor_x + w / 2.0, arrow_y)
545
+ cursor_x += w + GAP
546
+
547
+ tail_x = cursor_x
548
+ head_x = cursor_x + ARROW_LEN
549
+ _set_arrow(arrow, tail_x, arrow_y, head_x, arrow_y)
550
+ _update_graphic_for_arrow(page, arrow, tail_x, head_x, arrow_y)
551
+ cursor_x = head_x + GAP
552
+
553
+ for i, p in enumerate(products):
554
+ w = _bbox_width(p_bbs[i])
555
+ _move_element_to(p, cursor_x + w / 2.0, arrow_y)
556
+ cursor_x += w + GAP
557
+
558
+ arrow_cx = (tail_x + head_x) / 2.0
559
+ _stack_above_below(above, below, arrow_cx, arrow_y,
560
+ ABOVE_GAP, LAYOUT_BELOW_GAP)
561
+
562
+
563
+ # ---------------------------------------------------------------------------
564
+ # Approach 5: golden_ratio — Arrow + gaps use golden ratio proportions
565
+ # ---------------------------------------------------------------------------
566
+
567
+ def approach_golden_ratio(page, step, id_map, arrow, verbose=False):
568
+ """
569
+ Golden ratio aesthetics:
570
+ - Arrow length = φ × average molecule width
571
+ - Gaps = average molecule width / φ
572
+ - Pleasing visual proportions
573
+ """
574
+ PHI = 1.618
575
+
576
+ reactants, products, above, below = _parse_reaction(page, step, id_map)
577
+ if not reactants or not products:
578
+ return
579
+
580
+ r_bbs = [_element_bbox(r) for r in reactants]
581
+ p_bbs = [_element_bbox(p) for p in products]
582
+ all_bbs = r_bbs + p_bbs
583
+
584
+ avg_w = sum(_bbox_width(b) for b in all_bbs) / len(all_bbs)
585
+ content_len = _compute_arrow_len_from_content(above, below, min_len=50.0)
586
+ ARROW_LEN = max(content_len, min(110.0, avg_w * PHI))
587
+ GAP = max(LAYOUT_ABOVE_GAP, avg_w / PHI)
588
+
589
+ arrow_y = sum(_bbox_center(b)[1] for b in all_bbs) / len(all_bbs)
590
+ all_cx = sum(_bbox_center(b)[0] for b in all_bbs) / len(all_bbs)
591
+
592
+ r_total = sum(_bbox_width(b) for b in r_bbs) + GAP * max(0, len(r_bbs) - 1)
593
+ p_total = sum(_bbox_width(b) for b in p_bbs) + GAP * max(0, len(p_bbs) - 1)
594
+
595
+ total_w = r_total + GAP + ARROW_LEN + GAP + p_total
596
+ start_x = all_cx - total_w / 2.0
597
+
598
+ cursor_x = start_x
599
+ for i, r in enumerate(reactants):
600
+ w = _bbox_width(r_bbs[i])
601
+ _move_element_to(r, cursor_x + w / 2.0, arrow_y)
602
+ cursor_x += w + GAP
603
+
604
+ tail_x = cursor_x
605
+ head_x = cursor_x + ARROW_LEN
606
+ _set_arrow(arrow, tail_x, arrow_y, head_x, arrow_y)
607
+ _update_graphic_for_arrow(page, arrow, tail_x, head_x, arrow_y)
608
+ cursor_x = head_x + GAP
609
+
610
+ for i, p in enumerate(products):
611
+ w = _bbox_width(p_bbs[i])
612
+ _move_element_to(p, cursor_x + w / 2.0, arrow_y)
613
+ cursor_x += w + GAP
614
+
615
+ arrow_cx = (tail_x + head_x) / 2.0
616
+ _stack_above_below(above, below, arrow_cx, arrow_y,
617
+ LAYOUT_ABOVE_GAP, LAYOUT_BELOW_GAP)
618
+
619
+
620
+ # ---------------------------------------------------------------------------
621
+ # Approach 6: chemdraw_mimic — Closest emulation of ChemDraw heuristics
622
+ # ---------------------------------------------------------------------------
623
+
624
+ def approach_chemdraw_mimic(page, step, id_map, arrow, verbose=False):
625
+ """
626
+ Emulates ChemDraw's Clean Up Reaction behaviour:
627
+ - Arrow length ≈ 1.5× bond length (BondLength from doc)
628
+ - Molecules placed so nearest atom is ~1 bond length from arrow tip
629
+ - Above-arrow objects stacked: structures first, then text
630
+ - Below-arrow objects similarly stacked
631
+ - Everything vertically centered on a common y-line
632
+ - Separate above-arrow fragments from above-arrow text labels
633
+ """
634
+ reactants, products, above, below = _parse_reaction(page, step, id_map)
635
+ if not reactants or not products:
636
+ return
637
+
638
+ # Read BondLength from document
639
+ root = page
640
+ while root.tag != "CDXML":
641
+ # Walk up — but ET doesn't support parent. Use the global root instead.
642
+ root = page
643
+ break
644
+ # ACS Document 1996 bond length
645
+ bond_len = ACS_BOND_LENGTH
646
+
647
+ content_len = _compute_arrow_len_from_content(above, below, min_len=bond_len * 5.0)
648
+ ARROW_LEN = content_len
649
+ FRAG_GAP = bond_len * LAYOUT_FRAG_GAP_BONDS
650
+ INTER_GAP = bond_len * LAYOUT_INTER_GAP_BONDS
651
+
652
+ r_bbs = [_element_bbox(r) for r in reactants]
653
+ p_bbs = [_element_bbox(p) for p in products]
654
+ all_bbs = r_bbs + p_bbs
655
+
656
+ # Arrow y = vertical center of reactants (ChemDraw uses reactant center)
657
+ arrow_y = sum(_bbox_center(b)[1] for b in r_bbs) / len(r_bbs)
658
+
659
+ # Position arrow. Use mean x of all molecules as center.
660
+ all_cx = sum(_bbox_center(b)[0] for b in all_bbs) / len(all_bbs)
661
+
662
+ # Compute widths
663
+ r_widths = [_bbox_width(b) for b in r_bbs]
664
+ p_widths = [_bbox_width(b) for b in p_bbs]
665
+
666
+ r_block_w = sum(r_widths) + INTER_GAP * max(0, len(r_widths) - 1)
667
+ p_block_w = sum(p_widths) + INTER_GAP * max(0, len(p_widths) - 1)
668
+
669
+ total_w = r_block_w + FRAG_GAP + ARROW_LEN + FRAG_GAP + p_block_w
670
+ start_x = all_cx - total_w / 2.0
671
+
672
+ # Place reactants
673
+ cursor_x = start_x
674
+ for i, r in enumerate(reactants):
675
+ w = r_widths[i]
676
+ _move_element_to(r, cursor_x + w / 2.0, arrow_y)
677
+ cursor_x += w + INTER_GAP
678
+ cursor_x = cursor_x - INTER_GAP + FRAG_GAP # replace last inter-gap with frag-gap
679
+
680
+ # Arrow
681
+ tail_x = cursor_x
682
+ head_x = cursor_x + ARROW_LEN
683
+ _set_arrow(arrow, tail_x, arrow_y, head_x, arrow_y)
684
+ _update_graphic_for_arrow(page, arrow, tail_x, head_x, arrow_y)
685
+ cursor_x = head_x + FRAG_GAP
686
+
687
+ # Products
688
+ for i, p in enumerate(products):
689
+ w = p_widths[i]
690
+ _move_element_to(p, cursor_x + w / 2.0, arrow_y)
691
+ cursor_x += w + INTER_GAP
692
+
693
+ # Conditions — use shared stacking (text closest to arrow, frags above/below)
694
+ arrow_cx = (tail_x + head_x) / 2.0
695
+ _stack_above_below(above, below, arrow_cx, arrow_y,
696
+ LAYOUT_ABOVE_GAP, LAYOUT_BELOW_GAP)
697
+
698
+
699
+ # ---------------------------------------------------------------------------
700
+ # Shared: stack above/below arrow
701
+ # ---------------------------------------------------------------------------
702
+
703
+ def _compute_arrow_len_from_content(above: List[ET.Element],
704
+ below: List[ET.Element],
705
+ min_len: float = 50.0) -> float:
706
+ """Compute arrow length so it's at least as wide as the widest
707
+ above- or below-arrow object.
708
+
709
+ above/below are the raw element lists from the step metadata.
710
+ Text from above is redirected below, so we check both groups.
711
+ """
712
+ above_frags = [e for e in above if e.tag != "t"]
713
+ above_texts = [e for e in above if e.tag == "t"]
714
+ below_texts = [e for e in below if e.tag == "t"]
715
+ below_frags = [e for e in below if e.tag != "t"]
716
+
717
+ all_above = above_frags
718
+ all_below = above_texts + below_texts + below_frags
719
+
720
+ max_w = 0.0
721
+ for el in all_above + all_below:
722
+ if el.tag == "t":
723
+ # Use content-based width estimate instead of stored BoundingBox.
724
+ # Stored BoundingBox may be stale (e.g. from ELN exports with
725
+ # non-ACS scaling where bond normalization resized fragments but
726
+ # left page-level text BoundingBoxes untouched).
727
+ w = _estimate_text_width(el)
728
+ else:
729
+ bb = _element_bbox(el)
730
+ w = _bbox_width(bb)
731
+ if w > max_w:
732
+ max_w = w
733
+
734
+ # Arrow should be wider than the widest object, with some padding
735
+ return max(min_len, max_w + 10.0)
736
+
737
+
738
+ def _stack_above_below(above: List[ET.Element], below: List[ET.Element],
739
+ arrow_cx: float, arrow_y: float,
740
+ above_gap: float, below_gap: float):
741
+ """Place above/below-arrow objects with text always below the arrow.
742
+
743
+ Text (<t>) elements — even if listed as "above arrow" in the step
744
+ metadata — are always placed below the arrow line. Only non-text
745
+ elements (fragments / structures) go above.
746
+
747
+ For above-arrow fragments, uses atom-only bounding boxes (no text
748
+ labels) since ChemDraw's XML label BoundingBox values are unreliable.
749
+
750
+ The above_gap parameter is the *base* gap (typically 8pt). If the
751
+ bottommost atom of a fragment is N or P with only 2 explicit bonds
752
+ (i.e. it will have a vertically-stacked H label like NH or PH),
753
+ the gap is increased to 16pt to avoid the hanging label clashing
754
+ with the arrow.
755
+
756
+ below_gap is the distance from the arrow y-line to the top edge
757
+ of the highest below-arrow object (typically 4pt).
758
+ """
759
+ # Collect texts from both lists — they all go below
760
+ above_texts = [e for e in above if e.tag == "t"]
761
+ above_frags = [e for e in above if e.tag != "t"]
762
+ below_texts = [e for e in below if e.tag == "t"]
763
+ below_frags = [e for e in below if e.tag != "t"]
764
+
765
+ # --- Above arrow: only non-text elements (fragments) ---
766
+ # Use atom-only bbox; adjust gap for hanging labels (NH, PH)
767
+ for el in above_frags:
768
+ bb = _element_bbox(el)
769
+ h = _bbox_height(bb)
770
+ cx = (bb[0] + bb[2]) / 2.0
771
+ cy = (bb[1] + bb[3]) / 2.0
772
+
773
+ # Determine gap for this fragment
774
+ if el.tag == "fragment" and fragment_bottom_has_hanging_label(el):
775
+ gap = LAYOUT_HANGING_LABEL_GAP
776
+ else:
777
+ gap = above_gap
778
+
779
+ # Place so bottom edge of atom-only bbox is at arrow_y - gap
780
+ target_bottom = arrow_y - gap
781
+ target_cy = target_bottom - h / 2.0
782
+ _shift_element(el, arrow_cx - cx, target_cy - cy)
783
+
784
+ # --- Below arrow: all text (from above + below lists), then fragments ---
785
+ # Text elements use consistent baseline-to-baseline spacing (like
786
+ # ChemDraw's multi-line text rendering). This avoids dependence on
787
+ # stale BoundingBox values from upstream processing.
788
+ all_below_text = above_texts + below_texts
789
+ BASELINE_OFFSET = 10.0 # baseline below top-of-text-line (cap height)
790
+ TEXT_LINE_SPACING = 13.0 # baseline-to-baseline (Arial 10pt with leading)
791
+
792
+ prev_baseline = None
793
+ y_cursor = arrow_y + below_gap
794
+ for el in all_below_text:
795
+ if prev_baseline is None:
796
+ baseline_y = y_cursor + BASELINE_OFFSET
797
+ else:
798
+ baseline_y = prev_baseline + TEXT_LINE_SPACING
799
+ el.set("p", f"{arrow_cx:.2f} {baseline_y:.2f}")
800
+ el.set("CaptionJustification", "Center")
801
+ el.set("Justification", "Center")
802
+ recompute_text_bbox(el)
803
+ prev_baseline = baseline_y
804
+ # Update y_cursor to bottom of this text element for any
805
+ # subsequent non-text elements
806
+ bb = _element_bbox(el)
807
+ y_cursor = bb[3]
808
+
809
+ # Non-text elements (fragments) below arrow, after all text
810
+ for el in below_frags:
811
+ bb = _element_bbox(el)
812
+ h = _bbox_height(bb)
813
+ _move_element_to(el, arrow_cx, y_cursor + LAYOUT_BELOW_FRAG_PAD + h / 2.0)
814
+ y_cursor += LAYOUT_BELOW_FRAG_PAD + h
815
+
816
+
817
+ # ---------------------------------------------------------------------------
818
+ # Update document-level BoundingBox
819
+ # ---------------------------------------------------------------------------
820
+
821
+ def _update_doc_bbox(root: ET.Element):
822
+ """Recompute the document-level BoundingBox from page contents."""
823
+ page = root.find("page")
824
+ if page is None:
825
+ return
826
+ min_x = min_y = float("inf")
827
+ max_x = max_y = float("-inf")
828
+ for el in page:
829
+ if el.tag in ("fragment", "t", "arrow", "graphic"):
830
+ bb = _element_bbox(el)
831
+ if bb != (0, 0, 0, 0):
832
+ min_x = min(min_x, bb[0])
833
+ min_y = min(min_y, bb[1])
834
+ max_x = max(max_x, bb[2])
835
+ max_y = max(max_y, bb[3])
836
+ if min_x < float("inf"):
837
+ root.set("BoundingBox",
838
+ f"{min_x:.2f} {min_y:.2f} {max_x:.2f} {max_y:.2f}")
839
+
840
+
841
+ # ---------------------------------------------------------------------------
842
+ # Approach registry
843
+ # ---------------------------------------------------------------------------
844
+
845
+ APPROACHES = {
846
+ "bbox_center": approach_bbox_center,
847
+ "arrow_driven": approach_arrow_driven,
848
+ "proportional": approach_proportional,
849
+ "compact": approach_compact,
850
+ "golden_ratio": approach_golden_ratio,
851
+ "chemdraw_mimic": approach_chemdraw_mimic,
852
+ }
853
+
854
+ APPROACH_DESCRIPTIONS = {
855
+ "bbox_center": "Bounding-box centroid alignment + uniform gaps",
856
+ "arrow_driven": "Arrow length drives layout; molecules placed relative to ends",
857
+ "proportional": "Gap sizes proportional to molecule widths",
858
+ "compact": "Minimal gaps; tight layout for slides/posters",
859
+ "golden_ratio": "Arrow + gaps use golden ratio proportions",
860
+ "chemdraw_mimic": "Closest emulation of ChemDraw's cleanup heuristics",
861
+ }
862
+
863
+
864
+ def run_cleanup(input_path: str, output_path: str, approach: str = "chemdraw_mimic",
865
+ verbose: bool = False) -> dict:
866
+ """Run one cleanup approach on a CDXML file.
867
+
868
+ Returns dict with keys: output, approach, num_reactants, num_products.
869
+ """
870
+ tree = parse_cdxml(input_path)
871
+ root = tree.getroot()
872
+ page = _get_page(root)
873
+ if page is None:
874
+ raise ValueError("No <page> found in CDXML")
875
+
876
+ id_map = _build_id_map(page)
877
+ step = _get_step(page)
878
+ if step is None:
879
+ raise ValueError("No <scheme>/<step> found — not a reaction CDXML")
880
+
881
+ arrow = _get_arrow(page, step, id_map)
882
+ if arrow is None:
883
+ raise ValueError("No arrow element found in reaction")
884
+
885
+ func = APPROACHES.get(approach)
886
+ if func is None:
887
+ raise ValueError(f"Unknown approach: {approach}. "
888
+ f"Choose from: {', '.join(APPROACHES)}")
889
+
890
+ reactants, products, _, _ = _parse_reaction(page, step, id_map)
891
+ num_reactants = len(reactants)
892
+ num_products = len(products)
893
+
894
+ func(page, step, id_map, arrow, verbose=verbose)
895
+ _update_doc_bbox(root)
896
+ write_cdxml(tree, output_path)
897
+ return {
898
+ "output": output_path,
899
+ "approach": approach,
900
+ "num_reactants": num_reactants,
901
+ "num_products": num_products,
902
+ }
903
+
904
+
905
+ # ---------------------------------------------------------------------------
906
+ # CLI
907
+ # ---------------------------------------------------------------------------
908
+
909
+ def main(argv: Optional[List[str]] = None) -> int:
910
+ parser = argparse.ArgumentParser(
911
+ description="Clean up a CDXML reaction scheme layout (pure Python).",
912
+ formatter_class=argparse.RawDescriptionHelpFormatter,
913
+ epilog="\n".join(f" {k:18s} {v}" for k, v in APPROACH_DESCRIPTIONS.items()),
914
+ )
915
+ parser.add_argument("input", help="Input CDXML file with a reaction scheme")
916
+ parser.add_argument("-o", "--output", help="Output CDXML path (default: input-cleaned.cdxml)")
917
+ parser.add_argument("--approach", choices=list(APPROACHES.keys()),
918
+ default="chemdraw_mimic",
919
+ help="Layout approach (default: chemdraw_mimic)")
920
+ parser.add_argument("--all", action="store_true",
921
+ help="Run all 6 approaches, producing one output each")
922
+ parser.add_argument("--render", action="store_true",
923
+ help="Render each output to PNG via cdxml_to_image.py")
924
+ parser.add_argument("-v", "--verbose", action="store_true")
925
+ parser.add_argument("--json", action="store_true",
926
+ help="Output result as JSON to stdout")
927
+
928
+ args = parser.parse_args(argv)
929
+
930
+ if not os.path.isfile(args.input):
931
+ print(f"Error: file not found: {args.input}", file=sys.stderr)
932
+ return 1
933
+
934
+ base, ext = os.path.splitext(args.input)
935
+
936
+ # When --json, redirect status prints to stderr
937
+ _print = print
938
+ if args.json:
939
+ def _print(*a, **kw):
940
+ kw.setdefault("file", sys.stderr)
941
+ print(*a, **kw)
942
+
943
+ if args.all:
944
+ all_results = []
945
+ for name in APPROACHES:
946
+ out_path = f"{base}-cleanup-{name}{ext}"
947
+ _print(f"[{name}] -> {out_path}")
948
+ try:
949
+ info = run_cleanup(args.input, out_path, approach=name, verbose=args.verbose)
950
+ _print(f" OK")
951
+ all_results.append(info)
952
+ if args.render:
953
+ _render(out_path)
954
+ except Exception as e:
955
+ _print(f" FAILED: {e}", file=sys.stderr)
956
+ if args.json:
957
+ json_results = []
958
+ for info in all_results:
959
+ json_results.append({
960
+ "input": os.path.abspath(args.input),
961
+ "output": os.path.abspath(info["output"]),
962
+ "approach": info["approach"],
963
+ "num_reactants": info["num_reactants"],
964
+ "num_products": info["num_products"],
965
+ })
966
+ print(json.dumps(json_results, indent=2))
967
+ else:
968
+ out_path = args.output or f"{base}-cleaned{ext}"
969
+ try:
970
+ info = run_cleanup(args.input, out_path, approach=args.approach, verbose=args.verbose)
971
+ if args.json:
972
+ result = {
973
+ "input": os.path.abspath(args.input),
974
+ "output": os.path.abspath(out_path),
975
+ "approach": info["approach"],
976
+ "num_reactants": info["num_reactants"],
977
+ "num_products": info["num_products"],
978
+ }
979
+ print(json.dumps(result, indent=2))
980
+ else:
981
+ _print(f"Output: {out_path}")
982
+ if args.render:
983
+ _render(out_path)
984
+ except Exception as e:
985
+ _print(f"Error: {e}", file=sys.stderr)
986
+ return 1
987
+
988
+ return 0
989
+
990
+
991
+ def _render(cdxml_path: str):
992
+ """Render a CDXML to PNG using cdxml_to_image.py."""
993
+ try:
994
+ from ..chemdraw.cdxml_to_image import cdxml_to_image
995
+ png_path = cdxml_to_image(cdxml_path)
996
+ print(f" Rendered: {png_path}")
997
+ except Exception as e:
998
+ print(f" Render failed: {e}", file=sys.stderr)
999
+
1000
+
1001
+ if __name__ == "__main__":
1002
+ sys.exit(main())