cdxml-toolkit 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cdxml_toolkit/__init__.py +18 -0
- cdxml_toolkit/_jre/__init__.py +2 -0
- cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
- cdxml_toolkit/analysis/__init__.py +35 -0
- cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
- cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
- cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
- cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
- cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
- cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
- cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
- cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
- cdxml_toolkit/analysis/extract_nmr.py +47 -0
- cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
- cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
- cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
- cdxml_toolkit/cdxml_builder.py +920 -0
- cdxml_toolkit/cdxml_utils.py +342 -0
- cdxml_toolkit/chemdraw/__init__.py +5 -0
- cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
- cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
- cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
- cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
- cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
- cdxml_toolkit/constants.py +304 -0
- cdxml_toolkit/coord_normalizer.py +438 -0
- cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
- cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
- cdxml_toolkit/image/__init__.py +15 -0
- cdxml_toolkit/image/reaction_from_image.py +2103 -0
- cdxml_toolkit/image/structure_from_image.py +1711 -0
- cdxml_toolkit/layout/__init__.py +5 -0
- cdxml_toolkit/layout/alignment.py +1642 -0
- cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
- cdxml_toolkit/layout/scheme_merger.py +2260 -0
- cdxml_toolkit/mcp_server/__init__.py +0 -0
- cdxml_toolkit/mcp_server/__main__.py +5 -0
- cdxml_toolkit/mcp_server/server.py +1567 -0
- cdxml_toolkit/naming/__init__.py +6 -0
- cdxml_toolkit/naming/aligned_namer.py +2342 -0
- cdxml_toolkit/naming/mol_builder.py +3722 -0
- cdxml_toolkit/naming/name_decomposer.py +2843 -0
- cdxml_toolkit/naming/reactions_datamol.json +2414 -0
- cdxml_toolkit/office/__init__.py +5 -0
- cdxml_toolkit/office/doc_from_template.py +722 -0
- cdxml_toolkit/office/ole_embedder.py +808 -0
- cdxml_toolkit/office/ole_extractor.py +272 -0
- cdxml_toolkit/perception/__init__.py +10 -0
- cdxml_toolkit/perception/compound_search.py +229 -0
- cdxml_toolkit/perception/eln_csv_parser.py +240 -0
- cdxml_toolkit/perception/rdf_parser.py +664 -0
- cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
- cdxml_toolkit/perception/reaction_parser.py +2150 -0
- cdxml_toolkit/perception/scheme_reader.py +2948 -0
- cdxml_toolkit/perception/scheme_refine.py +1404 -0
- cdxml_toolkit/perception/scheme_segmenter.py +619 -0
- cdxml_toolkit/perception/spatial_assignment.py +1013 -0
- cdxml_toolkit/rdkit_utils.py +605 -0
- cdxml_toolkit/render/__init__.py +17 -0
- cdxml_toolkit/render/auto_layout.py +229 -0
- cdxml_toolkit/render/compact_parser.py +632 -0
- cdxml_toolkit/render/parser.py +706 -0
- cdxml_toolkit/render/render_scheme.py +267 -0
- cdxml_toolkit/render/renderer.py +2387 -0
- cdxml_toolkit/render/schema.py +90 -0
- cdxml_toolkit/render/scheme_maker.py +1043 -0
- cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
- cdxml_toolkit/resolve/__init__.py +13 -0
- cdxml_toolkit/resolve/cas_resolver.py +430 -0
- cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
- cdxml_toolkit/resolve/condensed_formula.py +493 -0
- cdxml_toolkit/resolve/jre_manager.py +195 -0
- cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
- cdxml_toolkit/resolve/reagent_db.py +285 -0
- cdxml_toolkit/resolve/superatom_data.json +2856 -0
- cdxml_toolkit/resolve/superatom_table.py +146 -0
- cdxml_toolkit/text_formatting.py +298 -0
- cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
- cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
- cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
- cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
- cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1002 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
reaction_cleanup.py — Clean up a CDXML reaction scheme layout (pure Python).
|
|
4
|
+
|
|
5
|
+
Replaces ChemDraw COM "Clean Up Reaction" with algorithmic layout.
|
|
6
|
+
Offers multiple approaches that can be compared side-by-side.
|
|
7
|
+
|
|
8
|
+
Usage
|
|
9
|
+
-----
|
|
10
|
+
python reaction_cleanup.py input.cdxml # default approach
|
|
11
|
+
python reaction_cleanup.py input.cdxml -o out.cdxml # explicit output
|
|
12
|
+
python reaction_cleanup.py input.cdxml --approach bbox_center # pick approach
|
|
13
|
+
python reaction_cleanup.py input.cdxml --all # run all 6 approaches
|
|
14
|
+
python reaction_cleanup.py input.cdxml --all --render # run all + PNG
|
|
15
|
+
|
|
16
|
+
Approaches
|
|
17
|
+
----------
|
|
18
|
+
1. bbox_center — Bounding-box centroid alignment + uniform gaps
|
|
19
|
+
2. arrow_driven — Arrow length drives layout; molecules placed relative to arrow ends
|
|
20
|
+
3. proportional — Gap sizes proportional to molecule widths
|
|
21
|
+
4. compact — Minimal gaps; tight layout for slides/posters
|
|
22
|
+
5. golden_ratio — Arrow length and gaps use golden ratio proportions
|
|
23
|
+
6. chemdraw_mimic — Closest emulation of ChemDraw's own cleanup heuristics
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
import argparse
|
|
27
|
+
import copy
|
|
28
|
+
import json
|
|
29
|
+
import math
|
|
30
|
+
import os
|
|
31
|
+
import sys
|
|
32
|
+
import xml.etree.ElementTree as ET
|
|
33
|
+
from typing import Dict, List, Optional, Tuple
|
|
34
|
+
|
|
35
|
+
from ..constants import (
|
|
36
|
+
ACS_BOND_LENGTH,
|
|
37
|
+
LAYOUT_ABOVE_GAP,
|
|
38
|
+
LAYOUT_BELOW_GAP,
|
|
39
|
+
LAYOUT_FRAG_GAP_BONDS,
|
|
40
|
+
LAYOUT_HANGING_LABEL_GAP,
|
|
41
|
+
LAYOUT_INTER_FRAGMENT_GAP,
|
|
42
|
+
LAYOUT_INTER_GAP_BONDS,
|
|
43
|
+
)
|
|
44
|
+
from ..cdxml_utils import (
|
|
45
|
+
fragment_bbox,
|
|
46
|
+
fragment_bottom_has_hanging_label,
|
|
47
|
+
parse_cdxml,
|
|
48
|
+
recompute_text_bbox,
|
|
49
|
+
write_cdxml,
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
# Backward-compat alias (imported by eln_enrichment.py)
|
|
53
|
+
_recompute_text_bbox = recompute_text_bbox
|
|
54
|
+
|
|
55
|
+
# Below-arrow fragment padding (not in shared constants)
|
|
56
|
+
LAYOUT_BELOW_FRAG_PAD = 2.0
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
# ---------------------------------------------------------------------------
|
|
60
|
+
# CDXML geometry helpers
|
|
61
|
+
# ---------------------------------------------------------------------------
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _get_page(root: ET.Element) -> Optional[ET.Element]:
|
|
65
|
+
return root.find("page")
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _build_id_map(page: ET.Element) -> Dict[str, ET.Element]:
|
|
69
|
+
"""Map element id → element for all direct children of page."""
|
|
70
|
+
m: Dict[str, ET.Element] = {}
|
|
71
|
+
for el in page:
|
|
72
|
+
eid = el.get("id", "")
|
|
73
|
+
if eid:
|
|
74
|
+
m[eid] = el
|
|
75
|
+
return m
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def _get_step(page: ET.Element) -> Optional[ET.Element]:
|
|
79
|
+
"""Find the first <step> inside a <scheme> on the page."""
|
|
80
|
+
scheme = page.find("scheme")
|
|
81
|
+
if scheme is None:
|
|
82
|
+
return None
|
|
83
|
+
return scheme.find("step")
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _get_arrow(page: ET.Element, step: ET.Element,
|
|
87
|
+
id_map: Dict[str, ET.Element]) -> Optional[ET.Element]:
|
|
88
|
+
"""Resolve the arrow element from step metadata."""
|
|
89
|
+
arrow_ids = step.get("ReactionStepArrows", "").split()
|
|
90
|
+
for aid in arrow_ids:
|
|
91
|
+
el = id_map.get(aid)
|
|
92
|
+
if el is not None and el.tag == "arrow":
|
|
93
|
+
return el
|
|
94
|
+
# Check for graphic superseded by arrow
|
|
95
|
+
if el is not None and el.tag == "graphic":
|
|
96
|
+
sup_id = el.get("SupersededBy", "")
|
|
97
|
+
if sup_id:
|
|
98
|
+
arrow_el = id_map.get(sup_id)
|
|
99
|
+
if arrow_el is not None:
|
|
100
|
+
return arrow_el
|
|
101
|
+
# Also search all page children for graphic → arrow chain
|
|
102
|
+
for child in page:
|
|
103
|
+
if child.tag == "graphic" and child.get("id") == aid:
|
|
104
|
+
sup_id = child.get("SupersededBy", "")
|
|
105
|
+
if sup_id:
|
|
106
|
+
for child2 in page:
|
|
107
|
+
if child2.get("id") == sup_id:
|
|
108
|
+
return child2
|
|
109
|
+
return None
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def _arrow_endpoints(arrow: ET.Element) -> Tuple[float, float, float, float]:
|
|
113
|
+
"""Return (tail_x, tail_y, head_x, head_y) from arrow element."""
|
|
114
|
+
from ..cdxml_utils import arrow_endpoints
|
|
115
|
+
return arrow_endpoints(arrow)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
# _fragment_bbox and _fragment_bottom_has_hanging_label are now in cdxml_utils
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def _text_bbox(t_el: ET.Element) -> Tuple[float, float, float, float]:
|
|
123
|
+
"""Bounding box of a text element."""
|
|
124
|
+
bb = t_el.get("BoundingBox", "")
|
|
125
|
+
if bb:
|
|
126
|
+
vals = [float(v) for v in bb.split()]
|
|
127
|
+
if len(vals) >= 4:
|
|
128
|
+
return vals[0], vals[1], vals[2], vals[3]
|
|
129
|
+
p = t_el.get("p", "")
|
|
130
|
+
if p:
|
|
131
|
+
parts = [float(v) for v in p.split()]
|
|
132
|
+
# Estimate text size
|
|
133
|
+
text_content = "".join(s.text or "" for s in t_el.iter("s"))
|
|
134
|
+
w = len(text_content) * 5.8
|
|
135
|
+
h = 12.0 * max(1, text_content.count("\n") + 1)
|
|
136
|
+
return parts[0] - w/2, parts[1] - h, parts[0] + w/2, parts[1]
|
|
137
|
+
return 0, 0, 0, 0
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
# _recompute_text_bbox is now imported from cdxml_utils (alias at top of file)
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def _estimate_text_width(t_el: ET.Element) -> float:
|
|
145
|
+
"""Estimate text width from content (5.8 pt/char for Arial 10pt).
|
|
146
|
+
|
|
147
|
+
Uses the same character-width estimate as _recompute_text_bbox but
|
|
148
|
+
without modifying the element. Immune to stale BoundingBox values
|
|
149
|
+
from upstream processing (e.g. ELN exports with non-ACS scaling).
|
|
150
|
+
"""
|
|
151
|
+
text_content = "".join(s.text or "" for s in t_el.iter("s"))
|
|
152
|
+
lines = text_content.split("\n") if "\n" in text_content else [text_content]
|
|
153
|
+
max_line_len = max((len(l) for l in lines), default=0)
|
|
154
|
+
return max_line_len * 5.8
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def _element_bbox(el: ET.Element) -> Tuple[float, float, float, float]:
|
|
158
|
+
"""Bounding box for any element (fragment or text)."""
|
|
159
|
+
if el.tag == "fragment":
|
|
160
|
+
bb = fragment_bbox(el)
|
|
161
|
+
return bb if bb is not None else (0, 0, 0, 0)
|
|
162
|
+
elif el.tag == "t":
|
|
163
|
+
return _text_bbox(el)
|
|
164
|
+
bb = el.get("BoundingBox", "")
|
|
165
|
+
if bb:
|
|
166
|
+
vals = [float(v) for v in bb.split()]
|
|
167
|
+
if len(vals) >= 4:
|
|
168
|
+
return vals[0], vals[1], vals[2], vals[3]
|
|
169
|
+
return 0, 0, 0, 0
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def _bbox_center(bb: Tuple[float, float, float, float]) -> Tuple[float, float]:
|
|
173
|
+
return (bb[0] + bb[2]) / 2.0, (bb[1] + bb[3]) / 2.0
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def _bbox_width(bb: Tuple[float, float, float, float]) -> float:
|
|
177
|
+
return bb[2] - bb[0]
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def _bbox_height(bb: Tuple[float, float, float, float]) -> float:
|
|
181
|
+
return bb[3] - bb[1]
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
# ---------------------------------------------------------------------------
|
|
185
|
+
# Element shifting / positioning
|
|
186
|
+
# ---------------------------------------------------------------------------
|
|
187
|
+
|
|
188
|
+
def _shift_element(el: ET.Element, dx: float, dy: float):
|
|
189
|
+
"""Translate an element (fragment or text) by (dx, dy).
|
|
190
|
+
|
|
191
|
+
For fragments, shifts ALL descendant nodes and text elements
|
|
192
|
+
(including those inside inner NodeType="Fragment" sub-structures).
|
|
193
|
+
This is correct because all coordinates live in the same space.
|
|
194
|
+
Also shifts BoundingBox attributes on all sub-elements.
|
|
195
|
+
"""
|
|
196
|
+
if el.tag == "fragment":
|
|
197
|
+
for n in el.iter("n"):
|
|
198
|
+
p = n.get("p")
|
|
199
|
+
if p:
|
|
200
|
+
parts = p.split()
|
|
201
|
+
if len(parts) >= 2:
|
|
202
|
+
nx = float(parts[0]) + dx
|
|
203
|
+
ny = float(parts[1]) + dy
|
|
204
|
+
n.set("p", f"{nx:.2f} {ny:.2f}")
|
|
205
|
+
for t in el.iter("t"):
|
|
206
|
+
_shift_text_element(t, dx, dy)
|
|
207
|
+
# Shift BoundingBox on the fragment itself
|
|
208
|
+
_shift_bbox_attr(el, dx, dy)
|
|
209
|
+
# Also shift BoundingBox on any inner <fragment> elements
|
|
210
|
+
for inner_frag in el.iter("fragment"):
|
|
211
|
+
if inner_frag is not el:
|
|
212
|
+
_shift_bbox_attr(inner_frag, dx, dy)
|
|
213
|
+
|
|
214
|
+
elif el.tag == "t":
|
|
215
|
+
_shift_text_element(el, dx, dy)
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def _shift_text_element(t: ET.Element, dx: float, dy: float):
|
|
219
|
+
"""Shift a <t> element's position and bounding box."""
|
|
220
|
+
p = t.get("p")
|
|
221
|
+
if p:
|
|
222
|
+
parts = p.split()
|
|
223
|
+
if len(parts) >= 2:
|
|
224
|
+
nx = float(parts[0]) + dx
|
|
225
|
+
ny = float(parts[1]) + dy
|
|
226
|
+
t.set("p", f"{nx:.2f} {ny:.2f}")
|
|
227
|
+
_shift_bbox_attr(t, dx, dy)
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def _shift_bbox_attr(el: ET.Element, dx: float, dy: float):
|
|
231
|
+
"""Shift BoundingBox attribute by (dx, dy)."""
|
|
232
|
+
bb = el.get("BoundingBox")
|
|
233
|
+
if bb:
|
|
234
|
+
vals = [float(v) for v in bb.split()]
|
|
235
|
+
if len(vals) >= 4:
|
|
236
|
+
vals[0] += dx
|
|
237
|
+
vals[1] += dy
|
|
238
|
+
vals[2] += dx
|
|
239
|
+
vals[3] += dy
|
|
240
|
+
el.set("BoundingBox", " ".join(f"{v:.2f}" for v in vals))
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def _set_arrow(arrow: ET.Element, tail_x: float, tail_y: float,
|
|
244
|
+
head_x: float, head_y: float):
|
|
245
|
+
"""Set arrow endpoints and update its bounding box."""
|
|
246
|
+
arrow.set("Tail3D", f"{tail_x:.2f} {tail_y:.2f} 0")
|
|
247
|
+
arrow.set("Head3D", f"{head_x:.2f} {head_y:.2f} 0")
|
|
248
|
+
# Update Center3D and axis ends (elliptical arc geometry — ChemDraw internal)
|
|
249
|
+
cx = (tail_x + head_x) / 2.0
|
|
250
|
+
cy = (tail_y + head_y) / 2.0
|
|
251
|
+
half_len = abs(head_x - tail_x) / 2.0
|
|
252
|
+
arrow.set("Center3D", f"{cx + 280:.2f} {cy + 130:.2f} 0")
|
|
253
|
+
arrow.set("MajorAxisEnd3D", f"{cx + 280 + half_len:.2f} {cy + 130:.2f} 0")
|
|
254
|
+
arrow.set("MinorAxisEnd3D", f"{cx + 280:.2f} {cy + 130 + half_len:.2f} 0")
|
|
255
|
+
# BoundingBox
|
|
256
|
+
pad = 2.0
|
|
257
|
+
bb_x1 = min(tail_x, head_x)
|
|
258
|
+
bb_x2 = max(tail_x, head_x)
|
|
259
|
+
arrow.set("BoundingBox",
|
|
260
|
+
f"{bb_x1:.2f} {tail_y - pad:.2f} {bb_x2:.2f} {tail_y + pad:.2f}")
|
|
261
|
+
# Also update the superseding graphic if present
|
|
262
|
+
# (handled at page level in the caller)
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def _update_graphic_for_arrow(page: ET.Element, arrow: ET.Element,
|
|
266
|
+
tail_x: float, head_x: float, arrow_y: float):
|
|
267
|
+
"""Update the <graphic> that the arrow supersedes."""
|
|
268
|
+
arrow_id = arrow.get("id", "")
|
|
269
|
+
for el in page:
|
|
270
|
+
if el.tag == "graphic" and el.get("SupersededBy") == arrow_id:
|
|
271
|
+
el.set("BoundingBox",
|
|
272
|
+
f"{head_x:.2f} {arrow_y:.2f} {tail_x:.2f} {arrow_y:.2f}")
|
|
273
|
+
break
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def _center_element_x(el: ET.Element, target_cx: float):
|
|
277
|
+
"""Move element so its horizontal center is at target_cx."""
|
|
278
|
+
bb = _element_bbox(el)
|
|
279
|
+
current_cx = (bb[0] + bb[2]) / 2.0
|
|
280
|
+
dx = target_cx - current_cx
|
|
281
|
+
_shift_element(el, dx, 0)
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def _center_element_y(el: ET.Element, target_cy: float):
|
|
285
|
+
"""Move element so its vertical center is at target_cy."""
|
|
286
|
+
bb = _element_bbox(el)
|
|
287
|
+
current_cy = (bb[1] + bb[3]) / 2.0
|
|
288
|
+
dy = target_cy - current_cy
|
|
289
|
+
_shift_element(el, 0, dy)
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
def _move_element_to(el: ET.Element, target_cx: float, target_cy: float):
|
|
293
|
+
"""Move element so its center is at (target_cx, target_cy)."""
|
|
294
|
+
bb = _element_bbox(el)
|
|
295
|
+
cx = (bb[0] + bb[2]) / 2.0
|
|
296
|
+
cy = (bb[1] + bb[3]) / 2.0
|
|
297
|
+
_shift_element(el, target_cx - cx, target_cy - cy)
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
# ---------------------------------------------------------------------------
|
|
301
|
+
# Reaction parsing — extract roles from <step>
|
|
302
|
+
# ---------------------------------------------------------------------------
|
|
303
|
+
|
|
304
|
+
def _parse_reaction(page: ET.Element, step: ET.Element,
|
|
305
|
+
id_map: Dict[str, ET.Element]):
|
|
306
|
+
"""Extract reactants, products, above-arrow, below-arrow element lists."""
|
|
307
|
+
def _resolve(attr):
|
|
308
|
+
ids = step.get(attr, "").split()
|
|
309
|
+
return [id_map[i] for i in ids if i in id_map]
|
|
310
|
+
|
|
311
|
+
reactants = _resolve("ReactionStepReactants")
|
|
312
|
+
products = _resolve("ReactionStepProducts")
|
|
313
|
+
above = _resolve("ReactionStepObjectsAboveArrow")
|
|
314
|
+
below = _resolve("ReactionStepObjectsBelowArrow")
|
|
315
|
+
return reactants, products, above, below
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
# ---------------------------------------------------------------------------
|
|
319
|
+
# Approach 1: bbox_center — Bounding-box centroid alignment + uniform gaps
|
|
320
|
+
# ---------------------------------------------------------------------------
|
|
321
|
+
|
|
322
|
+
def approach_bbox_center(page, step, id_map, arrow, verbose=False):
|
|
323
|
+
"""
|
|
324
|
+
Simple centroid-based layout:
|
|
325
|
+
- All molecules vertically centered on arrow y
|
|
326
|
+
- Uniform horizontal gaps between reactants, arrow, products
|
|
327
|
+
- Above/below text centered over arrow
|
|
328
|
+
"""
|
|
329
|
+
reactants, products, above, below = _parse_reaction(page, step, id_map)
|
|
330
|
+
if not reactants or not products:
|
|
331
|
+
return
|
|
332
|
+
|
|
333
|
+
GAP = 15.0 # gap between elements and arrow (approach-specific)
|
|
334
|
+
|
|
335
|
+
# Compute total width of reactant group and product group
|
|
336
|
+
r_bboxes = [_element_bbox(r) for r in reactants]
|
|
337
|
+
p_bboxes = [_element_bbox(p) for p in products]
|
|
338
|
+
r_total_w = sum(_bbox_width(b) for b in r_bboxes) + GAP * max(0, len(reactants) - 1)
|
|
339
|
+
p_total_w = sum(_bbox_width(b) for b in p_bboxes) + GAP * max(0, len(products) - 1)
|
|
340
|
+
|
|
341
|
+
# Arrow length: at least as wide as the widest above/below object
|
|
342
|
+
arrow_len = _compute_arrow_len_from_content(above, below)
|
|
343
|
+
|
|
344
|
+
# Compute arrow y as average of all molecule centers
|
|
345
|
+
all_bbs = r_bboxes + p_bboxes
|
|
346
|
+
arrow_y = sum(_bbox_center(b)[1] for b in all_bbs) / len(all_bbs)
|
|
347
|
+
|
|
348
|
+
# Layout: reactants | GAP | arrow | GAP | products
|
|
349
|
+
# Find current centroid to place everything relative to it
|
|
350
|
+
all_cx = sum(_bbox_center(b)[0] for b in all_bbs) / len(all_bbs)
|
|
351
|
+
total_w = r_total_w + GAP + arrow_len + GAP + p_total_w
|
|
352
|
+
start_x = all_cx - total_w / 2.0
|
|
353
|
+
|
|
354
|
+
# Place reactants
|
|
355
|
+
cursor_x = start_x
|
|
356
|
+
for i, r in enumerate(reactants):
|
|
357
|
+
bb = _element_bbox(r)
|
|
358
|
+
w = _bbox_width(bb)
|
|
359
|
+
_move_element_to(r, cursor_x + w / 2.0, arrow_y)
|
|
360
|
+
cursor_x += w + GAP
|
|
361
|
+
|
|
362
|
+
# Place arrow
|
|
363
|
+
tail_x = cursor_x
|
|
364
|
+
head_x = cursor_x + arrow_len
|
|
365
|
+
_set_arrow(arrow, tail_x, arrow_y, head_x, arrow_y)
|
|
366
|
+
_update_graphic_for_arrow(page, arrow, tail_x, head_x, arrow_y)
|
|
367
|
+
cursor_x = head_x + GAP
|
|
368
|
+
|
|
369
|
+
# Place products
|
|
370
|
+
for i, p in enumerate(products):
|
|
371
|
+
bb = _element_bbox(p)
|
|
372
|
+
w = _bbox_width(bb)
|
|
373
|
+
_move_element_to(p, cursor_x + w / 2.0, arrow_y)
|
|
374
|
+
cursor_x += w + GAP
|
|
375
|
+
|
|
376
|
+
# Center above-arrow objects
|
|
377
|
+
arrow_cx = (tail_x + head_x) / 2.0
|
|
378
|
+
_stack_above_below(above, below, arrow_cx, arrow_y,
|
|
379
|
+
LAYOUT_ABOVE_GAP, LAYOUT_BELOW_GAP)
|
|
380
|
+
|
|
381
|
+
|
|
382
|
+
# ---------------------------------------------------------------------------
|
|
383
|
+
# Approach 2: arrow_driven — Arrow length drives layout
|
|
384
|
+
# ---------------------------------------------------------------------------
|
|
385
|
+
|
|
386
|
+
def approach_arrow_driven(page, step, id_map, arrow, verbose=False):
|
|
387
|
+
"""
|
|
388
|
+
Arrow-centric layout:
|
|
389
|
+
- Arrow stays at a fixed reasonable length (70pt ≈ ~1 inch)
|
|
390
|
+
- Reactants right-aligned to arrow tail with gap
|
|
391
|
+
- Products left-aligned to arrow head with gap
|
|
392
|
+
- Vertical centering on arrow midpoint
|
|
393
|
+
"""
|
|
394
|
+
reactants, products, above, below = _parse_reaction(page, step, id_map)
|
|
395
|
+
if not reactants or not products:
|
|
396
|
+
return
|
|
397
|
+
|
|
398
|
+
FRAG_GAP = 12.0 # gap between fragment edge and arrow tip (approach-specific)
|
|
399
|
+
INTER_GAP = LAYOUT_INTER_FRAGMENT_GAP
|
|
400
|
+
|
|
401
|
+
# Arrow length: at least as wide as widest above/below object, min 70pt
|
|
402
|
+
ARROW_LEN = _compute_arrow_len_from_content(above, below, min_len=70.0)
|
|
403
|
+
|
|
404
|
+
# Determine arrow y from the tallest molecule's vertical center
|
|
405
|
+
all_bbs = [_element_bbox(r) for r in reactants] + [_element_bbox(p) for p in products]
|
|
406
|
+
arrow_y = sum(_bbox_center(b)[1] for b in all_bbs) / len(all_bbs)
|
|
407
|
+
|
|
408
|
+
# Place arrow centered on current midpoint
|
|
409
|
+
all_cx = sum(_bbox_center(b)[0] for b in all_bbs) / len(all_bbs)
|
|
410
|
+
tail_x = all_cx - ARROW_LEN / 2.0
|
|
411
|
+
head_x = all_cx + ARROW_LEN / 2.0
|
|
412
|
+
|
|
413
|
+
_set_arrow(arrow, tail_x, arrow_y, head_x, arrow_y)
|
|
414
|
+
_update_graphic_for_arrow(page, arrow, tail_x, head_x, arrow_y)
|
|
415
|
+
|
|
416
|
+
# Place reactants right-to-left from arrow tail
|
|
417
|
+
cursor_x = tail_x - FRAG_GAP
|
|
418
|
+
for r in reversed(reactants):
|
|
419
|
+
bb = _element_bbox(r)
|
|
420
|
+
w = _bbox_width(bb)
|
|
421
|
+
_move_element_to(r, cursor_x - w / 2.0, arrow_y)
|
|
422
|
+
cursor_x -= w + INTER_GAP
|
|
423
|
+
|
|
424
|
+
# Place products left-to-right from arrow head
|
|
425
|
+
cursor_x = head_x + FRAG_GAP
|
|
426
|
+
for p in products:
|
|
427
|
+
bb = _element_bbox(p)
|
|
428
|
+
w = _bbox_width(bb)
|
|
429
|
+
_move_element_to(p, cursor_x + w / 2.0, arrow_y)
|
|
430
|
+
cursor_x += w + INTER_GAP
|
|
431
|
+
|
|
432
|
+
# Conditions
|
|
433
|
+
arrow_cx = (tail_x + head_x) / 2.0
|
|
434
|
+
_stack_above_below(above, below, arrow_cx, arrow_y,
|
|
435
|
+
LAYOUT_ABOVE_GAP, LAYOUT_BELOW_GAP)
|
|
436
|
+
|
|
437
|
+
|
|
438
|
+
# ---------------------------------------------------------------------------
|
|
439
|
+
# Approach 3: proportional — Gaps proportional to molecule widths
|
|
440
|
+
# ---------------------------------------------------------------------------
|
|
441
|
+
|
|
442
|
+
def approach_proportional(page, step, id_map, arrow, verbose=False):
|
|
443
|
+
"""
|
|
444
|
+
Proportional spacing:
|
|
445
|
+
- Arrow length = 0.6× the average molecule width
|
|
446
|
+
- Gaps scale with molecule size
|
|
447
|
+
- Looks balanced for both small and large molecules
|
|
448
|
+
"""
|
|
449
|
+
reactants, products, above, below = _parse_reaction(page, step, id_map)
|
|
450
|
+
if not reactants or not products:
|
|
451
|
+
return
|
|
452
|
+
|
|
453
|
+
r_bbs = [_element_bbox(r) for r in reactants]
|
|
454
|
+
p_bbs = [_element_bbox(p) for p in products]
|
|
455
|
+
|
|
456
|
+
avg_w = (sum(_bbox_width(b) for b in r_bbs + p_bbs) /
|
|
457
|
+
len(r_bbs + p_bbs))
|
|
458
|
+
|
|
459
|
+
content_len = _compute_arrow_len_from_content(above, below, min_len=45.0)
|
|
460
|
+
ARROW_LEN = max(content_len, min(100.0, avg_w * 0.6))
|
|
461
|
+
GAP_RATIO = 0.25 # gap = 25% of adjacent molecule width
|
|
462
|
+
|
|
463
|
+
all_bbs = r_bbs + p_bbs
|
|
464
|
+
arrow_y = sum(_bbox_center(b)[1] for b in all_bbs) / len(all_bbs)
|
|
465
|
+
all_cx = sum(_bbox_center(b)[0] for b in all_bbs) / len(all_bbs)
|
|
466
|
+
|
|
467
|
+
# Compute total width
|
|
468
|
+
r_widths = [_bbox_width(b) for b in r_bbs]
|
|
469
|
+
p_widths = [_bbox_width(b) for b in p_bbs]
|
|
470
|
+
|
|
471
|
+
r_total = sum(r_widths) + sum(w * GAP_RATIO for w in r_widths[:-1]) if r_widths else 0
|
|
472
|
+
p_total = sum(p_widths) + sum(w * GAP_RATIO for w in p_widths[:-1]) if p_widths else 0
|
|
473
|
+
|
|
474
|
+
# Gap between last reactant and arrow tail
|
|
475
|
+
r_arrow_gap = (r_widths[-1] * GAP_RATIO + 8.0) if r_widths else 12.0
|
|
476
|
+
# Gap between arrow head and first product
|
|
477
|
+
p_arrow_gap = (p_widths[0] * GAP_RATIO + 8.0) if p_widths else 12.0
|
|
478
|
+
|
|
479
|
+
total_w = r_total + r_arrow_gap + ARROW_LEN + p_arrow_gap + p_total
|
|
480
|
+
start_x = all_cx - total_w / 2.0
|
|
481
|
+
|
|
482
|
+
# Place reactants
|
|
483
|
+
cursor_x = start_x
|
|
484
|
+
for i, r in enumerate(reactants):
|
|
485
|
+
w = r_widths[i]
|
|
486
|
+
_move_element_to(r, cursor_x + w / 2.0, arrow_y)
|
|
487
|
+
cursor_x += w + (w * GAP_RATIO if i < len(reactants) - 1 else 0)
|
|
488
|
+
|
|
489
|
+
cursor_x += r_arrow_gap
|
|
490
|
+
|
|
491
|
+
# Arrow
|
|
492
|
+
tail_x = cursor_x
|
|
493
|
+
head_x = cursor_x + ARROW_LEN
|
|
494
|
+
_set_arrow(arrow, tail_x, arrow_y, head_x, arrow_y)
|
|
495
|
+
_update_graphic_for_arrow(page, arrow, tail_x, head_x, arrow_y)
|
|
496
|
+
cursor_x = head_x + p_arrow_gap
|
|
497
|
+
|
|
498
|
+
# Products
|
|
499
|
+
for i, p in enumerate(products):
|
|
500
|
+
w = p_widths[i]
|
|
501
|
+
_move_element_to(p, cursor_x + w / 2.0, arrow_y)
|
|
502
|
+
cursor_x += w + (w * GAP_RATIO if i < len(products) - 1 else 0)
|
|
503
|
+
|
|
504
|
+
arrow_cx = (tail_x + head_x) / 2.0
|
|
505
|
+
_stack_above_below(above, below, arrow_cx, arrow_y,
|
|
506
|
+
LAYOUT_ABOVE_GAP, LAYOUT_BELOW_GAP)
|
|
507
|
+
|
|
508
|
+
|
|
509
|
+
# ---------------------------------------------------------------------------
|
|
510
|
+
# Approach 4: compact — Minimal gaps for slides/posters
|
|
511
|
+
# ---------------------------------------------------------------------------
|
|
512
|
+
|
|
513
|
+
def approach_compact(page, step, id_map, arrow, verbose=False):
|
|
514
|
+
"""
|
|
515
|
+
Compact layout for space-constrained output:
|
|
516
|
+
- Minimal gaps (5pt)
|
|
517
|
+
- Short arrow (45pt)
|
|
518
|
+
- Tight vertical stacking
|
|
519
|
+
"""
|
|
520
|
+
reactants, products, above, below = _parse_reaction(page, step, id_map)
|
|
521
|
+
if not reactants or not products:
|
|
522
|
+
return
|
|
523
|
+
|
|
524
|
+
ARROW_LEN = _compute_arrow_len_from_content(above, below, min_len=45.0)
|
|
525
|
+
GAP = 5.0
|
|
526
|
+
ABOVE_GAP = 5.0 # approach-specific (tighter than standard)
|
|
527
|
+
|
|
528
|
+
r_bbs = [_element_bbox(r) for r in reactants]
|
|
529
|
+
p_bbs = [_element_bbox(p) for p in products]
|
|
530
|
+
all_bbs = r_bbs + p_bbs
|
|
531
|
+
|
|
532
|
+
arrow_y = sum(_bbox_center(b)[1] for b in all_bbs) / len(all_bbs)
|
|
533
|
+
all_cx = sum(_bbox_center(b)[0] for b in all_bbs) / len(all_bbs)
|
|
534
|
+
|
|
535
|
+
r_total = sum(_bbox_width(b) for b in r_bbs) + GAP * max(0, len(r_bbs) - 1)
|
|
536
|
+
p_total = sum(_bbox_width(b) for b in p_bbs) + GAP * max(0, len(p_bbs) - 1)
|
|
537
|
+
|
|
538
|
+
total_w = r_total + GAP + ARROW_LEN + GAP + p_total
|
|
539
|
+
start_x = all_cx - total_w / 2.0
|
|
540
|
+
|
|
541
|
+
cursor_x = start_x
|
|
542
|
+
for i, r in enumerate(reactants):
|
|
543
|
+
w = _bbox_width(r_bbs[i])
|
|
544
|
+
_move_element_to(r, cursor_x + w / 2.0, arrow_y)
|
|
545
|
+
cursor_x += w + GAP
|
|
546
|
+
|
|
547
|
+
tail_x = cursor_x
|
|
548
|
+
head_x = cursor_x + ARROW_LEN
|
|
549
|
+
_set_arrow(arrow, tail_x, arrow_y, head_x, arrow_y)
|
|
550
|
+
_update_graphic_for_arrow(page, arrow, tail_x, head_x, arrow_y)
|
|
551
|
+
cursor_x = head_x + GAP
|
|
552
|
+
|
|
553
|
+
for i, p in enumerate(products):
|
|
554
|
+
w = _bbox_width(p_bbs[i])
|
|
555
|
+
_move_element_to(p, cursor_x + w / 2.0, arrow_y)
|
|
556
|
+
cursor_x += w + GAP
|
|
557
|
+
|
|
558
|
+
arrow_cx = (tail_x + head_x) / 2.0
|
|
559
|
+
_stack_above_below(above, below, arrow_cx, arrow_y,
|
|
560
|
+
ABOVE_GAP, LAYOUT_BELOW_GAP)
|
|
561
|
+
|
|
562
|
+
|
|
563
|
+
# ---------------------------------------------------------------------------
|
|
564
|
+
# Approach 5: golden_ratio — Arrow + gaps use golden ratio proportions
|
|
565
|
+
# ---------------------------------------------------------------------------
|
|
566
|
+
|
|
567
|
+
def approach_golden_ratio(page, step, id_map, arrow, verbose=False):
|
|
568
|
+
"""
|
|
569
|
+
Golden ratio aesthetics:
|
|
570
|
+
- Arrow length = φ × average molecule width
|
|
571
|
+
- Gaps = average molecule width / φ
|
|
572
|
+
- Pleasing visual proportions
|
|
573
|
+
"""
|
|
574
|
+
PHI = 1.618
|
|
575
|
+
|
|
576
|
+
reactants, products, above, below = _parse_reaction(page, step, id_map)
|
|
577
|
+
if not reactants or not products:
|
|
578
|
+
return
|
|
579
|
+
|
|
580
|
+
r_bbs = [_element_bbox(r) for r in reactants]
|
|
581
|
+
p_bbs = [_element_bbox(p) for p in products]
|
|
582
|
+
all_bbs = r_bbs + p_bbs
|
|
583
|
+
|
|
584
|
+
avg_w = sum(_bbox_width(b) for b in all_bbs) / len(all_bbs)
|
|
585
|
+
content_len = _compute_arrow_len_from_content(above, below, min_len=50.0)
|
|
586
|
+
ARROW_LEN = max(content_len, min(110.0, avg_w * PHI))
|
|
587
|
+
GAP = max(LAYOUT_ABOVE_GAP, avg_w / PHI)
|
|
588
|
+
|
|
589
|
+
arrow_y = sum(_bbox_center(b)[1] for b in all_bbs) / len(all_bbs)
|
|
590
|
+
all_cx = sum(_bbox_center(b)[0] for b in all_bbs) / len(all_bbs)
|
|
591
|
+
|
|
592
|
+
r_total = sum(_bbox_width(b) for b in r_bbs) + GAP * max(0, len(r_bbs) - 1)
|
|
593
|
+
p_total = sum(_bbox_width(b) for b in p_bbs) + GAP * max(0, len(p_bbs) - 1)
|
|
594
|
+
|
|
595
|
+
total_w = r_total + GAP + ARROW_LEN + GAP + p_total
|
|
596
|
+
start_x = all_cx - total_w / 2.0
|
|
597
|
+
|
|
598
|
+
cursor_x = start_x
|
|
599
|
+
for i, r in enumerate(reactants):
|
|
600
|
+
w = _bbox_width(r_bbs[i])
|
|
601
|
+
_move_element_to(r, cursor_x + w / 2.0, arrow_y)
|
|
602
|
+
cursor_x += w + GAP
|
|
603
|
+
|
|
604
|
+
tail_x = cursor_x
|
|
605
|
+
head_x = cursor_x + ARROW_LEN
|
|
606
|
+
_set_arrow(arrow, tail_x, arrow_y, head_x, arrow_y)
|
|
607
|
+
_update_graphic_for_arrow(page, arrow, tail_x, head_x, arrow_y)
|
|
608
|
+
cursor_x = head_x + GAP
|
|
609
|
+
|
|
610
|
+
for i, p in enumerate(products):
|
|
611
|
+
w = _bbox_width(p_bbs[i])
|
|
612
|
+
_move_element_to(p, cursor_x + w / 2.0, arrow_y)
|
|
613
|
+
cursor_x += w + GAP
|
|
614
|
+
|
|
615
|
+
arrow_cx = (tail_x + head_x) / 2.0
|
|
616
|
+
_stack_above_below(above, below, arrow_cx, arrow_y,
|
|
617
|
+
LAYOUT_ABOVE_GAP, LAYOUT_BELOW_GAP)
|
|
618
|
+
|
|
619
|
+
|
|
620
|
+
# ---------------------------------------------------------------------------
|
|
621
|
+
# Approach 6: chemdraw_mimic — Closest emulation of ChemDraw heuristics
|
|
622
|
+
# ---------------------------------------------------------------------------
|
|
623
|
+
|
|
624
|
+
def approach_chemdraw_mimic(page, step, id_map, arrow, verbose=False):
|
|
625
|
+
"""
|
|
626
|
+
Emulates ChemDraw's Clean Up Reaction behaviour:
|
|
627
|
+
- Arrow length ≈ 1.5× bond length (BondLength from doc)
|
|
628
|
+
- Molecules placed so nearest atom is ~1 bond length from arrow tip
|
|
629
|
+
- Above-arrow objects stacked: structures first, then text
|
|
630
|
+
- Below-arrow objects similarly stacked
|
|
631
|
+
- Everything vertically centered on a common y-line
|
|
632
|
+
- Separate above-arrow fragments from above-arrow text labels
|
|
633
|
+
"""
|
|
634
|
+
reactants, products, above, below = _parse_reaction(page, step, id_map)
|
|
635
|
+
if not reactants or not products:
|
|
636
|
+
return
|
|
637
|
+
|
|
638
|
+
# Read BondLength from document
|
|
639
|
+
root = page
|
|
640
|
+
while root.tag != "CDXML":
|
|
641
|
+
# Walk up — but ET doesn't support parent. Use the global root instead.
|
|
642
|
+
root = page
|
|
643
|
+
break
|
|
644
|
+
# ACS Document 1996 bond length
|
|
645
|
+
bond_len = ACS_BOND_LENGTH
|
|
646
|
+
|
|
647
|
+
content_len = _compute_arrow_len_from_content(above, below, min_len=bond_len * 5.0)
|
|
648
|
+
ARROW_LEN = content_len
|
|
649
|
+
FRAG_GAP = bond_len * LAYOUT_FRAG_GAP_BONDS
|
|
650
|
+
INTER_GAP = bond_len * LAYOUT_INTER_GAP_BONDS
|
|
651
|
+
|
|
652
|
+
r_bbs = [_element_bbox(r) for r in reactants]
|
|
653
|
+
p_bbs = [_element_bbox(p) for p in products]
|
|
654
|
+
all_bbs = r_bbs + p_bbs
|
|
655
|
+
|
|
656
|
+
# Arrow y = vertical center of reactants (ChemDraw uses reactant center)
|
|
657
|
+
arrow_y = sum(_bbox_center(b)[1] for b in r_bbs) / len(r_bbs)
|
|
658
|
+
|
|
659
|
+
# Position arrow. Use mean x of all molecules as center.
|
|
660
|
+
all_cx = sum(_bbox_center(b)[0] for b in all_bbs) / len(all_bbs)
|
|
661
|
+
|
|
662
|
+
# Compute widths
|
|
663
|
+
r_widths = [_bbox_width(b) for b in r_bbs]
|
|
664
|
+
p_widths = [_bbox_width(b) for b in p_bbs]
|
|
665
|
+
|
|
666
|
+
r_block_w = sum(r_widths) + INTER_GAP * max(0, len(r_widths) - 1)
|
|
667
|
+
p_block_w = sum(p_widths) + INTER_GAP * max(0, len(p_widths) - 1)
|
|
668
|
+
|
|
669
|
+
total_w = r_block_w + FRAG_GAP + ARROW_LEN + FRAG_GAP + p_block_w
|
|
670
|
+
start_x = all_cx - total_w / 2.0
|
|
671
|
+
|
|
672
|
+
# Place reactants
|
|
673
|
+
cursor_x = start_x
|
|
674
|
+
for i, r in enumerate(reactants):
|
|
675
|
+
w = r_widths[i]
|
|
676
|
+
_move_element_to(r, cursor_x + w / 2.0, arrow_y)
|
|
677
|
+
cursor_x += w + INTER_GAP
|
|
678
|
+
cursor_x = cursor_x - INTER_GAP + FRAG_GAP # replace last inter-gap with frag-gap
|
|
679
|
+
|
|
680
|
+
# Arrow
|
|
681
|
+
tail_x = cursor_x
|
|
682
|
+
head_x = cursor_x + ARROW_LEN
|
|
683
|
+
_set_arrow(arrow, tail_x, arrow_y, head_x, arrow_y)
|
|
684
|
+
_update_graphic_for_arrow(page, arrow, tail_x, head_x, arrow_y)
|
|
685
|
+
cursor_x = head_x + FRAG_GAP
|
|
686
|
+
|
|
687
|
+
# Products
|
|
688
|
+
for i, p in enumerate(products):
|
|
689
|
+
w = p_widths[i]
|
|
690
|
+
_move_element_to(p, cursor_x + w / 2.0, arrow_y)
|
|
691
|
+
cursor_x += w + INTER_GAP
|
|
692
|
+
|
|
693
|
+
# Conditions — use shared stacking (text closest to arrow, frags above/below)
|
|
694
|
+
arrow_cx = (tail_x + head_x) / 2.0
|
|
695
|
+
_stack_above_below(above, below, arrow_cx, arrow_y,
|
|
696
|
+
LAYOUT_ABOVE_GAP, LAYOUT_BELOW_GAP)
|
|
697
|
+
|
|
698
|
+
|
|
699
|
+
# ---------------------------------------------------------------------------
|
|
700
|
+
# Shared: stack above/below arrow
|
|
701
|
+
# ---------------------------------------------------------------------------
|
|
702
|
+
|
|
703
|
+
def _compute_arrow_len_from_content(above: List[ET.Element],
|
|
704
|
+
below: List[ET.Element],
|
|
705
|
+
min_len: float = 50.0) -> float:
|
|
706
|
+
"""Compute arrow length so it's at least as wide as the widest
|
|
707
|
+
above- or below-arrow object.
|
|
708
|
+
|
|
709
|
+
above/below are the raw element lists from the step metadata.
|
|
710
|
+
Text from above is redirected below, so we check both groups.
|
|
711
|
+
"""
|
|
712
|
+
above_frags = [e for e in above if e.tag != "t"]
|
|
713
|
+
above_texts = [e for e in above if e.tag == "t"]
|
|
714
|
+
below_texts = [e for e in below if e.tag == "t"]
|
|
715
|
+
below_frags = [e for e in below if e.tag != "t"]
|
|
716
|
+
|
|
717
|
+
all_above = above_frags
|
|
718
|
+
all_below = above_texts + below_texts + below_frags
|
|
719
|
+
|
|
720
|
+
max_w = 0.0
|
|
721
|
+
for el in all_above + all_below:
|
|
722
|
+
if el.tag == "t":
|
|
723
|
+
# Use content-based width estimate instead of stored BoundingBox.
|
|
724
|
+
# Stored BoundingBox may be stale (e.g. from ELN exports with
|
|
725
|
+
# non-ACS scaling where bond normalization resized fragments but
|
|
726
|
+
# left page-level text BoundingBoxes untouched).
|
|
727
|
+
w = _estimate_text_width(el)
|
|
728
|
+
else:
|
|
729
|
+
bb = _element_bbox(el)
|
|
730
|
+
w = _bbox_width(bb)
|
|
731
|
+
if w > max_w:
|
|
732
|
+
max_w = w
|
|
733
|
+
|
|
734
|
+
# Arrow should be wider than the widest object, with some padding
|
|
735
|
+
return max(min_len, max_w + 10.0)
|
|
736
|
+
|
|
737
|
+
|
|
738
|
+
def _stack_above_below(above: List[ET.Element], below: List[ET.Element],
|
|
739
|
+
arrow_cx: float, arrow_y: float,
|
|
740
|
+
above_gap: float, below_gap: float):
|
|
741
|
+
"""Place above/below-arrow objects with text always below the arrow.
|
|
742
|
+
|
|
743
|
+
Text (<t>) elements — even if listed as "above arrow" in the step
|
|
744
|
+
metadata — are always placed below the arrow line. Only non-text
|
|
745
|
+
elements (fragments / structures) go above.
|
|
746
|
+
|
|
747
|
+
For above-arrow fragments, uses atom-only bounding boxes (no text
|
|
748
|
+
labels) since ChemDraw's XML label BoundingBox values are unreliable.
|
|
749
|
+
|
|
750
|
+
The above_gap parameter is the *base* gap (typically 8pt). If the
|
|
751
|
+
bottommost atom of a fragment is N or P with only 2 explicit bonds
|
|
752
|
+
(i.e. it will have a vertically-stacked H label like NH or PH),
|
|
753
|
+
the gap is increased to 16pt to avoid the hanging label clashing
|
|
754
|
+
with the arrow.
|
|
755
|
+
|
|
756
|
+
below_gap is the distance from the arrow y-line to the top edge
|
|
757
|
+
of the highest below-arrow object (typically 4pt).
|
|
758
|
+
"""
|
|
759
|
+
# Collect texts from both lists — they all go below
|
|
760
|
+
above_texts = [e for e in above if e.tag == "t"]
|
|
761
|
+
above_frags = [e for e in above if e.tag != "t"]
|
|
762
|
+
below_texts = [e for e in below if e.tag == "t"]
|
|
763
|
+
below_frags = [e for e in below if e.tag != "t"]
|
|
764
|
+
|
|
765
|
+
# --- Above arrow: only non-text elements (fragments) ---
|
|
766
|
+
# Use atom-only bbox; adjust gap for hanging labels (NH, PH)
|
|
767
|
+
for el in above_frags:
|
|
768
|
+
bb = _element_bbox(el)
|
|
769
|
+
h = _bbox_height(bb)
|
|
770
|
+
cx = (bb[0] + bb[2]) / 2.0
|
|
771
|
+
cy = (bb[1] + bb[3]) / 2.0
|
|
772
|
+
|
|
773
|
+
# Determine gap for this fragment
|
|
774
|
+
if el.tag == "fragment" and fragment_bottom_has_hanging_label(el):
|
|
775
|
+
gap = LAYOUT_HANGING_LABEL_GAP
|
|
776
|
+
else:
|
|
777
|
+
gap = above_gap
|
|
778
|
+
|
|
779
|
+
# Place so bottom edge of atom-only bbox is at arrow_y - gap
|
|
780
|
+
target_bottom = arrow_y - gap
|
|
781
|
+
target_cy = target_bottom - h / 2.0
|
|
782
|
+
_shift_element(el, arrow_cx - cx, target_cy - cy)
|
|
783
|
+
|
|
784
|
+
# --- Below arrow: all text (from above + below lists), then fragments ---
|
|
785
|
+
# Text elements use consistent baseline-to-baseline spacing (like
|
|
786
|
+
# ChemDraw's multi-line text rendering). This avoids dependence on
|
|
787
|
+
# stale BoundingBox values from upstream processing.
|
|
788
|
+
all_below_text = above_texts + below_texts
|
|
789
|
+
BASELINE_OFFSET = 10.0 # baseline below top-of-text-line (cap height)
|
|
790
|
+
TEXT_LINE_SPACING = 13.0 # baseline-to-baseline (Arial 10pt with leading)
|
|
791
|
+
|
|
792
|
+
prev_baseline = None
|
|
793
|
+
y_cursor = arrow_y + below_gap
|
|
794
|
+
for el in all_below_text:
|
|
795
|
+
if prev_baseline is None:
|
|
796
|
+
baseline_y = y_cursor + BASELINE_OFFSET
|
|
797
|
+
else:
|
|
798
|
+
baseline_y = prev_baseline + TEXT_LINE_SPACING
|
|
799
|
+
el.set("p", f"{arrow_cx:.2f} {baseline_y:.2f}")
|
|
800
|
+
el.set("CaptionJustification", "Center")
|
|
801
|
+
el.set("Justification", "Center")
|
|
802
|
+
recompute_text_bbox(el)
|
|
803
|
+
prev_baseline = baseline_y
|
|
804
|
+
# Update y_cursor to bottom of this text element for any
|
|
805
|
+
# subsequent non-text elements
|
|
806
|
+
bb = _element_bbox(el)
|
|
807
|
+
y_cursor = bb[3]
|
|
808
|
+
|
|
809
|
+
# Non-text elements (fragments) below arrow, after all text
|
|
810
|
+
for el in below_frags:
|
|
811
|
+
bb = _element_bbox(el)
|
|
812
|
+
h = _bbox_height(bb)
|
|
813
|
+
_move_element_to(el, arrow_cx, y_cursor + LAYOUT_BELOW_FRAG_PAD + h / 2.0)
|
|
814
|
+
y_cursor += LAYOUT_BELOW_FRAG_PAD + h
|
|
815
|
+
|
|
816
|
+
|
|
817
|
+
# ---------------------------------------------------------------------------
|
|
818
|
+
# Update document-level BoundingBox
|
|
819
|
+
# ---------------------------------------------------------------------------
|
|
820
|
+
|
|
821
|
+
def _update_doc_bbox(root: ET.Element):
|
|
822
|
+
"""Recompute the document-level BoundingBox from page contents."""
|
|
823
|
+
page = root.find("page")
|
|
824
|
+
if page is None:
|
|
825
|
+
return
|
|
826
|
+
min_x = min_y = float("inf")
|
|
827
|
+
max_x = max_y = float("-inf")
|
|
828
|
+
for el in page:
|
|
829
|
+
if el.tag in ("fragment", "t", "arrow", "graphic"):
|
|
830
|
+
bb = _element_bbox(el)
|
|
831
|
+
if bb != (0, 0, 0, 0):
|
|
832
|
+
min_x = min(min_x, bb[0])
|
|
833
|
+
min_y = min(min_y, bb[1])
|
|
834
|
+
max_x = max(max_x, bb[2])
|
|
835
|
+
max_y = max(max_y, bb[3])
|
|
836
|
+
if min_x < float("inf"):
|
|
837
|
+
root.set("BoundingBox",
|
|
838
|
+
f"{min_x:.2f} {min_y:.2f} {max_x:.2f} {max_y:.2f}")
|
|
839
|
+
|
|
840
|
+
|
|
841
|
+
# ---------------------------------------------------------------------------
|
|
842
|
+
# Approach registry
|
|
843
|
+
# ---------------------------------------------------------------------------
|
|
844
|
+
|
|
845
|
+
APPROACHES = {
|
|
846
|
+
"bbox_center": approach_bbox_center,
|
|
847
|
+
"arrow_driven": approach_arrow_driven,
|
|
848
|
+
"proportional": approach_proportional,
|
|
849
|
+
"compact": approach_compact,
|
|
850
|
+
"golden_ratio": approach_golden_ratio,
|
|
851
|
+
"chemdraw_mimic": approach_chemdraw_mimic,
|
|
852
|
+
}
|
|
853
|
+
|
|
854
|
+
APPROACH_DESCRIPTIONS = {
|
|
855
|
+
"bbox_center": "Bounding-box centroid alignment + uniform gaps",
|
|
856
|
+
"arrow_driven": "Arrow length drives layout; molecules placed relative to ends",
|
|
857
|
+
"proportional": "Gap sizes proportional to molecule widths",
|
|
858
|
+
"compact": "Minimal gaps; tight layout for slides/posters",
|
|
859
|
+
"golden_ratio": "Arrow + gaps use golden ratio proportions",
|
|
860
|
+
"chemdraw_mimic": "Closest emulation of ChemDraw's cleanup heuristics",
|
|
861
|
+
}
|
|
862
|
+
|
|
863
|
+
|
|
864
|
+
def run_cleanup(input_path: str, output_path: str, approach: str = "chemdraw_mimic",
|
|
865
|
+
verbose: bool = False) -> dict:
|
|
866
|
+
"""Run one cleanup approach on a CDXML file.
|
|
867
|
+
|
|
868
|
+
Returns dict with keys: output, approach, num_reactants, num_products.
|
|
869
|
+
"""
|
|
870
|
+
tree = parse_cdxml(input_path)
|
|
871
|
+
root = tree.getroot()
|
|
872
|
+
page = _get_page(root)
|
|
873
|
+
if page is None:
|
|
874
|
+
raise ValueError("No <page> found in CDXML")
|
|
875
|
+
|
|
876
|
+
id_map = _build_id_map(page)
|
|
877
|
+
step = _get_step(page)
|
|
878
|
+
if step is None:
|
|
879
|
+
raise ValueError("No <scheme>/<step> found — not a reaction CDXML")
|
|
880
|
+
|
|
881
|
+
arrow = _get_arrow(page, step, id_map)
|
|
882
|
+
if arrow is None:
|
|
883
|
+
raise ValueError("No arrow element found in reaction")
|
|
884
|
+
|
|
885
|
+
func = APPROACHES.get(approach)
|
|
886
|
+
if func is None:
|
|
887
|
+
raise ValueError(f"Unknown approach: {approach}. "
|
|
888
|
+
f"Choose from: {', '.join(APPROACHES)}")
|
|
889
|
+
|
|
890
|
+
reactants, products, _, _ = _parse_reaction(page, step, id_map)
|
|
891
|
+
num_reactants = len(reactants)
|
|
892
|
+
num_products = len(products)
|
|
893
|
+
|
|
894
|
+
func(page, step, id_map, arrow, verbose=verbose)
|
|
895
|
+
_update_doc_bbox(root)
|
|
896
|
+
write_cdxml(tree, output_path)
|
|
897
|
+
return {
|
|
898
|
+
"output": output_path,
|
|
899
|
+
"approach": approach,
|
|
900
|
+
"num_reactants": num_reactants,
|
|
901
|
+
"num_products": num_products,
|
|
902
|
+
}
|
|
903
|
+
|
|
904
|
+
|
|
905
|
+
# ---------------------------------------------------------------------------
|
|
906
|
+
# CLI
|
|
907
|
+
# ---------------------------------------------------------------------------
|
|
908
|
+
|
|
909
|
+
def main(argv: Optional[List[str]] = None) -> int:
|
|
910
|
+
parser = argparse.ArgumentParser(
|
|
911
|
+
description="Clean up a CDXML reaction scheme layout (pure Python).",
|
|
912
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
913
|
+
epilog="\n".join(f" {k:18s} {v}" for k, v in APPROACH_DESCRIPTIONS.items()),
|
|
914
|
+
)
|
|
915
|
+
parser.add_argument("input", help="Input CDXML file with a reaction scheme")
|
|
916
|
+
parser.add_argument("-o", "--output", help="Output CDXML path (default: input-cleaned.cdxml)")
|
|
917
|
+
parser.add_argument("--approach", choices=list(APPROACHES.keys()),
|
|
918
|
+
default="chemdraw_mimic",
|
|
919
|
+
help="Layout approach (default: chemdraw_mimic)")
|
|
920
|
+
parser.add_argument("--all", action="store_true",
|
|
921
|
+
help="Run all 6 approaches, producing one output each")
|
|
922
|
+
parser.add_argument("--render", action="store_true",
|
|
923
|
+
help="Render each output to PNG via cdxml_to_image.py")
|
|
924
|
+
parser.add_argument("-v", "--verbose", action="store_true")
|
|
925
|
+
parser.add_argument("--json", action="store_true",
|
|
926
|
+
help="Output result as JSON to stdout")
|
|
927
|
+
|
|
928
|
+
args = parser.parse_args(argv)
|
|
929
|
+
|
|
930
|
+
if not os.path.isfile(args.input):
|
|
931
|
+
print(f"Error: file not found: {args.input}", file=sys.stderr)
|
|
932
|
+
return 1
|
|
933
|
+
|
|
934
|
+
base, ext = os.path.splitext(args.input)
|
|
935
|
+
|
|
936
|
+
# When --json, redirect status prints to stderr
|
|
937
|
+
_print = print
|
|
938
|
+
if args.json:
|
|
939
|
+
def _print(*a, **kw):
|
|
940
|
+
kw.setdefault("file", sys.stderr)
|
|
941
|
+
print(*a, **kw)
|
|
942
|
+
|
|
943
|
+
if args.all:
|
|
944
|
+
all_results = []
|
|
945
|
+
for name in APPROACHES:
|
|
946
|
+
out_path = f"{base}-cleanup-{name}{ext}"
|
|
947
|
+
_print(f"[{name}] -> {out_path}")
|
|
948
|
+
try:
|
|
949
|
+
info = run_cleanup(args.input, out_path, approach=name, verbose=args.verbose)
|
|
950
|
+
_print(f" OK")
|
|
951
|
+
all_results.append(info)
|
|
952
|
+
if args.render:
|
|
953
|
+
_render(out_path)
|
|
954
|
+
except Exception as e:
|
|
955
|
+
_print(f" FAILED: {e}", file=sys.stderr)
|
|
956
|
+
if args.json:
|
|
957
|
+
json_results = []
|
|
958
|
+
for info in all_results:
|
|
959
|
+
json_results.append({
|
|
960
|
+
"input": os.path.abspath(args.input),
|
|
961
|
+
"output": os.path.abspath(info["output"]),
|
|
962
|
+
"approach": info["approach"],
|
|
963
|
+
"num_reactants": info["num_reactants"],
|
|
964
|
+
"num_products": info["num_products"],
|
|
965
|
+
})
|
|
966
|
+
print(json.dumps(json_results, indent=2))
|
|
967
|
+
else:
|
|
968
|
+
out_path = args.output or f"{base}-cleaned{ext}"
|
|
969
|
+
try:
|
|
970
|
+
info = run_cleanup(args.input, out_path, approach=args.approach, verbose=args.verbose)
|
|
971
|
+
if args.json:
|
|
972
|
+
result = {
|
|
973
|
+
"input": os.path.abspath(args.input),
|
|
974
|
+
"output": os.path.abspath(out_path),
|
|
975
|
+
"approach": info["approach"],
|
|
976
|
+
"num_reactants": info["num_reactants"],
|
|
977
|
+
"num_products": info["num_products"],
|
|
978
|
+
}
|
|
979
|
+
print(json.dumps(result, indent=2))
|
|
980
|
+
else:
|
|
981
|
+
_print(f"Output: {out_path}")
|
|
982
|
+
if args.render:
|
|
983
|
+
_render(out_path)
|
|
984
|
+
except Exception as e:
|
|
985
|
+
_print(f"Error: {e}", file=sys.stderr)
|
|
986
|
+
return 1
|
|
987
|
+
|
|
988
|
+
return 0
|
|
989
|
+
|
|
990
|
+
|
|
991
|
+
def _render(cdxml_path: str):
|
|
992
|
+
"""Render a CDXML to PNG using cdxml_to_image.py."""
|
|
993
|
+
try:
|
|
994
|
+
from ..chemdraw.cdxml_to_image import cdxml_to_image
|
|
995
|
+
png_path = cdxml_to_image(cdxml_path)
|
|
996
|
+
print(f" Rendered: {png_path}")
|
|
997
|
+
except Exception as e:
|
|
998
|
+
print(f" Render failed: {e}", file=sys.stderr)
|
|
999
|
+
|
|
1000
|
+
|
|
1001
|
+
if __name__ == "__main__":
|
|
1002
|
+
sys.exit(main())
|