cdxml-toolkit 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cdxml_toolkit/__init__.py +18 -0
- cdxml_toolkit/_jre/__init__.py +2 -0
- cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
- cdxml_toolkit/analysis/__init__.py +35 -0
- cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
- cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
- cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
- cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
- cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
- cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
- cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
- cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
- cdxml_toolkit/analysis/extract_nmr.py +47 -0
- cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
- cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
- cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
- cdxml_toolkit/cdxml_builder.py +920 -0
- cdxml_toolkit/cdxml_utils.py +342 -0
- cdxml_toolkit/chemdraw/__init__.py +5 -0
- cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
- cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
- cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
- cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
- cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
- cdxml_toolkit/constants.py +304 -0
- cdxml_toolkit/coord_normalizer.py +438 -0
- cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
- cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
- cdxml_toolkit/image/__init__.py +15 -0
- cdxml_toolkit/image/reaction_from_image.py +2103 -0
- cdxml_toolkit/image/structure_from_image.py +1711 -0
- cdxml_toolkit/layout/__init__.py +5 -0
- cdxml_toolkit/layout/alignment.py +1642 -0
- cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
- cdxml_toolkit/layout/scheme_merger.py +2260 -0
- cdxml_toolkit/mcp_server/__init__.py +0 -0
- cdxml_toolkit/mcp_server/__main__.py +5 -0
- cdxml_toolkit/mcp_server/server.py +1567 -0
- cdxml_toolkit/naming/__init__.py +6 -0
- cdxml_toolkit/naming/aligned_namer.py +2342 -0
- cdxml_toolkit/naming/mol_builder.py +3722 -0
- cdxml_toolkit/naming/name_decomposer.py +2843 -0
- cdxml_toolkit/naming/reactions_datamol.json +2414 -0
- cdxml_toolkit/office/__init__.py +5 -0
- cdxml_toolkit/office/doc_from_template.py +722 -0
- cdxml_toolkit/office/ole_embedder.py +808 -0
- cdxml_toolkit/office/ole_extractor.py +272 -0
- cdxml_toolkit/perception/__init__.py +10 -0
- cdxml_toolkit/perception/compound_search.py +229 -0
- cdxml_toolkit/perception/eln_csv_parser.py +240 -0
- cdxml_toolkit/perception/rdf_parser.py +664 -0
- cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
- cdxml_toolkit/perception/reaction_parser.py +2150 -0
- cdxml_toolkit/perception/scheme_reader.py +2948 -0
- cdxml_toolkit/perception/scheme_refine.py +1404 -0
- cdxml_toolkit/perception/scheme_segmenter.py +619 -0
- cdxml_toolkit/perception/spatial_assignment.py +1013 -0
- cdxml_toolkit/rdkit_utils.py +605 -0
- cdxml_toolkit/render/__init__.py +17 -0
- cdxml_toolkit/render/auto_layout.py +229 -0
- cdxml_toolkit/render/compact_parser.py +632 -0
- cdxml_toolkit/render/parser.py +706 -0
- cdxml_toolkit/render/render_scheme.py +267 -0
- cdxml_toolkit/render/renderer.py +2387 -0
- cdxml_toolkit/render/schema.py +90 -0
- cdxml_toolkit/render/scheme_maker.py +1043 -0
- cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
- cdxml_toolkit/resolve/__init__.py +13 -0
- cdxml_toolkit/resolve/cas_resolver.py +430 -0
- cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
- cdxml_toolkit/resolve/condensed_formula.py +493 -0
- cdxml_toolkit/resolve/jre_manager.py +195 -0
- cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
- cdxml_toolkit/resolve/reagent_db.py +285 -0
- cdxml_toolkit/resolve/superatom_data.json +2856 -0
- cdxml_toolkit/resolve/superatom_table.py +146 -0
- cdxml_toolkit/text_formatting.py +298 -0
- cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
- cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
- cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
- cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
- cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1340 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
scheme_polisher_v2.py — Experimental COM-free scheme polishing pipeline.
|
|
4
|
+
|
|
5
|
+
Takes a CDX/CDXML reaction scheme and produces a presentation-ready CDXML
|
|
6
|
+
without any ChemDraw COM dependency (except CDX→CDXML conversion if needed,
|
|
7
|
+
which falls through to whatever backend cdx_converter.py has available).
|
|
8
|
+
|
|
9
|
+
Pipeline:
|
|
10
|
+
1. Convert CDX → CDXML if needed (via cdx_converter.py)
|
|
11
|
+
2. Normalize bond lengths per-fragment to ACS Document 1996 (14.40 pt)
|
|
12
|
+
3. Apply ACS Document 1996 document-level settings
|
|
13
|
+
4. Normalize caption/label fonts to Arial 10pt Bold
|
|
14
|
+
5. Run scheme_polisher logic (reagent classification, structure↔text swaps,
|
|
15
|
+
orientation alignment, subscript formatting, deduplication)
|
|
16
|
+
6. Merge conditions into single centered text block (default on)
|
|
17
|
+
7. Compact above/below-arrow objects toward arrow
|
|
18
|
+
8. Run reaction_cleanup for final spatial layout
|
|
19
|
+
|
|
20
|
+
Defaults differ from scheme_polisher.py:
|
|
21
|
+
- --merge-conditions is ON by default (use --no-merge-conditions to disable)
|
|
22
|
+
- ChemDraw COM cleanup is NEVER used
|
|
23
|
+
|
|
24
|
+
Usage:
|
|
25
|
+
python scheme_polisher_v2.py input.cdx [-o output.cdxml] [-v]
|
|
26
|
+
python scheme_polisher_v2.py input.cdxml [-o output.cdxml] [-v]
|
|
27
|
+
python scheme_polisher_v2.py input.cdx --no-merge-conditions
|
|
28
|
+
python scheme_polisher_v2.py input.cdxml --approach compact -v
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
import argparse
|
|
32
|
+
import copy
|
|
33
|
+
import json
|
|
34
|
+
import math
|
|
35
|
+
import os
|
|
36
|
+
import subprocess
|
|
37
|
+
import sys
|
|
38
|
+
import tempfile
|
|
39
|
+
import xml.etree.ElementTree as ET
|
|
40
|
+
from typing import Dict, List, Optional, Tuple
|
|
41
|
+
|
|
42
|
+
from ...constants import (
|
|
43
|
+
ACS_BOND_LENGTH as TARGET_BOND_LENGTH,
|
|
44
|
+
ACS_STYLE as ACS_SETTINGS,
|
|
45
|
+
CDXML_MINIMAL_HEADER,
|
|
46
|
+
CDXML_FOOTER,
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
# ---------------------------------------------------------------------------
|
|
51
|
+
# Bond length measurement and per-fragment normalization
|
|
52
|
+
# ---------------------------------------------------------------------------
|
|
53
|
+
|
|
54
|
+
def _measure_bond_lengths(frag: ET.Element) -> List[float]:
|
|
55
|
+
"""Measure all bond lengths in a fragment from node coordinates.
|
|
56
|
+
|
|
57
|
+
Uses direct-child <n> and <b> elements only (not inner fragments
|
|
58
|
+
of NodeType="Fragment" abbreviation groups).
|
|
59
|
+
"""
|
|
60
|
+
# Build node id → (x, y) map from direct child <n> nodes
|
|
61
|
+
node_map: Dict[str, Tuple[float, float]] = {}
|
|
62
|
+
for n in frag.findall("n"):
|
|
63
|
+
nid = n.get("id", "")
|
|
64
|
+
p = n.get("p", "")
|
|
65
|
+
if nid and p:
|
|
66
|
+
parts = p.split()
|
|
67
|
+
if len(parts) >= 2:
|
|
68
|
+
node_map[nid] = (float(parts[0]), float(parts[1]))
|
|
69
|
+
|
|
70
|
+
lengths = []
|
|
71
|
+
for b in frag.findall("b"):
|
|
72
|
+
b_id = b.get("B", "")
|
|
73
|
+
e_id = b.get("E", "")
|
|
74
|
+
if b_id in node_map and e_id in node_map:
|
|
75
|
+
bx, by = node_map[b_id]
|
|
76
|
+
ex, ey = node_map[e_id]
|
|
77
|
+
d = math.sqrt((bx - ex) ** 2 + (by - ey) ** 2)
|
|
78
|
+
if d > 0.1:
|
|
79
|
+
lengths.append(d)
|
|
80
|
+
|
|
81
|
+
return lengths
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _median(values: List[float]) -> float:
|
|
85
|
+
"""Compute median of a list of floats."""
|
|
86
|
+
s = sorted(values)
|
|
87
|
+
n = len(s)
|
|
88
|
+
if n == 0:
|
|
89
|
+
return 0.0
|
|
90
|
+
if n % 2 == 1:
|
|
91
|
+
return s[n // 2]
|
|
92
|
+
return (s[n // 2 - 1] + s[n // 2]) / 2.0
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _scale_fragment(frag: ET.Element, factor: float, cx: float, cy: float):
|
|
96
|
+
"""Scale all coordinates in a fragment around (cx, cy) by factor.
|
|
97
|
+
|
|
98
|
+
Scales ALL descendant nodes and text elements (including those
|
|
99
|
+
inside inner NodeType="Fragment" sub-structures), since all
|
|
100
|
+
coordinates live in the same global space.
|
|
101
|
+
"""
|
|
102
|
+
def scale_pt(x: float, y: float) -> Tuple[float, float]:
|
|
103
|
+
return cx + (x - cx) * factor, cy + (y - cy) * factor
|
|
104
|
+
|
|
105
|
+
def scale_bb(bb_str: str) -> str:
|
|
106
|
+
vals = [float(v) for v in bb_str.split()]
|
|
107
|
+
if len(vals) >= 4:
|
|
108
|
+
x1, y1 = scale_pt(vals[0], vals[1])
|
|
109
|
+
x2, y2 = scale_pt(vals[2], vals[3])
|
|
110
|
+
return f"{x1:.2f} {y1:.2f} {x2:.2f} {y2:.2f}"
|
|
111
|
+
return bb_str
|
|
112
|
+
|
|
113
|
+
# Scale all node positions (iter = all descendants)
|
|
114
|
+
for n in frag.iter("n"):
|
|
115
|
+
p = n.get("p")
|
|
116
|
+
if p:
|
|
117
|
+
parts = p.split()
|
|
118
|
+
if len(parts) >= 2:
|
|
119
|
+
nx, ny = scale_pt(float(parts[0]), float(parts[1]))
|
|
120
|
+
n.set("p", f"{nx:.2f} {ny:.2f}")
|
|
121
|
+
|
|
122
|
+
# Scale text label positions and bounding boxes
|
|
123
|
+
for t in frag.iter("t"):
|
|
124
|
+
p = t.get("p")
|
|
125
|
+
if p:
|
|
126
|
+
parts = p.split()
|
|
127
|
+
if len(parts) >= 2:
|
|
128
|
+
nx, ny = scale_pt(float(parts[0]), float(parts[1]))
|
|
129
|
+
t.set("p", f"{nx:.2f} {ny:.2f}")
|
|
130
|
+
bb = t.get("BoundingBox")
|
|
131
|
+
if bb:
|
|
132
|
+
t.set("BoundingBox", scale_bb(bb))
|
|
133
|
+
|
|
134
|
+
# Scale fragment-level BoundingBox
|
|
135
|
+
bb = frag.get("BoundingBox")
|
|
136
|
+
if bb:
|
|
137
|
+
frag.set("BoundingBox", scale_bb(bb))
|
|
138
|
+
|
|
139
|
+
# Scale inner fragment BoundingBoxes (abbreviation groups)
|
|
140
|
+
for inner in frag.iter("fragment"):
|
|
141
|
+
if inner is not frag:
|
|
142
|
+
bb = inner.get("BoundingBox")
|
|
143
|
+
if bb:
|
|
144
|
+
inner.set("BoundingBox", scale_bb(bb))
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def _fragment_centroid(frag: ET.Element) -> Tuple[float, float]:
|
|
148
|
+
"""Compute centroid from direct-child node positions."""
|
|
149
|
+
xs, ys = [], []
|
|
150
|
+
for n in frag.findall("n"):
|
|
151
|
+
p = n.get("p")
|
|
152
|
+
if p:
|
|
153
|
+
parts = p.split()
|
|
154
|
+
if len(parts) >= 2:
|
|
155
|
+
xs.append(float(parts[0]))
|
|
156
|
+
ys.append(float(parts[1]))
|
|
157
|
+
if not xs:
|
|
158
|
+
return 0.0, 0.0
|
|
159
|
+
return sum(xs) / len(xs), sum(ys) / len(ys)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def normalize_bond_lengths(root: ET.Element, target: float = TARGET_BOND_LENGTH,
|
|
163
|
+
verbose: bool = False) -> int:
|
|
164
|
+
"""Normalize bond lengths in every fragment to the target length.
|
|
165
|
+
|
|
166
|
+
Each fragment is scaled independently around its own centroid,
|
|
167
|
+
so fragments at different scales (common in ELN exports) all
|
|
168
|
+
converge to the same bond length.
|
|
169
|
+
|
|
170
|
+
Returns the number of fragments scaled.
|
|
171
|
+
"""
|
|
172
|
+
page = root.find("page")
|
|
173
|
+
if page is None:
|
|
174
|
+
return 0
|
|
175
|
+
|
|
176
|
+
scaled_count = 0
|
|
177
|
+
for frag in page.findall("fragment"):
|
|
178
|
+
lengths = _measure_bond_lengths(frag)
|
|
179
|
+
if not lengths:
|
|
180
|
+
continue
|
|
181
|
+
|
|
182
|
+
med = _median(lengths)
|
|
183
|
+
if med < 1.0:
|
|
184
|
+
continue
|
|
185
|
+
|
|
186
|
+
factor = target / med
|
|
187
|
+
if abs(factor - 1.0) < 0.02:
|
|
188
|
+
if verbose:
|
|
189
|
+
fid = frag.get("id", "?")
|
|
190
|
+
print(f" Fragment {fid}: median {med:.2f} pt, "
|
|
191
|
+
f"already at target ({factor:.3f}x)", file=sys.stderr)
|
|
192
|
+
continue
|
|
193
|
+
|
|
194
|
+
cx, cy = _fragment_centroid(frag)
|
|
195
|
+
_scale_fragment(frag, factor, cx, cy)
|
|
196
|
+
scaled_count += 1
|
|
197
|
+
|
|
198
|
+
if verbose:
|
|
199
|
+
fid = frag.get("id", "?")
|
|
200
|
+
print(f" Fragment {fid}: median {med:.2f} pt → "
|
|
201
|
+
f"scaled {factor:.3f}x around ({cx:.1f}, {cy:.1f})",
|
|
202
|
+
file=sys.stderr)
|
|
203
|
+
|
|
204
|
+
return scaled_count
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
# ---------------------------------------------------------------------------
|
|
208
|
+
# ACS document settings + font normalization
|
|
209
|
+
# ---------------------------------------------------------------------------
|
|
210
|
+
|
|
211
|
+
def apply_acs_settings(root: ET.Element):
|
|
212
|
+
"""Apply ACS Document 1996 settings to the root CDXML element."""
|
|
213
|
+
for attr, val in ACS_SETTINGS.items():
|
|
214
|
+
root.set(attr, val)
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def normalize_fonts(root: ET.Element, verbose: bool = False) -> int:
|
|
218
|
+
"""Set all caption text to Arial 10pt Bold (face=96).
|
|
219
|
+
|
|
220
|
+
Only touches <t> elements that are direct children of <page>
|
|
221
|
+
(i.e. captions/conditions, not atom labels inside fragments).
|
|
222
|
+
Returns number of text elements modified.
|
|
223
|
+
"""
|
|
224
|
+
page = root.find("page")
|
|
225
|
+
if page is None:
|
|
226
|
+
return 0
|
|
227
|
+
|
|
228
|
+
count = 0
|
|
229
|
+
for t_el in page.findall("t"):
|
|
230
|
+
modified = False
|
|
231
|
+
for s in t_el.findall("s"):
|
|
232
|
+
changed = False
|
|
233
|
+
if s.get("font") != "3":
|
|
234
|
+
s.set("font", "3")
|
|
235
|
+
changed = True
|
|
236
|
+
if s.get("size") != "10":
|
|
237
|
+
s.set("size", "10")
|
|
238
|
+
changed = True
|
|
239
|
+
# Don't override subscript (32) or italic (2) faces —
|
|
240
|
+
# only set formula (96) if it's something else like bold (1)
|
|
241
|
+
face = s.get("face", "")
|
|
242
|
+
if face not in ("2", "32", "96"):
|
|
243
|
+
s.set("face", "96")
|
|
244
|
+
changed = True
|
|
245
|
+
if changed:
|
|
246
|
+
modified = True
|
|
247
|
+
if modified:
|
|
248
|
+
count += 1
|
|
249
|
+
|
|
250
|
+
if verbose and count:
|
|
251
|
+
print(f" Normalized fonts on {count} text element(s)", file=sys.stderr)
|
|
252
|
+
return count
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def fix_narrow_text(root: ET.Element, verbose: bool = False) -> int:
|
|
256
|
+
"""Fix degenerate narrow text labels from Findmolecule ELN exports.
|
|
257
|
+
|
|
258
|
+
ELN exports sometimes create text with per-character LineStarts (each
|
|
259
|
+
character on its own line in a very narrow column, e.g. "Sodium Bicarbonate"
|
|
260
|
+
rendered as a 1-character-wide column 18 lines tall). This causes
|
|
261
|
+
BoundingBox to extend very far vertically, breaking layout and
|
|
262
|
+
run-arrow placement.
|
|
263
|
+
|
|
264
|
+
Fix: remove LineStarts attribute and recalculate BoundingBox to
|
|
265
|
+
approximate single-line width so downstream layout works correctly.
|
|
266
|
+
|
|
267
|
+
Returns number of text elements fixed.
|
|
268
|
+
"""
|
|
269
|
+
page = root.find("page")
|
|
270
|
+
if page is None:
|
|
271
|
+
return 0
|
|
272
|
+
|
|
273
|
+
count = 0
|
|
274
|
+
for t_el in page.findall("t"):
|
|
275
|
+
ls = t_el.get("LineStarts")
|
|
276
|
+
if not ls:
|
|
277
|
+
continue
|
|
278
|
+
|
|
279
|
+
# Get text content
|
|
280
|
+
text = "".join((s.text or "") for s in t_el.findall("s"))
|
|
281
|
+
if not text:
|
|
282
|
+
continue
|
|
283
|
+
|
|
284
|
+
line_starts = ls.strip().split()
|
|
285
|
+
n_lines = len(line_starts)
|
|
286
|
+
n_words = len(text.split())
|
|
287
|
+
|
|
288
|
+
# Heuristic: if LineStarts has more entries than 2× words,
|
|
289
|
+
# it's likely per-character wrapping from a narrow column
|
|
290
|
+
if n_lines <= max(n_words * 2, 3):
|
|
291
|
+
continue
|
|
292
|
+
|
|
293
|
+
# Remove LineStarts to make it single-line
|
|
294
|
+
del t_el.attrib["LineStarts"]
|
|
295
|
+
|
|
296
|
+
# Recalculate BoundingBox based on text length
|
|
297
|
+
# Arial 10pt Bold: ~6.0 pt per character average
|
|
298
|
+
p = t_el.get("p")
|
|
299
|
+
if p:
|
|
300
|
+
parts = p.split()
|
|
301
|
+
if len(parts) >= 2:
|
|
302
|
+
px, py = float(parts[0]), float(parts[1])
|
|
303
|
+
est_width = len(text) * 6.0
|
|
304
|
+
# BoundingBox: left top right bottom
|
|
305
|
+
t_el.set("BoundingBox",
|
|
306
|
+
f"{px:.2f} {py - 11:.2f} "
|
|
307
|
+
f"{px + est_width:.2f} {py + 3:.2f}")
|
|
308
|
+
|
|
309
|
+
count += 1
|
|
310
|
+
if verbose:
|
|
311
|
+
print(f" Fixed narrow text: '{text}' "
|
|
312
|
+
f"({n_lines} LineStarts → single line)",
|
|
313
|
+
file=sys.stderr)
|
|
314
|
+
|
|
315
|
+
return count
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
def resolve_orphan_reagent_text(root: ET.Element, verbose: bool = False) -> int:
|
|
319
|
+
"""Resolve orphan text labels to their reagent DB display names.
|
|
320
|
+
|
|
321
|
+
ELN exports sometimes place reagent names as free-floating text
|
|
322
|
+
elements that are NOT referenced in the ``<step>`` metadata (e.g.
|
|
323
|
+
"Sodium Bicarbonate" placed next to the substrate). The polisher
|
|
324
|
+
only processes step-referenced elements, so these labels are never
|
|
325
|
+
reformatted.
|
|
326
|
+
|
|
327
|
+
This function:
|
|
328
|
+
1. Finds text elements on the page that are NOT referenced in any step.
|
|
329
|
+
2. Looks up each text in the reagent database.
|
|
330
|
+
3. If found, renames the text to the DB display name (e.g.
|
|
331
|
+
"Sodium Bicarbonate" → "NaHCO3").
|
|
332
|
+
4. Adds the text to the nearest step's below-arrow references so
|
|
333
|
+
the polisher will process it (reformatting, conditions merging).
|
|
334
|
+
|
|
335
|
+
Returns number of text elements resolved.
|
|
336
|
+
"""
|
|
337
|
+
from ...resolve.reagent_db import get_reagent_db
|
|
338
|
+
|
|
339
|
+
page = root.find("page")
|
|
340
|
+
if page is None:
|
|
341
|
+
return 0
|
|
342
|
+
|
|
343
|
+
db = get_reagent_db()
|
|
344
|
+
|
|
345
|
+
# Collect all IDs referenced by any step
|
|
346
|
+
step_ids: set = set()
|
|
347
|
+
for scheme in page.findall("scheme"):
|
|
348
|
+
for step in scheme.findall("step"):
|
|
349
|
+
for attr in ("ReactionStepReactants", "ReactionStepProducts",
|
|
350
|
+
"ReactionStepArrows",
|
|
351
|
+
"ReactionStepObjectsAboveArrow",
|
|
352
|
+
"ReactionStepObjectsBelowArrow"):
|
|
353
|
+
val = step.get(attr, "")
|
|
354
|
+
for tok in val.split():
|
|
355
|
+
try:
|
|
356
|
+
step_ids.add(int(tok))
|
|
357
|
+
except ValueError:
|
|
358
|
+
pass
|
|
359
|
+
|
|
360
|
+
# Find the first step (for adding below-arrow references)
|
|
361
|
+
first_step = None
|
|
362
|
+
for scheme in page.findall("scheme"):
|
|
363
|
+
steps = scheme.findall("step")
|
|
364
|
+
if steps:
|
|
365
|
+
first_step = steps[0]
|
|
366
|
+
break
|
|
367
|
+
|
|
368
|
+
count = 0
|
|
369
|
+
for t_el in page.findall("t"):
|
|
370
|
+
tid_str = t_el.get("id")
|
|
371
|
+
if tid_str is None:
|
|
372
|
+
continue
|
|
373
|
+
try:
|
|
374
|
+
tid = int(tid_str)
|
|
375
|
+
except ValueError:
|
|
376
|
+
continue
|
|
377
|
+
|
|
378
|
+
if tid in step_ids:
|
|
379
|
+
continue # Already referenced in a step — polisher handles it
|
|
380
|
+
|
|
381
|
+
# Get text content
|
|
382
|
+
text = "".join((s.text or "") for s in t_el.findall("s"))
|
|
383
|
+
text = text.strip()
|
|
384
|
+
if not text or len(text) < 2:
|
|
385
|
+
continue
|
|
386
|
+
|
|
387
|
+
# Try to resolve via reagent DB
|
|
388
|
+
display = db.display_for_name(text.lower())
|
|
389
|
+
if display is None:
|
|
390
|
+
continue
|
|
391
|
+
if display.lower() == text.lower():
|
|
392
|
+
# Already the display form — still add to step but don't rename
|
|
393
|
+
pass
|
|
394
|
+
else:
|
|
395
|
+
# Rename the text to the display form
|
|
396
|
+
for s_el in t_el.findall("s"):
|
|
397
|
+
s_el.text = display
|
|
398
|
+
# Recalculate bounding box
|
|
399
|
+
p = t_el.get("p")
|
|
400
|
+
if p:
|
|
401
|
+
parts = p.split()
|
|
402
|
+
if len(parts) >= 2:
|
|
403
|
+
px, py = float(parts[0]), float(parts[1])
|
|
404
|
+
est_width = len(display) * 5.8
|
|
405
|
+
t_el.set("BoundingBox",
|
|
406
|
+
f"{px:.2f} {py - 9:.2f} "
|
|
407
|
+
f"{px + est_width:.2f} {py + 3:.2f}")
|
|
408
|
+
if verbose:
|
|
409
|
+
print(f" Renamed orphan text: '{text}' → '{display}'",
|
|
410
|
+
file=sys.stderr)
|
|
411
|
+
|
|
412
|
+
# Add to step below-arrow references
|
|
413
|
+
if first_step is not None:
|
|
414
|
+
below_str = first_step.get(
|
|
415
|
+
"ReactionStepObjectsBelowArrow", "")
|
|
416
|
+
if str(tid) not in below_str.split():
|
|
417
|
+
first_step.set(
|
|
418
|
+
"ReactionStepObjectsBelowArrow",
|
|
419
|
+
f"{below_str} {tid}".strip())
|
|
420
|
+
if verbose:
|
|
421
|
+
print(f" Added '{display}' (id={tid}) to step "
|
|
422
|
+
f"below-arrow references",
|
|
423
|
+
file=sys.stderr)
|
|
424
|
+
|
|
425
|
+
count += 1
|
|
426
|
+
|
|
427
|
+
return count
|
|
428
|
+
|
|
429
|
+
|
|
430
|
+
# ---------------------------------------------------------------------------
|
|
431
|
+
# Alignment imports (from alignment.py)
|
|
432
|
+
# ---------------------------------------------------------------------------
|
|
433
|
+
# Geometry primitives + high-level alignment orchestrators live in
|
|
434
|
+
# alignment.py. We import what's needed here and keep backward-
|
|
435
|
+
# compatible private aliases for internal callers.
|
|
436
|
+
|
|
437
|
+
from ...layout.alignment import (
|
|
438
|
+
fragment_centroid as _fragment_centroid,
|
|
439
|
+
get_visible_carbon_positions as _get_visible_carbon_positions,
|
|
440
|
+
match_and_compute_rotation as _match_and_compute_rotation,
|
|
441
|
+
rotate_fragment_in_place as _rotate_all_coords,
|
|
442
|
+
rdkit_align_to_product,
|
|
443
|
+
kabsch_align_to_product,
|
|
444
|
+
align_product_to_reference,
|
|
445
|
+
rxnmapper_align_to_product,
|
|
446
|
+
)
|
|
447
|
+
|
|
448
|
+
|
|
449
|
+
def _shift_element_coords(elem: ET.Element, dx: float, dy: float) -> None:
|
|
450
|
+
"""Shift all <n> and <t> coordinates within an element tree by (dx, dy).
|
|
451
|
+
|
|
452
|
+
Updates node positions, text positions, and BoundingBox attributes
|
|
453
|
+
on all descendants.
|
|
454
|
+
"""
|
|
455
|
+
for n in elem.iter("n"):
|
|
456
|
+
p = n.get("p")
|
|
457
|
+
if p:
|
|
458
|
+
parts = p.split()
|
|
459
|
+
if len(parts) >= 2:
|
|
460
|
+
n.set("p", f"{float(parts[0]) + dx:.2f} "
|
|
461
|
+
f"{float(parts[1]) + dy:.2f}")
|
|
462
|
+
for t in elem.iter("t"):
|
|
463
|
+
p = t.get("p")
|
|
464
|
+
if p:
|
|
465
|
+
parts = p.split()
|
|
466
|
+
if len(parts) >= 2:
|
|
467
|
+
t.set("p", f"{float(parts[0]) + dx:.2f} "
|
|
468
|
+
f"{float(parts[1]) + dy:.2f}")
|
|
469
|
+
bb = t.get("BoundingBox")
|
|
470
|
+
if bb:
|
|
471
|
+
vals = [float(v) for v in bb.split()]
|
|
472
|
+
if len(vals) >= 4:
|
|
473
|
+
t.set("BoundingBox",
|
|
474
|
+
f"{vals[0]+dx:.2f} {vals[1]+dy:.2f} "
|
|
475
|
+
f"{vals[2]+dx:.2f} {vals[3]+dy:.2f}")
|
|
476
|
+
bb = elem.get("BoundingBox")
|
|
477
|
+
if bb:
|
|
478
|
+
vals = [float(v) for v in bb.split()]
|
|
479
|
+
if len(vals) >= 4:
|
|
480
|
+
elem.set("BoundingBox",
|
|
481
|
+
f"{vals[0]+dx:.2f} {vals[1]+dy:.2f} "
|
|
482
|
+
f"{vals[2]+dx:.2f} {vals[3]+dy:.2f}")
|
|
483
|
+
|
|
484
|
+
|
|
485
|
+
# Note: RDKit MCS alignment functions have been moved to alignment.py.
|
|
486
|
+
# rdkit_align_to_product and kabsch_align_to_product are imported above.
|
|
487
|
+
|
|
488
|
+
|
|
489
|
+
# ---------------------------------------------------------------------------
|
|
490
|
+
# ChemScript per-fragment structure cleanup
|
|
491
|
+
# ---------------------------------------------------------------------------
|
|
492
|
+
|
|
493
|
+
# Dummy element used to replace abbreviation nodes during cleanup.
|
|
494
|
+
# Iodine (53) is a safe choice: ChemScript treats it as a normal atom
|
|
495
|
+
# and won't add hydrogens. In the rare case the molecule already has
|
|
496
|
+
# iodine, dummies are matched back by position proximity.
|
|
497
|
+
_ABBREV_DUMMY_ELEMENT = "53"
|
|
498
|
+
|
|
499
|
+
|
|
500
|
+
def _cleanup_fragments_chemscript(root: ET.Element,
|
|
501
|
+
verbose: bool = False) -> int:
|
|
502
|
+
"""Clean up each fragment's geometry via ChemScript CleanupStructure.
|
|
503
|
+
|
|
504
|
+
Extracts each <fragment> from the page into a standalone CDXML,
|
|
505
|
+
runs ChemScript cleanup on it, then replaces the fragment in-place
|
|
506
|
+
while preserving the original centroid position and element ID.
|
|
507
|
+
|
|
508
|
+
**Abbreviation preservation:** Before cleanup, any abbreviation nodes
|
|
509
|
+
(``NodeType="Fragment"``) are temporarily replaced with dummy atoms
|
|
510
|
+
(Iodine) so ChemScript doesn't expand them. After cleanup, the
|
|
511
|
+
saved abbreviation nodes are restored at the cleaned positions.
|
|
512
|
+
|
|
513
|
+
**Orientation preservation:** Kabsch alignment on visible carbon
|
|
514
|
+
atom positions corrects arbitrary rotations introduced by cleanup.
|
|
515
|
+
|
|
516
|
+
Returns the number of fragments cleaned.
|
|
517
|
+
"""
|
|
518
|
+
page = root.find("page")
|
|
519
|
+
if page is None:
|
|
520
|
+
return 0
|
|
521
|
+
|
|
522
|
+
# Lazy-init ChemScript bridge
|
|
523
|
+
cs_bridge = None
|
|
524
|
+
|
|
525
|
+
def _ensure_cs():
|
|
526
|
+
nonlocal cs_bridge
|
|
527
|
+
if cs_bridge is None:
|
|
528
|
+
from ...chemdraw.chemscript_bridge import ChemScriptBridge
|
|
529
|
+
cs_bridge = ChemScriptBridge()
|
|
530
|
+
return cs_bridge
|
|
531
|
+
|
|
532
|
+
cleaned_count = 0
|
|
533
|
+
|
|
534
|
+
for frag in list(page.findall("fragment")): # list() — we modify page
|
|
535
|
+
frag_id = frag.get("id", "?")
|
|
536
|
+
|
|
537
|
+
# Measure current centroid
|
|
538
|
+
old_cx, old_cy = _fragment_centroid(frag)
|
|
539
|
+
if old_cx == 0.0 and old_cy == 0.0:
|
|
540
|
+
if verbose:
|
|
541
|
+
print(f" Fragment {frag_id}: no atom coords, skipping",
|
|
542
|
+
file=sys.stderr)
|
|
543
|
+
continue
|
|
544
|
+
|
|
545
|
+
# Save visible carbon positions for Kabsch orientation matching
|
|
546
|
+
old_carbons = _get_visible_carbon_positions(frag)
|
|
547
|
+
|
|
548
|
+
# Skip fragments with too few visible carbons — these are likely
|
|
549
|
+
# inorganic salts (Cs2CO3=1C, NaH=0C) or very small molecules
|
|
550
|
+
# that don't benefit from geometry cleanup. ChemScript can also
|
|
551
|
+
# alter their connectivity (e.g. strip counterions from salts).
|
|
552
|
+
if len(old_carbons) < 3:
|
|
553
|
+
if verbose:
|
|
554
|
+
print(f" Fragment {frag_id}: only {len(old_carbons)} "
|
|
555
|
+
f"visible carbon(s), skipping cleanup",
|
|
556
|
+
file=sys.stderr)
|
|
557
|
+
continue
|
|
558
|
+
|
|
559
|
+
# Preserve objecttag children (FM MOLECULE TYPE, etc.)
|
|
560
|
+
saved_objecttags = []
|
|
561
|
+
for ot in frag.findall("objecttag"):
|
|
562
|
+
saved_objecttags.append(copy.deepcopy(ot))
|
|
563
|
+
|
|
564
|
+
# --- Abbreviation preservation: swap with dummy atoms ---
|
|
565
|
+
# Work on a deep copy so the original fragment is untouched
|
|
566
|
+
# in case cleanup fails.
|
|
567
|
+
work_frag = copy.deepcopy(frag)
|
|
568
|
+
saved_abbrevs = [] # list of deep-copied abbreviation <n> elements
|
|
569
|
+
|
|
570
|
+
for n in work_frag.findall("n"):
|
|
571
|
+
if n.get("NodeType") != "Fragment":
|
|
572
|
+
continue
|
|
573
|
+
# Save deep copy of the full abbreviation node
|
|
574
|
+
saved_abbrevs.append(copy.deepcopy(n))
|
|
575
|
+
# Strip to dummy atom: remove inner fragment + label
|
|
576
|
+
for child in list(n):
|
|
577
|
+
n.remove(child)
|
|
578
|
+
for attr in ("NodeType", "LabelDisplay", "NeedsClean",
|
|
579
|
+
"AS", "Warning"):
|
|
580
|
+
if attr in n.attrib:
|
|
581
|
+
del n.attrib[attr]
|
|
582
|
+
n.set("Element", _ABBREV_DUMMY_ELEMENT)
|
|
583
|
+
n.set("NumHydrogens", "0")
|
|
584
|
+
|
|
585
|
+
if saved_abbrevs and verbose:
|
|
586
|
+
labels = []
|
|
587
|
+
for sa in saved_abbrevs:
|
|
588
|
+
t = sa.find("t")
|
|
589
|
+
if t is not None:
|
|
590
|
+
labels.append("".join(
|
|
591
|
+
(s.text or "") for s in t.findall("s")))
|
|
592
|
+
print(f" Fragment {frag_id}: {len(saved_abbrevs)} abbreviation(s) "
|
|
593
|
+
f"swapped with dummies ({', '.join(labels)})",
|
|
594
|
+
file=sys.stderr)
|
|
595
|
+
|
|
596
|
+
# Wrap the modified copy in minimal CDXML
|
|
597
|
+
frag_xml = ET.tostring(work_frag, encoding="unicode")
|
|
598
|
+
wrapper_cdxml = (
|
|
599
|
+
f'{CDXML_MINIMAL_HEADER}\n'
|
|
600
|
+
'<page id="1">\n'
|
|
601
|
+
f'{frag_xml}\n'
|
|
602
|
+
'</page>\n'
|
|
603
|
+
f'{CDXML_FOOTER}'
|
|
604
|
+
)
|
|
605
|
+
|
|
606
|
+
tmp_in = tmp_out = None
|
|
607
|
+
try:
|
|
608
|
+
_ensure_cs()
|
|
609
|
+
|
|
610
|
+
with tempfile.NamedTemporaryFile(
|
|
611
|
+
suffix=".cdxml", mode="w", delete=False, encoding="utf-8"
|
|
612
|
+
) as f:
|
|
613
|
+
f.write(wrapper_cdxml)
|
|
614
|
+
tmp_in = f.name
|
|
615
|
+
|
|
616
|
+
tmp_out = tmp_in.replace(".cdxml", "-clean.cdxml")
|
|
617
|
+
cs_bridge.cleanup(tmp_in, output=tmp_out)
|
|
618
|
+
|
|
619
|
+
# Parse cleaned output
|
|
620
|
+
clean_tree = ET.parse(tmp_out)
|
|
621
|
+
clean_root = clean_tree.getroot()
|
|
622
|
+
clean_page = clean_root.find("page")
|
|
623
|
+
if clean_page is None:
|
|
624
|
+
continue
|
|
625
|
+
clean_frag = clean_page.find("fragment")
|
|
626
|
+
if clean_frag is None:
|
|
627
|
+
continue
|
|
628
|
+
|
|
629
|
+
# --- Preserve original orientation via Kabsch alignment ---
|
|
630
|
+
# ChemScript cleanup can arbitrarily rotate the structure.
|
|
631
|
+
# Use visible carbon positions (same count before/after since
|
|
632
|
+
# abbreviations were replaced with dummies, not carbons).
|
|
633
|
+
new_carbons = _get_visible_carbon_positions(clean_frag)
|
|
634
|
+
|
|
635
|
+
if (len(old_carbons) >= 3
|
|
636
|
+
and len(new_carbons) == len(old_carbons)):
|
|
637
|
+
cos_a, sin_a, angle_deg = _match_and_compute_rotation(
|
|
638
|
+
new_carbons, old_carbons)
|
|
639
|
+
if abs(angle_deg) >= 1.0:
|
|
640
|
+
rot_cx, rot_cy = _fragment_centroid(clean_frag)
|
|
641
|
+
_rotate_all_coords(
|
|
642
|
+
clean_frag, cos_a, sin_a, rot_cx, rot_cy)
|
|
643
|
+
if verbose:
|
|
644
|
+
print(f" Fragment {frag_id}: re-aligned "
|
|
645
|
+
f"{angle_deg:.1f}\u00b0 to original "
|
|
646
|
+
f"orientation", file=sys.stderr)
|
|
647
|
+
elif verbose and old_carbons:
|
|
648
|
+
print(f" Fragment {frag_id}: Kabsch skipped "
|
|
649
|
+
f"(old={len(old_carbons)}, "
|
|
650
|
+
f"new={len(new_carbons)} visible carbons)",
|
|
651
|
+
file=sys.stderr)
|
|
652
|
+
|
|
653
|
+
# Compute new centroid and shift to old position
|
|
654
|
+
new_cx, new_cy = _fragment_centroid(clean_frag)
|
|
655
|
+
if new_cx == 0.0 and new_cy == 0.0:
|
|
656
|
+
continue
|
|
657
|
+
|
|
658
|
+
dx = old_cx - new_cx
|
|
659
|
+
dy = old_cy - new_cy
|
|
660
|
+
|
|
661
|
+
# Shift all coordinates in the cleaned fragment
|
|
662
|
+
_shift_element_coords(clean_frag, dx, dy)
|
|
663
|
+
# Also shift inner fragment BoundingBoxes (not covered by
|
|
664
|
+
# _shift_element_coords since it uses .iter on the element
|
|
665
|
+
# itself, but inner <fragment> BB is on a non-n/non-t tag)
|
|
666
|
+
for inner in clean_frag.iter("fragment"):
|
|
667
|
+
if inner is not clean_frag:
|
|
668
|
+
ib = inner.get("BoundingBox")
|
|
669
|
+
if ib:
|
|
670
|
+
vals = [float(v) for v in ib.split()]
|
|
671
|
+
if len(vals) >= 4:
|
|
672
|
+
inner.set("BoundingBox",
|
|
673
|
+
f"{vals[0]+dx:.2f} {vals[1]+dy:.2f} "
|
|
674
|
+
f"{vals[2]+dx:.2f} {vals[3]+dy:.2f}")
|
|
675
|
+
|
|
676
|
+
# --- Restore abbreviation nodes ---
|
|
677
|
+
if saved_abbrevs:
|
|
678
|
+
# Find dummy atoms in the cleaned fragment
|
|
679
|
+
dummies = [n for n in clean_frag.findall("n")
|
|
680
|
+
if n.get("Element") == _ABBREV_DUMMY_ELEMENT]
|
|
681
|
+
|
|
682
|
+
# Match dummies to saved abbreviations by position proximity
|
|
683
|
+
used_saved = set()
|
|
684
|
+
for dummy in dummies:
|
|
685
|
+
dp = dummy.get("p", "").split()
|
|
686
|
+
if len(dp) < 2:
|
|
687
|
+
continue
|
|
688
|
+
d_x, d_y = float(dp[0]), float(dp[1])
|
|
689
|
+
|
|
690
|
+
# Find closest saved abbreviation
|
|
691
|
+
best_si = -1
|
|
692
|
+
best_d2 = float("inf")
|
|
693
|
+
for si, saved in enumerate(saved_abbrevs):
|
|
694
|
+
if si in used_saved:
|
|
695
|
+
continue
|
|
696
|
+
sp = saved.get("p", "").split()
|
|
697
|
+
if len(sp) < 2:
|
|
698
|
+
continue
|
|
699
|
+
s_x, s_y = float(sp[0]), float(sp[1])
|
|
700
|
+
d2 = (d_x - s_x) ** 2 + (d_y - s_y) ** 2
|
|
701
|
+
if d2 < best_d2:
|
|
702
|
+
best_d2 = d2
|
|
703
|
+
best_si = si
|
|
704
|
+
|
|
705
|
+
if best_si < 0:
|
|
706
|
+
continue
|
|
707
|
+
used_saved.add(best_si)
|
|
708
|
+
saved_node = saved_abbrevs[best_si]
|
|
709
|
+
|
|
710
|
+
# Compute offset from old to new abbreviation position
|
|
711
|
+
old_sp = saved_node.get("p", "").split()
|
|
712
|
+
if len(old_sp) < 2:
|
|
713
|
+
continue
|
|
714
|
+
abbr_dx = d_x - float(old_sp[0])
|
|
715
|
+
abbr_dy = d_y - float(old_sp[1])
|
|
716
|
+
|
|
717
|
+
# Update abbreviation node position + ID
|
|
718
|
+
saved_node.set("p", dummy.get("p"))
|
|
719
|
+
saved_node.set("id", dummy.get("id"))
|
|
720
|
+
|
|
721
|
+
# Shift inner fragment coordinates by the same offset
|
|
722
|
+
inner_frag = saved_node.find("fragment")
|
|
723
|
+
if inner_frag is not None:
|
|
724
|
+
_shift_element_coords(inner_frag, abbr_dx, abbr_dy)
|
|
725
|
+
|
|
726
|
+
# Replace dummy with abbreviation in the fragment
|
|
727
|
+
children = list(clean_frag)
|
|
728
|
+
idx = children.index(dummy)
|
|
729
|
+
clean_frag.remove(dummy)
|
|
730
|
+
clean_frag.insert(idx, saved_node)
|
|
731
|
+
|
|
732
|
+
if verbose:
|
|
733
|
+
lbl = ""
|
|
734
|
+
t = saved_node.find("t")
|
|
735
|
+
if t is not None:
|
|
736
|
+
lbl = "".join(
|
|
737
|
+
(s.text or "") for s in t.findall("s"))
|
|
738
|
+
print(f" Fragment {frag_id}: restored "
|
|
739
|
+
f"abbreviation '{lbl}' at "
|
|
740
|
+
f"({d_x:.1f}, {d_y:.1f})",
|
|
741
|
+
file=sys.stderr)
|
|
742
|
+
|
|
743
|
+
# Preserve original fragment ID
|
|
744
|
+
clean_frag.set("id", frag_id)
|
|
745
|
+
|
|
746
|
+
# Restore objecttags (ChemScript strips custom metadata)
|
|
747
|
+
for ot in saved_objecttags:
|
|
748
|
+
clean_frag.append(ot)
|
|
749
|
+
|
|
750
|
+
# Replace fragment in page
|
|
751
|
+
page_children = list(page)
|
|
752
|
+
frag_index = page_children.index(frag)
|
|
753
|
+
page.remove(frag)
|
|
754
|
+
page.insert(frag_index, clean_frag)
|
|
755
|
+
|
|
756
|
+
cleaned_count += 1
|
|
757
|
+
if verbose:
|
|
758
|
+
print(f" Fragment {frag_id}: cleaned "
|
|
759
|
+
f"(shift dx={dx:.1f}, dy={dy:.1f})",
|
|
760
|
+
file=sys.stderr)
|
|
761
|
+
|
|
762
|
+
except Exception as exc:
|
|
763
|
+
if verbose:
|
|
764
|
+
print(f" Fragment {frag_id}: cleanup failed: {exc}",
|
|
765
|
+
file=sys.stderr)
|
|
766
|
+
finally:
|
|
767
|
+
for tmp in (tmp_in, tmp_out):
|
|
768
|
+
if tmp and os.path.exists(tmp):
|
|
769
|
+
try:
|
|
770
|
+
os.unlink(tmp)
|
|
771
|
+
except OSError:
|
|
772
|
+
pass
|
|
773
|
+
|
|
774
|
+
# Close ChemScript bridge
|
|
775
|
+
if cs_bridge is not None:
|
|
776
|
+
try:
|
|
777
|
+
cs_bridge.close()
|
|
778
|
+
except Exception:
|
|
779
|
+
pass
|
|
780
|
+
|
|
781
|
+
return cleaned_count
|
|
782
|
+
|
|
783
|
+
|
|
784
|
+
def _cleanup_fragments_rdkit(root: ET.Element,
|
|
785
|
+
verbose: bool = False) -> int:
|
|
786
|
+
"""Clean up each fragment's geometry via RDKit (fallback for ChemScript).
|
|
787
|
+
|
|
788
|
+
Uses rdkit_utils.cleanup_fragment_rdkit() which does RDKit 2D layout
|
|
789
|
+
+ Kabsch orientation restoration. Abbreviation groups are included
|
|
790
|
+
as dummy atoms so their bonds get proper lengths too.
|
|
791
|
+
|
|
792
|
+
Returns the number of fragments cleaned.
|
|
793
|
+
"""
|
|
794
|
+
from ...rdkit_utils import cleanup_fragment_rdkit
|
|
795
|
+
|
|
796
|
+
page = root.find("page")
|
|
797
|
+
if page is None:
|
|
798
|
+
return 0
|
|
799
|
+
|
|
800
|
+
cleaned = 0
|
|
801
|
+
for frag in page.findall("fragment"):
|
|
802
|
+
try:
|
|
803
|
+
if cleanup_fragment_rdkit(frag, verbose):
|
|
804
|
+
cleaned += 1
|
|
805
|
+
except Exception as e:
|
|
806
|
+
if verbose:
|
|
807
|
+
frag_id = frag.get("id", "?")
|
|
808
|
+
print(f" [warn] RDKit cleanup skipped fragment {frag_id}: {e}",
|
|
809
|
+
file=sys.stderr)
|
|
810
|
+
return cleaned
|
|
811
|
+
|
|
812
|
+
|
|
813
|
+
# ---------------------------------------------------------------------------
|
|
814
|
+
# CDXML I/O helpers
|
|
815
|
+
# ---------------------------------------------------------------------------
|
|
816
|
+
|
|
817
|
+
def _parse_cdxml(path: str) -> ET.ElementTree:
|
|
818
|
+
return ET.parse(path)
|
|
819
|
+
|
|
820
|
+
|
|
821
|
+
def _write_cdxml(tree: ET.ElementTree, path: str):
|
|
822
|
+
"""Write CDXML, re-inserting DOCTYPE."""
|
|
823
|
+
tree.write(path, xml_declaration=True, encoding="UTF-8")
|
|
824
|
+
with open(path, "r", encoding="utf-8") as f:
|
|
825
|
+
content = f.read()
|
|
826
|
+
if "<!DOCTYPE" not in content:
|
|
827
|
+
content = content.replace(
|
|
828
|
+
"?>",
|
|
829
|
+
'?>\n<!DOCTYPE CDXML SYSTEM '
|
|
830
|
+
'"http://www.cambridgesoft.com/xml/cdxml.dtd" >',
|
|
831
|
+
1,
|
|
832
|
+
)
|
|
833
|
+
content = content.replace("ns0:", "").replace(":ns0", "")
|
|
834
|
+
with open(path, "w", encoding="utf-8") as f:
|
|
835
|
+
f.write(content)
|
|
836
|
+
|
|
837
|
+
|
|
838
|
+
def _convert_cdx_to_cdxml(cdx_path: str, verbose: bool = False) -> str:
|
|
839
|
+
"""Convert CDX to CDXML using cdx_converter.py.
|
|
840
|
+
|
|
841
|
+
Returns path to the generated CDXML file.
|
|
842
|
+
"""
|
|
843
|
+
cdxml_path = os.path.splitext(cdx_path)[0] + ".cdxml"
|
|
844
|
+
cmd = [sys.executable, "-m", "cdxml_toolkit.cdx_converter",
|
|
845
|
+
cdx_path, "-o", cdxml_path]
|
|
846
|
+
if verbose:
|
|
847
|
+
print(f" Converting CDX → CDXML: {os.path.basename(cdx_path)}",
|
|
848
|
+
file=sys.stderr)
|
|
849
|
+
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
850
|
+
if result.returncode != 0:
|
|
851
|
+
raise RuntimeError(f"CDX conversion failed: {result.stderr.strip()}")
|
|
852
|
+
if verbose:
|
|
853
|
+
print(f" {result.stdout.strip()}", file=sys.stderr)
|
|
854
|
+
return cdxml_path
|
|
855
|
+
|
|
856
|
+
|
|
857
|
+
# ---------------------------------------------------------------------------
|
|
858
|
+
# Main pipeline
|
|
859
|
+
# ---------------------------------------------------------------------------
|
|
860
|
+
|
|
861
|
+
def run_pipeline(
|
|
862
|
+
input_path: str,
|
|
863
|
+
output_path: str,
|
|
864
|
+
merge_conditions: bool = True,
|
|
865
|
+
approach: str = "chemdraw_mimic",
|
|
866
|
+
chemscript_cleanup: bool = True,
|
|
867
|
+
align_mode: str = "rdkit",
|
|
868
|
+
eln_csv: Optional[str] = None,
|
|
869
|
+
ref_cdxml: Optional[str] = None,
|
|
870
|
+
verbose: bool = False,
|
|
871
|
+
) -> str:
|
|
872
|
+
"""Run the full COM-free polishing pipeline.
|
|
873
|
+
|
|
874
|
+
Parameters
|
|
875
|
+
----------
|
|
876
|
+
input_path : str
|
|
877
|
+
Path to input .cdx or .cdxml file.
|
|
878
|
+
output_path : str
|
|
879
|
+
Path for final output .cdxml file.
|
|
880
|
+
merge_conditions : bool
|
|
881
|
+
Merge all condition text into one centered block (default True).
|
|
882
|
+
approach : str
|
|
883
|
+
Layout approach for reaction_cleanup (default "chemdraw_mimic").
|
|
884
|
+
chemscript_cleanup : bool
|
|
885
|
+
Run ChemScript CleanupStructure on each fragment before bond
|
|
886
|
+
normalization (fixes bond angles; default True). Cleaned
|
|
887
|
+
structures are re-aligned to their original orientation via
|
|
888
|
+
Kabsch alignment so the cleanup doesn't rotate the scheme.
|
|
889
|
+
align_mode : str
|
|
890
|
+
How to align reactant/reagent orientations to the product.
|
|
891
|
+
"rdkit" (default): RDKit MCS + GenerateDepictionMatching2DStructure.
|
|
892
|
+
Can rotate individual bonds, not just the whole molecule.
|
|
893
|
+
Falls back to scheme_polisher's Kabsch if RDKit is unavailable.
|
|
894
|
+
"rxnmapper": ML transformer atom mapping via RXNMapper.
|
|
895
|
+
Understands reaction chemistry; falls back to MCS if unavailable.
|
|
896
|
+
"kabsch": rigid-rotation Kabsch alignment via scheme_polisher
|
|
897
|
+
(legacy mode — only rotates the entire fragment).
|
|
898
|
+
eln_csv : str or None
|
|
899
|
+
Path to Findmolecule ELN CSV file for enrichment (equivalents,
|
|
900
|
+
run arrow with SM mass and product yield).
|
|
901
|
+
ref_cdxml : str or None
|
|
902
|
+
Path to a reference CDXML file containing known-good structures
|
|
903
|
+
drawn with the desired orientation (e.g. from a group meeting
|
|
904
|
+
slide). The product is aligned to the best-matching reference
|
|
905
|
+
structure via MCS, then reactants are aligned to the product.
|
|
906
|
+
verbose : bool
|
|
907
|
+
Print progress to stderr.
|
|
908
|
+
|
|
909
|
+
Returns
|
|
910
|
+
-------
|
|
911
|
+
str
|
|
912
|
+
Path to the output file.
|
|
913
|
+
"""
|
|
914
|
+
def log(msg: str):
|
|
915
|
+
if verbose:
|
|
916
|
+
print(f"[v2] {msg}", file=sys.stderr)
|
|
917
|
+
|
|
918
|
+
input_path = os.path.abspath(input_path)
|
|
919
|
+
output_path = os.path.abspath(output_path)
|
|
920
|
+
ext = os.path.splitext(input_path)[1].lower()
|
|
921
|
+
|
|
922
|
+
# --- Step 1: CDX → CDXML conversion if needed ---
|
|
923
|
+
if ext == ".cdx":
|
|
924
|
+
cdxml_path = _convert_cdx_to_cdxml(input_path, verbose)
|
|
925
|
+
owns_cdxml = False # don't delete — user may want it
|
|
926
|
+
elif ext == ".cdxml":
|
|
927
|
+
cdxml_path = input_path
|
|
928
|
+
else:
|
|
929
|
+
raise ValueError(f"Unsupported file format: {ext}")
|
|
930
|
+
|
|
931
|
+
# --- Step 2: Parse CDXML and optionally run ChemScript cleanup ---
|
|
932
|
+
tree = _parse_cdxml(cdxml_path)
|
|
933
|
+
root = tree.getroot()
|
|
934
|
+
|
|
935
|
+
if chemscript_cleanup:
|
|
936
|
+
log("Step 0: Running fragment geometry cleanup...")
|
|
937
|
+
cleanup_done = False
|
|
938
|
+
# RDKit is the default cleanup path (works without ChemScript)
|
|
939
|
+
try:
|
|
940
|
+
n_cleaned = _cleanup_fragments_rdkit(root, verbose)
|
|
941
|
+
if n_cleaned > 0:
|
|
942
|
+
log(f" Cleaned {n_cleaned} fragment(s) via RDKit")
|
|
943
|
+
cleanup_done = True
|
|
944
|
+
else:
|
|
945
|
+
log(f" RDKit cleanup returned 0 fragments, trying ChemScript...")
|
|
946
|
+
except Exception as exc:
|
|
947
|
+
log(f" RDKit cleanup failed ({exc}), trying ChemScript...")
|
|
948
|
+
# ChemScript fallback (if available and RDKit didn't clean anything)
|
|
949
|
+
if not cleanup_done:
|
|
950
|
+
try:
|
|
951
|
+
n_cleaned = _cleanup_fragments_chemscript(root, verbose)
|
|
952
|
+
if n_cleaned > 0:
|
|
953
|
+
log(f" Cleaned {n_cleaned} fragment(s) via ChemScript")
|
|
954
|
+
else:
|
|
955
|
+
log(f" No fragments cleaned by either backend")
|
|
956
|
+
except Exception as exc2:
|
|
957
|
+
log(f" ChemScript also unavailable ({exc2}), "
|
|
958
|
+
f"continuing without cleanup...")
|
|
959
|
+
|
|
960
|
+
# --- Normalize bond lengths per-fragment ---
|
|
961
|
+
log("Step 1: Normalizing bond lengths to ACS 14.40 pt...")
|
|
962
|
+
n_scaled = normalize_bond_lengths(root, TARGET_BOND_LENGTH, verbose)
|
|
963
|
+
log(f" Scaled {n_scaled} fragment(s)")
|
|
964
|
+
|
|
965
|
+
# --- Step 3: Apply ACS document settings ---
|
|
966
|
+
log("Step 2: Applying ACS Document 1996 settings...")
|
|
967
|
+
apply_acs_settings(root)
|
|
968
|
+
|
|
969
|
+
# --- Step 3: Normalize fonts ---
|
|
970
|
+
log("Step 3: Normalizing fonts to Arial 10pt...")
|
|
971
|
+
normalize_fonts(root, verbose)
|
|
972
|
+
|
|
973
|
+
# --- Step 3b: Fix narrow vertical text from ELN exports ---
|
|
974
|
+
n_fixed_text = fix_narrow_text(root, verbose)
|
|
975
|
+
if n_fixed_text:
|
|
976
|
+
log(f" Fixed {n_fixed_text} narrow text element(s)")
|
|
977
|
+
|
|
978
|
+
# --- Step 3c: Resolve orphan reagent text labels ---
|
|
979
|
+
n_resolved = resolve_orphan_reagent_text(root, verbose)
|
|
980
|
+
if n_resolved:
|
|
981
|
+
log(f" Resolved {n_resolved} orphan reagent text label(s)")
|
|
982
|
+
|
|
983
|
+
# --- Step 5: Write intermediate CDXML ---
|
|
984
|
+
tmpdir = tempfile.mkdtemp(prefix="spv2_")
|
|
985
|
+
normalized_path = os.path.join(tmpdir, "normalized.cdxml")
|
|
986
|
+
_write_cdxml(tree, normalized_path)
|
|
987
|
+
log(f" Wrote normalized CDXML to temp")
|
|
988
|
+
|
|
989
|
+
try:
|
|
990
|
+
# --- Step 6: Run scheme_polisher logic ---
|
|
991
|
+
# Always skip alignment inside polish_scheme — alignment is handled
|
|
992
|
+
# as an explicit Step 4e below (either rdkit or kabsch).
|
|
993
|
+
log("Step 4: Running scheme_polisher (classification + swaps + "
|
|
994
|
+
"formatting, alignment deferred to Step 4e)...")
|
|
995
|
+
from .scheme_polisher import polish_scheme, _compact_toward_arrow
|
|
996
|
+
|
|
997
|
+
polished_path = os.path.join(tmpdir, "polished.cdxml")
|
|
998
|
+
result = polish_scheme(
|
|
999
|
+
normalized_path, polished_path,
|
|
1000
|
+
verbose=verbose,
|
|
1001
|
+
merge_conditions=merge_conditions,
|
|
1002
|
+
skip_alignment=True,
|
|
1003
|
+
)
|
|
1004
|
+
|
|
1005
|
+
n_replaced = len(result["replacements"])
|
|
1006
|
+
n_promoted = len(result["promotions"])
|
|
1007
|
+
n_aligned = len(result.get("alignments", []))
|
|
1008
|
+
n_reformatted = len(result["reformatted"])
|
|
1009
|
+
n_deduped = len(result["dedup_removed"])
|
|
1010
|
+
log(f" {n_replaced} structure->text, {n_promoted} text->structure, "
|
|
1011
|
+
f"{n_aligned} aligned (Kabsch), {n_reformatted} reformatted, "
|
|
1012
|
+
f"{n_deduped} deduped"
|
|
1013
|
+
+ (", conditions merged" if result.get("merged_conditions") else ""))
|
|
1014
|
+
|
|
1015
|
+
# --- Step 4d: Align product to reference orientation ---
|
|
1016
|
+
if ref_cdxml:
|
|
1017
|
+
log("Step 4d: Aligning product to reference structure...")
|
|
1018
|
+
polished_tree = _parse_cdxml(polished_path)
|
|
1019
|
+
polished_root = polished_tree.getroot()
|
|
1020
|
+
try:
|
|
1021
|
+
success = align_product_to_reference(
|
|
1022
|
+
polished_root, ref_cdxml, verbose=verbose)
|
|
1023
|
+
if success:
|
|
1024
|
+
_write_cdxml(polished_tree, polished_path)
|
|
1025
|
+
log(" Product aligned to reference orientation")
|
|
1026
|
+
else:
|
|
1027
|
+
log(" No matching reference found — product keeps "
|
|
1028
|
+
"current orientation")
|
|
1029
|
+
except Exception as exc:
|
|
1030
|
+
log(f" WARNING: Reference alignment failed ({exc})")
|
|
1031
|
+
|
|
1032
|
+
# --- Step 4e: Alignment to product orientation ---
|
|
1033
|
+
if align_mode == "rdkit":
|
|
1034
|
+
log("Step 4e: RDKit MCS alignment to product orientation...")
|
|
1035
|
+
polished_tree = _parse_cdxml(polished_path)
|
|
1036
|
+
polished_root = polished_tree.getroot()
|
|
1037
|
+
try:
|
|
1038
|
+
n_rdkit_aligned = rdkit_align_to_product(
|
|
1039
|
+
polished_root, verbose=verbose)
|
|
1040
|
+
if n_rdkit_aligned > 0:
|
|
1041
|
+
_write_cdxml(polished_tree, polished_path)
|
|
1042
|
+
log(f" Aligned {n_rdkit_aligned} fragment(s) via "
|
|
1043
|
+
f"RDKit MCS + GenerateDepictionMatching2DStructure")
|
|
1044
|
+
else:
|
|
1045
|
+
log(" No fragments aligned via RDKit "
|
|
1046
|
+
"(MCS too small or RDKit unavailable)")
|
|
1047
|
+
except Exception as exc:
|
|
1048
|
+
log(f" WARNING: RDKit alignment failed ({exc}), "
|
|
1049
|
+
f"falling back to Kabsch...")
|
|
1050
|
+
# Fall back to Kabsch if RDKit fails
|
|
1051
|
+
polished_tree = _parse_cdxml(polished_path)
|
|
1052
|
+
polished_root = polished_tree.getroot()
|
|
1053
|
+
try:
|
|
1054
|
+
aligned_ids = kabsch_align_to_product(
|
|
1055
|
+
polished_root, verbose=verbose)
|
|
1056
|
+
if aligned_ids:
|
|
1057
|
+
_write_cdxml(polished_tree, polished_path)
|
|
1058
|
+
log(f" Kabsch fallback aligned {len(aligned_ids)} "
|
|
1059
|
+
f"fragment(s)")
|
|
1060
|
+
except Exception as exc2:
|
|
1061
|
+
log(f" WARNING: Kabsch fallback also failed ({exc2})")
|
|
1062
|
+
elif align_mode == "rxnmapper":
|
|
1063
|
+
log("Step 4e: RXNMapper alignment to product orientation...")
|
|
1064
|
+
polished_tree = _parse_cdxml(polished_path)
|
|
1065
|
+
polished_root = polished_tree.getroot()
|
|
1066
|
+
try:
|
|
1067
|
+
n_rxnm_aligned = rxnmapper_align_to_product(
|
|
1068
|
+
polished_root, verbose=verbose)
|
|
1069
|
+
if n_rxnm_aligned > 0:
|
|
1070
|
+
_write_cdxml(polished_tree, polished_path)
|
|
1071
|
+
log(f" Aligned {n_rxnm_aligned} fragment(s) via "
|
|
1072
|
+
f"RXNMapper atom maps")
|
|
1073
|
+
else:
|
|
1074
|
+
log(" No fragments aligned via RXNMapper")
|
|
1075
|
+
except Exception as exc:
|
|
1076
|
+
log(f" WARNING: RXNMapper alignment failed ({exc}), "
|
|
1077
|
+
f"falling back to RDKit MCS...")
|
|
1078
|
+
polished_tree = _parse_cdxml(polished_path)
|
|
1079
|
+
polished_root = polished_tree.getroot()
|
|
1080
|
+
try:
|
|
1081
|
+
n_rdkit_aligned = rdkit_align_to_product(
|
|
1082
|
+
polished_root, verbose=verbose)
|
|
1083
|
+
if n_rdkit_aligned > 0:
|
|
1084
|
+
_write_cdxml(polished_tree, polished_path)
|
|
1085
|
+
log(f" MCS fallback aligned {n_rdkit_aligned} "
|
|
1086
|
+
f"fragment(s)")
|
|
1087
|
+
except Exception as exc2:
|
|
1088
|
+
log(f" WARNING: MCS fallback also failed ({exc2})")
|
|
1089
|
+
elif align_mode == "kabsch":
|
|
1090
|
+
log("Step 4e: Kabsch alignment to product orientation...")
|
|
1091
|
+
polished_tree = _parse_cdxml(polished_path)
|
|
1092
|
+
polished_root = polished_tree.getroot()
|
|
1093
|
+
try:
|
|
1094
|
+
aligned_ids = kabsch_align_to_product(
|
|
1095
|
+
polished_root, verbose=verbose)
|
|
1096
|
+
if aligned_ids:
|
|
1097
|
+
_write_cdxml(polished_tree, polished_path)
|
|
1098
|
+
log(f" Aligned {len(aligned_ids)} fragment(s) via Kabsch")
|
|
1099
|
+
else:
|
|
1100
|
+
log(" No fragments aligned via Kabsch")
|
|
1101
|
+
except Exception as exc:
|
|
1102
|
+
log(f" WARNING: Kabsch alignment failed ({exc})")
|
|
1103
|
+
|
|
1104
|
+
# --- Step 4.5: Reposition non-substrate reactant above arrow ---
|
|
1105
|
+
if eln_csv:
|
|
1106
|
+
from .eln_enrichment import reposition_reactant_above_arrow
|
|
1107
|
+
|
|
1108
|
+
polished_tree = _parse_cdxml(polished_path)
|
|
1109
|
+
polished_root = polished_tree.getroot()
|
|
1110
|
+
if reposition_reactant_above_arrow(
|
|
1111
|
+
polished_root, eln_csv, verbose=verbose):
|
|
1112
|
+
_write_cdxml(polished_tree, polished_path)
|
|
1113
|
+
log("Step 4.5: Repositioned non-substrate reactant above arrow")
|
|
1114
|
+
|
|
1115
|
+
# --- Step 5.5: Phase A — ELN enrichment (equiv into text, before layout) ---
|
|
1116
|
+
enrichment_data = None
|
|
1117
|
+
if eln_csv:
|
|
1118
|
+
log("Step 5.5: Phase A — Injecting equivalents into text...")
|
|
1119
|
+
from .eln_enrichment import match_csv_to_scheme, enrich_phase_a
|
|
1120
|
+
|
|
1121
|
+
# Re-parse the polished CDXML to inject equivs
|
|
1122
|
+
polished_tree = _parse_cdxml(polished_path)
|
|
1123
|
+
polished_root = polished_tree.getroot()
|
|
1124
|
+
|
|
1125
|
+
enrichment_data = match_csv_to_scheme(
|
|
1126
|
+
polished_root, eln_csv, verbose=verbose)
|
|
1127
|
+
log(f" Matched {len(enrichment_data.matches)} CSV reagents "
|
|
1128
|
+
f"to scheme elements")
|
|
1129
|
+
|
|
1130
|
+
merged_text_id = result.get("merged_text_id")
|
|
1131
|
+
enrich_phase_a(
|
|
1132
|
+
polished_root, enrichment_data,
|
|
1133
|
+
merged_text_id=str(merged_text_id) if merged_text_id else None,
|
|
1134
|
+
verbose=verbose,
|
|
1135
|
+
)
|
|
1136
|
+
|
|
1137
|
+
# Write back
|
|
1138
|
+
_write_cdxml(polished_tree, polished_path)
|
|
1139
|
+
|
|
1140
|
+
# --- Step 7: Compact toward arrow ---
|
|
1141
|
+
log("Step 5: Compacting objects toward arrow...")
|
|
1142
|
+
_compact_toward_arrow(polished_path, verbose)
|
|
1143
|
+
|
|
1144
|
+
# --- Step 8: Run reaction_cleanup ---
|
|
1145
|
+
log(f"Step 6: Running reaction_cleanup (approach={approach})...")
|
|
1146
|
+
from ...layout.reaction_cleanup import run_cleanup
|
|
1147
|
+
|
|
1148
|
+
run_cleanup(polished_path, output_path, approach=approach, verbose=verbose)
|
|
1149
|
+
log(f" Final layout complete")
|
|
1150
|
+
|
|
1151
|
+
# --- Step 7.5: Phase B — ELN enrichment (run arrow + eq labels, after layout) ---
|
|
1152
|
+
if eln_csv and enrichment_data:
|
|
1153
|
+
log("Step 7.5: Phase B — Adding run arrow + eq labels...")
|
|
1154
|
+
from .eln_enrichment import enrich_phase_b
|
|
1155
|
+
|
|
1156
|
+
final_tree = _parse_cdxml(output_path)
|
|
1157
|
+
final_root = final_tree.getroot()
|
|
1158
|
+
|
|
1159
|
+
enrich_phase_b(final_root, enrichment_data, verbose=verbose)
|
|
1160
|
+
|
|
1161
|
+
_write_cdxml(final_tree, output_path)
|
|
1162
|
+
log(f" Enrichment complete")
|
|
1163
|
+
|
|
1164
|
+
finally:
|
|
1165
|
+
import shutil
|
|
1166
|
+
try:
|
|
1167
|
+
shutil.rmtree(tmpdir)
|
|
1168
|
+
except Exception:
|
|
1169
|
+
pass
|
|
1170
|
+
|
|
1171
|
+
log(f"Output: {output_path}")
|
|
1172
|
+
return output_path
|
|
1173
|
+
|
|
1174
|
+
|
|
1175
|
+
# ---------------------------------------------------------------------------
|
|
1176
|
+
# CLI
|
|
1177
|
+
# ---------------------------------------------------------------------------
|
|
1178
|
+
|
|
1179
|
+
def _classify_error(exc: Exception) -> str:
|
|
1180
|
+
"""Map an exception to a machine-readable error code."""
|
|
1181
|
+
msg = str(exc).lower()
|
|
1182
|
+
name = type(exc).__name__
|
|
1183
|
+
|
|
1184
|
+
if name == "FileNotFoundError" or "not found" in msg:
|
|
1185
|
+
return "file_not_found"
|
|
1186
|
+
if "parse" in msg or "xml" in msg.lower() or name == "ParseError":
|
|
1187
|
+
return "cdxml_parse_failed"
|
|
1188
|
+
if "rdkit" in msg or "smiles" in msg:
|
|
1189
|
+
return "smiles_parse_failed"
|
|
1190
|
+
if "chemscript" in msg:
|
|
1191
|
+
return "chemscript_error"
|
|
1192
|
+
if "alignment" in msg or "mcs" in msg:
|
|
1193
|
+
return "alignment_failed"
|
|
1194
|
+
if "enrichment" in msg or "csv" in msg:
|
|
1195
|
+
return "enrichment_failed"
|
|
1196
|
+
if "layout" in msg or "cleanup" in msg:
|
|
1197
|
+
return "layout_failed"
|
|
1198
|
+
if name in ("KeyError", "IndexError", "ValueError", "TypeError"):
|
|
1199
|
+
return "internal_error"
|
|
1200
|
+
return "pipeline_failed"
|
|
1201
|
+
|
|
1202
|
+
|
|
1203
|
+
def main(argv: Optional[List[str]] = None) -> int:
|
|
1204
|
+
from ...layout.reaction_cleanup import APPROACHES
|
|
1205
|
+
|
|
1206
|
+
parser = argparse.ArgumentParser(
|
|
1207
|
+
description=(
|
|
1208
|
+
"COM-free scheme polishing pipeline: normalize bond lengths, "
|
|
1209
|
+
"classify reagents, swap structures/text, align orientations, "
|
|
1210
|
+
"format subscripts, merge conditions, and clean up layout."
|
|
1211
|
+
),
|
|
1212
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
1213
|
+
epilog=__doc__,
|
|
1214
|
+
)
|
|
1215
|
+
parser.add_argument(
|
|
1216
|
+
"input",
|
|
1217
|
+
help="Input .cdx or .cdxml file",
|
|
1218
|
+
)
|
|
1219
|
+
parser.add_argument(
|
|
1220
|
+
"-o", "--output", default=None,
|
|
1221
|
+
help="Output CDXML file (default: <input_stem>-v2.cdxml)",
|
|
1222
|
+
)
|
|
1223
|
+
parser.add_argument(
|
|
1224
|
+
"--no-merge-conditions", action="store_true",
|
|
1225
|
+
help="Keep condition text as separate labels (default: merge into one block)",
|
|
1226
|
+
)
|
|
1227
|
+
parser.add_argument(
|
|
1228
|
+
"--approach", choices=list(APPROACHES.keys()),
|
|
1229
|
+
default="chemdraw_mimic",
|
|
1230
|
+
help="Layout approach for reaction_cleanup (default: chemdraw_mimic)",
|
|
1231
|
+
)
|
|
1232
|
+
parser.add_argument(
|
|
1233
|
+
"--no-chemscript-cleanup", action="store_true",
|
|
1234
|
+
help="Skip ChemScript CleanupStructure per fragment "
|
|
1235
|
+
"(default: cleanup is enabled to fix bond angles)",
|
|
1236
|
+
)
|
|
1237
|
+
parser.add_argument(
|
|
1238
|
+
"--align-mode", choices=["rdkit", "rxnmapper", "kabsch"],
|
|
1239
|
+
default="rdkit",
|
|
1240
|
+
help="Orientation alignment method (default: rdkit). "
|
|
1241
|
+
"'rdkit' uses MCS + GenerateDepictionMatching2DStructure "
|
|
1242
|
+
"(can rotate individual bonds for better alignment). "
|
|
1243
|
+
"'rxnmapper' uses ML transformer atom mapping to align "
|
|
1244
|
+
"reactants to product orientation (falls back to MCS). "
|
|
1245
|
+
"'kabsch' uses rigid-body rotation only (legacy backup).",
|
|
1246
|
+
)
|
|
1247
|
+
parser.add_argument(
|
|
1248
|
+
"--eln-csv", default=None,
|
|
1249
|
+
help="Findmolecule ELN CSV file for enrichment (adds equivalents, "
|
|
1250
|
+
"run arrow with SM mass and product yield)",
|
|
1251
|
+
)
|
|
1252
|
+
parser.add_argument(
|
|
1253
|
+
"--ref-cdxml", default=None,
|
|
1254
|
+
help="Reference CDXML file with known-good structure(s) for "
|
|
1255
|
+
"product orientation. The product is aligned to the best-"
|
|
1256
|
+
"matching reference via MCS, then reactants align to the "
|
|
1257
|
+
"product.",
|
|
1258
|
+
)
|
|
1259
|
+
parser.add_argument(
|
|
1260
|
+
"--render", action="store_true",
|
|
1261
|
+
help="Render output to PNG via cdxml_to_image.py",
|
|
1262
|
+
)
|
|
1263
|
+
parser.add_argument(
|
|
1264
|
+
"--json-errors", action="store_true",
|
|
1265
|
+
help="Output structured JSON error objects to stderr on failure "
|
|
1266
|
+
"(for agent orchestration)",
|
|
1267
|
+
)
|
|
1268
|
+
parser.add_argument(
|
|
1269
|
+
"-v", "--verbose", action="store_true",
|
|
1270
|
+
help="Print progress to stderr",
|
|
1271
|
+
)
|
|
1272
|
+
|
|
1273
|
+
args = parser.parse_args(argv)
|
|
1274
|
+
|
|
1275
|
+
def _emit_json_error(error_code: str, detail: str,
|
|
1276
|
+
file: str = None) -> None:
|
|
1277
|
+
"""Write a structured JSON error to stderr if --json-errors."""
|
|
1278
|
+
if not args.json_errors:
|
|
1279
|
+
return
|
|
1280
|
+
obj = {"error": error_code, "detail": detail}
|
|
1281
|
+
if file:
|
|
1282
|
+
obj["file"] = file
|
|
1283
|
+
print(json.dumps(obj), file=sys.stderr)
|
|
1284
|
+
|
|
1285
|
+
input_path = os.path.abspath(args.input)
|
|
1286
|
+
if not os.path.exists(input_path):
|
|
1287
|
+
msg = f"file not found: {input_path}"
|
|
1288
|
+
_emit_json_error("file_not_found", msg, os.path.basename(input_path))
|
|
1289
|
+
if not args.json_errors:
|
|
1290
|
+
print(f"ERROR: {msg}", file=sys.stderr)
|
|
1291
|
+
return 1
|
|
1292
|
+
|
|
1293
|
+
if args.output is None:
|
|
1294
|
+
stem = os.path.splitext(input_path)[0]
|
|
1295
|
+
output_path = stem + "-v2.cdxml"
|
|
1296
|
+
else:
|
|
1297
|
+
output_path = os.path.abspath(args.output)
|
|
1298
|
+
|
|
1299
|
+
try:
|
|
1300
|
+
run_pipeline(
|
|
1301
|
+
input_path,
|
|
1302
|
+
output_path,
|
|
1303
|
+
merge_conditions=not args.no_merge_conditions,
|
|
1304
|
+
approach=args.approach,
|
|
1305
|
+
chemscript_cleanup=not args.no_chemscript_cleanup,
|
|
1306
|
+
align_mode=args.align_mode,
|
|
1307
|
+
eln_csv=args.eln_csv,
|
|
1308
|
+
ref_cdxml=args.ref_cdxml,
|
|
1309
|
+
verbose=args.verbose,
|
|
1310
|
+
)
|
|
1311
|
+
except Exception as e:
|
|
1312
|
+
error_type = type(e).__name__
|
|
1313
|
+
error_code = _classify_error(e)
|
|
1314
|
+
_emit_json_error(error_code, str(e),
|
|
1315
|
+
os.path.basename(input_path))
|
|
1316
|
+
if not args.json_errors:
|
|
1317
|
+
print(f"ERROR: {e}", file=sys.stderr)
|
|
1318
|
+
if args.verbose:
|
|
1319
|
+
import traceback
|
|
1320
|
+
traceback.print_exc()
|
|
1321
|
+
return 1
|
|
1322
|
+
|
|
1323
|
+
print(f"Output: {output_path}")
|
|
1324
|
+
|
|
1325
|
+
if args.render:
|
|
1326
|
+
try:
|
|
1327
|
+
from ...chemdraw.cdxml_to_image import cdxml_to_image
|
|
1328
|
+
png_path = cdxml_to_image(output_path)
|
|
1329
|
+
print(f"Rendered: {png_path}")
|
|
1330
|
+
except Exception as e:
|
|
1331
|
+
_emit_json_error("render_failed", str(e),
|
|
1332
|
+
os.path.basename(output_path))
|
|
1333
|
+
if not args.json_errors:
|
|
1334
|
+
print(f"Render failed: {e}", file=sys.stderr)
|
|
1335
|
+
|
|
1336
|
+
return 0
|
|
1337
|
+
|
|
1338
|
+
|
|
1339
|
+
if __name__ == "__main__":
|
|
1340
|
+
sys.exit(main())
|