cdxml-toolkit 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cdxml_toolkit/__init__.py +18 -0
- cdxml_toolkit/_jre/__init__.py +2 -0
- cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
- cdxml_toolkit/analysis/__init__.py +35 -0
- cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
- cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
- cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
- cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
- cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
- cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
- cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
- cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
- cdxml_toolkit/analysis/extract_nmr.py +47 -0
- cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
- cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
- cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
- cdxml_toolkit/cdxml_builder.py +920 -0
- cdxml_toolkit/cdxml_utils.py +342 -0
- cdxml_toolkit/chemdraw/__init__.py +5 -0
- cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
- cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
- cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
- cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
- cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
- cdxml_toolkit/constants.py +304 -0
- cdxml_toolkit/coord_normalizer.py +438 -0
- cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
- cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
- cdxml_toolkit/image/__init__.py +15 -0
- cdxml_toolkit/image/reaction_from_image.py +2103 -0
- cdxml_toolkit/image/structure_from_image.py +1711 -0
- cdxml_toolkit/layout/__init__.py +5 -0
- cdxml_toolkit/layout/alignment.py +1642 -0
- cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
- cdxml_toolkit/layout/scheme_merger.py +2260 -0
- cdxml_toolkit/mcp_server/__init__.py +0 -0
- cdxml_toolkit/mcp_server/__main__.py +5 -0
- cdxml_toolkit/mcp_server/server.py +1567 -0
- cdxml_toolkit/naming/__init__.py +6 -0
- cdxml_toolkit/naming/aligned_namer.py +2342 -0
- cdxml_toolkit/naming/mol_builder.py +3722 -0
- cdxml_toolkit/naming/name_decomposer.py +2843 -0
- cdxml_toolkit/naming/reactions_datamol.json +2414 -0
- cdxml_toolkit/office/__init__.py +5 -0
- cdxml_toolkit/office/doc_from_template.py +722 -0
- cdxml_toolkit/office/ole_embedder.py +808 -0
- cdxml_toolkit/office/ole_extractor.py +272 -0
- cdxml_toolkit/perception/__init__.py +10 -0
- cdxml_toolkit/perception/compound_search.py +229 -0
- cdxml_toolkit/perception/eln_csv_parser.py +240 -0
- cdxml_toolkit/perception/rdf_parser.py +664 -0
- cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
- cdxml_toolkit/perception/reaction_parser.py +2150 -0
- cdxml_toolkit/perception/scheme_reader.py +2948 -0
- cdxml_toolkit/perception/scheme_refine.py +1404 -0
- cdxml_toolkit/perception/scheme_segmenter.py +619 -0
- cdxml_toolkit/perception/spatial_assignment.py +1013 -0
- cdxml_toolkit/rdkit_utils.py +605 -0
- cdxml_toolkit/render/__init__.py +17 -0
- cdxml_toolkit/render/auto_layout.py +229 -0
- cdxml_toolkit/render/compact_parser.py +632 -0
- cdxml_toolkit/render/parser.py +706 -0
- cdxml_toolkit/render/render_scheme.py +267 -0
- cdxml_toolkit/render/renderer.py +2387 -0
- cdxml_toolkit/render/schema.py +90 -0
- cdxml_toolkit/render/scheme_maker.py +1043 -0
- cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
- cdxml_toolkit/resolve/__init__.py +13 -0
- cdxml_toolkit/resolve/cas_resolver.py +430 -0
- cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
- cdxml_toolkit/resolve/condensed_formula.py +493 -0
- cdxml_toolkit/resolve/jre_manager.py +195 -0
- cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
- cdxml_toolkit/resolve/reagent_db.py +285 -0
- cdxml_toolkit/resolve/superatom_data.json +2856 -0
- cdxml_toolkit/resolve/superatom_table.py +146 -0
- cdxml_toolkit/text_formatting.py +298 -0
- cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
- cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
- cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
- cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
- cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1337 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
scheme_polisher.py — Polish a CDXML reaction scheme for presentation.
|
|
4
|
+
|
|
5
|
+
Takes a CDXML reaction scheme (typically from eln_cdx_cleanup.py) and:
|
|
6
|
+
1. Classifies reagents as atom-contributing or non-contributing
|
|
7
|
+
(using reactant_heuristic.py)
|
|
8
|
+
2. Replaces non-contributing reagent structures with text abbreviations
|
|
9
|
+
(e.g. Cs₂CO₃ structure → "Cs2CO3" text, n-BuLi → "n-BuLi")
|
|
10
|
+
3. Promotes atom-contributing text labels to drawn structures
|
|
11
|
+
(e.g. "Morpholine" → morpholine structure via ChemScript name resolution)
|
|
12
|
+
4. Aligns atom-contributing reagents to match product orientation:
|
|
13
|
+
a. Finds atom correspondence via RDKit substructure match or MCS
|
|
14
|
+
b. Maps RDKit (MOL) atom indices to CDXML node indices by coordinate
|
|
15
|
+
matching (handles ChemScript atom reordering on export)
|
|
16
|
+
c. Computes optimal rigid rotation via Kabsch algorithm on matched
|
|
17
|
+
CDXML coordinates (3× heteroatom weighting for symmetric rings)
|
|
18
|
+
d. Applies rotation in-place around reagent centroid
|
|
19
|
+
5. Reformats text labels (subscripts for numbers, italic for prefixes)
|
|
20
|
+
6. Deduplicates identical reagents/conditions (e.g. duplicate "THF")
|
|
21
|
+
7. Optionally merges all condition text into a single centered block
|
|
22
|
+
below the arrow (--merge-conditions)
|
|
23
|
+
8. Compacts above/below-arrow objects toward the arrow
|
|
24
|
+
9. Optionally runs ChemDraw COM "Clean Up Reaction" for final spacing
|
|
25
|
+
|
|
26
|
+
Post-processing modes (default: compact + ChemDraw cleanup):
|
|
27
|
+
--no-chemdraw-cleanup Compact only, skip ChemDraw COM pass
|
|
28
|
+
--no-compact Skip compaction and ChemDraw COM (raw polished output)
|
|
29
|
+
|
|
30
|
+
Usage:
|
|
31
|
+
python scheme_polisher.py -i scheme.cdxml [-o polished.cdxml] [-v]
|
|
32
|
+
python scheme_polisher.py -i scheme.cdxml --merge-conditions -v
|
|
33
|
+
python scheme_polisher.py -i scheme.cdxml --no-chemdraw-cleanup
|
|
34
|
+
python scheme_polisher.py -i scheme.cdxml --no-compact
|
|
35
|
+
|
|
36
|
+
Dependencies:
|
|
37
|
+
- reactant_heuristic.py (reagent classification)
|
|
38
|
+
- chemscript_bridge.py (text→structure promotion, MOL export)
|
|
39
|
+
- rdkit (MCS, substructure matching)
|
|
40
|
+
- reagent_abbreviations.json (curated name→display mapping)
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
import argparse
|
|
44
|
+
import json
|
|
45
|
+
import os
|
|
46
|
+
import re
|
|
47
|
+
import sys
|
|
48
|
+
import tempfile
|
|
49
|
+
import time
|
|
50
|
+
from typing import Dict, List, Optional, Set, Tuple
|
|
51
|
+
from xml.etree import ElementTree as ET
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
# ---------------------------------------------------------------------------
|
|
55
|
+
# Shared reagent database
|
|
56
|
+
# ---------------------------------------------------------------------------
|
|
57
|
+
|
|
58
|
+
from ...resolve.reagent_db import get_reagent_db
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
# ---------------------------------------------------------------------------
|
|
62
|
+
# Text formatting: subscripts + italic prefixes (from text_formatting.py)
|
|
63
|
+
# ---------------------------------------------------------------------------
|
|
64
|
+
|
|
65
|
+
from ...text_formatting import (
|
|
66
|
+
build_formatted_s_xml as _build_formatted_s_xml, # Re-exported for eln_enrichment.py backward compat
|
|
67
|
+
needs_subscript as _needs_subscript,
|
|
68
|
+
split_italic_prefix as _split_italic_prefix,
|
|
69
|
+
SUBSCRIPT_RE as _SUBSCRIPT_RE,
|
|
70
|
+
ITALIC_PREFIXES as _ITALIC_PREFIXES,
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
# Keep the old name as an alias used by _build_replacement_text_element
|
|
74
|
+
_build_subscripted_s_xml = _build_formatted_s_xml
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
# ---------------------------------------------------------------------------
|
|
78
|
+
# CDXML Helpers
|
|
79
|
+
# ---------------------------------------------------------------------------
|
|
80
|
+
|
|
81
|
+
def _get_text_content(el: ET.Element) -> str:
|
|
82
|
+
"""Extract concatenated text from all <s> children of a <t> element.
|
|
83
|
+
Joins without spaces — chemical formulae like Cs2CO3 are split across
|
|
84
|
+
multiple <s> elements (Cs + 2 + CO + 3) and must not get spaces."""
|
|
85
|
+
parts = []
|
|
86
|
+
for s in el.iter("s"):
|
|
87
|
+
if s.text:
|
|
88
|
+
parts.append(s.text)
|
|
89
|
+
return "".join(parts).strip()
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _get_fm_molecule_type(el: ET.Element) -> Optional[int]:
|
|
93
|
+
"""Read the Findmolecule MOLECULE TYPE objecttag.
|
|
94
|
+
Values: 0=molecule, 1=solvent, 2=condition text, 3=product."""
|
|
95
|
+
for ot in el.iter("objecttag"):
|
|
96
|
+
if ot.get("Name") == "FM MOLECULE TYPE":
|
|
97
|
+
try:
|
|
98
|
+
return int(ot.get("Value", ""))
|
|
99
|
+
except ValueError:
|
|
100
|
+
return None
|
|
101
|
+
return None
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def _fragment_bbox_center(frag: ET.Element) -> Tuple[float, float]:
|
|
105
|
+
"""Compute the center of a fragment's bounding box from node positions.
|
|
106
|
+
|
|
107
|
+
Delegates to cdxml_utils.fragment_centroid(); falls back to (500, 250).
|
|
108
|
+
"""
|
|
109
|
+
from ...cdxml_utils import fragment_centroid
|
|
110
|
+
result = fragment_centroid(frag)
|
|
111
|
+
if result is not None:
|
|
112
|
+
return result
|
|
113
|
+
return 500.0, 250.0 # fallback center
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def _element_to_xml_string(el: ET.Element) -> str:
|
|
117
|
+
"""Serialize an element to a raw XML string."""
|
|
118
|
+
return ET.tostring(el, encoding="unicode")
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
# ---------------------------------------------------------------------------
|
|
122
|
+
# Alignment imports (from alignment.py)
|
|
123
|
+
# ---------------------------------------------------------------------------
|
|
124
|
+
# All alignment primitives + high-level orchestrators live in alignment.py.
|
|
125
|
+
# We import the public names here and keep private aliases so any internal
|
|
126
|
+
# callers that used the old names still work.
|
|
127
|
+
|
|
128
|
+
from ...layout.alignment import (
|
|
129
|
+
sp_fragment_to_cdxml,
|
|
130
|
+
filtered_atom_nodes,
|
|
131
|
+
compute_rigid_rotation_2d,
|
|
132
|
+
rotate_fragment_in_place,
|
|
133
|
+
make_abbrev_dummy_copy,
|
|
134
|
+
kabsch_align_fragment_to_product,
|
|
135
|
+
kabsch_align_to_product,
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
# Backward-compatible private aliases
|
|
139
|
+
_sp_fragment_to_cdxml = sp_fragment_to_cdxml
|
|
140
|
+
_filtered_atom_nodes = filtered_atom_nodes
|
|
141
|
+
_compute_rigid_rotation = compute_rigid_rotation_2d
|
|
142
|
+
_rotate_fragment_in_place = rotate_fragment_in_place
|
|
143
|
+
_make_abbrev_dummy_copy = make_abbrev_dummy_copy
|
|
144
|
+
_align_reagent_to_product = kabsch_align_fragment_to_product
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
# ---------------------------------------------------------------------------
|
|
148
|
+
# Display name resolution for non-contributing fragments
|
|
149
|
+
# ---------------------------------------------------------------------------
|
|
150
|
+
|
|
151
|
+
def _resolve_display_name(
|
|
152
|
+
smiles: Optional[str],
|
|
153
|
+
name: Optional[str],
|
|
154
|
+
role: Optional[str],
|
|
155
|
+
) -> Optional[str]:
|
|
156
|
+
"""Determine the text abbreviation to display for a non-contributing reagent.
|
|
157
|
+
|
|
158
|
+
Resolution chain:
|
|
159
|
+
1a. Reagent DB display_name (via canonical SMILES — exact match)
|
|
160
|
+
1b. Reagent DB display_name (via stereo-agnostic SMILES match)
|
|
161
|
+
2a. Reagent DB display_name (via exact name/alias)
|
|
162
|
+
2b. Reagent DB display_name (via Levenshtein on name — catches typos
|
|
163
|
+
and abbreviation variants like EDC.HCl, nBuLi, i-Pr2NEt)
|
|
164
|
+
3. The reagent name itself (if available)
|
|
165
|
+
4. None (keep structure as-is)
|
|
166
|
+
"""
|
|
167
|
+
db = get_reagent_db()
|
|
168
|
+
|
|
169
|
+
# 1a. Look up display name by SMILES (exact canonical match)
|
|
170
|
+
if smiles:
|
|
171
|
+
display = db.display_for_smiles(smiles)
|
|
172
|
+
if display:
|
|
173
|
+
return display
|
|
174
|
+
|
|
175
|
+
# 1b. Stereo-agnostic SMILES match (e.g. OPSIN omits E/Z on DEAD)
|
|
176
|
+
if smiles:
|
|
177
|
+
display = _match_smiles_no_stereo(smiles, db)
|
|
178
|
+
if display:
|
|
179
|
+
return display
|
|
180
|
+
|
|
181
|
+
# 2a. Look up display name by name/alias (exact)
|
|
182
|
+
if name:
|
|
183
|
+
display = db.display_for_name(name)
|
|
184
|
+
if display:
|
|
185
|
+
return display
|
|
186
|
+
|
|
187
|
+
# 2b. Levenshtein fuzzy match on name
|
|
188
|
+
if name:
|
|
189
|
+
display = _match_name_levenshtein(name, db)
|
|
190
|
+
if display:
|
|
191
|
+
return display
|
|
192
|
+
|
|
193
|
+
# 3. Use the name as-is
|
|
194
|
+
if name:
|
|
195
|
+
return name
|
|
196
|
+
|
|
197
|
+
return None
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
# Levenshtein similarity threshold for name matching (0.0 - 1.0).
|
|
201
|
+
# 0.80 catches "EDC.HCl"→"edc" (0.86), "nBuLi"→"n-buli" (0.80),
|
|
202
|
+
# "DIEA"→"dipea" (0.80) while rejecting spurious matches.
|
|
203
|
+
_LEVENSHTEIN_THRESHOLD = 0.80
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def _levenshtein_distance(s: str, t: str) -> int:
|
|
207
|
+
"""Compute Levenshtein edit distance between two strings."""
|
|
208
|
+
if len(s) < len(t):
|
|
209
|
+
return _levenshtein_distance(t, s)
|
|
210
|
+
if not t:
|
|
211
|
+
return len(s)
|
|
212
|
+
prev = list(range(len(t) + 1))
|
|
213
|
+
for i, sc in enumerate(s):
|
|
214
|
+
curr = [i + 1]
|
|
215
|
+
for j, tc in enumerate(t):
|
|
216
|
+
cost = 0 if sc == tc else 1
|
|
217
|
+
curr.append(min(curr[j] + 1, prev[j + 1] + 1, prev[j] + cost))
|
|
218
|
+
prev = curr
|
|
219
|
+
return prev[-1]
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def _match_name_levenshtein(
|
|
223
|
+
name: str, db: 'ReagentDB', threshold: float = _LEVENSHTEIN_THRESHOLD,
|
|
224
|
+
) -> Optional[str]:
|
|
225
|
+
"""Find the best DB entry for *name* via Levenshtein similarity.
|
|
226
|
+
|
|
227
|
+
Returns the display name if similarity >= threshold, else None.
|
|
228
|
+
"""
|
|
229
|
+
query = name.strip().lower()
|
|
230
|
+
# Also try stripping common suffixes/prefixes that don't affect identity
|
|
231
|
+
# e.g. "EDC.HCl" → "edc", "Pd(OAc)2·xH2O" → "pd(oac)2"
|
|
232
|
+
candidates = [query]
|
|
233
|
+
for sep in ['.', '\u00b7', '\u2022', ' ']:
|
|
234
|
+
if sep in query:
|
|
235
|
+
candidates.append(query.split(sep)[0])
|
|
236
|
+
|
|
237
|
+
all_keys = sorted(db._by_name.keys())
|
|
238
|
+
best_score = 0.0
|
|
239
|
+
best_key = None
|
|
240
|
+
|
|
241
|
+
for candidate in candidates:
|
|
242
|
+
if not candidate:
|
|
243
|
+
continue
|
|
244
|
+
for key in all_keys:
|
|
245
|
+
dist = _levenshtein_distance(candidate, key)
|
|
246
|
+
max_len = max(len(candidate), len(key))
|
|
247
|
+
if max_len == 0:
|
|
248
|
+
continue
|
|
249
|
+
similarity = 1.0 - dist / max_len
|
|
250
|
+
if similarity > best_score:
|
|
251
|
+
best_score = similarity
|
|
252
|
+
best_key = key
|
|
253
|
+
|
|
254
|
+
if best_score >= threshold and best_key:
|
|
255
|
+
display = db.display_for_name(best_key)
|
|
256
|
+
if display:
|
|
257
|
+
print(f" Levenshtein: '{name}' -> '{best_key}' "
|
|
258
|
+
f"(similarity={best_score:.2f})", file=sys.stderr)
|
|
259
|
+
return display
|
|
260
|
+
return None
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def _match_smiles_no_stereo(smiles: str, db: 'ReagentDB') -> Optional[str]:
|
|
264
|
+
"""Match SMILES against DB after stripping stereochemistry.
|
|
265
|
+
|
|
266
|
+
Catches cases like DEAD where the input SMILES has no E/Z
|
|
267
|
+
but the DB entry has explicit /N=N/ stereo.
|
|
268
|
+
"""
|
|
269
|
+
try:
|
|
270
|
+
from rdkit import Chem
|
|
271
|
+
mol = Chem.MolFromSmiles(smiles)
|
|
272
|
+
if mol is None:
|
|
273
|
+
return None
|
|
274
|
+
Chem.RemoveStereochemistry(mol)
|
|
275
|
+
flat_smi = Chem.MolToSmiles(mol)
|
|
276
|
+
|
|
277
|
+
# Compare against all DB SMILES (also stripped)
|
|
278
|
+
for smi_key, entry in db._by_smiles.items():
|
|
279
|
+
mol2 = Chem.MolFromSmiles(smi_key)
|
|
280
|
+
if mol2 is None:
|
|
281
|
+
continue
|
|
282
|
+
Chem.RemoveStereochemistry(mol2)
|
|
283
|
+
flat2 = Chem.MolToSmiles(mol2)
|
|
284
|
+
if flat_smi == flat2:
|
|
285
|
+
return entry.get("display")
|
|
286
|
+
except ImportError:
|
|
287
|
+
pass
|
|
288
|
+
except Exception:
|
|
289
|
+
pass
|
|
290
|
+
return None
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
# ---------------------------------------------------------------------------
|
|
294
|
+
# Build replacement <t> element for a non-contributing fragment
|
|
295
|
+
# ---------------------------------------------------------------------------
|
|
296
|
+
|
|
297
|
+
def _build_replacement_text_element(
|
|
298
|
+
display_name: str,
|
|
299
|
+
element_id: str,
|
|
300
|
+
cx: float,
|
|
301
|
+
cy: float,
|
|
302
|
+
z_value: str,
|
|
303
|
+
) -> ET.Element:
|
|
304
|
+
"""Build a <t> element to replace a non-contributing fragment.
|
|
305
|
+
|
|
306
|
+
The text is positioned at (cx, cy) which was the center of the original
|
|
307
|
+
fragment's bounding box. Subscript formatting is applied for chemical
|
|
308
|
+
formulae.
|
|
309
|
+
"""
|
|
310
|
+
# Estimate bounding box
|
|
311
|
+
char_w = len(display_name) * 5.8
|
|
312
|
+
ascender = 8.0
|
|
313
|
+
descender = 3.0
|
|
314
|
+
|
|
315
|
+
# <t> p="x baseline_y" — baseline is at cy + partial ascender offset
|
|
316
|
+
baseline_y = cy + 3.5 # shift down slightly from center to align baseline
|
|
317
|
+
bx1 = cx - char_w / 2.0
|
|
318
|
+
by1 = baseline_y - ascender
|
|
319
|
+
bx2 = cx + char_w / 2.0
|
|
320
|
+
by2 = baseline_y + descender
|
|
321
|
+
|
|
322
|
+
s_xml = _build_subscripted_s_xml(display_name)
|
|
323
|
+
|
|
324
|
+
# Build XML string and parse it
|
|
325
|
+
t_xml = (
|
|
326
|
+
f'<t id="{element_id}" '
|
|
327
|
+
f'p="{cx:.2f} {baseline_y:.2f}" '
|
|
328
|
+
f'BoundingBox="{bx1:.2f} {by1:.2f} {bx2:.2f} {by2:.2f}" '
|
|
329
|
+
f'Z="{z_value}" '
|
|
330
|
+
f'InterpretChemically="no" '
|
|
331
|
+
f'LineHeight="auto">'
|
|
332
|
+
f'{s_xml}'
|
|
333
|
+
f'</t>'
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
return ET.fromstring(t_xml)
|
|
337
|
+
|
|
338
|
+
|
|
339
|
+
# ---------------------------------------------------------------------------
|
|
340
|
+
# Resolve reagent name to CDXML fragment (for text → structure promotion)
|
|
341
|
+
# ---------------------------------------------------------------------------
|
|
342
|
+
|
|
343
|
+
def _resolve_name_to_fragment(
|
|
344
|
+
name: str,
|
|
345
|
+
smiles: Optional[str],
|
|
346
|
+
cs_bridge,
|
|
347
|
+
verbose: bool = False,
|
|
348
|
+
) -> Optional[Tuple[str, float, float, float, float]]:
|
|
349
|
+
"""Resolve a reagent name (or SMILES) to a CDXML fragment.
|
|
350
|
+
|
|
351
|
+
Resolution chain:
|
|
352
|
+
1. ChemScript name_to_cdxml
|
|
353
|
+
2. If SMILES available: ChemScript smiles_to_cdxml
|
|
354
|
+
3. PubChem name → SMILES → ChemScript smiles_to_cdxml
|
|
355
|
+
|
|
356
|
+
Returns (frag_xml, xmin, ymin, xmax, ymax) or None.
|
|
357
|
+
"""
|
|
358
|
+
from ...image.reaction_from_image import (
|
|
359
|
+
_extract_fragment_from_cdxml, _measure_fragment_xml,
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
def log(msg: str):
|
|
363
|
+
if verbose:
|
|
364
|
+
print(f"[scheme_polisher] {msg}", file=sys.stderr)
|
|
365
|
+
|
|
366
|
+
# Resolve canonical display name from reagent DB
|
|
367
|
+
canonical = get_reagent_db().resolve_display(name)
|
|
368
|
+
|
|
369
|
+
# 1. ChemScript name resolution
|
|
370
|
+
try:
|
|
371
|
+
cdxml_str = cs_bridge.name_to_cdxml(canonical)
|
|
372
|
+
result = _extract_fragment_from_cdxml(cdxml_str)
|
|
373
|
+
if result is not None:
|
|
374
|
+
log(f" '{canonical}' → ChemScript name OK")
|
|
375
|
+
return result
|
|
376
|
+
except Exception as exc:
|
|
377
|
+
log(f" '{canonical}' → ChemScript name failed: {exc}")
|
|
378
|
+
|
|
379
|
+
# 2. Direct SMILES if available
|
|
380
|
+
if smiles:
|
|
381
|
+
try:
|
|
382
|
+
cdxml_str = cs_bridge.smiles_to_cdxml(smiles)
|
|
383
|
+
result = _extract_fragment_from_cdxml(cdxml_str)
|
|
384
|
+
if result is not None:
|
|
385
|
+
log(f" '{canonical}' → ChemScript SMILES OK")
|
|
386
|
+
return result
|
|
387
|
+
except Exception as exc:
|
|
388
|
+
log(f" '{canonical}' → ChemScript SMILES failed: {exc}")
|
|
389
|
+
|
|
390
|
+
# 3. PubChem name → SMILES → ChemScript
|
|
391
|
+
try:
|
|
392
|
+
from ...resolve.cas_resolver import resolve_name_to_smiles
|
|
393
|
+
pub_smiles = resolve_name_to_smiles(canonical)
|
|
394
|
+
if pub_smiles:
|
|
395
|
+
log(f" '{canonical}' → PubChem SMILES: {pub_smiles[:60]}")
|
|
396
|
+
cdxml_str = cs_bridge.smiles_to_cdxml(pub_smiles)
|
|
397
|
+
result = _extract_fragment_from_cdxml(cdxml_str)
|
|
398
|
+
if result is not None:
|
|
399
|
+
log(f" '{canonical}' → PubChem+ChemScript OK")
|
|
400
|
+
return result
|
|
401
|
+
except Exception as exc:
|
|
402
|
+
log(f" '{canonical}' → PubChem fallback failed: {exc}")
|
|
403
|
+
|
|
404
|
+
return None
|
|
405
|
+
|
|
406
|
+
|
|
407
|
+
# ---------------------------------------------------------------------------
|
|
408
|
+
# Core polishing logic
|
|
409
|
+
# ---------------------------------------------------------------------------
|
|
410
|
+
|
|
411
|
+
def polish_scheme(
|
|
412
|
+
cdxml_path: str,
|
|
413
|
+
output_path: str,
|
|
414
|
+
verbose: bool = False,
|
|
415
|
+
merge_conditions: bool = False,
|
|
416
|
+
skip_alignment: bool = False,
|
|
417
|
+
use_rxnmapper: bool = False,
|
|
418
|
+
) -> Dict:
|
|
419
|
+
"""Polish a CDXML reaction scheme in-place.
|
|
420
|
+
|
|
421
|
+
If merge_conditions=True, all text labels above the arrow are merged
|
|
422
|
+
into a single centered multi-line text block, and likewise below.
|
|
423
|
+
|
|
424
|
+
If skip_alignment=True, Step 4d (Kabsch orientation alignment) is
|
|
425
|
+
skipped. Useful when the caller will run its own alignment
|
|
426
|
+
afterwards (e.g. scheme_polisher_v2's RDKit MCS alignment).
|
|
427
|
+
|
|
428
|
+
use_rxnmapper is deprecated and ignored. Classification now uses
|
|
429
|
+
Schneider FP scoring (context-aware, no ML dependency).
|
|
430
|
+
|
|
431
|
+
Returns a dict describing changes made.
|
|
432
|
+
"""
|
|
433
|
+
def log(msg: str):
|
|
434
|
+
if verbose:
|
|
435
|
+
print(f"[scheme_polisher] {msg}", file=sys.stderr)
|
|
436
|
+
|
|
437
|
+
# --- Step 1: Run reactant_heuristic classification ---
|
|
438
|
+
log("Running reactant_heuristic classification...")
|
|
439
|
+
from ...perception.reactant_heuristic import classify_from_cdxml
|
|
440
|
+
|
|
441
|
+
classification = classify_from_cdxml(cdxml_path,
|
|
442
|
+
use_rxnmapper=use_rxnmapper)
|
|
443
|
+
reagents = classification["reagents"]
|
|
444
|
+
|
|
445
|
+
log(f"Classified {len(reagents)} reagent(s):")
|
|
446
|
+
for r in reagents:
|
|
447
|
+
log(f" id={r['source_id']} type={r['source_type']} "
|
|
448
|
+
f"class={r['classification']} "
|
|
449
|
+
f"role={r.get('role', '-')} name={r.get('name', '-')}")
|
|
450
|
+
|
|
451
|
+
# --- Step 2: Parse CDXML ---
|
|
452
|
+
tree = ET.parse(cdxml_path)
|
|
453
|
+
root = tree.getroot()
|
|
454
|
+
page = root.find("page")
|
|
455
|
+
if page is None:
|
|
456
|
+
raise SystemExit("ERROR: no <page> element in CDXML")
|
|
457
|
+
|
|
458
|
+
# Build id → element map and id → parent map
|
|
459
|
+
id_to_el: Dict[str, ET.Element] = {}
|
|
460
|
+
id_to_parent: Dict[str, ET.Element] = {}
|
|
461
|
+
for parent in page:
|
|
462
|
+
eid = parent.get("id", "")
|
|
463
|
+
if eid:
|
|
464
|
+
id_to_el[eid] = parent
|
|
465
|
+
id_to_parent[eid] = page
|
|
466
|
+
|
|
467
|
+
# --- Step 3: Parse <step> metadata ---
|
|
468
|
+
scheme = page.find("scheme")
|
|
469
|
+
step = scheme.find("step") if scheme is not None else None
|
|
470
|
+
if step is None:
|
|
471
|
+
raise SystemExit("ERROR: no <scheme><step> found in CDXML")
|
|
472
|
+
|
|
473
|
+
reactant_ids = set(step.get("ReactionStepReactants", "").split())
|
|
474
|
+
product_ids = set(step.get("ReactionStepProducts", "").split())
|
|
475
|
+
above_ids = step.get("ReactionStepObjectsAboveArrow", "").split()
|
|
476
|
+
below_ids = step.get("ReactionStepObjectsBelowArrow", "").split()
|
|
477
|
+
|
|
478
|
+
# --- Step 4: Process non-contributing fragments → replace with text ---
|
|
479
|
+
replacements = [] # (old_id, display_name)
|
|
480
|
+
ids_to_remove = [] # fragment IDs to remove from page
|
|
481
|
+
|
|
482
|
+
for r in reagents:
|
|
483
|
+
if r["classification"] != "non_contributing":
|
|
484
|
+
continue
|
|
485
|
+
if r["source_type"] != "fragment":
|
|
486
|
+
continue
|
|
487
|
+
|
|
488
|
+
src_id = r["source_id"]
|
|
489
|
+
el = id_to_el.get(src_id)
|
|
490
|
+
if el is None or el.tag != "fragment":
|
|
491
|
+
continue
|
|
492
|
+
|
|
493
|
+
# Skip products (shouldn't happen but be safe)
|
|
494
|
+
if src_id in product_ids:
|
|
495
|
+
continue
|
|
496
|
+
|
|
497
|
+
# Determine display name
|
|
498
|
+
display_name = _resolve_display_name(
|
|
499
|
+
r.get("smiles"), r.get("name"), r.get("role")
|
|
500
|
+
)
|
|
501
|
+
if display_name is None:
|
|
502
|
+
log(f" WARNING: no display name for fragment {src_id}, keeping structure")
|
|
503
|
+
continue
|
|
504
|
+
|
|
505
|
+
log(f" Replacing fragment {src_id} with text '{display_name}'")
|
|
506
|
+
|
|
507
|
+
# Get position from fragment center
|
|
508
|
+
cx, cy = _fragment_bbox_center(el)
|
|
509
|
+
z_value = el.get("Z", "1")
|
|
510
|
+
|
|
511
|
+
# Build replacement text element (same ID to preserve step refs)
|
|
512
|
+
new_t = _build_replacement_text_element(
|
|
513
|
+
display_name, src_id, cx, cy, z_value
|
|
514
|
+
)
|
|
515
|
+
|
|
516
|
+
# Replace in page: remove old fragment, insert new text
|
|
517
|
+
page.remove(el)
|
|
518
|
+
# Insert before the scheme element to keep document order sensible
|
|
519
|
+
scheme_idx = list(page).index(scheme) if scheme in page else len(list(page))
|
|
520
|
+
page.insert(scheme_idx, new_t)
|
|
521
|
+
|
|
522
|
+
replacements.append((src_id, display_name))
|
|
523
|
+
|
|
524
|
+
# Move replaced IDs from ReactionStepReactants → above-arrow so they
|
|
525
|
+
# are treated as conditions text (and eligible for merge-conditions).
|
|
526
|
+
if replacements and step is not None:
|
|
527
|
+
replaced_ids = {r[0] for r in replacements}
|
|
528
|
+
# Remove from reactants
|
|
529
|
+
current_reactants = step.get("ReactionStepReactants", "").split()
|
|
530
|
+
new_reactants = [rid for rid in current_reactants
|
|
531
|
+
if rid not in replaced_ids]
|
|
532
|
+
step.set("ReactionStepReactants", " ".join(new_reactants))
|
|
533
|
+
# Add to above-arrow
|
|
534
|
+
current_above = step.get("ReactionStepObjectsAboveArrow", "").split()
|
|
535
|
+
current_above = [a for a in current_above if a] # filter empty
|
|
536
|
+
for rid, _ in replacements:
|
|
537
|
+
if rid not in current_above:
|
|
538
|
+
current_above.append(rid)
|
|
539
|
+
step.set("ReactionStepObjectsAboveArrow", " ".join(current_above))
|
|
540
|
+
# Update local tracking sets
|
|
541
|
+
reactant_ids -= replaced_ids
|
|
542
|
+
above_ids = current_above
|
|
543
|
+
|
|
544
|
+
log(f"Replaced {len(replacements)} non-contributing fragment(s) with text")
|
|
545
|
+
|
|
546
|
+
# --- Lazy-init ChemScript bridge (shared by Step 4b and 4d) ---
|
|
547
|
+
cs_bridge = None
|
|
548
|
+
|
|
549
|
+
def _ensure_cs_bridge():
|
|
550
|
+
nonlocal cs_bridge
|
|
551
|
+
if cs_bridge is None:
|
|
552
|
+
from ...chemdraw.chemscript_bridge import ChemScriptBridge
|
|
553
|
+
cs_bridge = ChemScriptBridge()
|
|
554
|
+
return cs_bridge
|
|
555
|
+
|
|
556
|
+
# --- Step 4b: Promote atom-contributing text labels to structures ---
|
|
557
|
+
promotions = [] # (old_id, name)
|
|
558
|
+
|
|
559
|
+
for r in reagents:
|
|
560
|
+
if r["classification"] != "atom_contributing":
|
|
561
|
+
continue
|
|
562
|
+
if r["source_type"] != "text":
|
|
563
|
+
continue
|
|
564
|
+
|
|
565
|
+
src_id = r["source_id"]
|
|
566
|
+
el = id_to_el.get(src_id)
|
|
567
|
+
if el is None or el.tag != "t":
|
|
568
|
+
continue
|
|
569
|
+
|
|
570
|
+
name = r.get("name", "")
|
|
571
|
+
if not name:
|
|
572
|
+
continue
|
|
573
|
+
|
|
574
|
+
log(f" Promoting text '{name}' (id={src_id}) to structure...")
|
|
575
|
+
|
|
576
|
+
# Lazy-init ChemScript bridge
|
|
577
|
+
try:
|
|
578
|
+
_ensure_cs_bridge()
|
|
579
|
+
except Exception as exc:
|
|
580
|
+
log(f" WARNING: ChemScript unavailable ({exc}), "
|
|
581
|
+
f"cannot promote text to structures")
|
|
582
|
+
break
|
|
583
|
+
|
|
584
|
+
# Resolve name → CDXML fragment
|
|
585
|
+
frag_info = _resolve_name_to_fragment(name, r.get("smiles"), cs_bridge,
|
|
586
|
+
verbose)
|
|
587
|
+
if frag_info is None:
|
|
588
|
+
log(f" WARNING: could not resolve '{name}' to structure, keeping text")
|
|
589
|
+
continue
|
|
590
|
+
|
|
591
|
+
frag_xml, xmin, ymin, xmax, ymax = frag_info
|
|
592
|
+
|
|
593
|
+
# Position the new fragment at the old text element's location
|
|
594
|
+
from ...image.reaction_from_image import _translate_fragment_xml
|
|
595
|
+
bb = el.get("BoundingBox", "")
|
|
596
|
+
if bb:
|
|
597
|
+
vals = [float(v) for v in bb.split()]
|
|
598
|
+
tcx = (vals[0] + vals[2]) / 2.0
|
|
599
|
+
tcy = (vals[1] + vals[3]) / 2.0
|
|
600
|
+
else:
|
|
601
|
+
p = el.get("p", "")
|
|
602
|
+
if p:
|
|
603
|
+
pp = p.split()
|
|
604
|
+
tcx, tcy = float(pp[0]), float(pp[1])
|
|
605
|
+
else:
|
|
606
|
+
tcx, tcy = 500.0, 250.0
|
|
607
|
+
|
|
608
|
+
frag_cx = (xmin + xmax) / 2.0
|
|
609
|
+
frag_cy = (ymin + ymax) / 2.0
|
|
610
|
+
dx = tcx - frag_cx
|
|
611
|
+
dy = tcy - frag_cy
|
|
612
|
+
translated = _translate_fragment_xml(frag_xml, dx, dy)
|
|
613
|
+
|
|
614
|
+
# Parse the translated fragment XML and assign the old element's ID
|
|
615
|
+
new_frag = ET.fromstring(translated)
|
|
616
|
+
new_frag.set("id", src_id)
|
|
617
|
+
|
|
618
|
+
# Replace in page
|
|
619
|
+
page.remove(el)
|
|
620
|
+
scheme_idx = list(page).index(scheme) if scheme in page else len(list(page))
|
|
621
|
+
page.insert(scheme_idx, new_frag)
|
|
622
|
+
|
|
623
|
+
# Update id_to_el
|
|
624
|
+
id_to_el[src_id] = new_frag
|
|
625
|
+
|
|
626
|
+
promotions.append((src_id, name))
|
|
627
|
+
log(f" Promoted '{name}' to structure (id={src_id})")
|
|
628
|
+
|
|
629
|
+
log(f"Promoted {len(promotions)} text label(s) to structures")
|
|
630
|
+
|
|
631
|
+
# --- Step 4d: Align atom-contributing reagents to product orientation ---
|
|
632
|
+
alignments = [] # list of aligned fragment IDs
|
|
633
|
+
|
|
634
|
+
if skip_alignment:
|
|
635
|
+
log("Step 4d: Skipped (skip_alignment=True)")
|
|
636
|
+
else:
|
|
637
|
+
# Rebuild id_to_el after promotions
|
|
638
|
+
id_to_el.clear()
|
|
639
|
+
for el in page:
|
|
640
|
+
eid = el.get("id", "")
|
|
641
|
+
if eid:
|
|
642
|
+
id_to_el[eid] = el
|
|
643
|
+
|
|
644
|
+
# Collect atom-contributing fragment IDs (excluding product)
|
|
645
|
+
contributing_frag_ids = set()
|
|
646
|
+
for r in reagents:
|
|
647
|
+
if r["classification"] != "atom_contributing":
|
|
648
|
+
continue
|
|
649
|
+
src_id = r["source_id"]
|
|
650
|
+
el = id_to_el.get(src_id)
|
|
651
|
+
if el is not None and el.tag == "fragment":
|
|
652
|
+
if src_id not in product_ids:
|
|
653
|
+
contributing_frag_ids.add(src_id)
|
|
654
|
+
|
|
655
|
+
if contributing_frag_ids:
|
|
656
|
+
log(f"Step 4d: Aligning {len(contributing_frag_ids)} atom-contributing "
|
|
657
|
+
f"fragment(s) to product orientation...")
|
|
658
|
+
aligned_ids = kabsch_align_to_product(
|
|
659
|
+
root, cs_bridge=cs_bridge, verbose=verbose,
|
|
660
|
+
frag_ids=contributing_frag_ids)
|
|
661
|
+
alignments = [(fid, "aligned") for fid in aligned_ids]
|
|
662
|
+
else:
|
|
663
|
+
log("Step 4d: No atom-contributing fragments to align")
|
|
664
|
+
|
|
665
|
+
# Close ChemScript bridge (shared by 4b and 4d)
|
|
666
|
+
if cs_bridge is not None:
|
|
667
|
+
try:
|
|
668
|
+
cs_bridge.close()
|
|
669
|
+
except Exception:
|
|
670
|
+
pass
|
|
671
|
+
|
|
672
|
+
log(f"Aligned {len(alignments)} fragment(s) to product orientation")
|
|
673
|
+
|
|
674
|
+
# --- Step 4c: Reformat existing text labels (subscripts + italic) ---
|
|
675
|
+
# Rebuild id_to_el before reformatting
|
|
676
|
+
id_to_el.clear()
|
|
677
|
+
for el in page:
|
|
678
|
+
eid = el.get("id", "")
|
|
679
|
+
if eid:
|
|
680
|
+
id_to_el[eid] = el
|
|
681
|
+
|
|
682
|
+
above_ids_reformat = step.get("ReactionStepObjectsAboveArrow", "").split()
|
|
683
|
+
below_ids_reformat = step.get("ReactionStepObjectsBelowArrow", "").split()
|
|
684
|
+
all_condition_ids = set(above_ids_reformat) | set(below_ids_reformat)
|
|
685
|
+
# Skip IDs that were just created in step 4 (already correctly formatted)
|
|
686
|
+
newly_created_ids = {r[0] for r in replacements}
|
|
687
|
+
reformatted = []
|
|
688
|
+
|
|
689
|
+
for eid in all_condition_ids:
|
|
690
|
+
if eid in newly_created_ids:
|
|
691
|
+
continue
|
|
692
|
+
el = id_to_el.get(eid)
|
|
693
|
+
if el is None or el.tag != "t":
|
|
694
|
+
continue
|
|
695
|
+
|
|
696
|
+
# Get current plain text
|
|
697
|
+
old_text = _get_text_content(el)
|
|
698
|
+
if not old_text:
|
|
699
|
+
continue
|
|
700
|
+
|
|
701
|
+
# Look up canonical display form from reagent DB
|
|
702
|
+
canonical = get_reagent_db().resolve_display(old_text)
|
|
703
|
+
|
|
704
|
+
# Build new formatted <s> elements
|
|
705
|
+
new_s_xml = _build_formatted_s_xml(canonical)
|
|
706
|
+
|
|
707
|
+
# Check if reformatting would actually change anything
|
|
708
|
+
old_s_xml = "".join(ET.tostring(s, encoding="unicode") for s in el.findall("s"))
|
|
709
|
+
if old_s_xml == new_s_xml:
|
|
710
|
+
continue
|
|
711
|
+
|
|
712
|
+
# Remove old <s> children, keep objecttags and other children
|
|
713
|
+
old_children = list(el)
|
|
714
|
+
for child in old_children:
|
|
715
|
+
if child.tag == "s":
|
|
716
|
+
el.remove(child)
|
|
717
|
+
|
|
718
|
+
# Parse the new <s> elements and insert at the front
|
|
719
|
+
# Wrap in a dummy element for parsing
|
|
720
|
+
wrapper = ET.fromstring(f"<dummy>{new_s_xml}</dummy>")
|
|
721
|
+
insert_pos = 0
|
|
722
|
+
for new_s in wrapper:
|
|
723
|
+
el.insert(insert_pos, new_s)
|
|
724
|
+
insert_pos += 1
|
|
725
|
+
|
|
726
|
+
reformatted.append((eid, old_text, canonical))
|
|
727
|
+
log(f" Reformatted text id={eid}: '{old_text}' → '{canonical}' "
|
|
728
|
+
f"(subscript/italic)")
|
|
729
|
+
|
|
730
|
+
log(f"Reformatted {len(reformatted)} text label(s)")
|
|
731
|
+
|
|
732
|
+
# --- Step 5: Deduplicate text elements ---
|
|
733
|
+
# Rebuild id_to_el after replacements
|
|
734
|
+
id_to_el.clear()
|
|
735
|
+
for el in page:
|
|
736
|
+
eid = el.get("id", "")
|
|
737
|
+
if eid:
|
|
738
|
+
id_to_el[eid] = el
|
|
739
|
+
|
|
740
|
+
# Collect all text content for above/below arrow elements
|
|
741
|
+
def _normalize_text(text: str) -> str:
|
|
742
|
+
return text.strip().lower()
|
|
743
|
+
|
|
744
|
+
above_ids = step.get("ReactionStepObjectsAboveArrow", "").split()
|
|
745
|
+
below_ids = step.get("ReactionStepObjectsBelowArrow", "").split()
|
|
746
|
+
|
|
747
|
+
dedup_removed = []
|
|
748
|
+
|
|
749
|
+
for position_name, id_list_attr in [
|
|
750
|
+
("above", "ReactionStepObjectsAboveArrow"),
|
|
751
|
+
("below", "ReactionStepObjectsBelowArrow"),
|
|
752
|
+
]:
|
|
753
|
+
id_list = step.get(id_list_attr, "").split()
|
|
754
|
+
seen_texts: Dict[str, str] = {} # normalized_text → first_id
|
|
755
|
+
new_id_list = []
|
|
756
|
+
for eid in id_list:
|
|
757
|
+
el = id_to_el.get(eid)
|
|
758
|
+
if el is None:
|
|
759
|
+
continue
|
|
760
|
+
|
|
761
|
+
# Only deduplicate text elements
|
|
762
|
+
if el.tag == "t":
|
|
763
|
+
text = _get_text_content(el)
|
|
764
|
+
norm = _normalize_text(text)
|
|
765
|
+
if norm in seen_texts:
|
|
766
|
+
# Duplicate — remove element and skip ID
|
|
767
|
+
log(f" Dedup: removing duplicate '{text}' (id={eid}) "
|
|
768
|
+
f"from {position_name} (keeping id={seen_texts[norm]})")
|
|
769
|
+
page.remove(el)
|
|
770
|
+
dedup_removed.append((eid, text, position_name))
|
|
771
|
+
continue
|
|
772
|
+
seen_texts[norm] = eid
|
|
773
|
+
|
|
774
|
+
new_id_list.append(eid)
|
|
775
|
+
|
|
776
|
+
step.set(id_list_attr, " ".join(new_id_list))
|
|
777
|
+
|
|
778
|
+
log(f"Removed {len(dedup_removed)} duplicate(s)")
|
|
779
|
+
|
|
780
|
+
# --- Step 6: Merge all text labels into one centered block (optional) ---
|
|
781
|
+
merged_conditions = False
|
|
782
|
+
merged_text_id = None
|
|
783
|
+
|
|
784
|
+
if merge_conditions:
|
|
785
|
+
# Rebuild id_to_el
|
|
786
|
+
id_to_el.clear()
|
|
787
|
+
for el in page:
|
|
788
|
+
eid = el.get("id", "")
|
|
789
|
+
if eid:
|
|
790
|
+
id_to_el[eid] = el
|
|
791
|
+
|
|
792
|
+
# Find arrow midpoint for rough centering
|
|
793
|
+
arrow_cx, arrow_cy = 500.0, 250.0 # fallback
|
|
794
|
+
arrow_id = step.get("ReactionStepArrows", "").split()
|
|
795
|
+
for aid in arrow_id:
|
|
796
|
+
a_el = id_to_el.get(aid)
|
|
797
|
+
if a_el is None:
|
|
798
|
+
# Try the superseding arrow (graphic → arrow pattern)
|
|
799
|
+
for child in page:
|
|
800
|
+
if child.tag == "arrow":
|
|
801
|
+
a_el = child
|
|
802
|
+
break
|
|
803
|
+
if a_el is None:
|
|
804
|
+
for child in page:
|
|
805
|
+
if child.tag == "graphic" and child.get("id") == aid:
|
|
806
|
+
sup_id = child.get("SupersededBy", "")
|
|
807
|
+
if sup_id:
|
|
808
|
+
a_el = id_to_el.get(sup_id)
|
|
809
|
+
break
|
|
810
|
+
if a_el is not None:
|
|
811
|
+
head = a_el.get("Head3D", "")
|
|
812
|
+
tail = a_el.get("Tail3D", "")
|
|
813
|
+
if head and tail:
|
|
814
|
+
hx, hy = float(head.split()[0]), float(head.split()[1])
|
|
815
|
+
tx, ty = float(tail.split()[0]), float(tail.split()[1])
|
|
816
|
+
arrow_cx = (hx + tx) / 2.0
|
|
817
|
+
arrow_cy = (hy + ty) / 2.0
|
|
818
|
+
else:
|
|
819
|
+
bb = a_el.get("BoundingBox", "")
|
|
820
|
+
if bb:
|
|
821
|
+
vals = [float(v) for v in bb.split()]
|
|
822
|
+
arrow_cx = (vals[0] + vals[2]) / 2.0
|
|
823
|
+
arrow_cy = (vals[1] + vals[3]) / 2.0
|
|
824
|
+
break
|
|
825
|
+
|
|
826
|
+
# Collect ALL text labels from above + below into one ordered list
|
|
827
|
+
all_text_ids = []
|
|
828
|
+
all_text_lines = []
|
|
829
|
+
non_text_above = []
|
|
830
|
+
non_text_below = []
|
|
831
|
+
|
|
832
|
+
for position_name, id_list_attr in [
|
|
833
|
+
("above", "ReactionStepObjectsAboveArrow"),
|
|
834
|
+
("below", "ReactionStepObjectsBelowArrow"),
|
|
835
|
+
]:
|
|
836
|
+
id_list = step.get(id_list_attr, "").split()
|
|
837
|
+
for eid in id_list:
|
|
838
|
+
el = id_to_el.get(eid)
|
|
839
|
+
if el is None:
|
|
840
|
+
continue
|
|
841
|
+
if el.tag == "t":
|
|
842
|
+
text = _get_text_content(el)
|
|
843
|
+
if text:
|
|
844
|
+
all_text_ids.append(eid)
|
|
845
|
+
all_text_lines.append(text)
|
|
846
|
+
else:
|
|
847
|
+
if position_name == "above":
|
|
848
|
+
non_text_above.append(eid)
|
|
849
|
+
else:
|
|
850
|
+
non_text_below.append(eid)
|
|
851
|
+
|
|
852
|
+
if len(all_text_ids) >= 2:
|
|
853
|
+
log(f" Merging {len(all_text_ids)} text labels into one block: "
|
|
854
|
+
f"{all_text_lines}")
|
|
855
|
+
|
|
856
|
+
# Build merged <s> content with \n between lines
|
|
857
|
+
s_parts = []
|
|
858
|
+
for i, text in enumerate(all_text_lines):
|
|
859
|
+
if i > 0:
|
|
860
|
+
s_parts.append(
|
|
861
|
+
'<s font="3" size="10" color="0" face="96">\n</s>'
|
|
862
|
+
)
|
|
863
|
+
canonical = get_reagent_db().resolve_display(text)
|
|
864
|
+
s_parts.append(_build_formatted_s_xml(canonical))
|
|
865
|
+
s_xml = "".join(s_parts)
|
|
866
|
+
|
|
867
|
+
# Keep the first text element, remove all others
|
|
868
|
+
keep_id = all_text_ids[0]
|
|
869
|
+
keep_el = id_to_el[keep_id]
|
|
870
|
+
keep_z = keep_el.get("Z", "1")
|
|
871
|
+
|
|
872
|
+
for eid in all_text_ids[1:]:
|
|
873
|
+
el = id_to_el.get(eid)
|
|
874
|
+
if el is not None:
|
|
875
|
+
page.remove(el)
|
|
876
|
+
|
|
877
|
+
# Position just below arrow — ChemDraw cleanup will refine
|
|
878
|
+
line_height = 12.5
|
|
879
|
+
n_lines = len(all_text_lines)
|
|
880
|
+
max_text_len = max(len(t) for t in all_text_lines)
|
|
881
|
+
total_w = max_text_len * 5.8
|
|
882
|
+
total_h = n_lines * line_height
|
|
883
|
+
|
|
884
|
+
mcx = arrow_cx
|
|
885
|
+
by1 = arrow_cy + 4.0 # just below arrow
|
|
886
|
+
by2 = by1 + total_h
|
|
887
|
+
bx1 = mcx - total_w / 2.0
|
|
888
|
+
bx2 = mcx + total_w / 2.0
|
|
889
|
+
first_baseline_y = by1 + 10.0 # first line baseline
|
|
890
|
+
|
|
891
|
+
# Rebuild the kept element
|
|
892
|
+
for child in list(keep_el):
|
|
893
|
+
keep_el.remove(child)
|
|
894
|
+
|
|
895
|
+
keep_el.set("p", f"{mcx:.2f} {first_baseline_y:.2f}")
|
|
896
|
+
keep_el.set("BoundingBox",
|
|
897
|
+
f"{bx1:.2f} {by1:.2f} {bx2:.2f} {by2:.2f}")
|
|
898
|
+
keep_el.set("Z", keep_z)
|
|
899
|
+
keep_el.set("InterpretChemically", "no")
|
|
900
|
+
keep_el.set("LineHeight", "auto")
|
|
901
|
+
keep_el.set("CaptionJustification", "Center")
|
|
902
|
+
keep_el.set("Justification", "Center")
|
|
903
|
+
|
|
904
|
+
# Parse and insert new <s> children
|
|
905
|
+
wrapper = ET.fromstring(f"<dummy>{s_xml}</dummy>")
|
|
906
|
+
for child in wrapper:
|
|
907
|
+
keep_el.append(child)
|
|
908
|
+
|
|
909
|
+
# Update step refs: merged text block goes above arrow
|
|
910
|
+
# (ChemDraw Clean Up Reaction expects objects in above/below)
|
|
911
|
+
step.set("ReactionStepObjectsAboveArrow",
|
|
912
|
+
" ".join(non_text_above + [keep_id]))
|
|
913
|
+
step.set("ReactionStepObjectsBelowArrow",
|
|
914
|
+
" ".join(non_text_below))
|
|
915
|
+
|
|
916
|
+
merged_conditions = True
|
|
917
|
+
merged_text_id = keep_id
|
|
918
|
+
log(f" Merged into single text block (id={keep_id})")
|
|
919
|
+
|
|
920
|
+
# --- Step 7: Write output CDXML ---
|
|
921
|
+
tree.write(output_path, xml_declaration=True, encoding="UTF-8")
|
|
922
|
+
|
|
923
|
+
# Post-process: fix XML declaration and DOCTYPE
|
|
924
|
+
_fixup_cdxml_output(output_path)
|
|
925
|
+
|
|
926
|
+
log(f"Written polished scheme to {output_path}")
|
|
927
|
+
|
|
928
|
+
return {
|
|
929
|
+
"replacements": replacements,
|
|
930
|
+
"promotions": promotions,
|
|
931
|
+
"alignments": alignments,
|
|
932
|
+
"reformatted": reformatted,
|
|
933
|
+
"dedup_removed": dedup_removed,
|
|
934
|
+
"merged_conditions": merged_conditions,
|
|
935
|
+
"merged_text_id": merged_text_id,
|
|
936
|
+
"total_reagents": len(reagents),
|
|
937
|
+
"product_smiles": classification.get("product_smiles"),
|
|
938
|
+
"classification": classification,
|
|
939
|
+
}
|
|
940
|
+
|
|
941
|
+
|
|
942
|
+
def _fixup_cdxml_output(path: str):
|
|
943
|
+
"""Fix up the CDXML output from ElementTree.
|
|
944
|
+
|
|
945
|
+
ElementTree's write() doesn't include DOCTYPE and may mangle some
|
|
946
|
+
attributes. This does a minimal fix-up pass.
|
|
947
|
+
"""
|
|
948
|
+
with open(path, "r", encoding="utf-8") as f:
|
|
949
|
+
content = f.read()
|
|
950
|
+
|
|
951
|
+
# Ensure proper XML declaration
|
|
952
|
+
if not content.startswith("<?xml"):
|
|
953
|
+
content = '<?xml version="1.0" encoding="UTF-8" ?>\n' + content
|
|
954
|
+
|
|
955
|
+
# Add DOCTYPE if missing
|
|
956
|
+
if "<!DOCTYPE CDXML" not in content:
|
|
957
|
+
content = content.replace(
|
|
958
|
+
"<CDXML",
|
|
959
|
+
'<!DOCTYPE CDXML SYSTEM "http://www.cambridgesoft.com/xml/cdxml.dtd" >\n<CDXML',
|
|
960
|
+
1,
|
|
961
|
+
)
|
|
962
|
+
|
|
963
|
+
with open(path, "w", encoding="utf-8") as f:
|
|
964
|
+
f.write(content)
|
|
965
|
+
|
|
966
|
+
|
|
967
|
+
# ---------------------------------------------------------------------------
|
|
968
|
+
# ChemDraw COM cleanup pass
|
|
969
|
+
# ---------------------------------------------------------------------------
|
|
970
|
+
|
|
971
|
+
def _find_arrow_center(page: ET.Element, step: ET.Element,
|
|
972
|
+
id_to_el: Dict[str, ET.Element],
|
|
973
|
+
) -> Tuple[float, float]:
|
|
974
|
+
"""Find the arrow midpoint from step metadata."""
|
|
975
|
+
arrow_cx, arrow_cy = 500.0, 250.0
|
|
976
|
+
arrow_ids = step.get("ReactionStepArrows", "").split()
|
|
977
|
+
for aid in arrow_ids:
|
|
978
|
+
a_el = id_to_el.get(aid)
|
|
979
|
+
if a_el is None:
|
|
980
|
+
for child in page:
|
|
981
|
+
if child.tag == "graphic" and child.get("id") == aid:
|
|
982
|
+
sup_id = child.get("SupersededBy", "")
|
|
983
|
+
if sup_id:
|
|
984
|
+
a_el = id_to_el.get(sup_id)
|
|
985
|
+
break
|
|
986
|
+
if a_el is not None:
|
|
987
|
+
head = a_el.get("Head3D", "")
|
|
988
|
+
tail = a_el.get("Tail3D", "")
|
|
989
|
+
if head and tail:
|
|
990
|
+
hx, hy = float(head.split()[0]), float(head.split()[1])
|
|
991
|
+
tx, ty = float(tail.split()[0]), float(tail.split()[1])
|
|
992
|
+
arrow_cx = (hx + tx) / 2.0
|
|
993
|
+
arrow_cy = (hy + ty) / 2.0
|
|
994
|
+
else:
|
|
995
|
+
bb = a_el.get("BoundingBox", "")
|
|
996
|
+
if bb:
|
|
997
|
+
vals = [float(v) for v in bb.split()]
|
|
998
|
+
arrow_cx = (vals[0] + vals[2]) / 2.0
|
|
999
|
+
arrow_cy = (vals[1] + vals[3]) / 2.0
|
|
1000
|
+
break
|
|
1001
|
+
return arrow_cx, arrow_cy
|
|
1002
|
+
|
|
1003
|
+
|
|
1004
|
+
def _compact_toward_arrow(cdxml_path: str, verbose: bool = False):
|
|
1005
|
+
"""Move above/below-arrow objects closer to the arrow line.
|
|
1006
|
+
|
|
1007
|
+
ChemDraw's "Clean Up Reaction" only recognises reaction components
|
|
1008
|
+
that are reasonably close together. After merging conditions into
|
|
1009
|
+
one large text block the vertical spread can exceed this threshold.
|
|
1010
|
+
This helper nudges every above-arrow element downward and every
|
|
1011
|
+
below-arrow element upward so that all objects sit within a tight
|
|
1012
|
+
band around the arrow y-coordinate.
|
|
1013
|
+
"""
|
|
1014
|
+
def log(msg: str):
|
|
1015
|
+
if verbose:
|
|
1016
|
+
print(f"[scheme_polisher] {msg}", file=sys.stderr)
|
|
1017
|
+
|
|
1018
|
+
tree = ET.parse(cdxml_path)
|
|
1019
|
+
root = tree.getroot()
|
|
1020
|
+
page = root.find("page")
|
|
1021
|
+
scheme = page.find("scheme") if page is not None else None
|
|
1022
|
+
step = scheme.find("step") if scheme is not None else None
|
|
1023
|
+
if step is None:
|
|
1024
|
+
return
|
|
1025
|
+
|
|
1026
|
+
id_to_el: Dict[str, ET.Element] = {}
|
|
1027
|
+
for el in page:
|
|
1028
|
+
eid = el.get("id", "")
|
|
1029
|
+
if eid:
|
|
1030
|
+
id_to_el[eid] = el
|
|
1031
|
+
|
|
1032
|
+
arrow_cx, arrow_cy = _find_arrow_center(page, step, id_to_el)
|
|
1033
|
+
log(f" Compacting: arrow center = ({arrow_cx:.1f}, {arrow_cy:.1f})")
|
|
1034
|
+
|
|
1035
|
+
# Target: above-arrow objects sit with their bottom edge at arrow_cy - 5
|
|
1036
|
+
# Target: below-arrow objects sit with their top edge at arrow_cy + 5
|
|
1037
|
+
GAP = 5.0
|
|
1038
|
+
|
|
1039
|
+
for attr, direction in [
|
|
1040
|
+
("ReactionStepObjectsAboveArrow", "above"),
|
|
1041
|
+
("ReactionStepObjectsBelowArrow", "below"),
|
|
1042
|
+
]:
|
|
1043
|
+
ids = step.get(attr, "").split()
|
|
1044
|
+
for eid in ids:
|
|
1045
|
+
el = id_to_el.get(eid)
|
|
1046
|
+
if el is None:
|
|
1047
|
+
continue
|
|
1048
|
+
|
|
1049
|
+
# Compute current bounding box center-y
|
|
1050
|
+
if el.tag == "fragment":
|
|
1051
|
+
_, cy = _fragment_bbox_center(el)
|
|
1052
|
+
elif el.tag == "t":
|
|
1053
|
+
bb = el.get("BoundingBox", "")
|
|
1054
|
+
if bb:
|
|
1055
|
+
vals = [float(v) for v in bb.split()]
|
|
1056
|
+
cy = (vals[1] + vals[3]) / 2.0
|
|
1057
|
+
else:
|
|
1058
|
+
continue
|
|
1059
|
+
else:
|
|
1060
|
+
continue
|
|
1061
|
+
|
|
1062
|
+
# How far to shift toward the arrow (y-axis points down)
|
|
1063
|
+
if direction == "above":
|
|
1064
|
+
target_cy = arrow_cy - GAP - 15 # keep a small gap above
|
|
1065
|
+
dy = target_cy - cy
|
|
1066
|
+
# dy > 0 means object is above target → move down toward arrow
|
|
1067
|
+
# dy < 0 means object is already below target → skip
|
|
1068
|
+
if dy <= 0:
|
|
1069
|
+
continue
|
|
1070
|
+
else:
|
|
1071
|
+
target_cy = arrow_cy + GAP + 15
|
|
1072
|
+
dy = target_cy - cy
|
|
1073
|
+
# dy < 0 means object is below target → move up toward arrow
|
|
1074
|
+
# dy > 0 means object is already above target → skip
|
|
1075
|
+
if dy >= 0:
|
|
1076
|
+
continue
|
|
1077
|
+
|
|
1078
|
+
log(f" Compacting {el.tag} id={eid} {direction}: "
|
|
1079
|
+
f"dy={dy:+.1f}")
|
|
1080
|
+
_shift_element_y(el, dy)
|
|
1081
|
+
|
|
1082
|
+
tree.write(cdxml_path, xml_declaration=True, encoding="UTF-8")
|
|
1083
|
+
_fixup_cdxml_output(cdxml_path)
|
|
1084
|
+
|
|
1085
|
+
|
|
1086
|
+
def _shift_element_y(el: ET.Element, dy: float):
|
|
1087
|
+
"""Shift an element (fragment or text) vertically by dy points."""
|
|
1088
|
+
if el.tag == "fragment":
|
|
1089
|
+
# Shift all node positions
|
|
1090
|
+
for n in el.iter("n"):
|
|
1091
|
+
p = n.get("p")
|
|
1092
|
+
if p:
|
|
1093
|
+
parts = p.split()
|
|
1094
|
+
if len(parts) >= 2:
|
|
1095
|
+
new_y = float(parts[1]) + dy
|
|
1096
|
+
n.set("p", f"{parts[0]} {new_y:.2f}")
|
|
1097
|
+
# Shift nested text label positions
|
|
1098
|
+
for t in el.iter("t"):
|
|
1099
|
+
p = t.get("p")
|
|
1100
|
+
if p:
|
|
1101
|
+
parts = p.split()
|
|
1102
|
+
if len(parts) >= 2:
|
|
1103
|
+
new_y = float(parts[1]) + dy
|
|
1104
|
+
t.set("p", f"{parts[0]} {new_y:.2f}")
|
|
1105
|
+
bb = t.get("BoundingBox")
|
|
1106
|
+
if bb:
|
|
1107
|
+
vals = [float(v) for v in bb.split()]
|
|
1108
|
+
if len(vals) >= 4:
|
|
1109
|
+
vals[1] += dy
|
|
1110
|
+
vals[3] += dy
|
|
1111
|
+
t.set("BoundingBox",
|
|
1112
|
+
" ".join(f"{v:.2f}" for v in vals))
|
|
1113
|
+
# Shift fragment BoundingBox
|
|
1114
|
+
bb = el.get("BoundingBox")
|
|
1115
|
+
if bb:
|
|
1116
|
+
vals = [float(v) for v in bb.split()]
|
|
1117
|
+
if len(vals) >= 4:
|
|
1118
|
+
vals[1] += dy
|
|
1119
|
+
vals[3] += dy
|
|
1120
|
+
el.set("BoundingBox",
|
|
1121
|
+
" ".join(f"{v:.2f}" for v in vals))
|
|
1122
|
+
|
|
1123
|
+
elif el.tag == "t":
|
|
1124
|
+
p = el.get("p")
|
|
1125
|
+
if p:
|
|
1126
|
+
parts = p.split()
|
|
1127
|
+
if len(parts) >= 2:
|
|
1128
|
+
new_y = float(parts[1]) + dy
|
|
1129
|
+
el.set("p", f"{parts[0]} {new_y:.2f}")
|
|
1130
|
+
bb = el.get("BoundingBox")
|
|
1131
|
+
if bb:
|
|
1132
|
+
vals = [float(v) for v in bb.split()]
|
|
1133
|
+
if len(vals) >= 4:
|
|
1134
|
+
vals[1] += dy
|
|
1135
|
+
vals[3] += dy
|
|
1136
|
+
el.set("BoundingBox",
|
|
1137
|
+
" ".join(f"{v:.2f}" for v in vals))
|
|
1138
|
+
|
|
1139
|
+
|
|
1140
|
+
def _chemdraw_cleanup_reaction(cdxml_path: str, output_path: str,
|
|
1141
|
+
verbose: bool = False):
|
|
1142
|
+
"""Run ChemDraw COM "Clean Up Reaction" on the CDXML file.
|
|
1143
|
+
|
|
1144
|
+
Reuses the same COM automation pattern as eln_cdx_cleanup.py.
|
|
1145
|
+
Expects the file to already be compacted (see _compact_toward_arrow).
|
|
1146
|
+
"""
|
|
1147
|
+
import win32com.client
|
|
1148
|
+
|
|
1149
|
+
def log(msg: str):
|
|
1150
|
+
if verbose:
|
|
1151
|
+
print(f"[scheme_polisher] {msg}", file=sys.stderr)
|
|
1152
|
+
|
|
1153
|
+
log("Running ChemDraw COM cleanup...")
|
|
1154
|
+
|
|
1155
|
+
# Import COM helpers from eln_cdx_cleanup
|
|
1156
|
+
from .eln_cdx_cleanup import (
|
|
1157
|
+
_get_chemdraw, _chemdraw_open,
|
|
1158
|
+
_restore_chemdraw_window,
|
|
1159
|
+
)
|
|
1160
|
+
|
|
1161
|
+
cdApp, launched = _get_chemdraw()
|
|
1162
|
+
doc = _chemdraw_open(cdApp, os.path.abspath(cdxml_path))
|
|
1163
|
+
|
|
1164
|
+
# Select all, then Clean Up Reaction (Structure menu → item 7)
|
|
1165
|
+
# Run 3 times — arrow lengths and spacing may not fully converge
|
|
1166
|
+
# on the first pass.
|
|
1167
|
+
for i in range(3):
|
|
1168
|
+
doc.Objects.Select()
|
|
1169
|
+
time.sleep(1)
|
|
1170
|
+
cdApp.MenuBars(1).Menus(5).MenuItems(7).Execute()
|
|
1171
|
+
time.sleep(1)
|
|
1172
|
+
|
|
1173
|
+
# Save to output
|
|
1174
|
+
doc.SaveAs(os.path.abspath(output_path))
|
|
1175
|
+
time.sleep(0.5)
|
|
1176
|
+
doc.Close(False)
|
|
1177
|
+
|
|
1178
|
+
if launched:
|
|
1179
|
+
_restore_chemdraw_window()
|
|
1180
|
+
cdApp.Quit()
|
|
1181
|
+
|
|
1182
|
+
log(f"ChemDraw cleanup saved to {output_path}")
|
|
1183
|
+
|
|
1184
|
+
|
|
1185
|
+
# ---------------------------------------------------------------------------
|
|
1186
|
+
# CLI
|
|
1187
|
+
# ---------------------------------------------------------------------------
|
|
1188
|
+
|
|
1189
|
+
def main(argv: Optional[List[str]] = None) -> int:
|
|
1190
|
+
parser = argparse.ArgumentParser(
|
|
1191
|
+
description=(
|
|
1192
|
+
"Polish a CDXML reaction scheme: replace non-contributing "
|
|
1193
|
+
"reagent structures with text abbreviations, deduplicate, "
|
|
1194
|
+
"and optionally run ChemDraw Clean Up Reaction."
|
|
1195
|
+
),
|
|
1196
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
1197
|
+
epilog=__doc__,
|
|
1198
|
+
)
|
|
1199
|
+
parser.add_argument(
|
|
1200
|
+
"-i", "--input", required=True,
|
|
1201
|
+
help="Input CDXML file",
|
|
1202
|
+
)
|
|
1203
|
+
parser.add_argument(
|
|
1204
|
+
"-o", "--output", default=None,
|
|
1205
|
+
help="Output CDXML file (default: <input_stem>-polished.cdxml)",
|
|
1206
|
+
)
|
|
1207
|
+
parser.add_argument(
|
|
1208
|
+
"--no-chemdraw-cleanup", action="store_true",
|
|
1209
|
+
help="Skip the ChemDraw COM 'Clean Up Reaction' pass (still compacts)",
|
|
1210
|
+
)
|
|
1211
|
+
parser.add_argument(
|
|
1212
|
+
"--no-compact", action="store_true",
|
|
1213
|
+
help="Skip the compaction step (implies --no-chemdraw-cleanup)",
|
|
1214
|
+
)
|
|
1215
|
+
parser.add_argument(
|
|
1216
|
+
"--merge-conditions", action="store_true",
|
|
1217
|
+
help=(
|
|
1218
|
+
"Merge all text labels above/below the arrow into a single "
|
|
1219
|
+
"centered multi-line text block"
|
|
1220
|
+
),
|
|
1221
|
+
)
|
|
1222
|
+
parser.add_argument(
|
|
1223
|
+
"-v", "--verbose", action="store_true",
|
|
1224
|
+
help="Print progress to stderr",
|
|
1225
|
+
)
|
|
1226
|
+
parser.add_argument(
|
|
1227
|
+
"--json", action="store_true",
|
|
1228
|
+
help="Output result as JSON to stdout",
|
|
1229
|
+
)
|
|
1230
|
+
|
|
1231
|
+
args = parser.parse_args(argv)
|
|
1232
|
+
|
|
1233
|
+
input_path = os.path.abspath(args.input)
|
|
1234
|
+
if not os.path.exists(input_path):
|
|
1235
|
+
print(f"ERROR: file not found: {input_path}", file=sys.stderr)
|
|
1236
|
+
return 1
|
|
1237
|
+
|
|
1238
|
+
# Default output path
|
|
1239
|
+
if args.output is None:
|
|
1240
|
+
stem = os.path.splitext(input_path)[0]
|
|
1241
|
+
output_path = stem + "-polished.cdxml"
|
|
1242
|
+
else:
|
|
1243
|
+
output_path = os.path.abspath(args.output)
|
|
1244
|
+
|
|
1245
|
+
# --no-compact implies --no-chemdraw-cleanup
|
|
1246
|
+
do_compact = not args.no_compact
|
|
1247
|
+
do_chemdraw = not args.no_chemdraw_cleanup and not args.no_compact
|
|
1248
|
+
|
|
1249
|
+
if not do_compact and not do_chemdraw:
|
|
1250
|
+
# No post-processing — write directly to output
|
|
1251
|
+
result = polish_scheme(input_path, output_path,
|
|
1252
|
+
verbose=args.verbose,
|
|
1253
|
+
merge_conditions=args.merge_conditions)
|
|
1254
|
+
elif do_compact and not do_chemdraw:
|
|
1255
|
+
# Compact only — write to output, then compact in-place
|
|
1256
|
+
result = polish_scheme(input_path, output_path,
|
|
1257
|
+
verbose=args.verbose,
|
|
1258
|
+
merge_conditions=args.merge_conditions)
|
|
1259
|
+
_compact_toward_arrow(output_path, args.verbose)
|
|
1260
|
+
else:
|
|
1261
|
+
# Compact + ChemDraw cleanup — write to temp, compact, cleanup
|
|
1262
|
+
tmpdir = tempfile.mkdtemp(prefix="scheme_polish_")
|
|
1263
|
+
tmp_path = os.path.join(tmpdir, "pre_cleanup.cdxml")
|
|
1264
|
+
try:
|
|
1265
|
+
result = polish_scheme(input_path, tmp_path,
|
|
1266
|
+
verbose=args.verbose,
|
|
1267
|
+
merge_conditions=args.merge_conditions)
|
|
1268
|
+
_compact_toward_arrow(tmp_path, args.verbose)
|
|
1269
|
+
_chemdraw_cleanup_reaction(tmp_path, output_path,
|
|
1270
|
+
verbose=args.verbose)
|
|
1271
|
+
finally:
|
|
1272
|
+
import shutil
|
|
1273
|
+
try:
|
|
1274
|
+
shutil.rmtree(tmpdir)
|
|
1275
|
+
except Exception:
|
|
1276
|
+
pass
|
|
1277
|
+
|
|
1278
|
+
# --- Report ---
|
|
1279
|
+
n_replaced = len(result["replacements"])
|
|
1280
|
+
n_promoted = len(result["promotions"])
|
|
1281
|
+
n_aligned = len(result.get("alignments", []))
|
|
1282
|
+
n_reformatted = len(result["reformatted"])
|
|
1283
|
+
n_deduped = len(result["dedup_removed"])
|
|
1284
|
+
parts = [
|
|
1285
|
+
f"{n_replaced} structure(s) → text",
|
|
1286
|
+
f"{n_promoted} text → structure",
|
|
1287
|
+
f"{n_aligned} fragment(s) aligned to product",
|
|
1288
|
+
f"{n_reformatted} text reformatted",
|
|
1289
|
+
f"{n_deduped} duplicate(s) removed",
|
|
1290
|
+
]
|
|
1291
|
+
if result.get("merged_conditions"):
|
|
1292
|
+
parts.append("conditions merged into single block")
|
|
1293
|
+
print(f"Polished: {', '.join(parts)}", file=sys.stderr)
|
|
1294
|
+
print(f"Output: {output_path}", file=sys.stderr)
|
|
1295
|
+
|
|
1296
|
+
if args.json:
|
|
1297
|
+
# Determine mode
|
|
1298
|
+
if not do_compact and not do_chemdraw:
|
|
1299
|
+
mode = "raw"
|
|
1300
|
+
elif do_compact and not do_chemdraw:
|
|
1301
|
+
mode = "compact"
|
|
1302
|
+
else:
|
|
1303
|
+
mode = "chemdraw"
|
|
1304
|
+
|
|
1305
|
+
steps_applied = []
|
|
1306
|
+
if n_replaced:
|
|
1307
|
+
steps_applied.append(f"{n_replaced} structure(s) replaced with text")
|
|
1308
|
+
if n_promoted:
|
|
1309
|
+
steps_applied.append(f"{n_promoted} text promoted to structure")
|
|
1310
|
+
if n_aligned:
|
|
1311
|
+
steps_applied.append(f"{n_aligned} fragment(s) aligned to product")
|
|
1312
|
+
if n_reformatted:
|
|
1313
|
+
steps_applied.append(f"{n_reformatted} text reformatted")
|
|
1314
|
+
if n_deduped:
|
|
1315
|
+
steps_applied.append(f"{n_deduped} duplicate(s) removed")
|
|
1316
|
+
if result.get("merged_conditions"):
|
|
1317
|
+
steps_applied.append("conditions merged")
|
|
1318
|
+
if do_compact:
|
|
1319
|
+
steps_applied.append("compacted toward arrow")
|
|
1320
|
+
if do_chemdraw:
|
|
1321
|
+
steps_applied.append("ChemDraw COM cleanup")
|
|
1322
|
+
|
|
1323
|
+
warnings = []
|
|
1324
|
+
json_result = {
|
|
1325
|
+
"input": str(input_path),
|
|
1326
|
+
"output": str(output_path),
|
|
1327
|
+
"mode": mode,
|
|
1328
|
+
"steps_applied": steps_applied,
|
|
1329
|
+
"warnings": warnings,
|
|
1330
|
+
}
|
|
1331
|
+
print(json.dumps(json_result, indent=2))
|
|
1332
|
+
|
|
1333
|
+
return 0
|
|
1334
|
+
|
|
1335
|
+
|
|
1336
|
+
if __name__ == "__main__":
|
|
1337
|
+
sys.exit(main())
|