cdxml-toolkit 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. cdxml_toolkit/__init__.py +18 -0
  2. cdxml_toolkit/_jre/__init__.py +2 -0
  3. cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
  4. cdxml_toolkit/analysis/__init__.py +35 -0
  5. cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
  6. cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
  7. cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
  8. cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
  9. cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
  10. cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
  11. cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
  12. cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
  13. cdxml_toolkit/analysis/extract_nmr.py +47 -0
  14. cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
  15. cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
  16. cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
  17. cdxml_toolkit/cdxml_builder.py +920 -0
  18. cdxml_toolkit/cdxml_utils.py +342 -0
  19. cdxml_toolkit/chemdraw/__init__.py +5 -0
  20. cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
  21. cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
  22. cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
  23. cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
  24. cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
  25. cdxml_toolkit/constants.py +304 -0
  26. cdxml_toolkit/coord_normalizer.py +438 -0
  27. cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
  28. cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
  29. cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
  30. cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
  31. cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
  32. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
  33. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
  34. cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
  35. cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
  36. cdxml_toolkit/image/__init__.py +15 -0
  37. cdxml_toolkit/image/reaction_from_image.py +2103 -0
  38. cdxml_toolkit/image/structure_from_image.py +1711 -0
  39. cdxml_toolkit/layout/__init__.py +5 -0
  40. cdxml_toolkit/layout/alignment.py +1642 -0
  41. cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
  42. cdxml_toolkit/layout/scheme_merger.py +2260 -0
  43. cdxml_toolkit/mcp_server/__init__.py +0 -0
  44. cdxml_toolkit/mcp_server/__main__.py +5 -0
  45. cdxml_toolkit/mcp_server/server.py +1567 -0
  46. cdxml_toolkit/naming/__init__.py +6 -0
  47. cdxml_toolkit/naming/aligned_namer.py +2342 -0
  48. cdxml_toolkit/naming/mol_builder.py +3722 -0
  49. cdxml_toolkit/naming/name_decomposer.py +2843 -0
  50. cdxml_toolkit/naming/reactions_datamol.json +2414 -0
  51. cdxml_toolkit/office/__init__.py +5 -0
  52. cdxml_toolkit/office/doc_from_template.py +722 -0
  53. cdxml_toolkit/office/ole_embedder.py +808 -0
  54. cdxml_toolkit/office/ole_extractor.py +272 -0
  55. cdxml_toolkit/perception/__init__.py +10 -0
  56. cdxml_toolkit/perception/compound_search.py +229 -0
  57. cdxml_toolkit/perception/eln_csv_parser.py +240 -0
  58. cdxml_toolkit/perception/rdf_parser.py +664 -0
  59. cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
  60. cdxml_toolkit/perception/reaction_parser.py +2150 -0
  61. cdxml_toolkit/perception/scheme_reader.py +2948 -0
  62. cdxml_toolkit/perception/scheme_refine.py +1404 -0
  63. cdxml_toolkit/perception/scheme_segmenter.py +619 -0
  64. cdxml_toolkit/perception/spatial_assignment.py +1013 -0
  65. cdxml_toolkit/rdkit_utils.py +605 -0
  66. cdxml_toolkit/render/__init__.py +17 -0
  67. cdxml_toolkit/render/auto_layout.py +229 -0
  68. cdxml_toolkit/render/compact_parser.py +632 -0
  69. cdxml_toolkit/render/parser.py +706 -0
  70. cdxml_toolkit/render/render_scheme.py +267 -0
  71. cdxml_toolkit/render/renderer.py +2387 -0
  72. cdxml_toolkit/render/schema.py +90 -0
  73. cdxml_toolkit/render/scheme_maker.py +1043 -0
  74. cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
  75. cdxml_toolkit/resolve/__init__.py +13 -0
  76. cdxml_toolkit/resolve/cas_resolver.py +430 -0
  77. cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
  78. cdxml_toolkit/resolve/condensed_formula.py +493 -0
  79. cdxml_toolkit/resolve/jre_manager.py +195 -0
  80. cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
  81. cdxml_toolkit/resolve/reagent_db.py +285 -0
  82. cdxml_toolkit/resolve/superatom_data.json +2856 -0
  83. cdxml_toolkit/resolve/superatom_table.py +146 -0
  84. cdxml_toolkit/text_formatting.py +298 -0
  85. cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
  86. cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
  87. cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
  88. cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
  89. cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
  90. cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
  91. cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1337 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ scheme_polisher.py — Polish a CDXML reaction scheme for presentation.
4
+
5
+ Takes a CDXML reaction scheme (typically from eln_cdx_cleanup.py) and:
6
+ 1. Classifies reagents as atom-contributing or non-contributing
7
+ (using reactant_heuristic.py)
8
+ 2. Replaces non-contributing reagent structures with text abbreviations
9
+ (e.g. Cs₂CO₃ structure → "Cs2CO3" text, n-BuLi → "n-BuLi")
10
+ 3. Promotes atom-contributing text labels to drawn structures
11
+ (e.g. "Morpholine" → morpholine structure via ChemScript name resolution)
12
+ 4. Aligns atom-contributing reagents to match product orientation:
13
+ a. Finds atom correspondence via RDKit substructure match or MCS
14
+ b. Maps RDKit (MOL) atom indices to CDXML node indices by coordinate
15
+ matching (handles ChemScript atom reordering on export)
16
+ c. Computes optimal rigid rotation via Kabsch algorithm on matched
17
+ CDXML coordinates (3× heteroatom weighting for symmetric rings)
18
+ d. Applies rotation in-place around reagent centroid
19
+ 5. Reformats text labels (subscripts for numbers, italic for prefixes)
20
+ 6. Deduplicates identical reagents/conditions (e.g. duplicate "THF")
21
+ 7. Optionally merges all condition text into a single centered block
22
+ below the arrow (--merge-conditions)
23
+ 8. Compacts above/below-arrow objects toward the arrow
24
+ 9. Optionally runs ChemDraw COM "Clean Up Reaction" for final spacing
25
+
26
+ Post-processing modes (default: compact + ChemDraw cleanup):
27
+ --no-chemdraw-cleanup Compact only, skip ChemDraw COM pass
28
+ --no-compact Skip compaction and ChemDraw COM (raw polished output)
29
+
30
+ Usage:
31
+ python scheme_polisher.py -i scheme.cdxml [-o polished.cdxml] [-v]
32
+ python scheme_polisher.py -i scheme.cdxml --merge-conditions -v
33
+ python scheme_polisher.py -i scheme.cdxml --no-chemdraw-cleanup
34
+ python scheme_polisher.py -i scheme.cdxml --no-compact
35
+
36
+ Dependencies:
37
+ - reactant_heuristic.py (reagent classification)
38
+ - chemscript_bridge.py (text→structure promotion, MOL export)
39
+ - rdkit (MCS, substructure matching)
40
+ - reagent_abbreviations.json (curated name→display mapping)
41
+ """
42
+
43
+ import argparse
44
+ import json
45
+ import os
46
+ import re
47
+ import sys
48
+ import tempfile
49
+ import time
50
+ from typing import Dict, List, Optional, Set, Tuple
51
+ from xml.etree import ElementTree as ET
52
+
53
+
54
+ # ---------------------------------------------------------------------------
55
+ # Shared reagent database
56
+ # ---------------------------------------------------------------------------
57
+
58
+ from ...resolve.reagent_db import get_reagent_db
59
+
60
+
61
+ # ---------------------------------------------------------------------------
62
+ # Text formatting: subscripts + italic prefixes (from text_formatting.py)
63
+ # ---------------------------------------------------------------------------
64
+
65
+ from ...text_formatting import (
66
+ build_formatted_s_xml as _build_formatted_s_xml, # Re-exported for eln_enrichment.py backward compat
67
+ needs_subscript as _needs_subscript,
68
+ split_italic_prefix as _split_italic_prefix,
69
+ SUBSCRIPT_RE as _SUBSCRIPT_RE,
70
+ ITALIC_PREFIXES as _ITALIC_PREFIXES,
71
+ )
72
+
73
+ # Keep the old name as an alias used by _build_replacement_text_element
74
+ _build_subscripted_s_xml = _build_formatted_s_xml
75
+
76
+
77
+ # ---------------------------------------------------------------------------
78
+ # CDXML Helpers
79
+ # ---------------------------------------------------------------------------
80
+
81
+ def _get_text_content(el: ET.Element) -> str:
82
+ """Extract concatenated text from all <s> children of a <t> element.
83
+ Joins without spaces — chemical formulae like Cs2CO3 are split across
84
+ multiple <s> elements (Cs + 2 + CO + 3) and must not get spaces."""
85
+ parts = []
86
+ for s in el.iter("s"):
87
+ if s.text:
88
+ parts.append(s.text)
89
+ return "".join(parts).strip()
90
+
91
+
92
+ def _get_fm_molecule_type(el: ET.Element) -> Optional[int]:
93
+ """Read the Findmolecule MOLECULE TYPE objecttag.
94
+ Values: 0=molecule, 1=solvent, 2=condition text, 3=product."""
95
+ for ot in el.iter("objecttag"):
96
+ if ot.get("Name") == "FM MOLECULE TYPE":
97
+ try:
98
+ return int(ot.get("Value", ""))
99
+ except ValueError:
100
+ return None
101
+ return None
102
+
103
+
104
+ def _fragment_bbox_center(frag: ET.Element) -> Tuple[float, float]:
105
+ """Compute the center of a fragment's bounding box from node positions.
106
+
107
+ Delegates to cdxml_utils.fragment_centroid(); falls back to (500, 250).
108
+ """
109
+ from ...cdxml_utils import fragment_centroid
110
+ result = fragment_centroid(frag)
111
+ if result is not None:
112
+ return result
113
+ return 500.0, 250.0 # fallback center
114
+
115
+
116
+ def _element_to_xml_string(el: ET.Element) -> str:
117
+ """Serialize an element to a raw XML string."""
118
+ return ET.tostring(el, encoding="unicode")
119
+
120
+
121
+ # ---------------------------------------------------------------------------
122
+ # Alignment imports (from alignment.py)
123
+ # ---------------------------------------------------------------------------
124
+ # All alignment primitives + high-level orchestrators live in alignment.py.
125
+ # We import the public names here and keep private aliases so any internal
126
+ # callers that used the old names still work.
127
+
128
+ from ...layout.alignment import (
129
+ sp_fragment_to_cdxml,
130
+ filtered_atom_nodes,
131
+ compute_rigid_rotation_2d,
132
+ rotate_fragment_in_place,
133
+ make_abbrev_dummy_copy,
134
+ kabsch_align_fragment_to_product,
135
+ kabsch_align_to_product,
136
+ )
137
+
138
+ # Backward-compatible private aliases
139
+ _sp_fragment_to_cdxml = sp_fragment_to_cdxml
140
+ _filtered_atom_nodes = filtered_atom_nodes
141
+ _compute_rigid_rotation = compute_rigid_rotation_2d
142
+ _rotate_fragment_in_place = rotate_fragment_in_place
143
+ _make_abbrev_dummy_copy = make_abbrev_dummy_copy
144
+ _align_reagent_to_product = kabsch_align_fragment_to_product
145
+
146
+
147
+ # ---------------------------------------------------------------------------
148
+ # Display name resolution for non-contributing fragments
149
+ # ---------------------------------------------------------------------------
150
+
151
+ def _resolve_display_name(
152
+ smiles: Optional[str],
153
+ name: Optional[str],
154
+ role: Optional[str],
155
+ ) -> Optional[str]:
156
+ """Determine the text abbreviation to display for a non-contributing reagent.
157
+
158
+ Resolution chain:
159
+ 1a. Reagent DB display_name (via canonical SMILES — exact match)
160
+ 1b. Reagent DB display_name (via stereo-agnostic SMILES match)
161
+ 2a. Reagent DB display_name (via exact name/alias)
162
+ 2b. Reagent DB display_name (via Levenshtein on name — catches typos
163
+ and abbreviation variants like EDC.HCl, nBuLi, i-Pr2NEt)
164
+ 3. The reagent name itself (if available)
165
+ 4. None (keep structure as-is)
166
+ """
167
+ db = get_reagent_db()
168
+
169
+ # 1a. Look up display name by SMILES (exact canonical match)
170
+ if smiles:
171
+ display = db.display_for_smiles(smiles)
172
+ if display:
173
+ return display
174
+
175
+ # 1b. Stereo-agnostic SMILES match (e.g. OPSIN omits E/Z on DEAD)
176
+ if smiles:
177
+ display = _match_smiles_no_stereo(smiles, db)
178
+ if display:
179
+ return display
180
+
181
+ # 2a. Look up display name by name/alias (exact)
182
+ if name:
183
+ display = db.display_for_name(name)
184
+ if display:
185
+ return display
186
+
187
+ # 2b. Levenshtein fuzzy match on name
188
+ if name:
189
+ display = _match_name_levenshtein(name, db)
190
+ if display:
191
+ return display
192
+
193
+ # 3. Use the name as-is
194
+ if name:
195
+ return name
196
+
197
+ return None
198
+
199
+
200
+ # Levenshtein similarity threshold for name matching (0.0 - 1.0).
201
+ # 0.80 catches "EDC.HCl"→"edc" (0.86), "nBuLi"→"n-buli" (0.80),
202
+ # "DIEA"→"dipea" (0.80) while rejecting spurious matches.
203
+ _LEVENSHTEIN_THRESHOLD = 0.80
204
+
205
+
206
+ def _levenshtein_distance(s: str, t: str) -> int:
207
+ """Compute Levenshtein edit distance between two strings."""
208
+ if len(s) < len(t):
209
+ return _levenshtein_distance(t, s)
210
+ if not t:
211
+ return len(s)
212
+ prev = list(range(len(t) + 1))
213
+ for i, sc in enumerate(s):
214
+ curr = [i + 1]
215
+ for j, tc in enumerate(t):
216
+ cost = 0 if sc == tc else 1
217
+ curr.append(min(curr[j] + 1, prev[j + 1] + 1, prev[j] + cost))
218
+ prev = curr
219
+ return prev[-1]
220
+
221
+
222
+ def _match_name_levenshtein(
223
+ name: str, db: 'ReagentDB', threshold: float = _LEVENSHTEIN_THRESHOLD,
224
+ ) -> Optional[str]:
225
+ """Find the best DB entry for *name* via Levenshtein similarity.
226
+
227
+ Returns the display name if similarity >= threshold, else None.
228
+ """
229
+ query = name.strip().lower()
230
+ # Also try stripping common suffixes/prefixes that don't affect identity
231
+ # e.g. "EDC.HCl" → "edc", "Pd(OAc)2·xH2O" → "pd(oac)2"
232
+ candidates = [query]
233
+ for sep in ['.', '\u00b7', '\u2022', ' ']:
234
+ if sep in query:
235
+ candidates.append(query.split(sep)[0])
236
+
237
+ all_keys = sorted(db._by_name.keys())
238
+ best_score = 0.0
239
+ best_key = None
240
+
241
+ for candidate in candidates:
242
+ if not candidate:
243
+ continue
244
+ for key in all_keys:
245
+ dist = _levenshtein_distance(candidate, key)
246
+ max_len = max(len(candidate), len(key))
247
+ if max_len == 0:
248
+ continue
249
+ similarity = 1.0 - dist / max_len
250
+ if similarity > best_score:
251
+ best_score = similarity
252
+ best_key = key
253
+
254
+ if best_score >= threshold and best_key:
255
+ display = db.display_for_name(best_key)
256
+ if display:
257
+ print(f" Levenshtein: '{name}' -> '{best_key}' "
258
+ f"(similarity={best_score:.2f})", file=sys.stderr)
259
+ return display
260
+ return None
261
+
262
+
263
+ def _match_smiles_no_stereo(smiles: str, db: 'ReagentDB') -> Optional[str]:
264
+ """Match SMILES against DB after stripping stereochemistry.
265
+
266
+ Catches cases like DEAD where the input SMILES has no E/Z
267
+ but the DB entry has explicit /N=N/ stereo.
268
+ """
269
+ try:
270
+ from rdkit import Chem
271
+ mol = Chem.MolFromSmiles(smiles)
272
+ if mol is None:
273
+ return None
274
+ Chem.RemoveStereochemistry(mol)
275
+ flat_smi = Chem.MolToSmiles(mol)
276
+
277
+ # Compare against all DB SMILES (also stripped)
278
+ for smi_key, entry in db._by_smiles.items():
279
+ mol2 = Chem.MolFromSmiles(smi_key)
280
+ if mol2 is None:
281
+ continue
282
+ Chem.RemoveStereochemistry(mol2)
283
+ flat2 = Chem.MolToSmiles(mol2)
284
+ if flat_smi == flat2:
285
+ return entry.get("display")
286
+ except ImportError:
287
+ pass
288
+ except Exception:
289
+ pass
290
+ return None
291
+
292
+
293
+ # ---------------------------------------------------------------------------
294
+ # Build replacement <t> element for a non-contributing fragment
295
+ # ---------------------------------------------------------------------------
296
+
297
+ def _build_replacement_text_element(
298
+ display_name: str,
299
+ element_id: str,
300
+ cx: float,
301
+ cy: float,
302
+ z_value: str,
303
+ ) -> ET.Element:
304
+ """Build a <t> element to replace a non-contributing fragment.
305
+
306
+ The text is positioned at (cx, cy) which was the center of the original
307
+ fragment's bounding box. Subscript formatting is applied for chemical
308
+ formulae.
309
+ """
310
+ # Estimate bounding box
311
+ char_w = len(display_name) * 5.8
312
+ ascender = 8.0
313
+ descender = 3.0
314
+
315
+ # <t> p="x baseline_y" — baseline is at cy + partial ascender offset
316
+ baseline_y = cy + 3.5 # shift down slightly from center to align baseline
317
+ bx1 = cx - char_w / 2.0
318
+ by1 = baseline_y - ascender
319
+ bx2 = cx + char_w / 2.0
320
+ by2 = baseline_y + descender
321
+
322
+ s_xml = _build_subscripted_s_xml(display_name)
323
+
324
+ # Build XML string and parse it
325
+ t_xml = (
326
+ f'<t id="{element_id}" '
327
+ f'p="{cx:.2f} {baseline_y:.2f}" '
328
+ f'BoundingBox="{bx1:.2f} {by1:.2f} {bx2:.2f} {by2:.2f}" '
329
+ f'Z="{z_value}" '
330
+ f'InterpretChemically="no" '
331
+ f'LineHeight="auto">'
332
+ f'{s_xml}'
333
+ f'</t>'
334
+ )
335
+
336
+ return ET.fromstring(t_xml)
337
+
338
+
339
+ # ---------------------------------------------------------------------------
340
+ # Resolve reagent name to CDXML fragment (for text → structure promotion)
341
+ # ---------------------------------------------------------------------------
342
+
343
+ def _resolve_name_to_fragment(
344
+ name: str,
345
+ smiles: Optional[str],
346
+ cs_bridge,
347
+ verbose: bool = False,
348
+ ) -> Optional[Tuple[str, float, float, float, float]]:
349
+ """Resolve a reagent name (or SMILES) to a CDXML fragment.
350
+
351
+ Resolution chain:
352
+ 1. ChemScript name_to_cdxml
353
+ 2. If SMILES available: ChemScript smiles_to_cdxml
354
+ 3. PubChem name → SMILES → ChemScript smiles_to_cdxml
355
+
356
+ Returns (frag_xml, xmin, ymin, xmax, ymax) or None.
357
+ """
358
+ from ...image.reaction_from_image import (
359
+ _extract_fragment_from_cdxml, _measure_fragment_xml,
360
+ )
361
+
362
+ def log(msg: str):
363
+ if verbose:
364
+ print(f"[scheme_polisher] {msg}", file=sys.stderr)
365
+
366
+ # Resolve canonical display name from reagent DB
367
+ canonical = get_reagent_db().resolve_display(name)
368
+
369
+ # 1. ChemScript name resolution
370
+ try:
371
+ cdxml_str = cs_bridge.name_to_cdxml(canonical)
372
+ result = _extract_fragment_from_cdxml(cdxml_str)
373
+ if result is not None:
374
+ log(f" '{canonical}' → ChemScript name OK")
375
+ return result
376
+ except Exception as exc:
377
+ log(f" '{canonical}' → ChemScript name failed: {exc}")
378
+
379
+ # 2. Direct SMILES if available
380
+ if smiles:
381
+ try:
382
+ cdxml_str = cs_bridge.smiles_to_cdxml(smiles)
383
+ result = _extract_fragment_from_cdxml(cdxml_str)
384
+ if result is not None:
385
+ log(f" '{canonical}' → ChemScript SMILES OK")
386
+ return result
387
+ except Exception as exc:
388
+ log(f" '{canonical}' → ChemScript SMILES failed: {exc}")
389
+
390
+ # 3. PubChem name → SMILES → ChemScript
391
+ try:
392
+ from ...resolve.cas_resolver import resolve_name_to_smiles
393
+ pub_smiles = resolve_name_to_smiles(canonical)
394
+ if pub_smiles:
395
+ log(f" '{canonical}' → PubChem SMILES: {pub_smiles[:60]}")
396
+ cdxml_str = cs_bridge.smiles_to_cdxml(pub_smiles)
397
+ result = _extract_fragment_from_cdxml(cdxml_str)
398
+ if result is not None:
399
+ log(f" '{canonical}' → PubChem+ChemScript OK")
400
+ return result
401
+ except Exception as exc:
402
+ log(f" '{canonical}' → PubChem fallback failed: {exc}")
403
+
404
+ return None
405
+
406
+
407
+ # ---------------------------------------------------------------------------
408
+ # Core polishing logic
409
+ # ---------------------------------------------------------------------------
410
+
411
+ def polish_scheme(
412
+ cdxml_path: str,
413
+ output_path: str,
414
+ verbose: bool = False,
415
+ merge_conditions: bool = False,
416
+ skip_alignment: bool = False,
417
+ use_rxnmapper: bool = False,
418
+ ) -> Dict:
419
+ """Polish a CDXML reaction scheme in-place.
420
+
421
+ If merge_conditions=True, all text labels above the arrow are merged
422
+ into a single centered multi-line text block, and likewise below.
423
+
424
+ If skip_alignment=True, Step 4d (Kabsch orientation alignment) is
425
+ skipped. Useful when the caller will run its own alignment
426
+ afterwards (e.g. scheme_polisher_v2's RDKit MCS alignment).
427
+
428
+ use_rxnmapper is deprecated and ignored. Classification now uses
429
+ Schneider FP scoring (context-aware, no ML dependency).
430
+
431
+ Returns a dict describing changes made.
432
+ """
433
+ def log(msg: str):
434
+ if verbose:
435
+ print(f"[scheme_polisher] {msg}", file=sys.stderr)
436
+
437
+ # --- Step 1: Run reactant_heuristic classification ---
438
+ log("Running reactant_heuristic classification...")
439
+ from ...perception.reactant_heuristic import classify_from_cdxml
440
+
441
+ classification = classify_from_cdxml(cdxml_path,
442
+ use_rxnmapper=use_rxnmapper)
443
+ reagents = classification["reagents"]
444
+
445
+ log(f"Classified {len(reagents)} reagent(s):")
446
+ for r in reagents:
447
+ log(f" id={r['source_id']} type={r['source_type']} "
448
+ f"class={r['classification']} "
449
+ f"role={r.get('role', '-')} name={r.get('name', '-')}")
450
+
451
+ # --- Step 2: Parse CDXML ---
452
+ tree = ET.parse(cdxml_path)
453
+ root = tree.getroot()
454
+ page = root.find("page")
455
+ if page is None:
456
+ raise SystemExit("ERROR: no <page> element in CDXML")
457
+
458
+ # Build id → element map and id → parent map
459
+ id_to_el: Dict[str, ET.Element] = {}
460
+ id_to_parent: Dict[str, ET.Element] = {}
461
+ for parent in page:
462
+ eid = parent.get("id", "")
463
+ if eid:
464
+ id_to_el[eid] = parent
465
+ id_to_parent[eid] = page
466
+
467
+ # --- Step 3: Parse <step> metadata ---
468
+ scheme = page.find("scheme")
469
+ step = scheme.find("step") if scheme is not None else None
470
+ if step is None:
471
+ raise SystemExit("ERROR: no <scheme><step> found in CDXML")
472
+
473
+ reactant_ids = set(step.get("ReactionStepReactants", "").split())
474
+ product_ids = set(step.get("ReactionStepProducts", "").split())
475
+ above_ids = step.get("ReactionStepObjectsAboveArrow", "").split()
476
+ below_ids = step.get("ReactionStepObjectsBelowArrow", "").split()
477
+
478
+ # --- Step 4: Process non-contributing fragments → replace with text ---
479
+ replacements = [] # (old_id, display_name)
480
+ ids_to_remove = [] # fragment IDs to remove from page
481
+
482
+ for r in reagents:
483
+ if r["classification"] != "non_contributing":
484
+ continue
485
+ if r["source_type"] != "fragment":
486
+ continue
487
+
488
+ src_id = r["source_id"]
489
+ el = id_to_el.get(src_id)
490
+ if el is None or el.tag != "fragment":
491
+ continue
492
+
493
+ # Skip products (shouldn't happen but be safe)
494
+ if src_id in product_ids:
495
+ continue
496
+
497
+ # Determine display name
498
+ display_name = _resolve_display_name(
499
+ r.get("smiles"), r.get("name"), r.get("role")
500
+ )
501
+ if display_name is None:
502
+ log(f" WARNING: no display name for fragment {src_id}, keeping structure")
503
+ continue
504
+
505
+ log(f" Replacing fragment {src_id} with text '{display_name}'")
506
+
507
+ # Get position from fragment center
508
+ cx, cy = _fragment_bbox_center(el)
509
+ z_value = el.get("Z", "1")
510
+
511
+ # Build replacement text element (same ID to preserve step refs)
512
+ new_t = _build_replacement_text_element(
513
+ display_name, src_id, cx, cy, z_value
514
+ )
515
+
516
+ # Replace in page: remove old fragment, insert new text
517
+ page.remove(el)
518
+ # Insert before the scheme element to keep document order sensible
519
+ scheme_idx = list(page).index(scheme) if scheme in page else len(list(page))
520
+ page.insert(scheme_idx, new_t)
521
+
522
+ replacements.append((src_id, display_name))
523
+
524
+ # Move replaced IDs from ReactionStepReactants → above-arrow so they
525
+ # are treated as conditions text (and eligible for merge-conditions).
526
+ if replacements and step is not None:
527
+ replaced_ids = {r[0] for r in replacements}
528
+ # Remove from reactants
529
+ current_reactants = step.get("ReactionStepReactants", "").split()
530
+ new_reactants = [rid for rid in current_reactants
531
+ if rid not in replaced_ids]
532
+ step.set("ReactionStepReactants", " ".join(new_reactants))
533
+ # Add to above-arrow
534
+ current_above = step.get("ReactionStepObjectsAboveArrow", "").split()
535
+ current_above = [a for a in current_above if a] # filter empty
536
+ for rid, _ in replacements:
537
+ if rid not in current_above:
538
+ current_above.append(rid)
539
+ step.set("ReactionStepObjectsAboveArrow", " ".join(current_above))
540
+ # Update local tracking sets
541
+ reactant_ids -= replaced_ids
542
+ above_ids = current_above
543
+
544
+ log(f"Replaced {len(replacements)} non-contributing fragment(s) with text")
545
+
546
+ # --- Lazy-init ChemScript bridge (shared by Step 4b and 4d) ---
547
+ cs_bridge = None
548
+
549
+ def _ensure_cs_bridge():
550
+ nonlocal cs_bridge
551
+ if cs_bridge is None:
552
+ from ...chemdraw.chemscript_bridge import ChemScriptBridge
553
+ cs_bridge = ChemScriptBridge()
554
+ return cs_bridge
555
+
556
+ # --- Step 4b: Promote atom-contributing text labels to structures ---
557
+ promotions = [] # (old_id, name)
558
+
559
+ for r in reagents:
560
+ if r["classification"] != "atom_contributing":
561
+ continue
562
+ if r["source_type"] != "text":
563
+ continue
564
+
565
+ src_id = r["source_id"]
566
+ el = id_to_el.get(src_id)
567
+ if el is None or el.tag != "t":
568
+ continue
569
+
570
+ name = r.get("name", "")
571
+ if not name:
572
+ continue
573
+
574
+ log(f" Promoting text '{name}' (id={src_id}) to structure...")
575
+
576
+ # Lazy-init ChemScript bridge
577
+ try:
578
+ _ensure_cs_bridge()
579
+ except Exception as exc:
580
+ log(f" WARNING: ChemScript unavailable ({exc}), "
581
+ f"cannot promote text to structures")
582
+ break
583
+
584
+ # Resolve name → CDXML fragment
585
+ frag_info = _resolve_name_to_fragment(name, r.get("smiles"), cs_bridge,
586
+ verbose)
587
+ if frag_info is None:
588
+ log(f" WARNING: could not resolve '{name}' to structure, keeping text")
589
+ continue
590
+
591
+ frag_xml, xmin, ymin, xmax, ymax = frag_info
592
+
593
+ # Position the new fragment at the old text element's location
594
+ from ...image.reaction_from_image import _translate_fragment_xml
595
+ bb = el.get("BoundingBox", "")
596
+ if bb:
597
+ vals = [float(v) for v in bb.split()]
598
+ tcx = (vals[0] + vals[2]) / 2.0
599
+ tcy = (vals[1] + vals[3]) / 2.0
600
+ else:
601
+ p = el.get("p", "")
602
+ if p:
603
+ pp = p.split()
604
+ tcx, tcy = float(pp[0]), float(pp[1])
605
+ else:
606
+ tcx, tcy = 500.0, 250.0
607
+
608
+ frag_cx = (xmin + xmax) / 2.0
609
+ frag_cy = (ymin + ymax) / 2.0
610
+ dx = tcx - frag_cx
611
+ dy = tcy - frag_cy
612
+ translated = _translate_fragment_xml(frag_xml, dx, dy)
613
+
614
+ # Parse the translated fragment XML and assign the old element's ID
615
+ new_frag = ET.fromstring(translated)
616
+ new_frag.set("id", src_id)
617
+
618
+ # Replace in page
619
+ page.remove(el)
620
+ scheme_idx = list(page).index(scheme) if scheme in page else len(list(page))
621
+ page.insert(scheme_idx, new_frag)
622
+
623
+ # Update id_to_el
624
+ id_to_el[src_id] = new_frag
625
+
626
+ promotions.append((src_id, name))
627
+ log(f" Promoted '{name}' to structure (id={src_id})")
628
+
629
+ log(f"Promoted {len(promotions)} text label(s) to structures")
630
+
631
+ # --- Step 4d: Align atom-contributing reagents to product orientation ---
632
+ alignments = [] # list of aligned fragment IDs
633
+
634
+ if skip_alignment:
635
+ log("Step 4d: Skipped (skip_alignment=True)")
636
+ else:
637
+ # Rebuild id_to_el after promotions
638
+ id_to_el.clear()
639
+ for el in page:
640
+ eid = el.get("id", "")
641
+ if eid:
642
+ id_to_el[eid] = el
643
+
644
+ # Collect atom-contributing fragment IDs (excluding product)
645
+ contributing_frag_ids = set()
646
+ for r in reagents:
647
+ if r["classification"] != "atom_contributing":
648
+ continue
649
+ src_id = r["source_id"]
650
+ el = id_to_el.get(src_id)
651
+ if el is not None and el.tag == "fragment":
652
+ if src_id not in product_ids:
653
+ contributing_frag_ids.add(src_id)
654
+
655
+ if contributing_frag_ids:
656
+ log(f"Step 4d: Aligning {len(contributing_frag_ids)} atom-contributing "
657
+ f"fragment(s) to product orientation...")
658
+ aligned_ids = kabsch_align_to_product(
659
+ root, cs_bridge=cs_bridge, verbose=verbose,
660
+ frag_ids=contributing_frag_ids)
661
+ alignments = [(fid, "aligned") for fid in aligned_ids]
662
+ else:
663
+ log("Step 4d: No atom-contributing fragments to align")
664
+
665
+ # Close ChemScript bridge (shared by 4b and 4d)
666
+ if cs_bridge is not None:
667
+ try:
668
+ cs_bridge.close()
669
+ except Exception:
670
+ pass
671
+
672
+ log(f"Aligned {len(alignments)} fragment(s) to product orientation")
673
+
674
+ # --- Step 4c: Reformat existing text labels (subscripts + italic) ---
675
+ # Rebuild id_to_el before reformatting
676
+ id_to_el.clear()
677
+ for el in page:
678
+ eid = el.get("id", "")
679
+ if eid:
680
+ id_to_el[eid] = el
681
+
682
+ above_ids_reformat = step.get("ReactionStepObjectsAboveArrow", "").split()
683
+ below_ids_reformat = step.get("ReactionStepObjectsBelowArrow", "").split()
684
+ all_condition_ids = set(above_ids_reformat) | set(below_ids_reformat)
685
+ # Skip IDs that were just created in step 4 (already correctly formatted)
686
+ newly_created_ids = {r[0] for r in replacements}
687
+ reformatted = []
688
+
689
+ for eid in all_condition_ids:
690
+ if eid in newly_created_ids:
691
+ continue
692
+ el = id_to_el.get(eid)
693
+ if el is None or el.tag != "t":
694
+ continue
695
+
696
+ # Get current plain text
697
+ old_text = _get_text_content(el)
698
+ if not old_text:
699
+ continue
700
+
701
+ # Look up canonical display form from reagent DB
702
+ canonical = get_reagent_db().resolve_display(old_text)
703
+
704
+ # Build new formatted <s> elements
705
+ new_s_xml = _build_formatted_s_xml(canonical)
706
+
707
+ # Check if reformatting would actually change anything
708
+ old_s_xml = "".join(ET.tostring(s, encoding="unicode") for s in el.findall("s"))
709
+ if old_s_xml == new_s_xml:
710
+ continue
711
+
712
+ # Remove old <s> children, keep objecttags and other children
713
+ old_children = list(el)
714
+ for child in old_children:
715
+ if child.tag == "s":
716
+ el.remove(child)
717
+
718
+ # Parse the new <s> elements and insert at the front
719
+ # Wrap in a dummy element for parsing
720
+ wrapper = ET.fromstring(f"<dummy>{new_s_xml}</dummy>")
721
+ insert_pos = 0
722
+ for new_s in wrapper:
723
+ el.insert(insert_pos, new_s)
724
+ insert_pos += 1
725
+
726
+ reformatted.append((eid, old_text, canonical))
727
+ log(f" Reformatted text id={eid}: '{old_text}' → '{canonical}' "
728
+ f"(subscript/italic)")
729
+
730
+ log(f"Reformatted {len(reformatted)} text label(s)")
731
+
732
+ # --- Step 5: Deduplicate text elements ---
733
+ # Rebuild id_to_el after replacements
734
+ id_to_el.clear()
735
+ for el in page:
736
+ eid = el.get("id", "")
737
+ if eid:
738
+ id_to_el[eid] = el
739
+
740
+ # Collect all text content for above/below arrow elements
741
+ def _normalize_text(text: str) -> str:
742
+ return text.strip().lower()
743
+
744
+ above_ids = step.get("ReactionStepObjectsAboveArrow", "").split()
745
+ below_ids = step.get("ReactionStepObjectsBelowArrow", "").split()
746
+
747
+ dedup_removed = []
748
+
749
+ for position_name, id_list_attr in [
750
+ ("above", "ReactionStepObjectsAboveArrow"),
751
+ ("below", "ReactionStepObjectsBelowArrow"),
752
+ ]:
753
+ id_list = step.get(id_list_attr, "").split()
754
+ seen_texts: Dict[str, str] = {} # normalized_text → first_id
755
+ new_id_list = []
756
+ for eid in id_list:
757
+ el = id_to_el.get(eid)
758
+ if el is None:
759
+ continue
760
+
761
+ # Only deduplicate text elements
762
+ if el.tag == "t":
763
+ text = _get_text_content(el)
764
+ norm = _normalize_text(text)
765
+ if norm in seen_texts:
766
+ # Duplicate — remove element and skip ID
767
+ log(f" Dedup: removing duplicate '{text}' (id={eid}) "
768
+ f"from {position_name} (keeping id={seen_texts[norm]})")
769
+ page.remove(el)
770
+ dedup_removed.append((eid, text, position_name))
771
+ continue
772
+ seen_texts[norm] = eid
773
+
774
+ new_id_list.append(eid)
775
+
776
+ step.set(id_list_attr, " ".join(new_id_list))
777
+
778
+ log(f"Removed {len(dedup_removed)} duplicate(s)")
779
+
780
+ # --- Step 6: Merge all text labels into one centered block (optional) ---
781
+ merged_conditions = False
782
+ merged_text_id = None
783
+
784
+ if merge_conditions:
785
+ # Rebuild id_to_el
786
+ id_to_el.clear()
787
+ for el in page:
788
+ eid = el.get("id", "")
789
+ if eid:
790
+ id_to_el[eid] = el
791
+
792
+ # Find arrow midpoint for rough centering
793
+ arrow_cx, arrow_cy = 500.0, 250.0 # fallback
794
+ arrow_id = step.get("ReactionStepArrows", "").split()
795
+ for aid in arrow_id:
796
+ a_el = id_to_el.get(aid)
797
+ if a_el is None:
798
+ # Try the superseding arrow (graphic → arrow pattern)
799
+ for child in page:
800
+ if child.tag == "arrow":
801
+ a_el = child
802
+ break
803
+ if a_el is None:
804
+ for child in page:
805
+ if child.tag == "graphic" and child.get("id") == aid:
806
+ sup_id = child.get("SupersededBy", "")
807
+ if sup_id:
808
+ a_el = id_to_el.get(sup_id)
809
+ break
810
+ if a_el is not None:
811
+ head = a_el.get("Head3D", "")
812
+ tail = a_el.get("Tail3D", "")
813
+ if head and tail:
814
+ hx, hy = float(head.split()[0]), float(head.split()[1])
815
+ tx, ty = float(tail.split()[0]), float(tail.split()[1])
816
+ arrow_cx = (hx + tx) / 2.0
817
+ arrow_cy = (hy + ty) / 2.0
818
+ else:
819
+ bb = a_el.get("BoundingBox", "")
820
+ if bb:
821
+ vals = [float(v) for v in bb.split()]
822
+ arrow_cx = (vals[0] + vals[2]) / 2.0
823
+ arrow_cy = (vals[1] + vals[3]) / 2.0
824
+ break
825
+
826
+ # Collect ALL text labels from above + below into one ordered list
827
+ all_text_ids = []
828
+ all_text_lines = []
829
+ non_text_above = []
830
+ non_text_below = []
831
+
832
+ for position_name, id_list_attr in [
833
+ ("above", "ReactionStepObjectsAboveArrow"),
834
+ ("below", "ReactionStepObjectsBelowArrow"),
835
+ ]:
836
+ id_list = step.get(id_list_attr, "").split()
837
+ for eid in id_list:
838
+ el = id_to_el.get(eid)
839
+ if el is None:
840
+ continue
841
+ if el.tag == "t":
842
+ text = _get_text_content(el)
843
+ if text:
844
+ all_text_ids.append(eid)
845
+ all_text_lines.append(text)
846
+ else:
847
+ if position_name == "above":
848
+ non_text_above.append(eid)
849
+ else:
850
+ non_text_below.append(eid)
851
+
852
+ if len(all_text_ids) >= 2:
853
+ log(f" Merging {len(all_text_ids)} text labels into one block: "
854
+ f"{all_text_lines}")
855
+
856
+ # Build merged <s> content with \n between lines
857
+ s_parts = []
858
+ for i, text in enumerate(all_text_lines):
859
+ if i > 0:
860
+ s_parts.append(
861
+ '<s font="3" size="10" color="0" face="96">\n</s>'
862
+ )
863
+ canonical = get_reagent_db().resolve_display(text)
864
+ s_parts.append(_build_formatted_s_xml(canonical))
865
+ s_xml = "".join(s_parts)
866
+
867
+ # Keep the first text element, remove all others
868
+ keep_id = all_text_ids[0]
869
+ keep_el = id_to_el[keep_id]
870
+ keep_z = keep_el.get("Z", "1")
871
+
872
+ for eid in all_text_ids[1:]:
873
+ el = id_to_el.get(eid)
874
+ if el is not None:
875
+ page.remove(el)
876
+
877
+ # Position just below arrow — ChemDraw cleanup will refine
878
+ line_height = 12.5
879
+ n_lines = len(all_text_lines)
880
+ max_text_len = max(len(t) for t in all_text_lines)
881
+ total_w = max_text_len * 5.8
882
+ total_h = n_lines * line_height
883
+
884
+ mcx = arrow_cx
885
+ by1 = arrow_cy + 4.0 # just below arrow
886
+ by2 = by1 + total_h
887
+ bx1 = mcx - total_w / 2.0
888
+ bx2 = mcx + total_w / 2.0
889
+ first_baseline_y = by1 + 10.0 # first line baseline
890
+
891
+ # Rebuild the kept element
892
+ for child in list(keep_el):
893
+ keep_el.remove(child)
894
+
895
+ keep_el.set("p", f"{mcx:.2f} {first_baseline_y:.2f}")
896
+ keep_el.set("BoundingBox",
897
+ f"{bx1:.2f} {by1:.2f} {bx2:.2f} {by2:.2f}")
898
+ keep_el.set("Z", keep_z)
899
+ keep_el.set("InterpretChemically", "no")
900
+ keep_el.set("LineHeight", "auto")
901
+ keep_el.set("CaptionJustification", "Center")
902
+ keep_el.set("Justification", "Center")
903
+
904
+ # Parse and insert new <s> children
905
+ wrapper = ET.fromstring(f"<dummy>{s_xml}</dummy>")
906
+ for child in wrapper:
907
+ keep_el.append(child)
908
+
909
+ # Update step refs: merged text block goes above arrow
910
+ # (ChemDraw Clean Up Reaction expects objects in above/below)
911
+ step.set("ReactionStepObjectsAboveArrow",
912
+ " ".join(non_text_above + [keep_id]))
913
+ step.set("ReactionStepObjectsBelowArrow",
914
+ " ".join(non_text_below))
915
+
916
+ merged_conditions = True
917
+ merged_text_id = keep_id
918
+ log(f" Merged into single text block (id={keep_id})")
919
+
920
+ # --- Step 7: Write output CDXML ---
921
+ tree.write(output_path, xml_declaration=True, encoding="UTF-8")
922
+
923
+ # Post-process: fix XML declaration and DOCTYPE
924
+ _fixup_cdxml_output(output_path)
925
+
926
+ log(f"Written polished scheme to {output_path}")
927
+
928
+ return {
929
+ "replacements": replacements,
930
+ "promotions": promotions,
931
+ "alignments": alignments,
932
+ "reformatted": reformatted,
933
+ "dedup_removed": dedup_removed,
934
+ "merged_conditions": merged_conditions,
935
+ "merged_text_id": merged_text_id,
936
+ "total_reagents": len(reagents),
937
+ "product_smiles": classification.get("product_smiles"),
938
+ "classification": classification,
939
+ }
940
+
941
+
942
+ def _fixup_cdxml_output(path: str):
943
+ """Fix up the CDXML output from ElementTree.
944
+
945
+ ElementTree's write() doesn't include DOCTYPE and may mangle some
946
+ attributes. This does a minimal fix-up pass.
947
+ """
948
+ with open(path, "r", encoding="utf-8") as f:
949
+ content = f.read()
950
+
951
+ # Ensure proper XML declaration
952
+ if not content.startswith("<?xml"):
953
+ content = '<?xml version="1.0" encoding="UTF-8" ?>\n' + content
954
+
955
+ # Add DOCTYPE if missing
956
+ if "<!DOCTYPE CDXML" not in content:
957
+ content = content.replace(
958
+ "<CDXML",
959
+ '<!DOCTYPE CDXML SYSTEM "http://www.cambridgesoft.com/xml/cdxml.dtd" >\n<CDXML',
960
+ 1,
961
+ )
962
+
963
+ with open(path, "w", encoding="utf-8") as f:
964
+ f.write(content)
965
+
966
+
967
+ # ---------------------------------------------------------------------------
968
+ # ChemDraw COM cleanup pass
969
+ # ---------------------------------------------------------------------------
970
+
971
+ def _find_arrow_center(page: ET.Element, step: ET.Element,
972
+ id_to_el: Dict[str, ET.Element],
973
+ ) -> Tuple[float, float]:
974
+ """Find the arrow midpoint from step metadata."""
975
+ arrow_cx, arrow_cy = 500.0, 250.0
976
+ arrow_ids = step.get("ReactionStepArrows", "").split()
977
+ for aid in arrow_ids:
978
+ a_el = id_to_el.get(aid)
979
+ if a_el is None:
980
+ for child in page:
981
+ if child.tag == "graphic" and child.get("id") == aid:
982
+ sup_id = child.get("SupersededBy", "")
983
+ if sup_id:
984
+ a_el = id_to_el.get(sup_id)
985
+ break
986
+ if a_el is not None:
987
+ head = a_el.get("Head3D", "")
988
+ tail = a_el.get("Tail3D", "")
989
+ if head and tail:
990
+ hx, hy = float(head.split()[0]), float(head.split()[1])
991
+ tx, ty = float(tail.split()[0]), float(tail.split()[1])
992
+ arrow_cx = (hx + tx) / 2.0
993
+ arrow_cy = (hy + ty) / 2.0
994
+ else:
995
+ bb = a_el.get("BoundingBox", "")
996
+ if bb:
997
+ vals = [float(v) for v in bb.split()]
998
+ arrow_cx = (vals[0] + vals[2]) / 2.0
999
+ arrow_cy = (vals[1] + vals[3]) / 2.0
1000
+ break
1001
+ return arrow_cx, arrow_cy
1002
+
1003
+
1004
+ def _compact_toward_arrow(cdxml_path: str, verbose: bool = False):
1005
+ """Move above/below-arrow objects closer to the arrow line.
1006
+
1007
+ ChemDraw's "Clean Up Reaction" only recognises reaction components
1008
+ that are reasonably close together. After merging conditions into
1009
+ one large text block the vertical spread can exceed this threshold.
1010
+ This helper nudges every above-arrow element downward and every
1011
+ below-arrow element upward so that all objects sit within a tight
1012
+ band around the arrow y-coordinate.
1013
+ """
1014
+ def log(msg: str):
1015
+ if verbose:
1016
+ print(f"[scheme_polisher] {msg}", file=sys.stderr)
1017
+
1018
+ tree = ET.parse(cdxml_path)
1019
+ root = tree.getroot()
1020
+ page = root.find("page")
1021
+ scheme = page.find("scheme") if page is not None else None
1022
+ step = scheme.find("step") if scheme is not None else None
1023
+ if step is None:
1024
+ return
1025
+
1026
+ id_to_el: Dict[str, ET.Element] = {}
1027
+ for el in page:
1028
+ eid = el.get("id", "")
1029
+ if eid:
1030
+ id_to_el[eid] = el
1031
+
1032
+ arrow_cx, arrow_cy = _find_arrow_center(page, step, id_to_el)
1033
+ log(f" Compacting: arrow center = ({arrow_cx:.1f}, {arrow_cy:.1f})")
1034
+
1035
+ # Target: above-arrow objects sit with their bottom edge at arrow_cy - 5
1036
+ # Target: below-arrow objects sit with their top edge at arrow_cy + 5
1037
+ GAP = 5.0
1038
+
1039
+ for attr, direction in [
1040
+ ("ReactionStepObjectsAboveArrow", "above"),
1041
+ ("ReactionStepObjectsBelowArrow", "below"),
1042
+ ]:
1043
+ ids = step.get(attr, "").split()
1044
+ for eid in ids:
1045
+ el = id_to_el.get(eid)
1046
+ if el is None:
1047
+ continue
1048
+
1049
+ # Compute current bounding box center-y
1050
+ if el.tag == "fragment":
1051
+ _, cy = _fragment_bbox_center(el)
1052
+ elif el.tag == "t":
1053
+ bb = el.get("BoundingBox", "")
1054
+ if bb:
1055
+ vals = [float(v) for v in bb.split()]
1056
+ cy = (vals[1] + vals[3]) / 2.0
1057
+ else:
1058
+ continue
1059
+ else:
1060
+ continue
1061
+
1062
+ # How far to shift toward the arrow (y-axis points down)
1063
+ if direction == "above":
1064
+ target_cy = arrow_cy - GAP - 15 # keep a small gap above
1065
+ dy = target_cy - cy
1066
+ # dy > 0 means object is above target → move down toward arrow
1067
+ # dy < 0 means object is already below target → skip
1068
+ if dy <= 0:
1069
+ continue
1070
+ else:
1071
+ target_cy = arrow_cy + GAP + 15
1072
+ dy = target_cy - cy
1073
+ # dy < 0 means object is below target → move up toward arrow
1074
+ # dy > 0 means object is already above target → skip
1075
+ if dy >= 0:
1076
+ continue
1077
+
1078
+ log(f" Compacting {el.tag} id={eid} {direction}: "
1079
+ f"dy={dy:+.1f}")
1080
+ _shift_element_y(el, dy)
1081
+
1082
+ tree.write(cdxml_path, xml_declaration=True, encoding="UTF-8")
1083
+ _fixup_cdxml_output(cdxml_path)
1084
+
1085
+
1086
+ def _shift_element_y(el: ET.Element, dy: float):
1087
+ """Shift an element (fragment or text) vertically by dy points."""
1088
+ if el.tag == "fragment":
1089
+ # Shift all node positions
1090
+ for n in el.iter("n"):
1091
+ p = n.get("p")
1092
+ if p:
1093
+ parts = p.split()
1094
+ if len(parts) >= 2:
1095
+ new_y = float(parts[1]) + dy
1096
+ n.set("p", f"{parts[0]} {new_y:.2f}")
1097
+ # Shift nested text label positions
1098
+ for t in el.iter("t"):
1099
+ p = t.get("p")
1100
+ if p:
1101
+ parts = p.split()
1102
+ if len(parts) >= 2:
1103
+ new_y = float(parts[1]) + dy
1104
+ t.set("p", f"{parts[0]} {new_y:.2f}")
1105
+ bb = t.get("BoundingBox")
1106
+ if bb:
1107
+ vals = [float(v) for v in bb.split()]
1108
+ if len(vals) >= 4:
1109
+ vals[1] += dy
1110
+ vals[3] += dy
1111
+ t.set("BoundingBox",
1112
+ " ".join(f"{v:.2f}" for v in vals))
1113
+ # Shift fragment BoundingBox
1114
+ bb = el.get("BoundingBox")
1115
+ if bb:
1116
+ vals = [float(v) for v in bb.split()]
1117
+ if len(vals) >= 4:
1118
+ vals[1] += dy
1119
+ vals[3] += dy
1120
+ el.set("BoundingBox",
1121
+ " ".join(f"{v:.2f}" for v in vals))
1122
+
1123
+ elif el.tag == "t":
1124
+ p = el.get("p")
1125
+ if p:
1126
+ parts = p.split()
1127
+ if len(parts) >= 2:
1128
+ new_y = float(parts[1]) + dy
1129
+ el.set("p", f"{parts[0]} {new_y:.2f}")
1130
+ bb = el.get("BoundingBox")
1131
+ if bb:
1132
+ vals = [float(v) for v in bb.split()]
1133
+ if len(vals) >= 4:
1134
+ vals[1] += dy
1135
+ vals[3] += dy
1136
+ el.set("BoundingBox",
1137
+ " ".join(f"{v:.2f}" for v in vals))
1138
+
1139
+
1140
+ def _chemdraw_cleanup_reaction(cdxml_path: str, output_path: str,
1141
+ verbose: bool = False):
1142
+ """Run ChemDraw COM "Clean Up Reaction" on the CDXML file.
1143
+
1144
+ Reuses the same COM automation pattern as eln_cdx_cleanup.py.
1145
+ Expects the file to already be compacted (see _compact_toward_arrow).
1146
+ """
1147
+ import win32com.client
1148
+
1149
+ def log(msg: str):
1150
+ if verbose:
1151
+ print(f"[scheme_polisher] {msg}", file=sys.stderr)
1152
+
1153
+ log("Running ChemDraw COM cleanup...")
1154
+
1155
+ # Import COM helpers from eln_cdx_cleanup
1156
+ from .eln_cdx_cleanup import (
1157
+ _get_chemdraw, _chemdraw_open,
1158
+ _restore_chemdraw_window,
1159
+ )
1160
+
1161
+ cdApp, launched = _get_chemdraw()
1162
+ doc = _chemdraw_open(cdApp, os.path.abspath(cdxml_path))
1163
+
1164
+ # Select all, then Clean Up Reaction (Structure menu → item 7)
1165
+ # Run 3 times — arrow lengths and spacing may not fully converge
1166
+ # on the first pass.
1167
+ for i in range(3):
1168
+ doc.Objects.Select()
1169
+ time.sleep(1)
1170
+ cdApp.MenuBars(1).Menus(5).MenuItems(7).Execute()
1171
+ time.sleep(1)
1172
+
1173
+ # Save to output
1174
+ doc.SaveAs(os.path.abspath(output_path))
1175
+ time.sleep(0.5)
1176
+ doc.Close(False)
1177
+
1178
+ if launched:
1179
+ _restore_chemdraw_window()
1180
+ cdApp.Quit()
1181
+
1182
+ log(f"ChemDraw cleanup saved to {output_path}")
1183
+
1184
+
1185
+ # ---------------------------------------------------------------------------
1186
+ # CLI
1187
+ # ---------------------------------------------------------------------------
1188
+
1189
+ def main(argv: Optional[List[str]] = None) -> int:
1190
+ parser = argparse.ArgumentParser(
1191
+ description=(
1192
+ "Polish a CDXML reaction scheme: replace non-contributing "
1193
+ "reagent structures with text abbreviations, deduplicate, "
1194
+ "and optionally run ChemDraw Clean Up Reaction."
1195
+ ),
1196
+ formatter_class=argparse.RawDescriptionHelpFormatter,
1197
+ epilog=__doc__,
1198
+ )
1199
+ parser.add_argument(
1200
+ "-i", "--input", required=True,
1201
+ help="Input CDXML file",
1202
+ )
1203
+ parser.add_argument(
1204
+ "-o", "--output", default=None,
1205
+ help="Output CDXML file (default: <input_stem>-polished.cdxml)",
1206
+ )
1207
+ parser.add_argument(
1208
+ "--no-chemdraw-cleanup", action="store_true",
1209
+ help="Skip the ChemDraw COM 'Clean Up Reaction' pass (still compacts)",
1210
+ )
1211
+ parser.add_argument(
1212
+ "--no-compact", action="store_true",
1213
+ help="Skip the compaction step (implies --no-chemdraw-cleanup)",
1214
+ )
1215
+ parser.add_argument(
1216
+ "--merge-conditions", action="store_true",
1217
+ help=(
1218
+ "Merge all text labels above/below the arrow into a single "
1219
+ "centered multi-line text block"
1220
+ ),
1221
+ )
1222
+ parser.add_argument(
1223
+ "-v", "--verbose", action="store_true",
1224
+ help="Print progress to stderr",
1225
+ )
1226
+ parser.add_argument(
1227
+ "--json", action="store_true",
1228
+ help="Output result as JSON to stdout",
1229
+ )
1230
+
1231
+ args = parser.parse_args(argv)
1232
+
1233
+ input_path = os.path.abspath(args.input)
1234
+ if not os.path.exists(input_path):
1235
+ print(f"ERROR: file not found: {input_path}", file=sys.stderr)
1236
+ return 1
1237
+
1238
+ # Default output path
1239
+ if args.output is None:
1240
+ stem = os.path.splitext(input_path)[0]
1241
+ output_path = stem + "-polished.cdxml"
1242
+ else:
1243
+ output_path = os.path.abspath(args.output)
1244
+
1245
+ # --no-compact implies --no-chemdraw-cleanup
1246
+ do_compact = not args.no_compact
1247
+ do_chemdraw = not args.no_chemdraw_cleanup and not args.no_compact
1248
+
1249
+ if not do_compact and not do_chemdraw:
1250
+ # No post-processing — write directly to output
1251
+ result = polish_scheme(input_path, output_path,
1252
+ verbose=args.verbose,
1253
+ merge_conditions=args.merge_conditions)
1254
+ elif do_compact and not do_chemdraw:
1255
+ # Compact only — write to output, then compact in-place
1256
+ result = polish_scheme(input_path, output_path,
1257
+ verbose=args.verbose,
1258
+ merge_conditions=args.merge_conditions)
1259
+ _compact_toward_arrow(output_path, args.verbose)
1260
+ else:
1261
+ # Compact + ChemDraw cleanup — write to temp, compact, cleanup
1262
+ tmpdir = tempfile.mkdtemp(prefix="scheme_polish_")
1263
+ tmp_path = os.path.join(tmpdir, "pre_cleanup.cdxml")
1264
+ try:
1265
+ result = polish_scheme(input_path, tmp_path,
1266
+ verbose=args.verbose,
1267
+ merge_conditions=args.merge_conditions)
1268
+ _compact_toward_arrow(tmp_path, args.verbose)
1269
+ _chemdraw_cleanup_reaction(tmp_path, output_path,
1270
+ verbose=args.verbose)
1271
+ finally:
1272
+ import shutil
1273
+ try:
1274
+ shutil.rmtree(tmpdir)
1275
+ except Exception:
1276
+ pass
1277
+
1278
+ # --- Report ---
1279
+ n_replaced = len(result["replacements"])
1280
+ n_promoted = len(result["promotions"])
1281
+ n_aligned = len(result.get("alignments", []))
1282
+ n_reformatted = len(result["reformatted"])
1283
+ n_deduped = len(result["dedup_removed"])
1284
+ parts = [
1285
+ f"{n_replaced} structure(s) → text",
1286
+ f"{n_promoted} text → structure",
1287
+ f"{n_aligned} fragment(s) aligned to product",
1288
+ f"{n_reformatted} text reformatted",
1289
+ f"{n_deduped} duplicate(s) removed",
1290
+ ]
1291
+ if result.get("merged_conditions"):
1292
+ parts.append("conditions merged into single block")
1293
+ print(f"Polished: {', '.join(parts)}", file=sys.stderr)
1294
+ print(f"Output: {output_path}", file=sys.stderr)
1295
+
1296
+ if args.json:
1297
+ # Determine mode
1298
+ if not do_compact and not do_chemdraw:
1299
+ mode = "raw"
1300
+ elif do_compact and not do_chemdraw:
1301
+ mode = "compact"
1302
+ else:
1303
+ mode = "chemdraw"
1304
+
1305
+ steps_applied = []
1306
+ if n_replaced:
1307
+ steps_applied.append(f"{n_replaced} structure(s) replaced with text")
1308
+ if n_promoted:
1309
+ steps_applied.append(f"{n_promoted} text promoted to structure")
1310
+ if n_aligned:
1311
+ steps_applied.append(f"{n_aligned} fragment(s) aligned to product")
1312
+ if n_reformatted:
1313
+ steps_applied.append(f"{n_reformatted} text reformatted")
1314
+ if n_deduped:
1315
+ steps_applied.append(f"{n_deduped} duplicate(s) removed")
1316
+ if result.get("merged_conditions"):
1317
+ steps_applied.append("conditions merged")
1318
+ if do_compact:
1319
+ steps_applied.append("compacted toward arrow")
1320
+ if do_chemdraw:
1321
+ steps_applied.append("ChemDraw COM cleanup")
1322
+
1323
+ warnings = []
1324
+ json_result = {
1325
+ "input": str(input_path),
1326
+ "output": str(output_path),
1327
+ "mode": mode,
1328
+ "steps_applied": steps_applied,
1329
+ "warnings": warnings,
1330
+ }
1331
+ print(json.dumps(json_result, indent=2))
1332
+
1333
+ return 0
1334
+
1335
+
1336
+ if __name__ == "__main__":
1337
+ sys.exit(main())