cdxml-toolkit 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. cdxml_toolkit/__init__.py +18 -0
  2. cdxml_toolkit/_jre/__init__.py +2 -0
  3. cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
  4. cdxml_toolkit/analysis/__init__.py +35 -0
  5. cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
  6. cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
  7. cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
  8. cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
  9. cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
  10. cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
  11. cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
  12. cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
  13. cdxml_toolkit/analysis/extract_nmr.py +47 -0
  14. cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
  15. cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
  16. cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
  17. cdxml_toolkit/cdxml_builder.py +920 -0
  18. cdxml_toolkit/cdxml_utils.py +342 -0
  19. cdxml_toolkit/chemdraw/__init__.py +5 -0
  20. cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
  21. cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
  22. cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
  23. cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
  24. cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
  25. cdxml_toolkit/constants.py +304 -0
  26. cdxml_toolkit/coord_normalizer.py +438 -0
  27. cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
  28. cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
  29. cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
  30. cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
  31. cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
  32. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
  33. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
  34. cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
  35. cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
  36. cdxml_toolkit/image/__init__.py +15 -0
  37. cdxml_toolkit/image/reaction_from_image.py +2103 -0
  38. cdxml_toolkit/image/structure_from_image.py +1711 -0
  39. cdxml_toolkit/layout/__init__.py +5 -0
  40. cdxml_toolkit/layout/alignment.py +1642 -0
  41. cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
  42. cdxml_toolkit/layout/scheme_merger.py +2260 -0
  43. cdxml_toolkit/mcp_server/__init__.py +0 -0
  44. cdxml_toolkit/mcp_server/__main__.py +5 -0
  45. cdxml_toolkit/mcp_server/server.py +1567 -0
  46. cdxml_toolkit/naming/__init__.py +6 -0
  47. cdxml_toolkit/naming/aligned_namer.py +2342 -0
  48. cdxml_toolkit/naming/mol_builder.py +3722 -0
  49. cdxml_toolkit/naming/name_decomposer.py +2843 -0
  50. cdxml_toolkit/naming/reactions_datamol.json +2414 -0
  51. cdxml_toolkit/office/__init__.py +5 -0
  52. cdxml_toolkit/office/doc_from_template.py +722 -0
  53. cdxml_toolkit/office/ole_embedder.py +808 -0
  54. cdxml_toolkit/office/ole_extractor.py +272 -0
  55. cdxml_toolkit/perception/__init__.py +10 -0
  56. cdxml_toolkit/perception/compound_search.py +229 -0
  57. cdxml_toolkit/perception/eln_csv_parser.py +240 -0
  58. cdxml_toolkit/perception/rdf_parser.py +664 -0
  59. cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
  60. cdxml_toolkit/perception/reaction_parser.py +2150 -0
  61. cdxml_toolkit/perception/scheme_reader.py +2948 -0
  62. cdxml_toolkit/perception/scheme_refine.py +1404 -0
  63. cdxml_toolkit/perception/scheme_segmenter.py +619 -0
  64. cdxml_toolkit/perception/spatial_assignment.py +1013 -0
  65. cdxml_toolkit/rdkit_utils.py +605 -0
  66. cdxml_toolkit/render/__init__.py +17 -0
  67. cdxml_toolkit/render/auto_layout.py +229 -0
  68. cdxml_toolkit/render/compact_parser.py +632 -0
  69. cdxml_toolkit/render/parser.py +706 -0
  70. cdxml_toolkit/render/render_scheme.py +267 -0
  71. cdxml_toolkit/render/renderer.py +2387 -0
  72. cdxml_toolkit/render/schema.py +90 -0
  73. cdxml_toolkit/render/scheme_maker.py +1043 -0
  74. cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
  75. cdxml_toolkit/resolve/__init__.py +13 -0
  76. cdxml_toolkit/resolve/cas_resolver.py +430 -0
  77. cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
  78. cdxml_toolkit/resolve/condensed_formula.py +493 -0
  79. cdxml_toolkit/resolve/jre_manager.py +195 -0
  80. cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
  81. cdxml_toolkit/resolve/reagent_db.py +285 -0
  82. cdxml_toolkit/resolve/superatom_data.json +2856 -0
  83. cdxml_toolkit/resolve/superatom_table.py +146 -0
  84. cdxml_toolkit/text_formatting.py +298 -0
  85. cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
  86. cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
  87. cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
  88. cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
  89. cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
  90. cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
  91. cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1340 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ scheme_polisher_v2.py — Experimental COM-free scheme polishing pipeline.
4
+
5
+ Takes a CDX/CDXML reaction scheme and produces a presentation-ready CDXML
6
+ without any ChemDraw COM dependency (except CDX→CDXML conversion if needed,
7
+ which falls through to whatever backend cdx_converter.py has available).
8
+
9
+ Pipeline:
10
+ 1. Convert CDX → CDXML if needed (via cdx_converter.py)
11
+ 2. Normalize bond lengths per-fragment to ACS Document 1996 (14.40 pt)
12
+ 3. Apply ACS Document 1996 document-level settings
13
+ 4. Normalize caption/label fonts to Arial 10pt Bold
14
+ 5. Run scheme_polisher logic (reagent classification, structure↔text swaps,
15
+ orientation alignment, subscript formatting, deduplication)
16
+ 6. Merge conditions into single centered text block (default on)
17
+ 7. Compact above/below-arrow objects toward arrow
18
+ 8. Run reaction_cleanup for final spatial layout
19
+
20
+ Defaults differ from scheme_polisher.py:
21
+ - --merge-conditions is ON by default (use --no-merge-conditions to disable)
22
+ - ChemDraw COM cleanup is NEVER used
23
+
24
+ Usage:
25
+ python scheme_polisher_v2.py input.cdx [-o output.cdxml] [-v]
26
+ python scheme_polisher_v2.py input.cdxml [-o output.cdxml] [-v]
27
+ python scheme_polisher_v2.py input.cdx --no-merge-conditions
28
+ python scheme_polisher_v2.py input.cdxml --approach compact -v
29
+ """
30
+
31
+ import argparse
32
+ import copy
33
+ import json
34
+ import math
35
+ import os
36
+ import subprocess
37
+ import sys
38
+ import tempfile
39
+ import xml.etree.ElementTree as ET
40
+ from typing import Dict, List, Optional, Tuple
41
+
42
+ from ...constants import (
43
+ ACS_BOND_LENGTH as TARGET_BOND_LENGTH,
44
+ ACS_STYLE as ACS_SETTINGS,
45
+ CDXML_MINIMAL_HEADER,
46
+ CDXML_FOOTER,
47
+ )
48
+
49
+
50
+ # ---------------------------------------------------------------------------
51
+ # Bond length measurement and per-fragment normalization
52
+ # ---------------------------------------------------------------------------
53
+
54
+ def _measure_bond_lengths(frag: ET.Element) -> List[float]:
55
+ """Measure all bond lengths in a fragment from node coordinates.
56
+
57
+ Uses direct-child <n> and <b> elements only (not inner fragments
58
+ of NodeType="Fragment" abbreviation groups).
59
+ """
60
+ # Build node id → (x, y) map from direct child <n> nodes
61
+ node_map: Dict[str, Tuple[float, float]] = {}
62
+ for n in frag.findall("n"):
63
+ nid = n.get("id", "")
64
+ p = n.get("p", "")
65
+ if nid and p:
66
+ parts = p.split()
67
+ if len(parts) >= 2:
68
+ node_map[nid] = (float(parts[0]), float(parts[1]))
69
+
70
+ lengths = []
71
+ for b in frag.findall("b"):
72
+ b_id = b.get("B", "")
73
+ e_id = b.get("E", "")
74
+ if b_id in node_map and e_id in node_map:
75
+ bx, by = node_map[b_id]
76
+ ex, ey = node_map[e_id]
77
+ d = math.sqrt((bx - ex) ** 2 + (by - ey) ** 2)
78
+ if d > 0.1:
79
+ lengths.append(d)
80
+
81
+ return lengths
82
+
83
+
84
+ def _median(values: List[float]) -> float:
85
+ """Compute median of a list of floats."""
86
+ s = sorted(values)
87
+ n = len(s)
88
+ if n == 0:
89
+ return 0.0
90
+ if n % 2 == 1:
91
+ return s[n // 2]
92
+ return (s[n // 2 - 1] + s[n // 2]) / 2.0
93
+
94
+
95
+ def _scale_fragment(frag: ET.Element, factor: float, cx: float, cy: float):
96
+ """Scale all coordinates in a fragment around (cx, cy) by factor.
97
+
98
+ Scales ALL descendant nodes and text elements (including those
99
+ inside inner NodeType="Fragment" sub-structures), since all
100
+ coordinates live in the same global space.
101
+ """
102
+ def scale_pt(x: float, y: float) -> Tuple[float, float]:
103
+ return cx + (x - cx) * factor, cy + (y - cy) * factor
104
+
105
+ def scale_bb(bb_str: str) -> str:
106
+ vals = [float(v) for v in bb_str.split()]
107
+ if len(vals) >= 4:
108
+ x1, y1 = scale_pt(vals[0], vals[1])
109
+ x2, y2 = scale_pt(vals[2], vals[3])
110
+ return f"{x1:.2f} {y1:.2f} {x2:.2f} {y2:.2f}"
111
+ return bb_str
112
+
113
+ # Scale all node positions (iter = all descendants)
114
+ for n in frag.iter("n"):
115
+ p = n.get("p")
116
+ if p:
117
+ parts = p.split()
118
+ if len(parts) >= 2:
119
+ nx, ny = scale_pt(float(parts[0]), float(parts[1]))
120
+ n.set("p", f"{nx:.2f} {ny:.2f}")
121
+
122
+ # Scale text label positions and bounding boxes
123
+ for t in frag.iter("t"):
124
+ p = t.get("p")
125
+ if p:
126
+ parts = p.split()
127
+ if len(parts) >= 2:
128
+ nx, ny = scale_pt(float(parts[0]), float(parts[1]))
129
+ t.set("p", f"{nx:.2f} {ny:.2f}")
130
+ bb = t.get("BoundingBox")
131
+ if bb:
132
+ t.set("BoundingBox", scale_bb(bb))
133
+
134
+ # Scale fragment-level BoundingBox
135
+ bb = frag.get("BoundingBox")
136
+ if bb:
137
+ frag.set("BoundingBox", scale_bb(bb))
138
+
139
+ # Scale inner fragment BoundingBoxes (abbreviation groups)
140
+ for inner in frag.iter("fragment"):
141
+ if inner is not frag:
142
+ bb = inner.get("BoundingBox")
143
+ if bb:
144
+ inner.set("BoundingBox", scale_bb(bb))
145
+
146
+
147
+ def _fragment_centroid(frag: ET.Element) -> Tuple[float, float]:
148
+ """Compute centroid from direct-child node positions."""
149
+ xs, ys = [], []
150
+ for n in frag.findall("n"):
151
+ p = n.get("p")
152
+ if p:
153
+ parts = p.split()
154
+ if len(parts) >= 2:
155
+ xs.append(float(parts[0]))
156
+ ys.append(float(parts[1]))
157
+ if not xs:
158
+ return 0.0, 0.0
159
+ return sum(xs) / len(xs), sum(ys) / len(ys)
160
+
161
+
162
+ def normalize_bond_lengths(root: ET.Element, target: float = TARGET_BOND_LENGTH,
163
+ verbose: bool = False) -> int:
164
+ """Normalize bond lengths in every fragment to the target length.
165
+
166
+ Each fragment is scaled independently around its own centroid,
167
+ so fragments at different scales (common in ELN exports) all
168
+ converge to the same bond length.
169
+
170
+ Returns the number of fragments scaled.
171
+ """
172
+ page = root.find("page")
173
+ if page is None:
174
+ return 0
175
+
176
+ scaled_count = 0
177
+ for frag in page.findall("fragment"):
178
+ lengths = _measure_bond_lengths(frag)
179
+ if not lengths:
180
+ continue
181
+
182
+ med = _median(lengths)
183
+ if med < 1.0:
184
+ continue
185
+
186
+ factor = target / med
187
+ if abs(factor - 1.0) < 0.02:
188
+ if verbose:
189
+ fid = frag.get("id", "?")
190
+ print(f" Fragment {fid}: median {med:.2f} pt, "
191
+ f"already at target ({factor:.3f}x)", file=sys.stderr)
192
+ continue
193
+
194
+ cx, cy = _fragment_centroid(frag)
195
+ _scale_fragment(frag, factor, cx, cy)
196
+ scaled_count += 1
197
+
198
+ if verbose:
199
+ fid = frag.get("id", "?")
200
+ print(f" Fragment {fid}: median {med:.2f} pt → "
201
+ f"scaled {factor:.3f}x around ({cx:.1f}, {cy:.1f})",
202
+ file=sys.stderr)
203
+
204
+ return scaled_count
205
+
206
+
207
+ # ---------------------------------------------------------------------------
208
+ # ACS document settings + font normalization
209
+ # ---------------------------------------------------------------------------
210
+
211
+ def apply_acs_settings(root: ET.Element):
212
+ """Apply ACS Document 1996 settings to the root CDXML element."""
213
+ for attr, val in ACS_SETTINGS.items():
214
+ root.set(attr, val)
215
+
216
+
217
+ def normalize_fonts(root: ET.Element, verbose: bool = False) -> int:
218
+ """Set all caption text to Arial 10pt Bold (face=96).
219
+
220
+ Only touches <t> elements that are direct children of <page>
221
+ (i.e. captions/conditions, not atom labels inside fragments).
222
+ Returns number of text elements modified.
223
+ """
224
+ page = root.find("page")
225
+ if page is None:
226
+ return 0
227
+
228
+ count = 0
229
+ for t_el in page.findall("t"):
230
+ modified = False
231
+ for s in t_el.findall("s"):
232
+ changed = False
233
+ if s.get("font") != "3":
234
+ s.set("font", "3")
235
+ changed = True
236
+ if s.get("size") != "10":
237
+ s.set("size", "10")
238
+ changed = True
239
+ # Don't override subscript (32) or italic (2) faces —
240
+ # only set formula (96) if it's something else like bold (1)
241
+ face = s.get("face", "")
242
+ if face not in ("2", "32", "96"):
243
+ s.set("face", "96")
244
+ changed = True
245
+ if changed:
246
+ modified = True
247
+ if modified:
248
+ count += 1
249
+
250
+ if verbose and count:
251
+ print(f" Normalized fonts on {count} text element(s)", file=sys.stderr)
252
+ return count
253
+
254
+
255
+ def fix_narrow_text(root: ET.Element, verbose: bool = False) -> int:
256
+ """Fix degenerate narrow text labels from Findmolecule ELN exports.
257
+
258
+ ELN exports sometimes create text with per-character LineStarts (each
259
+ character on its own line in a very narrow column, e.g. "Sodium Bicarbonate"
260
+ rendered as a 1-character-wide column 18 lines tall). This causes
261
+ BoundingBox to extend very far vertically, breaking layout and
262
+ run-arrow placement.
263
+
264
+ Fix: remove LineStarts attribute and recalculate BoundingBox to
265
+ approximate single-line width so downstream layout works correctly.
266
+
267
+ Returns number of text elements fixed.
268
+ """
269
+ page = root.find("page")
270
+ if page is None:
271
+ return 0
272
+
273
+ count = 0
274
+ for t_el in page.findall("t"):
275
+ ls = t_el.get("LineStarts")
276
+ if not ls:
277
+ continue
278
+
279
+ # Get text content
280
+ text = "".join((s.text or "") for s in t_el.findall("s"))
281
+ if not text:
282
+ continue
283
+
284
+ line_starts = ls.strip().split()
285
+ n_lines = len(line_starts)
286
+ n_words = len(text.split())
287
+
288
+ # Heuristic: if LineStarts has more entries than 2× words,
289
+ # it's likely per-character wrapping from a narrow column
290
+ if n_lines <= max(n_words * 2, 3):
291
+ continue
292
+
293
+ # Remove LineStarts to make it single-line
294
+ del t_el.attrib["LineStarts"]
295
+
296
+ # Recalculate BoundingBox based on text length
297
+ # Arial 10pt Bold: ~6.0 pt per character average
298
+ p = t_el.get("p")
299
+ if p:
300
+ parts = p.split()
301
+ if len(parts) >= 2:
302
+ px, py = float(parts[0]), float(parts[1])
303
+ est_width = len(text) * 6.0
304
+ # BoundingBox: left top right bottom
305
+ t_el.set("BoundingBox",
306
+ f"{px:.2f} {py - 11:.2f} "
307
+ f"{px + est_width:.2f} {py + 3:.2f}")
308
+
309
+ count += 1
310
+ if verbose:
311
+ print(f" Fixed narrow text: '{text}' "
312
+ f"({n_lines} LineStarts → single line)",
313
+ file=sys.stderr)
314
+
315
+ return count
316
+
317
+
318
+ def resolve_orphan_reagent_text(root: ET.Element, verbose: bool = False) -> int:
319
+ """Resolve orphan text labels to their reagent DB display names.
320
+
321
+ ELN exports sometimes place reagent names as free-floating text
322
+ elements that are NOT referenced in the ``<step>`` metadata (e.g.
323
+ "Sodium Bicarbonate" placed next to the substrate). The polisher
324
+ only processes step-referenced elements, so these labels are never
325
+ reformatted.
326
+
327
+ This function:
328
+ 1. Finds text elements on the page that are NOT referenced in any step.
329
+ 2. Looks up each text in the reagent database.
330
+ 3. If found, renames the text to the DB display name (e.g.
331
+ "Sodium Bicarbonate" → "NaHCO3").
332
+ 4. Adds the text to the nearest step's below-arrow references so
333
+ the polisher will process it (reformatting, conditions merging).
334
+
335
+ Returns number of text elements resolved.
336
+ """
337
+ from ...resolve.reagent_db import get_reagent_db
338
+
339
+ page = root.find("page")
340
+ if page is None:
341
+ return 0
342
+
343
+ db = get_reagent_db()
344
+
345
+ # Collect all IDs referenced by any step
346
+ step_ids: set = set()
347
+ for scheme in page.findall("scheme"):
348
+ for step in scheme.findall("step"):
349
+ for attr in ("ReactionStepReactants", "ReactionStepProducts",
350
+ "ReactionStepArrows",
351
+ "ReactionStepObjectsAboveArrow",
352
+ "ReactionStepObjectsBelowArrow"):
353
+ val = step.get(attr, "")
354
+ for tok in val.split():
355
+ try:
356
+ step_ids.add(int(tok))
357
+ except ValueError:
358
+ pass
359
+
360
+ # Find the first step (for adding below-arrow references)
361
+ first_step = None
362
+ for scheme in page.findall("scheme"):
363
+ steps = scheme.findall("step")
364
+ if steps:
365
+ first_step = steps[0]
366
+ break
367
+
368
+ count = 0
369
+ for t_el in page.findall("t"):
370
+ tid_str = t_el.get("id")
371
+ if tid_str is None:
372
+ continue
373
+ try:
374
+ tid = int(tid_str)
375
+ except ValueError:
376
+ continue
377
+
378
+ if tid in step_ids:
379
+ continue # Already referenced in a step — polisher handles it
380
+
381
+ # Get text content
382
+ text = "".join((s.text or "") for s in t_el.findall("s"))
383
+ text = text.strip()
384
+ if not text or len(text) < 2:
385
+ continue
386
+
387
+ # Try to resolve via reagent DB
388
+ display = db.display_for_name(text.lower())
389
+ if display is None:
390
+ continue
391
+ if display.lower() == text.lower():
392
+ # Already the display form — still add to step but don't rename
393
+ pass
394
+ else:
395
+ # Rename the text to the display form
396
+ for s_el in t_el.findall("s"):
397
+ s_el.text = display
398
+ # Recalculate bounding box
399
+ p = t_el.get("p")
400
+ if p:
401
+ parts = p.split()
402
+ if len(parts) >= 2:
403
+ px, py = float(parts[0]), float(parts[1])
404
+ est_width = len(display) * 5.8
405
+ t_el.set("BoundingBox",
406
+ f"{px:.2f} {py - 9:.2f} "
407
+ f"{px + est_width:.2f} {py + 3:.2f}")
408
+ if verbose:
409
+ print(f" Renamed orphan text: '{text}' → '{display}'",
410
+ file=sys.stderr)
411
+
412
+ # Add to step below-arrow references
413
+ if first_step is not None:
414
+ below_str = first_step.get(
415
+ "ReactionStepObjectsBelowArrow", "")
416
+ if str(tid) not in below_str.split():
417
+ first_step.set(
418
+ "ReactionStepObjectsBelowArrow",
419
+ f"{below_str} {tid}".strip())
420
+ if verbose:
421
+ print(f" Added '{display}' (id={tid}) to step "
422
+ f"below-arrow references",
423
+ file=sys.stderr)
424
+
425
+ count += 1
426
+
427
+ return count
428
+
429
+
430
+ # ---------------------------------------------------------------------------
431
+ # Alignment imports (from alignment.py)
432
+ # ---------------------------------------------------------------------------
433
+ # Geometry primitives + high-level alignment orchestrators live in
434
+ # alignment.py. We import what's needed here and keep backward-
435
+ # compatible private aliases for internal callers.
436
+
437
+ from ...layout.alignment import (
438
+ fragment_centroid as _fragment_centroid,
439
+ get_visible_carbon_positions as _get_visible_carbon_positions,
440
+ match_and_compute_rotation as _match_and_compute_rotation,
441
+ rotate_fragment_in_place as _rotate_all_coords,
442
+ rdkit_align_to_product,
443
+ kabsch_align_to_product,
444
+ align_product_to_reference,
445
+ rxnmapper_align_to_product,
446
+ )
447
+
448
+
449
+ def _shift_element_coords(elem: ET.Element, dx: float, dy: float) -> None:
450
+ """Shift all <n> and <t> coordinates within an element tree by (dx, dy).
451
+
452
+ Updates node positions, text positions, and BoundingBox attributes
453
+ on all descendants.
454
+ """
455
+ for n in elem.iter("n"):
456
+ p = n.get("p")
457
+ if p:
458
+ parts = p.split()
459
+ if len(parts) >= 2:
460
+ n.set("p", f"{float(parts[0]) + dx:.2f} "
461
+ f"{float(parts[1]) + dy:.2f}")
462
+ for t in elem.iter("t"):
463
+ p = t.get("p")
464
+ if p:
465
+ parts = p.split()
466
+ if len(parts) >= 2:
467
+ t.set("p", f"{float(parts[0]) + dx:.2f} "
468
+ f"{float(parts[1]) + dy:.2f}")
469
+ bb = t.get("BoundingBox")
470
+ if bb:
471
+ vals = [float(v) for v in bb.split()]
472
+ if len(vals) >= 4:
473
+ t.set("BoundingBox",
474
+ f"{vals[0]+dx:.2f} {vals[1]+dy:.2f} "
475
+ f"{vals[2]+dx:.2f} {vals[3]+dy:.2f}")
476
+ bb = elem.get("BoundingBox")
477
+ if bb:
478
+ vals = [float(v) for v in bb.split()]
479
+ if len(vals) >= 4:
480
+ elem.set("BoundingBox",
481
+ f"{vals[0]+dx:.2f} {vals[1]+dy:.2f} "
482
+ f"{vals[2]+dx:.2f} {vals[3]+dy:.2f}")
483
+
484
+
485
+ # Note: RDKit MCS alignment functions have been moved to alignment.py.
486
+ # rdkit_align_to_product and kabsch_align_to_product are imported above.
487
+
488
+
489
+ # ---------------------------------------------------------------------------
490
+ # ChemScript per-fragment structure cleanup
491
+ # ---------------------------------------------------------------------------
492
+
493
+ # Dummy element used to replace abbreviation nodes during cleanup.
494
+ # Iodine (53) is a safe choice: ChemScript treats it as a normal atom
495
+ # and won't add hydrogens. In the rare case the molecule already has
496
+ # iodine, dummies are matched back by position proximity.
497
+ _ABBREV_DUMMY_ELEMENT = "53"
498
+
499
+
500
+ def _cleanup_fragments_chemscript(root: ET.Element,
501
+ verbose: bool = False) -> int:
502
+ """Clean up each fragment's geometry via ChemScript CleanupStructure.
503
+
504
+ Extracts each <fragment> from the page into a standalone CDXML,
505
+ runs ChemScript cleanup on it, then replaces the fragment in-place
506
+ while preserving the original centroid position and element ID.
507
+
508
+ **Abbreviation preservation:** Before cleanup, any abbreviation nodes
509
+ (``NodeType="Fragment"``) are temporarily replaced with dummy atoms
510
+ (Iodine) so ChemScript doesn't expand them. After cleanup, the
511
+ saved abbreviation nodes are restored at the cleaned positions.
512
+
513
+ **Orientation preservation:** Kabsch alignment on visible carbon
514
+ atom positions corrects arbitrary rotations introduced by cleanup.
515
+
516
+ Returns the number of fragments cleaned.
517
+ """
518
+ page = root.find("page")
519
+ if page is None:
520
+ return 0
521
+
522
+ # Lazy-init ChemScript bridge
523
+ cs_bridge = None
524
+
525
+ def _ensure_cs():
526
+ nonlocal cs_bridge
527
+ if cs_bridge is None:
528
+ from ...chemdraw.chemscript_bridge import ChemScriptBridge
529
+ cs_bridge = ChemScriptBridge()
530
+ return cs_bridge
531
+
532
+ cleaned_count = 0
533
+
534
+ for frag in list(page.findall("fragment")): # list() — we modify page
535
+ frag_id = frag.get("id", "?")
536
+
537
+ # Measure current centroid
538
+ old_cx, old_cy = _fragment_centroid(frag)
539
+ if old_cx == 0.0 and old_cy == 0.0:
540
+ if verbose:
541
+ print(f" Fragment {frag_id}: no atom coords, skipping",
542
+ file=sys.stderr)
543
+ continue
544
+
545
+ # Save visible carbon positions for Kabsch orientation matching
546
+ old_carbons = _get_visible_carbon_positions(frag)
547
+
548
+ # Skip fragments with too few visible carbons — these are likely
549
+ # inorganic salts (Cs2CO3=1C, NaH=0C) or very small molecules
550
+ # that don't benefit from geometry cleanup. ChemScript can also
551
+ # alter their connectivity (e.g. strip counterions from salts).
552
+ if len(old_carbons) < 3:
553
+ if verbose:
554
+ print(f" Fragment {frag_id}: only {len(old_carbons)} "
555
+ f"visible carbon(s), skipping cleanup",
556
+ file=sys.stderr)
557
+ continue
558
+
559
+ # Preserve objecttag children (FM MOLECULE TYPE, etc.)
560
+ saved_objecttags = []
561
+ for ot in frag.findall("objecttag"):
562
+ saved_objecttags.append(copy.deepcopy(ot))
563
+
564
+ # --- Abbreviation preservation: swap with dummy atoms ---
565
+ # Work on a deep copy so the original fragment is untouched
566
+ # in case cleanup fails.
567
+ work_frag = copy.deepcopy(frag)
568
+ saved_abbrevs = [] # list of deep-copied abbreviation <n> elements
569
+
570
+ for n in work_frag.findall("n"):
571
+ if n.get("NodeType") != "Fragment":
572
+ continue
573
+ # Save deep copy of the full abbreviation node
574
+ saved_abbrevs.append(copy.deepcopy(n))
575
+ # Strip to dummy atom: remove inner fragment + label
576
+ for child in list(n):
577
+ n.remove(child)
578
+ for attr in ("NodeType", "LabelDisplay", "NeedsClean",
579
+ "AS", "Warning"):
580
+ if attr in n.attrib:
581
+ del n.attrib[attr]
582
+ n.set("Element", _ABBREV_DUMMY_ELEMENT)
583
+ n.set("NumHydrogens", "0")
584
+
585
+ if saved_abbrevs and verbose:
586
+ labels = []
587
+ for sa in saved_abbrevs:
588
+ t = sa.find("t")
589
+ if t is not None:
590
+ labels.append("".join(
591
+ (s.text or "") for s in t.findall("s")))
592
+ print(f" Fragment {frag_id}: {len(saved_abbrevs)} abbreviation(s) "
593
+ f"swapped with dummies ({', '.join(labels)})",
594
+ file=sys.stderr)
595
+
596
+ # Wrap the modified copy in minimal CDXML
597
+ frag_xml = ET.tostring(work_frag, encoding="unicode")
598
+ wrapper_cdxml = (
599
+ f'{CDXML_MINIMAL_HEADER}\n'
600
+ '<page id="1">\n'
601
+ f'{frag_xml}\n'
602
+ '</page>\n'
603
+ f'{CDXML_FOOTER}'
604
+ )
605
+
606
+ tmp_in = tmp_out = None
607
+ try:
608
+ _ensure_cs()
609
+
610
+ with tempfile.NamedTemporaryFile(
611
+ suffix=".cdxml", mode="w", delete=False, encoding="utf-8"
612
+ ) as f:
613
+ f.write(wrapper_cdxml)
614
+ tmp_in = f.name
615
+
616
+ tmp_out = tmp_in.replace(".cdxml", "-clean.cdxml")
617
+ cs_bridge.cleanup(tmp_in, output=tmp_out)
618
+
619
+ # Parse cleaned output
620
+ clean_tree = ET.parse(tmp_out)
621
+ clean_root = clean_tree.getroot()
622
+ clean_page = clean_root.find("page")
623
+ if clean_page is None:
624
+ continue
625
+ clean_frag = clean_page.find("fragment")
626
+ if clean_frag is None:
627
+ continue
628
+
629
+ # --- Preserve original orientation via Kabsch alignment ---
630
+ # ChemScript cleanup can arbitrarily rotate the structure.
631
+ # Use visible carbon positions (same count before/after since
632
+ # abbreviations were replaced with dummies, not carbons).
633
+ new_carbons = _get_visible_carbon_positions(clean_frag)
634
+
635
+ if (len(old_carbons) >= 3
636
+ and len(new_carbons) == len(old_carbons)):
637
+ cos_a, sin_a, angle_deg = _match_and_compute_rotation(
638
+ new_carbons, old_carbons)
639
+ if abs(angle_deg) >= 1.0:
640
+ rot_cx, rot_cy = _fragment_centroid(clean_frag)
641
+ _rotate_all_coords(
642
+ clean_frag, cos_a, sin_a, rot_cx, rot_cy)
643
+ if verbose:
644
+ print(f" Fragment {frag_id}: re-aligned "
645
+ f"{angle_deg:.1f}\u00b0 to original "
646
+ f"orientation", file=sys.stderr)
647
+ elif verbose and old_carbons:
648
+ print(f" Fragment {frag_id}: Kabsch skipped "
649
+ f"(old={len(old_carbons)}, "
650
+ f"new={len(new_carbons)} visible carbons)",
651
+ file=sys.stderr)
652
+
653
+ # Compute new centroid and shift to old position
654
+ new_cx, new_cy = _fragment_centroid(clean_frag)
655
+ if new_cx == 0.0 and new_cy == 0.0:
656
+ continue
657
+
658
+ dx = old_cx - new_cx
659
+ dy = old_cy - new_cy
660
+
661
+ # Shift all coordinates in the cleaned fragment
662
+ _shift_element_coords(clean_frag, dx, dy)
663
+ # Also shift inner fragment BoundingBoxes (not covered by
664
+ # _shift_element_coords since it uses .iter on the element
665
+ # itself, but inner <fragment> BB is on a non-n/non-t tag)
666
+ for inner in clean_frag.iter("fragment"):
667
+ if inner is not clean_frag:
668
+ ib = inner.get("BoundingBox")
669
+ if ib:
670
+ vals = [float(v) for v in ib.split()]
671
+ if len(vals) >= 4:
672
+ inner.set("BoundingBox",
673
+ f"{vals[0]+dx:.2f} {vals[1]+dy:.2f} "
674
+ f"{vals[2]+dx:.2f} {vals[3]+dy:.2f}")
675
+
676
+ # --- Restore abbreviation nodes ---
677
+ if saved_abbrevs:
678
+ # Find dummy atoms in the cleaned fragment
679
+ dummies = [n for n in clean_frag.findall("n")
680
+ if n.get("Element") == _ABBREV_DUMMY_ELEMENT]
681
+
682
+ # Match dummies to saved abbreviations by position proximity
683
+ used_saved = set()
684
+ for dummy in dummies:
685
+ dp = dummy.get("p", "").split()
686
+ if len(dp) < 2:
687
+ continue
688
+ d_x, d_y = float(dp[0]), float(dp[1])
689
+
690
+ # Find closest saved abbreviation
691
+ best_si = -1
692
+ best_d2 = float("inf")
693
+ for si, saved in enumerate(saved_abbrevs):
694
+ if si in used_saved:
695
+ continue
696
+ sp = saved.get("p", "").split()
697
+ if len(sp) < 2:
698
+ continue
699
+ s_x, s_y = float(sp[0]), float(sp[1])
700
+ d2 = (d_x - s_x) ** 2 + (d_y - s_y) ** 2
701
+ if d2 < best_d2:
702
+ best_d2 = d2
703
+ best_si = si
704
+
705
+ if best_si < 0:
706
+ continue
707
+ used_saved.add(best_si)
708
+ saved_node = saved_abbrevs[best_si]
709
+
710
+ # Compute offset from old to new abbreviation position
711
+ old_sp = saved_node.get("p", "").split()
712
+ if len(old_sp) < 2:
713
+ continue
714
+ abbr_dx = d_x - float(old_sp[0])
715
+ abbr_dy = d_y - float(old_sp[1])
716
+
717
+ # Update abbreviation node position + ID
718
+ saved_node.set("p", dummy.get("p"))
719
+ saved_node.set("id", dummy.get("id"))
720
+
721
+ # Shift inner fragment coordinates by the same offset
722
+ inner_frag = saved_node.find("fragment")
723
+ if inner_frag is not None:
724
+ _shift_element_coords(inner_frag, abbr_dx, abbr_dy)
725
+
726
+ # Replace dummy with abbreviation in the fragment
727
+ children = list(clean_frag)
728
+ idx = children.index(dummy)
729
+ clean_frag.remove(dummy)
730
+ clean_frag.insert(idx, saved_node)
731
+
732
+ if verbose:
733
+ lbl = ""
734
+ t = saved_node.find("t")
735
+ if t is not None:
736
+ lbl = "".join(
737
+ (s.text or "") for s in t.findall("s"))
738
+ print(f" Fragment {frag_id}: restored "
739
+ f"abbreviation '{lbl}' at "
740
+ f"({d_x:.1f}, {d_y:.1f})",
741
+ file=sys.stderr)
742
+
743
+ # Preserve original fragment ID
744
+ clean_frag.set("id", frag_id)
745
+
746
+ # Restore objecttags (ChemScript strips custom metadata)
747
+ for ot in saved_objecttags:
748
+ clean_frag.append(ot)
749
+
750
+ # Replace fragment in page
751
+ page_children = list(page)
752
+ frag_index = page_children.index(frag)
753
+ page.remove(frag)
754
+ page.insert(frag_index, clean_frag)
755
+
756
+ cleaned_count += 1
757
+ if verbose:
758
+ print(f" Fragment {frag_id}: cleaned "
759
+ f"(shift dx={dx:.1f}, dy={dy:.1f})",
760
+ file=sys.stderr)
761
+
762
+ except Exception as exc:
763
+ if verbose:
764
+ print(f" Fragment {frag_id}: cleanup failed: {exc}",
765
+ file=sys.stderr)
766
+ finally:
767
+ for tmp in (tmp_in, tmp_out):
768
+ if tmp and os.path.exists(tmp):
769
+ try:
770
+ os.unlink(tmp)
771
+ except OSError:
772
+ pass
773
+
774
+ # Close ChemScript bridge
775
+ if cs_bridge is not None:
776
+ try:
777
+ cs_bridge.close()
778
+ except Exception:
779
+ pass
780
+
781
+ return cleaned_count
782
+
783
+
784
+ def _cleanup_fragments_rdkit(root: ET.Element,
785
+ verbose: bool = False) -> int:
786
+ """Clean up each fragment's geometry via RDKit (fallback for ChemScript).
787
+
788
+ Uses rdkit_utils.cleanup_fragment_rdkit() which does RDKit 2D layout
789
+ + Kabsch orientation restoration. Abbreviation groups are included
790
+ as dummy atoms so their bonds get proper lengths too.
791
+
792
+ Returns the number of fragments cleaned.
793
+ """
794
+ from ...rdkit_utils import cleanup_fragment_rdkit
795
+
796
+ page = root.find("page")
797
+ if page is None:
798
+ return 0
799
+
800
+ cleaned = 0
801
+ for frag in page.findall("fragment"):
802
+ try:
803
+ if cleanup_fragment_rdkit(frag, verbose):
804
+ cleaned += 1
805
+ except Exception as e:
806
+ if verbose:
807
+ frag_id = frag.get("id", "?")
808
+ print(f" [warn] RDKit cleanup skipped fragment {frag_id}: {e}",
809
+ file=sys.stderr)
810
+ return cleaned
811
+
812
+
813
+ # ---------------------------------------------------------------------------
814
+ # CDXML I/O helpers
815
+ # ---------------------------------------------------------------------------
816
+
817
+ def _parse_cdxml(path: str) -> ET.ElementTree:
818
+ return ET.parse(path)
819
+
820
+
821
+ def _write_cdxml(tree: ET.ElementTree, path: str):
822
+ """Write CDXML, re-inserting DOCTYPE."""
823
+ tree.write(path, xml_declaration=True, encoding="UTF-8")
824
+ with open(path, "r", encoding="utf-8") as f:
825
+ content = f.read()
826
+ if "<!DOCTYPE" not in content:
827
+ content = content.replace(
828
+ "?>",
829
+ '?>\n<!DOCTYPE CDXML SYSTEM '
830
+ '"http://www.cambridgesoft.com/xml/cdxml.dtd" >',
831
+ 1,
832
+ )
833
+ content = content.replace("ns0:", "").replace(":ns0", "")
834
+ with open(path, "w", encoding="utf-8") as f:
835
+ f.write(content)
836
+
837
+
838
+ def _convert_cdx_to_cdxml(cdx_path: str, verbose: bool = False) -> str:
839
+ """Convert CDX to CDXML using cdx_converter.py.
840
+
841
+ Returns path to the generated CDXML file.
842
+ """
843
+ cdxml_path = os.path.splitext(cdx_path)[0] + ".cdxml"
844
+ cmd = [sys.executable, "-m", "cdxml_toolkit.cdx_converter",
845
+ cdx_path, "-o", cdxml_path]
846
+ if verbose:
847
+ print(f" Converting CDX → CDXML: {os.path.basename(cdx_path)}",
848
+ file=sys.stderr)
849
+ result = subprocess.run(cmd, capture_output=True, text=True)
850
+ if result.returncode != 0:
851
+ raise RuntimeError(f"CDX conversion failed: {result.stderr.strip()}")
852
+ if verbose:
853
+ print(f" {result.stdout.strip()}", file=sys.stderr)
854
+ return cdxml_path
855
+
856
+
857
+ # ---------------------------------------------------------------------------
858
+ # Main pipeline
859
+ # ---------------------------------------------------------------------------
860
+
861
+ def run_pipeline(
862
+ input_path: str,
863
+ output_path: str,
864
+ merge_conditions: bool = True,
865
+ approach: str = "chemdraw_mimic",
866
+ chemscript_cleanup: bool = True,
867
+ align_mode: str = "rdkit",
868
+ eln_csv: Optional[str] = None,
869
+ ref_cdxml: Optional[str] = None,
870
+ verbose: bool = False,
871
+ ) -> str:
872
+ """Run the full COM-free polishing pipeline.
873
+
874
+ Parameters
875
+ ----------
876
+ input_path : str
877
+ Path to input .cdx or .cdxml file.
878
+ output_path : str
879
+ Path for final output .cdxml file.
880
+ merge_conditions : bool
881
+ Merge all condition text into one centered block (default True).
882
+ approach : str
883
+ Layout approach for reaction_cleanup (default "chemdraw_mimic").
884
+ chemscript_cleanup : bool
885
+ Run ChemScript CleanupStructure on each fragment before bond
886
+ normalization (fixes bond angles; default True). Cleaned
887
+ structures are re-aligned to their original orientation via
888
+ Kabsch alignment so the cleanup doesn't rotate the scheme.
889
+ align_mode : str
890
+ How to align reactant/reagent orientations to the product.
891
+ "rdkit" (default): RDKit MCS + GenerateDepictionMatching2DStructure.
892
+ Can rotate individual bonds, not just the whole molecule.
893
+ Falls back to scheme_polisher's Kabsch if RDKit is unavailable.
894
+ "rxnmapper": ML transformer atom mapping via RXNMapper.
895
+ Understands reaction chemistry; falls back to MCS if unavailable.
896
+ "kabsch": rigid-rotation Kabsch alignment via scheme_polisher
897
+ (legacy mode — only rotates the entire fragment).
898
+ eln_csv : str or None
899
+ Path to Findmolecule ELN CSV file for enrichment (equivalents,
900
+ run arrow with SM mass and product yield).
901
+ ref_cdxml : str or None
902
+ Path to a reference CDXML file containing known-good structures
903
+ drawn with the desired orientation (e.g. from a group meeting
904
+ slide). The product is aligned to the best-matching reference
905
+ structure via MCS, then reactants are aligned to the product.
906
+ verbose : bool
907
+ Print progress to stderr.
908
+
909
+ Returns
910
+ -------
911
+ str
912
+ Path to the output file.
913
+ """
914
+ def log(msg: str):
915
+ if verbose:
916
+ print(f"[v2] {msg}", file=sys.stderr)
917
+
918
+ input_path = os.path.abspath(input_path)
919
+ output_path = os.path.abspath(output_path)
920
+ ext = os.path.splitext(input_path)[1].lower()
921
+
922
+ # --- Step 1: CDX → CDXML conversion if needed ---
923
+ if ext == ".cdx":
924
+ cdxml_path = _convert_cdx_to_cdxml(input_path, verbose)
925
+ owns_cdxml = False # don't delete — user may want it
926
+ elif ext == ".cdxml":
927
+ cdxml_path = input_path
928
+ else:
929
+ raise ValueError(f"Unsupported file format: {ext}")
930
+
931
+ # --- Step 2: Parse CDXML and optionally run ChemScript cleanup ---
932
+ tree = _parse_cdxml(cdxml_path)
933
+ root = tree.getroot()
934
+
935
+ if chemscript_cleanup:
936
+ log("Step 0: Running fragment geometry cleanup...")
937
+ cleanup_done = False
938
+ # RDKit is the default cleanup path (works without ChemScript)
939
+ try:
940
+ n_cleaned = _cleanup_fragments_rdkit(root, verbose)
941
+ if n_cleaned > 0:
942
+ log(f" Cleaned {n_cleaned} fragment(s) via RDKit")
943
+ cleanup_done = True
944
+ else:
945
+ log(f" RDKit cleanup returned 0 fragments, trying ChemScript...")
946
+ except Exception as exc:
947
+ log(f" RDKit cleanup failed ({exc}), trying ChemScript...")
948
+ # ChemScript fallback (if available and RDKit didn't clean anything)
949
+ if not cleanup_done:
950
+ try:
951
+ n_cleaned = _cleanup_fragments_chemscript(root, verbose)
952
+ if n_cleaned > 0:
953
+ log(f" Cleaned {n_cleaned} fragment(s) via ChemScript")
954
+ else:
955
+ log(f" No fragments cleaned by either backend")
956
+ except Exception as exc2:
957
+ log(f" ChemScript also unavailable ({exc2}), "
958
+ f"continuing without cleanup...")
959
+
960
+ # --- Normalize bond lengths per-fragment ---
961
+ log("Step 1: Normalizing bond lengths to ACS 14.40 pt...")
962
+ n_scaled = normalize_bond_lengths(root, TARGET_BOND_LENGTH, verbose)
963
+ log(f" Scaled {n_scaled} fragment(s)")
964
+
965
+ # --- Step 3: Apply ACS document settings ---
966
+ log("Step 2: Applying ACS Document 1996 settings...")
967
+ apply_acs_settings(root)
968
+
969
+ # --- Step 3: Normalize fonts ---
970
+ log("Step 3: Normalizing fonts to Arial 10pt...")
971
+ normalize_fonts(root, verbose)
972
+
973
+ # --- Step 3b: Fix narrow vertical text from ELN exports ---
974
+ n_fixed_text = fix_narrow_text(root, verbose)
975
+ if n_fixed_text:
976
+ log(f" Fixed {n_fixed_text} narrow text element(s)")
977
+
978
+ # --- Step 3c: Resolve orphan reagent text labels ---
979
+ n_resolved = resolve_orphan_reagent_text(root, verbose)
980
+ if n_resolved:
981
+ log(f" Resolved {n_resolved} orphan reagent text label(s)")
982
+
983
+ # --- Step 5: Write intermediate CDXML ---
984
+ tmpdir = tempfile.mkdtemp(prefix="spv2_")
985
+ normalized_path = os.path.join(tmpdir, "normalized.cdxml")
986
+ _write_cdxml(tree, normalized_path)
987
+ log(f" Wrote normalized CDXML to temp")
988
+
989
+ try:
990
+ # --- Step 6: Run scheme_polisher logic ---
991
+ # Always skip alignment inside polish_scheme — alignment is handled
992
+ # as an explicit Step 4e below (either rdkit or kabsch).
993
+ log("Step 4: Running scheme_polisher (classification + swaps + "
994
+ "formatting, alignment deferred to Step 4e)...")
995
+ from .scheme_polisher import polish_scheme, _compact_toward_arrow
996
+
997
+ polished_path = os.path.join(tmpdir, "polished.cdxml")
998
+ result = polish_scheme(
999
+ normalized_path, polished_path,
1000
+ verbose=verbose,
1001
+ merge_conditions=merge_conditions,
1002
+ skip_alignment=True,
1003
+ )
1004
+
1005
+ n_replaced = len(result["replacements"])
1006
+ n_promoted = len(result["promotions"])
1007
+ n_aligned = len(result.get("alignments", []))
1008
+ n_reformatted = len(result["reformatted"])
1009
+ n_deduped = len(result["dedup_removed"])
1010
+ log(f" {n_replaced} structure->text, {n_promoted} text->structure, "
1011
+ f"{n_aligned} aligned (Kabsch), {n_reformatted} reformatted, "
1012
+ f"{n_deduped} deduped"
1013
+ + (", conditions merged" if result.get("merged_conditions") else ""))
1014
+
1015
+ # --- Step 4d: Align product to reference orientation ---
1016
+ if ref_cdxml:
1017
+ log("Step 4d: Aligning product to reference structure...")
1018
+ polished_tree = _parse_cdxml(polished_path)
1019
+ polished_root = polished_tree.getroot()
1020
+ try:
1021
+ success = align_product_to_reference(
1022
+ polished_root, ref_cdxml, verbose=verbose)
1023
+ if success:
1024
+ _write_cdxml(polished_tree, polished_path)
1025
+ log(" Product aligned to reference orientation")
1026
+ else:
1027
+ log(" No matching reference found — product keeps "
1028
+ "current orientation")
1029
+ except Exception as exc:
1030
+ log(f" WARNING: Reference alignment failed ({exc})")
1031
+
1032
+ # --- Step 4e: Alignment to product orientation ---
1033
+ if align_mode == "rdkit":
1034
+ log("Step 4e: RDKit MCS alignment to product orientation...")
1035
+ polished_tree = _parse_cdxml(polished_path)
1036
+ polished_root = polished_tree.getroot()
1037
+ try:
1038
+ n_rdkit_aligned = rdkit_align_to_product(
1039
+ polished_root, verbose=verbose)
1040
+ if n_rdkit_aligned > 0:
1041
+ _write_cdxml(polished_tree, polished_path)
1042
+ log(f" Aligned {n_rdkit_aligned} fragment(s) via "
1043
+ f"RDKit MCS + GenerateDepictionMatching2DStructure")
1044
+ else:
1045
+ log(" No fragments aligned via RDKit "
1046
+ "(MCS too small or RDKit unavailable)")
1047
+ except Exception as exc:
1048
+ log(f" WARNING: RDKit alignment failed ({exc}), "
1049
+ f"falling back to Kabsch...")
1050
+ # Fall back to Kabsch if RDKit fails
1051
+ polished_tree = _parse_cdxml(polished_path)
1052
+ polished_root = polished_tree.getroot()
1053
+ try:
1054
+ aligned_ids = kabsch_align_to_product(
1055
+ polished_root, verbose=verbose)
1056
+ if aligned_ids:
1057
+ _write_cdxml(polished_tree, polished_path)
1058
+ log(f" Kabsch fallback aligned {len(aligned_ids)} "
1059
+ f"fragment(s)")
1060
+ except Exception as exc2:
1061
+ log(f" WARNING: Kabsch fallback also failed ({exc2})")
1062
+ elif align_mode == "rxnmapper":
1063
+ log("Step 4e: RXNMapper alignment to product orientation...")
1064
+ polished_tree = _parse_cdxml(polished_path)
1065
+ polished_root = polished_tree.getroot()
1066
+ try:
1067
+ n_rxnm_aligned = rxnmapper_align_to_product(
1068
+ polished_root, verbose=verbose)
1069
+ if n_rxnm_aligned > 0:
1070
+ _write_cdxml(polished_tree, polished_path)
1071
+ log(f" Aligned {n_rxnm_aligned} fragment(s) via "
1072
+ f"RXNMapper atom maps")
1073
+ else:
1074
+ log(" No fragments aligned via RXNMapper")
1075
+ except Exception as exc:
1076
+ log(f" WARNING: RXNMapper alignment failed ({exc}), "
1077
+ f"falling back to RDKit MCS...")
1078
+ polished_tree = _parse_cdxml(polished_path)
1079
+ polished_root = polished_tree.getroot()
1080
+ try:
1081
+ n_rdkit_aligned = rdkit_align_to_product(
1082
+ polished_root, verbose=verbose)
1083
+ if n_rdkit_aligned > 0:
1084
+ _write_cdxml(polished_tree, polished_path)
1085
+ log(f" MCS fallback aligned {n_rdkit_aligned} "
1086
+ f"fragment(s)")
1087
+ except Exception as exc2:
1088
+ log(f" WARNING: MCS fallback also failed ({exc2})")
1089
+ elif align_mode == "kabsch":
1090
+ log("Step 4e: Kabsch alignment to product orientation...")
1091
+ polished_tree = _parse_cdxml(polished_path)
1092
+ polished_root = polished_tree.getroot()
1093
+ try:
1094
+ aligned_ids = kabsch_align_to_product(
1095
+ polished_root, verbose=verbose)
1096
+ if aligned_ids:
1097
+ _write_cdxml(polished_tree, polished_path)
1098
+ log(f" Aligned {len(aligned_ids)} fragment(s) via Kabsch")
1099
+ else:
1100
+ log(" No fragments aligned via Kabsch")
1101
+ except Exception as exc:
1102
+ log(f" WARNING: Kabsch alignment failed ({exc})")
1103
+
1104
+ # --- Step 4.5: Reposition non-substrate reactant above arrow ---
1105
+ if eln_csv:
1106
+ from .eln_enrichment import reposition_reactant_above_arrow
1107
+
1108
+ polished_tree = _parse_cdxml(polished_path)
1109
+ polished_root = polished_tree.getroot()
1110
+ if reposition_reactant_above_arrow(
1111
+ polished_root, eln_csv, verbose=verbose):
1112
+ _write_cdxml(polished_tree, polished_path)
1113
+ log("Step 4.5: Repositioned non-substrate reactant above arrow")
1114
+
1115
+ # --- Step 5.5: Phase A — ELN enrichment (equiv into text, before layout) ---
1116
+ enrichment_data = None
1117
+ if eln_csv:
1118
+ log("Step 5.5: Phase A — Injecting equivalents into text...")
1119
+ from .eln_enrichment import match_csv_to_scheme, enrich_phase_a
1120
+
1121
+ # Re-parse the polished CDXML to inject equivs
1122
+ polished_tree = _parse_cdxml(polished_path)
1123
+ polished_root = polished_tree.getroot()
1124
+
1125
+ enrichment_data = match_csv_to_scheme(
1126
+ polished_root, eln_csv, verbose=verbose)
1127
+ log(f" Matched {len(enrichment_data.matches)} CSV reagents "
1128
+ f"to scheme elements")
1129
+
1130
+ merged_text_id = result.get("merged_text_id")
1131
+ enrich_phase_a(
1132
+ polished_root, enrichment_data,
1133
+ merged_text_id=str(merged_text_id) if merged_text_id else None,
1134
+ verbose=verbose,
1135
+ )
1136
+
1137
+ # Write back
1138
+ _write_cdxml(polished_tree, polished_path)
1139
+
1140
+ # --- Step 7: Compact toward arrow ---
1141
+ log("Step 5: Compacting objects toward arrow...")
1142
+ _compact_toward_arrow(polished_path, verbose)
1143
+
1144
+ # --- Step 8: Run reaction_cleanup ---
1145
+ log(f"Step 6: Running reaction_cleanup (approach={approach})...")
1146
+ from ...layout.reaction_cleanup import run_cleanup
1147
+
1148
+ run_cleanup(polished_path, output_path, approach=approach, verbose=verbose)
1149
+ log(f" Final layout complete")
1150
+
1151
+ # --- Step 7.5: Phase B — ELN enrichment (run arrow + eq labels, after layout) ---
1152
+ if eln_csv and enrichment_data:
1153
+ log("Step 7.5: Phase B — Adding run arrow + eq labels...")
1154
+ from .eln_enrichment import enrich_phase_b
1155
+
1156
+ final_tree = _parse_cdxml(output_path)
1157
+ final_root = final_tree.getroot()
1158
+
1159
+ enrich_phase_b(final_root, enrichment_data, verbose=verbose)
1160
+
1161
+ _write_cdxml(final_tree, output_path)
1162
+ log(f" Enrichment complete")
1163
+
1164
+ finally:
1165
+ import shutil
1166
+ try:
1167
+ shutil.rmtree(tmpdir)
1168
+ except Exception:
1169
+ pass
1170
+
1171
+ log(f"Output: {output_path}")
1172
+ return output_path
1173
+
1174
+
1175
+ # ---------------------------------------------------------------------------
1176
+ # CLI
1177
+ # ---------------------------------------------------------------------------
1178
+
1179
+ def _classify_error(exc: Exception) -> str:
1180
+ """Map an exception to a machine-readable error code."""
1181
+ msg = str(exc).lower()
1182
+ name = type(exc).__name__
1183
+
1184
+ if name == "FileNotFoundError" or "not found" in msg:
1185
+ return "file_not_found"
1186
+ if "parse" in msg or "xml" in msg.lower() or name == "ParseError":
1187
+ return "cdxml_parse_failed"
1188
+ if "rdkit" in msg or "smiles" in msg:
1189
+ return "smiles_parse_failed"
1190
+ if "chemscript" in msg:
1191
+ return "chemscript_error"
1192
+ if "alignment" in msg or "mcs" in msg:
1193
+ return "alignment_failed"
1194
+ if "enrichment" in msg or "csv" in msg:
1195
+ return "enrichment_failed"
1196
+ if "layout" in msg or "cleanup" in msg:
1197
+ return "layout_failed"
1198
+ if name in ("KeyError", "IndexError", "ValueError", "TypeError"):
1199
+ return "internal_error"
1200
+ return "pipeline_failed"
1201
+
1202
+
1203
+ def main(argv: Optional[List[str]] = None) -> int:
1204
+ from ...layout.reaction_cleanup import APPROACHES
1205
+
1206
+ parser = argparse.ArgumentParser(
1207
+ description=(
1208
+ "COM-free scheme polishing pipeline: normalize bond lengths, "
1209
+ "classify reagents, swap structures/text, align orientations, "
1210
+ "format subscripts, merge conditions, and clean up layout."
1211
+ ),
1212
+ formatter_class=argparse.RawDescriptionHelpFormatter,
1213
+ epilog=__doc__,
1214
+ )
1215
+ parser.add_argument(
1216
+ "input",
1217
+ help="Input .cdx or .cdxml file",
1218
+ )
1219
+ parser.add_argument(
1220
+ "-o", "--output", default=None,
1221
+ help="Output CDXML file (default: <input_stem>-v2.cdxml)",
1222
+ )
1223
+ parser.add_argument(
1224
+ "--no-merge-conditions", action="store_true",
1225
+ help="Keep condition text as separate labels (default: merge into one block)",
1226
+ )
1227
+ parser.add_argument(
1228
+ "--approach", choices=list(APPROACHES.keys()),
1229
+ default="chemdraw_mimic",
1230
+ help="Layout approach for reaction_cleanup (default: chemdraw_mimic)",
1231
+ )
1232
+ parser.add_argument(
1233
+ "--no-chemscript-cleanup", action="store_true",
1234
+ help="Skip ChemScript CleanupStructure per fragment "
1235
+ "(default: cleanup is enabled to fix bond angles)",
1236
+ )
1237
+ parser.add_argument(
1238
+ "--align-mode", choices=["rdkit", "rxnmapper", "kabsch"],
1239
+ default="rdkit",
1240
+ help="Orientation alignment method (default: rdkit). "
1241
+ "'rdkit' uses MCS + GenerateDepictionMatching2DStructure "
1242
+ "(can rotate individual bonds for better alignment). "
1243
+ "'rxnmapper' uses ML transformer atom mapping to align "
1244
+ "reactants to product orientation (falls back to MCS). "
1245
+ "'kabsch' uses rigid-body rotation only (legacy backup).",
1246
+ )
1247
+ parser.add_argument(
1248
+ "--eln-csv", default=None,
1249
+ help="Findmolecule ELN CSV file for enrichment (adds equivalents, "
1250
+ "run arrow with SM mass and product yield)",
1251
+ )
1252
+ parser.add_argument(
1253
+ "--ref-cdxml", default=None,
1254
+ help="Reference CDXML file with known-good structure(s) for "
1255
+ "product orientation. The product is aligned to the best-"
1256
+ "matching reference via MCS, then reactants align to the "
1257
+ "product.",
1258
+ )
1259
+ parser.add_argument(
1260
+ "--render", action="store_true",
1261
+ help="Render output to PNG via cdxml_to_image.py",
1262
+ )
1263
+ parser.add_argument(
1264
+ "--json-errors", action="store_true",
1265
+ help="Output structured JSON error objects to stderr on failure "
1266
+ "(for agent orchestration)",
1267
+ )
1268
+ parser.add_argument(
1269
+ "-v", "--verbose", action="store_true",
1270
+ help="Print progress to stderr",
1271
+ )
1272
+
1273
+ args = parser.parse_args(argv)
1274
+
1275
+ def _emit_json_error(error_code: str, detail: str,
1276
+ file: str = None) -> None:
1277
+ """Write a structured JSON error to stderr if --json-errors."""
1278
+ if not args.json_errors:
1279
+ return
1280
+ obj = {"error": error_code, "detail": detail}
1281
+ if file:
1282
+ obj["file"] = file
1283
+ print(json.dumps(obj), file=sys.stderr)
1284
+
1285
+ input_path = os.path.abspath(args.input)
1286
+ if not os.path.exists(input_path):
1287
+ msg = f"file not found: {input_path}"
1288
+ _emit_json_error("file_not_found", msg, os.path.basename(input_path))
1289
+ if not args.json_errors:
1290
+ print(f"ERROR: {msg}", file=sys.stderr)
1291
+ return 1
1292
+
1293
+ if args.output is None:
1294
+ stem = os.path.splitext(input_path)[0]
1295
+ output_path = stem + "-v2.cdxml"
1296
+ else:
1297
+ output_path = os.path.abspath(args.output)
1298
+
1299
+ try:
1300
+ run_pipeline(
1301
+ input_path,
1302
+ output_path,
1303
+ merge_conditions=not args.no_merge_conditions,
1304
+ approach=args.approach,
1305
+ chemscript_cleanup=not args.no_chemscript_cleanup,
1306
+ align_mode=args.align_mode,
1307
+ eln_csv=args.eln_csv,
1308
+ ref_cdxml=args.ref_cdxml,
1309
+ verbose=args.verbose,
1310
+ )
1311
+ except Exception as e:
1312
+ error_type = type(e).__name__
1313
+ error_code = _classify_error(e)
1314
+ _emit_json_error(error_code, str(e),
1315
+ os.path.basename(input_path))
1316
+ if not args.json_errors:
1317
+ print(f"ERROR: {e}", file=sys.stderr)
1318
+ if args.verbose:
1319
+ import traceback
1320
+ traceback.print_exc()
1321
+ return 1
1322
+
1323
+ print(f"Output: {output_path}")
1324
+
1325
+ if args.render:
1326
+ try:
1327
+ from ...chemdraw.cdxml_to_image import cdxml_to_image
1328
+ png_path = cdxml_to_image(output_path)
1329
+ print(f"Rendered: {png_path}")
1330
+ except Exception as e:
1331
+ _emit_json_error("render_failed", str(e),
1332
+ os.path.basename(output_path))
1333
+ if not args.json_errors:
1334
+ print(f"Render failed: {e}", file=sys.stderr)
1335
+
1336
+ return 0
1337
+
1338
+
1339
+ if __name__ == "__main__":
1340
+ sys.exit(main())