cdxml-toolkit 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. cdxml_toolkit/__init__.py +18 -0
  2. cdxml_toolkit/_jre/__init__.py +2 -0
  3. cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
  4. cdxml_toolkit/analysis/__init__.py +35 -0
  5. cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
  6. cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
  7. cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
  8. cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
  9. cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
  10. cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
  11. cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
  12. cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
  13. cdxml_toolkit/analysis/extract_nmr.py +47 -0
  14. cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
  15. cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
  16. cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
  17. cdxml_toolkit/cdxml_builder.py +920 -0
  18. cdxml_toolkit/cdxml_utils.py +342 -0
  19. cdxml_toolkit/chemdraw/__init__.py +5 -0
  20. cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
  21. cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
  22. cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
  23. cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
  24. cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
  25. cdxml_toolkit/constants.py +304 -0
  26. cdxml_toolkit/coord_normalizer.py +438 -0
  27. cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
  28. cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
  29. cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
  30. cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
  31. cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
  32. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
  33. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
  34. cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
  35. cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
  36. cdxml_toolkit/image/__init__.py +15 -0
  37. cdxml_toolkit/image/reaction_from_image.py +2103 -0
  38. cdxml_toolkit/image/structure_from_image.py +1711 -0
  39. cdxml_toolkit/layout/__init__.py +5 -0
  40. cdxml_toolkit/layout/alignment.py +1642 -0
  41. cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
  42. cdxml_toolkit/layout/scheme_merger.py +2260 -0
  43. cdxml_toolkit/mcp_server/__init__.py +0 -0
  44. cdxml_toolkit/mcp_server/__main__.py +5 -0
  45. cdxml_toolkit/mcp_server/server.py +1567 -0
  46. cdxml_toolkit/naming/__init__.py +6 -0
  47. cdxml_toolkit/naming/aligned_namer.py +2342 -0
  48. cdxml_toolkit/naming/mol_builder.py +3722 -0
  49. cdxml_toolkit/naming/name_decomposer.py +2843 -0
  50. cdxml_toolkit/naming/reactions_datamol.json +2414 -0
  51. cdxml_toolkit/office/__init__.py +5 -0
  52. cdxml_toolkit/office/doc_from_template.py +722 -0
  53. cdxml_toolkit/office/ole_embedder.py +808 -0
  54. cdxml_toolkit/office/ole_extractor.py +272 -0
  55. cdxml_toolkit/perception/__init__.py +10 -0
  56. cdxml_toolkit/perception/compound_search.py +229 -0
  57. cdxml_toolkit/perception/eln_csv_parser.py +240 -0
  58. cdxml_toolkit/perception/rdf_parser.py +664 -0
  59. cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
  60. cdxml_toolkit/perception/reaction_parser.py +2150 -0
  61. cdxml_toolkit/perception/scheme_reader.py +2948 -0
  62. cdxml_toolkit/perception/scheme_refine.py +1404 -0
  63. cdxml_toolkit/perception/scheme_segmenter.py +619 -0
  64. cdxml_toolkit/perception/spatial_assignment.py +1013 -0
  65. cdxml_toolkit/rdkit_utils.py +605 -0
  66. cdxml_toolkit/render/__init__.py +17 -0
  67. cdxml_toolkit/render/auto_layout.py +229 -0
  68. cdxml_toolkit/render/compact_parser.py +632 -0
  69. cdxml_toolkit/render/parser.py +706 -0
  70. cdxml_toolkit/render/render_scheme.py +267 -0
  71. cdxml_toolkit/render/renderer.py +2387 -0
  72. cdxml_toolkit/render/schema.py +90 -0
  73. cdxml_toolkit/render/scheme_maker.py +1043 -0
  74. cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
  75. cdxml_toolkit/resolve/__init__.py +13 -0
  76. cdxml_toolkit/resolve/cas_resolver.py +430 -0
  77. cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
  78. cdxml_toolkit/resolve/condensed_formula.py +493 -0
  79. cdxml_toolkit/resolve/jre_manager.py +195 -0
  80. cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
  81. cdxml_toolkit/resolve/reagent_db.py +285 -0
  82. cdxml_toolkit/resolve/superatom_data.json +2856 -0
  83. cdxml_toolkit/resolve/superatom_table.py +146 -0
  84. cdxml_toolkit/text_formatting.py +298 -0
  85. cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
  86. cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
  87. cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
  88. cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
  89. cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
  90. cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
  91. cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,920 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ cdxml_builder.py — Build valid ChemDraw 16 CDXML from structured atom/bond data.
4
+
5
+ Produces CDXML that opens correctly in ChemDraw 16 using ACS Document 1996 style:
6
+ BondLength=14.40, ChainAngle=120, Arial 10 pt captions / 9 pt labels.
7
+
8
+ Modes
9
+ -----
10
+ Single molecule
11
+ python cdxml_builder.py --input molecule.json --output molecule.cdxml
12
+
13
+ Reaction scheme
14
+ python cdxml_builder.py --input reaction.json --mode reaction --output scheme.cdxml
15
+
16
+ Input JSON — single molecule
17
+ {
18
+ "atoms": [
19
+ {"index": 1, "symbol": "C", "x": 150.0, "y": 300.0},
20
+ {"index": 2, "symbol": "N", "x": 164.4, "y": 308.2, "num_hydrogens": 0},
21
+ ...
22
+ ],
23
+ "bonds": [
24
+ {"index": 1, "order": 1, "atom1": 1, "atom2": 2},
25
+ {"index": 2, "order": 2, "atom1": 2, "atom2": 3},
26
+ ...
27
+ ]
28
+ }
29
+
30
+ Input JSON — reaction
31
+ {
32
+ "reactants": [ <molecule>, ... ],
33
+ "products": [ <molecule>, ... ],
34
+ "conditions": {
35
+ "above": ["Pd2dba3 (5 mol%)", "BINAP (10 mol%)"],
36
+ "below": ["Cs2CO3 (2 eq.)", "dioxane", "100 °C, 24 h"]
37
+ }
38
+ }
39
+
40
+ Coordinates must already be in CDXML points (run coord_normalizer.py first).
41
+
42
+ Atom dict keys
43
+ index int atom number (1-based, must be unique)
44
+ symbol str element symbol ("C", "N", "Br", …)
45
+ x, y float position in CDXML points
46
+ num_hydrogens int explicit H count (omit or None for C to get implicit)
47
+ cfg int stereo flag (1=wedge up, 6=wedge down, 4=either)
48
+ charge int formal charge (0 = omit)
49
+
50
+ Bond dict keys
51
+ index int bond number (1-based, must be unique)
52
+ order int 1=single, 2=double, 3=triple, 4=aromatic
53
+ atom1, atom2 int atom indices
54
+ cfg int stereo: 1=up, 4=either, 6=down (wedge/dash)
55
+ double_pos str "Right" | "Left" (for double bonds in rings)
56
+ """
57
+
58
+ import argparse
59
+ import json
60
+ import math
61
+ import sys
62
+ from copy import deepcopy
63
+ from typing import Dict, List, Optional, Tuple
64
+ from xml.sax.saxutils import escape as xml_escape
65
+
66
+
67
+ # ---------------------------------------------------------------------------
68
+ # Constants — ACS Document 1996 (from shared constants.py)
69
+ # ---------------------------------------------------------------------------
70
+
71
+ from .constants import (
72
+ ACS_BOND_LENGTH_STR as ACS_BOND_LENGTH,
73
+ ACS_CHAIN_ANGLE_STR as ACS_CHAIN_ANGLE,
74
+ ACS_LABEL_FONT, ACS_LABEL_SIZE, ACS_LABEL_FACE,
75
+ ACS_CAPTION_SIZE, ACS_CAPTION_FACE,
76
+ ACS_LINE_WIDTH, ACS_BOLD_WIDTH, ACS_BOND_SPACING,
77
+ ACS_HASH_SPACING, ACS_MARGIN_WIDTH,
78
+ CDXML_HEADER as _CDXML_HEADER,
79
+ CDXML_FOOTER as _CDXML_FOOTER,
80
+ )
81
+
82
+ # Element numbers for heteroatoms we care about
83
+ ELEMENT_NUMBERS: Dict[str, int] = {
84
+ "H": 1, "B": 5, "C": 6, "N": 7, "O": 8,
85
+ "F": 9, "Si": 14, "P": 15, "S": 16, "Cl": 17,
86
+ "Se": 34, "Br": 35, "I": 53, "Cs": 55,
87
+ }
88
+
89
+ # Two-character element symbols (need special handling in label alignment)
90
+ WIDE_SYMBOLS = {"Br", "Cl", "Si", "Se", "Cs"}
91
+
92
+ # Bond order → CDXML Order attribute (1 is default so we can omit it)
93
+ BOND_ORDER_ATTR: Dict[int, Optional[str]] = {
94
+ 1: None, # single — omit Order attribute
95
+ 2: "2",
96
+ 3: "3",
97
+ 4: "1.5", # aromatic rendered as 1.5 in ChemDraw
98
+ }
99
+
100
+ # Stereo bond config → ChemDraw BS / Display attribute
101
+ BOND_STEREO_ATTR: Dict[int, str] = {
102
+ 1: "WedgeBegin", # solid wedge up
103
+ 4: "WedgeBegin", # either / unknown (use same, ChemDraw re-interprets)
104
+ 6: "WedgedHashBegin", # dashed wedge
105
+ }
106
+
107
+
108
+ # ---------------------------------------------------------------------------
109
+ # ID counter
110
+ # ---------------------------------------------------------------------------
111
+
112
+ class _IDGen:
113
+ """Simple incrementing integer ID generator."""
114
+ def __init__(self, start: int = 1000):
115
+ self._n = start
116
+
117
+ def next(self) -> int:
118
+ v = self._n
119
+ self._n += 1
120
+ return v
121
+
122
+
123
+ # ---------------------------------------------------------------------------
124
+ # Label position helper
125
+ # ---------------------------------------------------------------------------
126
+
127
+ def _label_offset(symbol: str) -> Tuple[float, float]:
128
+ """
129
+ Return (dx, dy) offset from atom position to the top-left of the <t> label.
130
+ Approximates ChemDraw's own offsets (3.25 pt horizontal, 3.5 pt vertical).
131
+ """
132
+ # ChemDraw positions labels slightly to the left and above the atom centre.
133
+ # Wide symbols shift further left.
134
+ char_w = 7.0 if symbol in WIDE_SYMBOLS else 3.5
135
+ return -char_w + 0.75, -7.5 # dx, dy from atom p to label top-left
136
+
137
+
138
+ def _label_bbox(x: float, y: float, symbol: str) -> str:
139
+ """Return BoundingBox string for a heteroatom label."""
140
+ char_w = 7.0 if symbol in WIDE_SYMBOLS else 6.0
141
+ # p is the bottom of the label in ChemDraw convention
142
+ lx = x - char_w / 2.0
143
+ ty = y - 7.52 # top
144
+ by = y # bottom ≈ atom y
145
+ rx = lx + char_w
146
+ return f"{lx:.2f} {ty:.2f} {rx:.2f} {by:.2f}"
147
+
148
+
149
+ # ---------------------------------------------------------------------------
150
+ # Inner fragment builder for abbreviation nodes
151
+ # ---------------------------------------------------------------------------
152
+
153
+ def _build_abbrev_inner_fragment(
154
+ label_smiles: str,
155
+ anchor_x: float,
156
+ anchor_y: float,
157
+ ids: _IDGen,
158
+ ) -> str:
159
+ """Build inner ``<fragment>`` XML for a ``NodeType="Fragment"`` abbreviation.
160
+
161
+ Generates 2D coords from *label_smiles*, normalises to ACS bond length,
162
+ positions near (*anchor_x*, *anchor_y*), and adds an
163
+ ``ExternalConnectionPoint`` on the first atom (the attachment point).
164
+
165
+ Returns the ``<fragment>...</fragment>`` XML string, or ``""`` on failure.
166
+ """
167
+ try:
168
+ from .image.structure_from_image import smiles_to_coords
169
+ from .coord_normalizer import normalize_coords
170
+ except ImportError:
171
+ return ""
172
+
173
+ mol_data = smiles_to_coords(label_smiles, offset_index=0)
174
+ if not mol_data or not mol_data.get("atoms"):
175
+ return ""
176
+
177
+ atoms, bonds = normalize_coords(
178
+ mol_data["atoms"], mol_data["bonds"],
179
+ center_x=anchor_x, center_y=anchor_y,
180
+ flip_y=True,
181
+ )
182
+ if not atoms:
183
+ return ""
184
+
185
+ frag_id = ids.next()
186
+ lines: List[str] = [f'<fragment id="{frag_id}">']
187
+
188
+ inner_map: Dict[int, int] = {}
189
+ for a in atoms:
190
+ aid = ids.next()
191
+ inner_map[a["index"]] = aid
192
+ sym = a.get("symbol", "C")
193
+ ax, ay = a["x"], a["y"]
194
+ z = ids.next()
195
+ attrs = [f'id="{aid}"', f'p="{ax:.2f} {ay:.2f}"', f'Z="{z}"']
196
+ if sym != "C":
197
+ el_num = ELEMENT_NUMBERS.get(sym, 0)
198
+ if el_num:
199
+ attrs.append(f'Element="{el_num}"')
200
+ nh = a.get("num_hydrogens", 0)
201
+ attrs.append(f'NumHydrogens="{nh}"')
202
+ attrs.append('NeedsClean="yes"')
203
+ lines.append(f'<n {" ".join(attrs)}/>')
204
+
205
+ for b in bonds:
206
+ bid = ids.next()
207
+ z = ids.next()
208
+ a1 = inner_map.get(b["atom1"], 0)
209
+ a2 = inner_map.get(b["atom2"], 0)
210
+ order = b.get("order", 1)
211
+ attrs = [f'id="{bid}"', f'Z="{z}"', f'B="{a1}"', f'E="{a2}"']
212
+ order_attr = BOND_ORDER_ATTR.get(order)
213
+ if order_attr:
214
+ attrs.append(f'Order="{order_attr}"')
215
+ lines.append(f'<b {" ".join(attrs)}/>')
216
+
217
+ # ExternalConnectionPoint — bonded to first atom (attachment point)
218
+ ecp_id = ids.next()
219
+ ecp_z = ids.next()
220
+ first_atom = atoms[0]
221
+ ecp_x = first_atom["x"] - 14.4
222
+ ecp_y = first_atom["y"]
223
+ first_inner_id = inner_map.get(first_atom["index"], 0)
224
+ lines.append(
225
+ f'<n id="{ecp_id}" NodeType="ExternalConnectionPoint" '
226
+ f'p="{ecp_x:.2f} {ecp_y:.2f}" Z="{ecp_z}" '
227
+ f'ExternalConnectionNum="1"/>'
228
+ )
229
+ ecp_bond_id = ids.next()
230
+ ecp_bond_z = ids.next()
231
+ lines.append(
232
+ f'<b id="{ecp_bond_id}" Z="{ecp_bond_z}" '
233
+ f'B="{ecp_id}" E="{first_inner_id}"/>'
234
+ )
235
+
236
+ lines.append('</fragment>')
237
+ return "\n".join(lines)
238
+
239
+
240
+ # ---------------------------------------------------------------------------
241
+ # Fragment (molecule) builder
242
+ # ---------------------------------------------------------------------------
243
+
244
+ def _build_fragment(
245
+ atoms: List[Dict],
246
+ bonds: List[Dict],
247
+ ids: _IDGen,
248
+ atom_id_map: Optional[Dict[int, int]] = None, # out-param: atom index → xml id
249
+ ) -> Tuple[str, Dict[int, int], int]:
250
+ """
251
+ Build a <fragment> XML string.
252
+
253
+ Supports three atom types via optional dict keys:
254
+
255
+ * **Normal atoms** — standard CDXML atoms (carbon or heteroatom with label).
256
+ * **Abbreviation atoms** (``is_abbreviation=True``) — rendered as
257
+ ``NodeType="Fragment"`` with an inner ``<fragment>`` and a text label.
258
+ Requires ``abbrev_label``; ``abbrev_smiles`` used for inner fragment.
259
+ * **Generic group atoms** (``is_generic=True``) — rendered as
260
+ ``NodeType="GenericNickname"`` (or other *node_type*) with a text label.
261
+ Requires ``generic_label``.
262
+
263
+ Returns (xml_string, atom_id_map, fragment_xml_id).
264
+ atom_id_map maps caller's atom index → the XML element id used.
265
+ """
266
+ if atom_id_map is None:
267
+ atom_id_map = {}
268
+
269
+ frag_id = ids.next()
270
+
271
+ # Compute bounding box
272
+ xs = [a["x"] for a in atoms]
273
+ ys = [a["y"] for a in atoms]
274
+ bb_x1, bb_y1 = min(xs), min(ys)
275
+ bb_x2, bb_y2 = max(xs), max(ys)
276
+
277
+ lines: List[str] = []
278
+ lines.append(
279
+ f'<fragment id="{frag_id}" '
280
+ f'BoundingBox="{bb_x1:.2f} {bb_y1:.2f} {bb_x2:.2f} {bb_y2:.2f}" '
281
+ f'Z="{ids.next()}">'
282
+ )
283
+
284
+ # Atoms
285
+ for a in atoms:
286
+ atom_xml_id = ids.next()
287
+ atom_id_map[a["index"]] = atom_xml_id
288
+
289
+ ax, ay = a["x"], a["y"]
290
+ z = ids.next()
291
+
292
+ # ---- Abbreviation group (NodeType="Fragment") ----
293
+ if a.get("is_abbreviation"):
294
+ label = a.get("abbrev_label", "?")
295
+ label_smiles = a.get("abbrev_smiles")
296
+
297
+ lines.append(
298
+ f'<n id="{atom_xml_id}" NodeType="Fragment" '
299
+ f'p="{ax:.2f} {ay:.2f}" Z="{z}" AS="N">'
300
+ )
301
+
302
+ # Inner fragment from SMILES (optional — ChemDraw needs it)
303
+ if label_smiles:
304
+ inner_xml = _build_abbrev_inner_fragment(
305
+ label_smiles, ax, ay, ids)
306
+ if inner_xml:
307
+ lines.append(inner_xml)
308
+
309
+ # Label text
310
+ lx = ax - 3.25
311
+ ly = ay + 3.52
312
+ # Estimate bbox based on label length
313
+ label_w = max(len(label) * 5.5, 6.0)
314
+ lbx1 = ax - label_w / 2.0
315
+ lby1 = ay - 7.52
316
+ lbx2 = ax + label_w / 2.0
317
+ lby2 = ay
318
+ tid = ids.next()
319
+ lines.append(
320
+ f'<t id="{tid}" p="{lx:.2f} {ly:.2f}" '
321
+ f'BoundingBox="{lbx1:.2f} {lby1:.2f} {lbx2:.2f} {lby2:.2f}" '
322
+ f'LabelJustification="Left">'
323
+ )
324
+ lines.append(
325
+ f'<s font="{ACS_LABEL_FONT}" size="{ACS_LABEL_SIZE}" '
326
+ f'color="0" face="{ACS_LABEL_FACE}">'
327
+ f'{xml_escape(label)}</s>'
328
+ )
329
+ lines.append('</t>')
330
+ lines.append('</n>')
331
+ continue
332
+
333
+ # ---- Generic variable group (R, X, Ar, R1, …) ----
334
+ if a.get("is_generic"):
335
+ label = a.get("generic_label", "R")
336
+ node_type = a.get("node_type", "GenericNickname")
337
+
338
+ attrs = [
339
+ f'id="{atom_xml_id}"',
340
+ f'NodeType="{node_type}"',
341
+ f'p="{ax:.2f} {ay:.2f}"',
342
+ f'Z="{z}"',
343
+ f'AS="N"',
344
+ ]
345
+ if node_type == "GenericNickname":
346
+ attrs.append(f'GenericNickname="{xml_escape(label)}"')
347
+
348
+ lines.append(f'<n {" ".join(attrs)}>')
349
+
350
+ lx = ax - 3.25
351
+ ly = ay + 3.52
352
+ label_w = max(len(label) * 5.5, 6.0)
353
+ lbx1 = ax - label_w / 2.0
354
+ lby1 = ay - 7.52
355
+ lbx2 = ax + label_w / 2.0
356
+ lby2 = ay
357
+ tid = ids.next()
358
+ lines.append(
359
+ f'<t id="{tid}" p="{lx:.2f} {ly:.2f}" '
360
+ f'BoundingBox="{lbx1:.2f} {lby1:.2f} {lbx2:.2f} {lby2:.2f}" '
361
+ f'LabelJustification="Left">'
362
+ )
363
+ lines.append(
364
+ f'<s font="{ACS_LABEL_FONT}" size="{ACS_LABEL_SIZE}" '
365
+ f'color="0" face="{ACS_LABEL_FACE}">'
366
+ f'{xml_escape(label)}</s>'
367
+ )
368
+ lines.append('</t>')
369
+ lines.append('</n>')
370
+ continue
371
+
372
+ # ---- Normal atom ----
373
+ sym = a.get("symbol", "C")
374
+
375
+ # Base attributes
376
+ attrs = [f'id="{atom_xml_id}"', f'p="{ax:.2f} {ay:.2f}"', f'Z="{z}"']
377
+
378
+ is_carbon = (sym == "C")
379
+
380
+ # Charge
381
+ charge = a.get("charge", 0)
382
+
383
+ if not is_carbon:
384
+ el_num = ELEMENT_NUMBERS.get(sym, 0)
385
+ if el_num:
386
+ attrs.append(f'Element="{el_num}"')
387
+ nh = a.get("num_hydrogens", 0)
388
+ attrs.append(f'NumHydrogens="{nh}"')
389
+ attrs.append('NeedsClean="yes"')
390
+ attrs.append('AS="N"')
391
+
392
+ if charge:
393
+ attrs.append(f'Charge="{charge}"')
394
+
395
+ # Stereo cfg (atom)
396
+ cfg = a.get("cfg", 0)
397
+ if cfg:
398
+ attrs.append(f'Stereo="{cfg}"')
399
+
400
+ if is_carbon and not charge:
401
+ # Carbon: no Element, no label, no NumHydrogens
402
+ lines.append(f'<n {" ".join(attrs)}/>')
403
+ else:
404
+ # Heteroatom: needs <t> child label
405
+ # Label position: offset from atom centre
406
+ lx = ax - 3.25
407
+ ly = ay + 3.52
408
+ bbox = _label_bbox(ax, ay, sym)
409
+
410
+ # Build label text including hydrogens: "N" → "NH", "O" → "OH"
411
+ nh = a.get("num_hydrogens", 0)
412
+ if nh == 1:
413
+ label_text = xml_escape(sym) + "H"
414
+ elif nh > 1:
415
+ label_text = xml_escape(sym) + "H" + str(nh)
416
+ else:
417
+ label_text = xml_escape(sym)
418
+ label_align = ""
419
+ if sym in WIDE_SYMBOLS:
420
+ label_align = ' LabelAlignment="Left"'
421
+
422
+ lines.append(f'<n {" ".join(attrs)}>')
423
+ lines.append(
424
+ f'<t p="{lx:.2f} {ly:.2f}" BoundingBox="{bbox}" '
425
+ f'LabelJustification="Left">'
426
+ )
427
+ lines.append(
428
+ f'<s font="{ACS_LABEL_FONT}" size="{ACS_LABEL_SIZE}" '
429
+ f'color="0" face="{ACS_LABEL_FACE}">{label_text}</s>'
430
+ )
431
+ lines.append("</t>")
432
+ lines.append("</n>")
433
+
434
+ # Bonds
435
+ for b in bonds:
436
+ bond_xml_id = ids.next()
437
+ z = ids.next()
438
+ a1_xml = atom_id_map.get(b["atom1"], 0)
439
+ a2_xml = atom_id_map.get(b["atom2"], 0)
440
+ order = b.get("order", 1)
441
+ cfg = b.get("cfg", 0)
442
+
443
+ attrs = [
444
+ f'id="{bond_xml_id}"',
445
+ f'Z="{z}"',
446
+ f'B="{a1_xml}"',
447
+ f'E="{a2_xml}"',
448
+ ]
449
+
450
+ order_attr = BOND_ORDER_ATTR.get(order)
451
+ if order_attr:
452
+ attrs.append(f'Order="{order_attr}"')
453
+
454
+ double_pos = b.get("double_pos", "")
455
+ if double_pos:
456
+ attrs.append(f'DoublePosition="{double_pos}"')
457
+
458
+ if cfg and cfg in BOND_STEREO_ATTR:
459
+ attrs.append(f'Display="{BOND_STEREO_ATTR[cfg]}"')
460
+ elif order == 1:
461
+ # Default single bond gets BS="N" (normal, no stereo)
462
+ attrs.append('BS="N"')
463
+
464
+ lines.append(f'<b {" ".join(attrs)}/>')
465
+
466
+ lines.append("</fragment>")
467
+ return "\n".join(lines), atom_id_map, frag_id
468
+
469
+
470
+ # ---------------------------------------------------------------------------
471
+ # Conditions text builder
472
+ # ---------------------------------------------------------------------------
473
+
474
+ def _build_conditions_text(
475
+ lines: List[str],
476
+ x: float,
477
+ y: float,
478
+ ids: _IDGen,
479
+ justification: str = "Center",
480
+ ) -> Tuple[str, int]:
481
+ """
482
+ Build a standalone <t> element for reaction conditions (above or below arrow).
483
+
484
+ Returns (xml_string, text_xml_id).
485
+ """
486
+ tid = ids.next()
487
+ z = ids.next()
488
+
489
+ # Estimate bounding box: ~6 pt per char, 12 pt line height
490
+ max_chars = max((len(ln) for ln in lines), default=1)
491
+ w = max_chars * 5.8
492
+ h = len(lines) * 12.0
493
+
494
+ bx1 = x - w / 2.0
495
+ by1 = y - h
496
+ bx2 = x + w / 2.0
497
+ by2 = y
498
+
499
+ parts = [
500
+ f'<t id="{tid}" p="{x:.2f} {y:.2f}" '
501
+ f'BoundingBox="{bx1:.2f} {by1:.2f} {bx2:.2f} {by2:.2f}" '
502
+ f'Z="{z}" '
503
+ f'CaptionJustification="{justification}" '
504
+ f'Justification="{justification}" '
505
+ f'LineHeight="auto">'
506
+ ]
507
+ text = "\n".join(xml_escape(ln) for ln in lines)
508
+ parts.append(
509
+ f'<s font="{ACS_LABEL_FONT}" size="{ACS_CAPTION_SIZE}" '
510
+ f'color="0" face="{ACS_CAPTION_FACE}">{text}</s>'
511
+ )
512
+ parts.append("</t>")
513
+ return "\n".join(parts), tid
514
+
515
+
516
+ # ---------------------------------------------------------------------------
517
+ # Arrow builder
518
+ # ---------------------------------------------------------------------------
519
+
520
+ def _build_arrow(
521
+ tail_x: float, tail_y: float,
522
+ head_x: float, head_y: float,
523
+ ids: _IDGen,
524
+ ) -> Tuple[str, int]:
525
+ """
526
+ Build an <arrow> element (full solid arrowhead, reaction style).
527
+ Returns (xml_string, arrow_xml_id).
528
+ """
529
+ aid = ids.next()
530
+ z = ids.next()
531
+
532
+ # BoundingBox encloses the arrow shaft
533
+ bx1 = min(tail_x, head_x)
534
+ by1 = min(tail_y, head_y) - 4.0
535
+ bx2 = max(tail_x, head_x)
536
+ by2 = max(tail_y, head_y) + 4.0
537
+
538
+ # Center3D / MajorAxisEnd3D / MinorAxisEnd3D — ChemDraw uses these for
539
+ # internal geometry but they don't affect display in standard mode.
540
+ cx3 = (tail_x + head_x) / 2.0
541
+ cy3 = tail_y + 100.0
542
+ xml = (
543
+ f'<arrow id="{aid}" '
544
+ f'BoundingBox="{bx1:.2f} {by1:.2f} {bx2:.2f} {by2:.2f}" '
545
+ f'Z="{z}" '
546
+ f'FillType="None" '
547
+ f'ArrowheadHead="Full" '
548
+ f'ArrowheadType="Solid" '
549
+ f'HeadSize="1000" '
550
+ f'ArrowheadCenterSize="875" '
551
+ f'ArrowheadWidth="250" '
552
+ f'Head3D="{head_x:.2f} {head_y:.2f} 0" '
553
+ f'Tail3D="{tail_x:.2f} {tail_y:.2f} 0" '
554
+ f'Center3D="{cx3:.2f} {cy3:.2f} 0" '
555
+ f'MajorAxisEnd3D="{cx3 + 80:.2f} {cy3:.2f} 0" '
556
+ f'MinorAxisEnd3D="{cx3:.2f} {cy3 + 80:.2f} 0"'
557
+ f'/>'
558
+ )
559
+ return xml, aid
560
+
561
+
562
+ # ---------------------------------------------------------------------------
563
+ # Page templates
564
+ # ---------------------------------------------------------------------------
565
+
566
+ _PAGE_OPEN = (
567
+ '<page id="{page_id}" BoundingBox="0 0 1620 2160" '
568
+ 'HeaderPosition="36" FooterPosition="36" '
569
+ 'PrintTrimMarks="yes" HeightPages="3" WidthPages="3">'
570
+ )
571
+ _PAGE_CLOSE = "</page>"
572
+
573
+
574
+ def _header(bbox: str) -> str:
575
+ return _CDXML_HEADER.format(
576
+ bbox=bbox,
577
+ label_font=ACS_LABEL_FONT,
578
+ label_size=ACS_LABEL_SIZE,
579
+ label_face=ACS_LABEL_FACE,
580
+ caption_size=ACS_CAPTION_SIZE,
581
+ hash_spacing=ACS_HASH_SPACING,
582
+ margin_width=ACS_MARGIN_WIDTH,
583
+ line_width=ACS_LINE_WIDTH,
584
+ bold_width=ACS_BOLD_WIDTH,
585
+ bond_length=ACS_BOND_LENGTH,
586
+ bond_spacing=ACS_BOND_SPACING,
587
+ chain_angle=ACS_CHAIN_ANGLE,
588
+ )
589
+
590
+
591
+ # ---------------------------------------------------------------------------
592
+ # Public API — single molecule
593
+ # ---------------------------------------------------------------------------
594
+
595
+ def build_molecule_cdxml(
596
+ atoms: List[Dict],
597
+ bonds: List[Dict],
598
+ start_id: int = 1000,
599
+ ) -> str:
600
+ """
601
+ Build a CDXML document containing a single molecule fragment.
602
+
603
+ Parameters
604
+ ----------
605
+ atoms : list of atom dicts (coordinates already in CDXML pts)
606
+ bonds : list of bond dicts
607
+ start_id : first XML element id to use
608
+
609
+ Returns
610
+ -------
611
+ CDXML document as a string
612
+ """
613
+ ids = _IDGen(start_id)
614
+
615
+ atom_id_map: Dict[int, int] = {}
616
+ frag_xml, atom_id_map, _ = _build_fragment(atoms, bonds, ids, atom_id_map)
617
+
618
+ # Document bounding box
619
+ xs = [a["x"] for a in atoms]
620
+ ys = [a["y"] for a in atoms]
621
+ bbox = f"{min(xs):.2f} {min(ys):.2f} {max(xs):.2f} {max(ys):.2f}"
622
+
623
+ page_id = ids.next()
624
+
625
+ lines = [
626
+ _header(bbox),
627
+ _PAGE_OPEN.format(page_id=page_id),
628
+ frag_xml,
629
+ _PAGE_CLOSE,
630
+ _CDXML_FOOTER,
631
+ ]
632
+ return "\n".join(lines)
633
+
634
+
635
+ # ---------------------------------------------------------------------------
636
+ # Public API — reaction scheme
637
+ # ---------------------------------------------------------------------------
638
+
639
+ def build_reaction_cdxml(
640
+ reactants: List[Dict],
641
+ products: List[Dict],
642
+ conditions: Optional[Dict] = None,
643
+ arrow_y: Optional[float] = None,
644
+ arrow_tail_x: Optional[float] = None,
645
+ arrow_head_x: Optional[float] = None,
646
+ start_id: int = 1000,
647
+ ) -> str:
648
+ """
649
+ Build a CDXML reaction scheme document.
650
+
651
+ Each molecule in reactants/products is a dict::
652
+
653
+ {
654
+ "atoms": [...],
655
+ "bonds": [...],
656
+ # optional: "name", "role"
657
+ }
658
+
659
+ conditions is a dict::
660
+
661
+ {
662
+ "above": ["Pd2dba3 (5 mol%)", "BINAP (10 mol%)"],
663
+ "below": ["Cs2CO3 (2 eq.)", "dioxane", "100 °C, 24 h"]
664
+ }
665
+
666
+ Arrow position is auto-calculated from molecule bounding boxes if not given.
667
+
668
+ Parameters
669
+ ----------
670
+ reactants : list of molecule dicts
671
+ products : list of molecule dicts
672
+ conditions: dict with optional "above" and "below" lists of strings
673
+ arrow_y : y-coordinate of arrow shaft (auto if None)
674
+ arrow_tail_x, arrow_head_x : x-coords of arrow ends (auto if None)
675
+ start_id : first XML element id
676
+
677
+ Returns
678
+ -------
679
+ CDXML document string
680
+ """
681
+ if conditions is None:
682
+ conditions = {}
683
+
684
+ ids = _IDGen(start_id)
685
+
686
+ # ---- Build all fragment XMLs ----
687
+ all_xml_parts: List[str] = []
688
+ reactant_frag_ids: List[int] = []
689
+ product_frag_ids: List[int] = []
690
+
691
+ # Collect all atom positions to determine arrow y and bounding box
692
+ all_xs: List[float] = []
693
+ all_ys: List[float] = []
694
+
695
+ for mol in reactants:
696
+ atom_id_map: Dict[int, int] = {}
697
+ frag_xml, _, frag_id = _build_fragment(
698
+ mol.get("atoms", []), mol.get("bonds", []), ids, atom_id_map
699
+ )
700
+ all_xml_parts.append(frag_xml)
701
+ reactant_frag_ids.append(frag_id)
702
+ for a in mol.get("atoms", []):
703
+ all_xs.append(a["x"])
704
+ all_ys.append(a["y"])
705
+
706
+ for mol in products:
707
+ atom_id_map = {}
708
+ frag_xml, _, frag_id = _build_fragment(
709
+ mol.get("atoms", []), mol.get("bonds", []), ids, atom_id_map
710
+ )
711
+ all_xml_parts.append(frag_xml)
712
+ product_frag_ids.append(frag_id)
713
+ for a in mol.get("atoms", []):
714
+ all_xs.append(a["x"])
715
+ all_ys.append(a["y"])
716
+
717
+ if not all_xs:
718
+ raise ValueError("No atoms found in reactants or products")
719
+
720
+ # ---- Auto-calculate arrow position ----
721
+ # Arrow y: vertical midpoint of all molecules
722
+ mid_y = (min(all_ys) + max(all_ys)) / 2.0
723
+ if arrow_y is None:
724
+ arrow_y = mid_y
725
+
726
+ # Arrow x: gap between right edge of last reactant and left edge of first product
727
+ reactant_xs = []
728
+ product_xs = []
729
+ for mol in reactants:
730
+ reactant_xs.extend(a["x"] for a in mol.get("atoms", []))
731
+ for mol in products:
732
+ product_xs.extend(a["x"] for a in mol.get("atoms", []))
733
+
734
+ reactant_right = max(reactant_xs) if reactant_xs else 100.0
735
+ product_left = min(product_xs) if product_xs else 300.0
736
+
737
+ gap = product_left - reactant_right
738
+ margin = max(10.0, gap * 0.15)
739
+
740
+ if arrow_tail_x is None:
741
+ arrow_tail_x = reactant_right + margin
742
+ if arrow_head_x is None:
743
+ arrow_head_x = product_left - margin
744
+
745
+ # ---- Conditions text elements ----
746
+ above_ids: List[int] = []
747
+ below_ids: List[int] = []
748
+
749
+ arrow_mid_x = (arrow_tail_x + arrow_head_x) / 2.0
750
+
751
+ above_lines = conditions.get("above", [])
752
+ below_lines = conditions.get("below", [])
753
+
754
+ above_xml_parts: List[str] = []
755
+ below_xml_parts: List[str] = []
756
+
757
+ if above_lines:
758
+ txt_xml, txt_id = _build_conditions_text(
759
+ above_lines,
760
+ x=arrow_mid_x,
761
+ y=arrow_y - 8.0, # above arrow shaft
762
+ ids=ids,
763
+ )
764
+ above_xml_parts.append(txt_xml)
765
+ above_ids.append(txt_id)
766
+
767
+ if below_lines:
768
+ txt_xml, txt_id = _build_conditions_text(
769
+ below_lines,
770
+ x=arrow_mid_x,
771
+ y=arrow_y + 20.0, # below arrow shaft
772
+ ids=ids,
773
+ )
774
+ below_xml_parts.append(txt_xml)
775
+ below_ids.append(txt_id)
776
+
777
+ # ---- Arrow ----
778
+ arrow_xml, arrow_id = _build_arrow(
779
+ tail_x=arrow_tail_x,
780
+ tail_y=arrow_y,
781
+ head_x=arrow_head_x,
782
+ head_y=arrow_y,
783
+ ids=ids,
784
+ )
785
+
786
+ # ---- Scheme / step ----
787
+ scheme_id = ids.next()
788
+ step_id = ids.next()
789
+
790
+ reactant_str = " ".join(str(i) for i in reactant_frag_ids)
791
+ product_str = " ".join(str(i) for i in product_frag_ids)
792
+ above_str = " ".join(str(i) for i in above_ids)
793
+ below_str = " ".join(str(i) for i in below_ids)
794
+
795
+ step_attrs = [
796
+ f'id="{step_id}"',
797
+ f'ReactionStepReactants="{reactant_str}"',
798
+ f'ReactionStepProducts="{product_str}"',
799
+ f'ReactionStepArrows="{arrow_id}"',
800
+ ]
801
+ if above_str:
802
+ step_attrs.append(f'ReactionStepObjectsAboveArrow="{above_str}"')
803
+ if below_str:
804
+ step_attrs.append(f'ReactionStepObjectsBelowArrow="{below_str}"')
805
+
806
+ scheme_xml = (
807
+ f'<scheme id="{scheme_id}">'
808
+ f'<step {" ".join(step_attrs)}/>'
809
+ f'</scheme>'
810
+ )
811
+
812
+ # ---- Document bounding box ----
813
+ extra_margin = 20.0
814
+ doc_x1 = min(all_xs) - extra_margin
815
+ doc_y1 = min(all_ys) - extra_margin
816
+ doc_x2 = max(all_xs) + extra_margin
817
+ doc_y2 = max(all_ys) + extra_margin
818
+ doc_bbox = f"{doc_x1:.2f} {doc_y1:.2f} {doc_x2:.2f} {doc_y2:.2f}"
819
+
820
+ page_id = ids.next()
821
+
822
+ # ---- Assemble document ----
823
+ sections = (
824
+ [_header(doc_bbox)]
825
+ + [_PAGE_OPEN.format(page_id=page_id)]
826
+ + all_xml_parts
827
+ + above_xml_parts
828
+ + below_xml_parts
829
+ + [arrow_xml]
830
+ + [scheme_xml]
831
+ + [_PAGE_CLOSE]
832
+ + [_CDXML_FOOTER]
833
+ )
834
+ return "\n".join(sections)
835
+
836
+
837
+ # ---------------------------------------------------------------------------
838
+ # Helpers for loading from JSON
839
+ # ---------------------------------------------------------------------------
840
+
841
+ def _load_json(path: str) -> Dict:
842
+ if path == "-":
843
+ return json.load(sys.stdin)
844
+ with open(path, encoding="utf-8") as fh:
845
+ return json.load(fh)
846
+
847
+
848
+ # ---------------------------------------------------------------------------
849
+ # CLI
850
+ # ---------------------------------------------------------------------------
851
+
852
+ def _build_arg_parser() -> argparse.ArgumentParser:
853
+ p = argparse.ArgumentParser(
854
+ description="Build CDXML from structured atom/bond JSON (ACS Document 1996 style).",
855
+ formatter_class=argparse.RawDescriptionHelpFormatter,
856
+ epilog=__doc__,
857
+ )
858
+ p.add_argument(
859
+ "--input", "-i",
860
+ default="-",
861
+ help="Input JSON file (default: stdin)",
862
+ )
863
+ p.add_argument(
864
+ "--output", "-o",
865
+ default="-",
866
+ help="Output CDXML file (default: stdout)",
867
+ )
868
+ p.add_argument(
869
+ "--mode", "-m",
870
+ choices=["molecule", "reaction"],
871
+ default="molecule",
872
+ help="Output mode: 'molecule' (single fragment) or 'reaction' (scheme with arrow)",
873
+ )
874
+ p.add_argument(
875
+ "--start-id",
876
+ type=int,
877
+ default=1000,
878
+ help="First XML element id to use (default: 1000)",
879
+ )
880
+ return p
881
+
882
+
883
+ def main(argv: Optional[List[str]] = None) -> int:
884
+ parser = _build_arg_parser()
885
+ args = parser.parse_args(argv)
886
+
887
+ data = _load_json(args.input)
888
+
889
+ if args.mode == "molecule":
890
+ atoms = data.get("atoms", [])
891
+ bonds = data.get("bonds", [])
892
+ if not atoms:
893
+ print("ERROR: no atoms in input", file=sys.stderr)
894
+ return 1
895
+ cdxml = build_molecule_cdxml(atoms, bonds, start_id=args.start_id)
896
+
897
+ else: # reaction
898
+ reactants = data.get("reactants", [])
899
+ products = data.get("products", [])
900
+ conditions = data.get("conditions", {})
901
+ if not reactants or not products:
902
+ print("ERROR: reaction mode requires 'reactants' and 'products'", file=sys.stderr)
903
+ return 1
904
+ cdxml = build_reaction_cdxml(
905
+ reactants, products, conditions,
906
+ start_id=args.start_id,
907
+ )
908
+
909
+ if args.output == "-":
910
+ print(cdxml)
911
+ else:
912
+ with open(args.output, "w", encoding="utf-8") as fh:
913
+ fh.write(cdxml)
914
+ print(f"Written to {args.output}", file=sys.stderr)
915
+
916
+ return 0
917
+
918
+
919
+ if __name__ == "__main__":
920
+ sys.exit(main())