cdxml-toolkit 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. cdxml_toolkit/__init__.py +18 -0
  2. cdxml_toolkit/_jre/__init__.py +2 -0
  3. cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
  4. cdxml_toolkit/analysis/__init__.py +35 -0
  5. cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
  6. cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
  7. cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
  8. cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
  9. cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
  10. cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
  11. cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
  12. cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
  13. cdxml_toolkit/analysis/extract_nmr.py +47 -0
  14. cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
  15. cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
  16. cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
  17. cdxml_toolkit/cdxml_builder.py +920 -0
  18. cdxml_toolkit/cdxml_utils.py +342 -0
  19. cdxml_toolkit/chemdraw/__init__.py +5 -0
  20. cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
  21. cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
  22. cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
  23. cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
  24. cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
  25. cdxml_toolkit/constants.py +304 -0
  26. cdxml_toolkit/coord_normalizer.py +438 -0
  27. cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
  28. cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
  29. cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
  30. cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
  31. cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
  32. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
  33. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
  34. cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
  35. cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
  36. cdxml_toolkit/image/__init__.py +15 -0
  37. cdxml_toolkit/image/reaction_from_image.py +2103 -0
  38. cdxml_toolkit/image/structure_from_image.py +1711 -0
  39. cdxml_toolkit/layout/__init__.py +5 -0
  40. cdxml_toolkit/layout/alignment.py +1642 -0
  41. cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
  42. cdxml_toolkit/layout/scheme_merger.py +2260 -0
  43. cdxml_toolkit/mcp_server/__init__.py +0 -0
  44. cdxml_toolkit/mcp_server/__main__.py +5 -0
  45. cdxml_toolkit/mcp_server/server.py +1567 -0
  46. cdxml_toolkit/naming/__init__.py +6 -0
  47. cdxml_toolkit/naming/aligned_namer.py +2342 -0
  48. cdxml_toolkit/naming/mol_builder.py +3722 -0
  49. cdxml_toolkit/naming/name_decomposer.py +2843 -0
  50. cdxml_toolkit/naming/reactions_datamol.json +2414 -0
  51. cdxml_toolkit/office/__init__.py +5 -0
  52. cdxml_toolkit/office/doc_from_template.py +722 -0
  53. cdxml_toolkit/office/ole_embedder.py +808 -0
  54. cdxml_toolkit/office/ole_extractor.py +272 -0
  55. cdxml_toolkit/perception/__init__.py +10 -0
  56. cdxml_toolkit/perception/compound_search.py +229 -0
  57. cdxml_toolkit/perception/eln_csv_parser.py +240 -0
  58. cdxml_toolkit/perception/rdf_parser.py +664 -0
  59. cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
  60. cdxml_toolkit/perception/reaction_parser.py +2150 -0
  61. cdxml_toolkit/perception/scheme_reader.py +2948 -0
  62. cdxml_toolkit/perception/scheme_refine.py +1404 -0
  63. cdxml_toolkit/perception/scheme_segmenter.py +619 -0
  64. cdxml_toolkit/perception/spatial_assignment.py +1013 -0
  65. cdxml_toolkit/rdkit_utils.py +605 -0
  66. cdxml_toolkit/render/__init__.py +17 -0
  67. cdxml_toolkit/render/auto_layout.py +229 -0
  68. cdxml_toolkit/render/compact_parser.py +632 -0
  69. cdxml_toolkit/render/parser.py +706 -0
  70. cdxml_toolkit/render/render_scheme.py +267 -0
  71. cdxml_toolkit/render/renderer.py +2387 -0
  72. cdxml_toolkit/render/schema.py +90 -0
  73. cdxml_toolkit/render/scheme_maker.py +1043 -0
  74. cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
  75. cdxml_toolkit/resolve/__init__.py +13 -0
  76. cdxml_toolkit/resolve/cas_resolver.py +430 -0
  77. cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
  78. cdxml_toolkit/resolve/condensed_formula.py +493 -0
  79. cdxml_toolkit/resolve/jre_manager.py +195 -0
  80. cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
  81. cdxml_toolkit/resolve/reagent_db.py +285 -0
  82. cdxml_toolkit/resolve/superatom_data.json +2856 -0
  83. cdxml_toolkit/resolve/superatom_table.py +146 -0
  84. cdxml_toolkit/text_formatting.py +298 -0
  85. cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
  86. cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
  87. cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
  88. cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
  89. cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
  90. cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
  91. cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,808 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ ole_embedder.py - Embed CDXML files as editable ChemDraw OLE objects in PPTX/DOCX.
4
+
5
+ Takes one or more CDXML files and creates a PowerPoint or Word document with
6
+ each structure embedded as a double-clickable, editable ChemDraw OLE object.
7
+
8
+ Uses binary OLE construction with CDXML BoundingBox sizing for correct
9
+ display dimensions. Requires ChemDraw COM for CDX conversion and EMF preview.
10
+
11
+ Usage:
12
+ # Single structure
13
+ python ole_embedder.py scheme.cdxml --pptx -o report.pptx
14
+ python ole_embedder.py scheme.cdxml --docx -o report.docx
15
+
16
+ # Multiple structures (one per slide / one per paragraph)
17
+ python ole_embedder.py s1.cdxml s2.cdxml s3.cdxml --pptx -o report.pptx
18
+
19
+ # With margin adjustment (points)
20
+ python ole_embedder.py scheme.cdxml --pptx --margin 2.0 -o report.pptx
21
+
22
+ Requirements:
23
+ - ChemDraw 16+ (COM automation for CDXML -> CDX + EMF)
24
+ - python-pptx (PPTX scaffolding)
25
+ - python-docx (DOCX scaffolding)
26
+ - lxml
27
+ """
28
+
29
+ import argparse
30
+ import io
31
+ import json
32
+ import os
33
+ import struct
34
+ import sys
35
+ import tempfile
36
+ import xml.etree.ElementTree as ET
37
+ import zipfile
38
+
39
+
40
+ # ---------------------------------------------------------------------------
41
+ # Constants
42
+ # ---------------------------------------------------------------------------
43
+
44
+ # ChemDraw OLE CLSID (little-endian mixed-endian UUID encoding)
45
+ # UUID: 41BA6D21-A02E-11CE-8FD9-0020AFD1F20C
46
+ CHEMDRAW_CLSID = bytes([
47
+ 0x21, 0x6D, 0xBA, 0x41,
48
+ 0x2E, 0xA0,
49
+ 0xCE, 0x11,
50
+ 0x8F, 0xD9,
51
+ 0x00, 0x20, 0xAF, 0xD1, 0xF2, 0x0C,
52
+ ])
53
+
54
+ SECTOR_SIZE = 512
55
+ MINI_SECTOR_SIZE = 64
56
+ MINI_STREAM_CUTOFF = 4096
57
+ ENDOFCHAIN = 0xFFFFFFFE
58
+ FREESECT = 0xFFFFFFFF
59
+ NOSTREAM = 0xFFFFFFFF
60
+ FATSECT = 0xFFFFFFFD
61
+
62
+ # Constant bytes from a known-good ChemDraw OLE object (via Office COM)
63
+ COMPOBJ_BYTES = bytes.fromhex(
64
+ "0100feff030a0000ffffffff"
65
+ "216dba412ea0ce118fd90020afd1f20c"
66
+ "140000004353204368656d447261772044726177696e67"
67
+ "001c0000004368656d4472617720496e7465726368616e676520466f726d6174"
68
+ "00160000004368656d447261772e446f63756d656e742e362e30"
69
+ "00f439b271000000000000000000000000"
70
+ ) # 126 bytes
71
+
72
+ OLE_BYTES = bytes.fromhex(
73
+ "0100000200000000000000000000000000000000"
74
+ ) # 20 bytes
75
+
76
+ OLEPRES000_BYTES = bytes.fromhex(
77
+ "ffffffff030000000400000001000000"
78
+ "ffffffff02000000000000000000000000000000"
79
+ "000000004e414e4900000000"
80
+ ) # 48 bytes
81
+
82
+
83
+ # ---------------------------------------------------------------------------
84
+ # ChemDraw COM — batch conversion
85
+ # ---------------------------------------------------------------------------
86
+
87
+ def batch_convert(cdxml_paths):
88
+ """
89
+ Open ChemDraw once and convert all CDXML files to CDX + EMF.
90
+
91
+ Returns list of dicts: [{
92
+ 'path': str, 'name': str,
93
+ 'cdx_data': bytes, 'emf_data': bytes,
94
+ 'width_emu': int, 'height_emu': int,
95
+ }, ...]
96
+ """
97
+ import win32com.client as win32
98
+
99
+ results = []
100
+ app = None
101
+ try:
102
+ app = win32.Dispatch("ChemDraw.Application")
103
+ app.Visible = False
104
+
105
+ for cdxml_path in cdxml_paths:
106
+ cdxml_abs = os.path.abspath(cdxml_path)
107
+ name = os.path.splitext(os.path.basename(cdxml_path))[0]
108
+
109
+ tmp_cdx = tempfile.mktemp(suffix=".cdx")
110
+ tmp_emf = tempfile.mktemp(suffix=".emf")
111
+
112
+ doc = None
113
+ try:
114
+ doc = app.Documents.Open(cdxml_abs)
115
+
116
+ doc.SaveAs(os.path.abspath(tmp_cdx))
117
+ with open(tmp_cdx, "rb") as f:
118
+ cdx_data = f.read()
119
+
120
+ doc.SaveAs(os.path.abspath(tmp_emf))
121
+ with open(tmp_emf, "rb") as f:
122
+ emf_data = f.read()
123
+
124
+ results.append({
125
+ 'path': cdxml_path,
126
+ 'name': name,
127
+ 'cdx_data': cdx_data,
128
+ 'emf_data': emf_data,
129
+ })
130
+ finally:
131
+ try:
132
+ if doc:
133
+ doc.Close(False)
134
+ except Exception:
135
+ pass
136
+ for tmp in (tmp_cdx, tmp_emf):
137
+ if os.path.exists(tmp):
138
+ os.unlink(tmp)
139
+
140
+ finally:
141
+ try:
142
+ if app:
143
+ app.Quit()
144
+ except Exception:
145
+ pass
146
+
147
+ return results
148
+
149
+
150
+ # ---------------------------------------------------------------------------
151
+ # CDXML dimension parsing
152
+ # ---------------------------------------------------------------------------
153
+
154
+ def get_cdxml_content_size(cdxml_path, margin_pt=0.0, scale=1.02):
155
+ """
156
+ Compute OLE display dimensions from CDXML content BoundingBox.
157
+
158
+ The CDXML root BoundingBox is the tight content bounding box in points.
159
+ A scale factor (default 1.02 = +2%) compensates for the slight difference
160
+ between the CDXML BoundingBox and ChemDraw's actual OLE extent.
161
+ Convert to EMU: 1 pt = 12700 EMU.
162
+
163
+ Returns (width_emu, height_emu).
164
+ """
165
+ tree = ET.parse(cdxml_path)
166
+ root = tree.getroot()
167
+
168
+ w_pt = h_pt = None
169
+
170
+ # Try root BoundingBox first (tightest content box)
171
+ bb = root.get('BoundingBox')
172
+ if bb:
173
+ parts = bb.split()
174
+ if len(parts) == 4:
175
+ x1, y1, x2, y2 = map(float, parts)
176
+ w_pt = (x2 - x1) + 2 * margin_pt
177
+ h_pt = (y2 - y1) + 2 * margin_pt
178
+
179
+ # Fallback: compute from all child BoundingBox attributes
180
+ if w_pt is None:
181
+ min_x, min_y = float("inf"), float("inf")
182
+ max_x, max_y = float("-inf"), float("-inf")
183
+ for elem in root.iter():
184
+ child_bb = elem.get("BoundingBox")
185
+ if child_bb and elem.tag not in ("page", "CDXML"):
186
+ parts = child_bb.split()
187
+ if len(parts) == 4:
188
+ try:
189
+ cx1, cy1, cx2, cy2 = map(float, parts)
190
+ min_x = min(min_x, cx1)
191
+ min_y = min(min_y, cy1)
192
+ max_x = max(max_x, cx2)
193
+ max_y = max(max_y, cy2)
194
+ except ValueError:
195
+ continue
196
+
197
+ if min_x < float("inf"):
198
+ w_pt = (max_x - min_x) + 2 * margin_pt
199
+ h_pt = (max_y - min_y) + 2 * margin_pt
200
+
201
+ # Last resort: use page dimensions (scale not applied)
202
+ if w_pt is None:
203
+ for page in root.iter('page'):
204
+ pw = page.get('Width')
205
+ ph = page.get('Height')
206
+ if pw and ph:
207
+ return int(float(pw) * 12700), int(float(ph) * 12700)
208
+ return 5080000, 2540000 # ~4" x 2" fallback
209
+
210
+ # Apply scale factor
211
+ w_pt *= scale
212
+ h_pt *= scale
213
+
214
+ return int(w_pt * 12700), int(h_pt * 12700)
215
+
216
+
217
+ # ---------------------------------------------------------------------------
218
+ # CFB (Compound File Binary) builder
219
+ # ---------------------------------------------------------------------------
220
+
221
+ def _make_dir_entry(name, entry_type, color, left_id, right_id, child_id,
222
+ clsid, start_sector, size):
223
+ """Build a 128-byte CFB directory entry."""
224
+ entry = bytearray(128)
225
+
226
+ name_utf16 = name.encode("utf-16-le") + b"\x00\x00"
227
+ name_len = len(name_utf16)
228
+ entry[0:name_len] = name_utf16
229
+
230
+ struct.pack_into("<H", entry, 64, name_len)
231
+ entry[66] = entry_type # 2=stream, 5=root
232
+ entry[67] = color # 0=red, 1=black
233
+
234
+ struct.pack_into("<I", entry, 68, left_id)
235
+ struct.pack_into("<I", entry, 72, right_id)
236
+ struct.pack_into("<I", entry, 76, child_id)
237
+
238
+ if clsid:
239
+ entry[80:96] = clsid
240
+
241
+ struct.pack_into("<I", entry, 116, start_sector)
242
+ struct.pack_into("<I", entry, 120, size & 0xFFFFFFFF)
243
+
244
+ return bytes(entry)
245
+
246
+
247
+ def build_ole_compound_file(cdx_data):
248
+ """
249
+ Build a CFB file matching the known-good layout from Office COM.
250
+
251
+ Layout:
252
+ Header (512 bytes)
253
+ Sector 0-1: Free
254
+ Sector 2: Directory sector 1 (entries 0-3)
255
+ Sector 3: FAT sector
256
+ Sector 4: Directory sector 2 (entry 4 + 3 empty)
257
+ Sector 5: Mini-FAT sector
258
+ Sector 6: Mini-stream container (256 bytes)
259
+ Sectors 7+: CONTENTS data (CDX binary)
260
+ """
261
+ cdx_sectors = (len(cdx_data) + SECTOR_SIZE - 1) // SECTOR_SIZE
262
+ cdx_start = 7
263
+ total_sectors = cdx_start + cdx_sectors
264
+
265
+ if total_sectors > 128:
266
+ raise ValueError(f"CDX data too large ({len(cdx_data)} bytes) for single-FAT CFB")
267
+
268
+ # --- FAT ---
269
+ fat = [FREESECT] * 128
270
+ fat[0] = FREESECT
271
+ fat[1] = FREESECT
272
+ fat[2] = 4 # dir sector 1 -> sector 4
273
+ fat[3] = FATSECT
274
+ fat[4] = ENDOFCHAIN # dir sector 2 end
275
+ fat[5] = ENDOFCHAIN # mini-FAT
276
+ fat[6] = ENDOFCHAIN # mini-stream container
277
+
278
+ for i in range(cdx_sectors):
279
+ sec = cdx_start + i
280
+ fat[sec] = sec + 1 if i < cdx_sectors - 1 else ENDOFCHAIN
281
+
282
+ # --- Mini-FAT ---
283
+ mini_fat = [FREESECT] * 128
284
+ mini_fat[0] = ENDOFCHAIN # OlePres000
285
+ mini_fat[1] = ENDOFCHAIN # Ole
286
+ mini_fat[2] = 3 # CompObj -> 3
287
+ mini_fat[3] = ENDOFCHAIN # CompObj end
288
+
289
+ # --- Mini-stream container (256 bytes) ---
290
+ mini_stream = bytearray(256)
291
+ mini_stream[0:48] = OLEPRES000_BYTES # mini-sector 0
292
+ mini_stream[64:84] = OLE_BYTES # mini-sector 1
293
+ mini_stream[128:254] = COMPOBJ_BYTES # mini-sectors 2-3
294
+
295
+ # --- Directory entries ---
296
+ e0 = _make_dir_entry("Root Entry", 5, 0,
297
+ NOSTREAM, NOSTREAM, 1,
298
+ CHEMDRAW_CLSID, 6, 256)
299
+ e1 = _make_dir_entry("\x01CompObj", 2, 1,
300
+ 3, 2, NOSTREAM,
301
+ None, 2, 126)
302
+ e2 = _make_dir_entry("CONTENTS", 2, 1,
303
+ NOSTREAM, 4, NOSTREAM,
304
+ None, cdx_start, len(cdx_data))
305
+ e3 = _make_dir_entry("\x01Ole", 2, 1,
306
+ NOSTREAM, NOSTREAM, NOSTREAM,
307
+ None, 1, 20)
308
+ e4 = _make_dir_entry("\x02OlePres000", 2, 1,
309
+ NOSTREAM, NOSTREAM, NOSTREAM,
310
+ None, 0, 48)
311
+
312
+ dir_sector_1 = e0 + e1 + e2 + e3
313
+ dir_sector_2 = e4 + (b"\x00" * 128 * 3)
314
+
315
+ # --- CFB header ---
316
+ header = bytearray(SECTOR_SIZE)
317
+ header[0:8] = b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1"
318
+ struct.pack_into("<H", header, 24, 0x003E)
319
+ struct.pack_into("<H", header, 26, 0x0003)
320
+ struct.pack_into("<H", header, 28, 0xFFFE)
321
+ struct.pack_into("<H", header, 30, 9)
322
+ struct.pack_into("<H", header, 32, 6)
323
+ struct.pack_into("<I", header, 40, 0)
324
+ struct.pack_into("<I", header, 44, 1)
325
+ struct.pack_into("<I", header, 48, 2)
326
+ struct.pack_into("<I", header, 52, 0)
327
+ struct.pack_into("<I", header, 56, MINI_STREAM_CUTOFF)
328
+ struct.pack_into("<I", header, 60, 5)
329
+ struct.pack_into("<I", header, 64, 1)
330
+ struct.pack_into("<I", header, 68, ENDOFCHAIN)
331
+ struct.pack_into("<I", header, 72, 0)
332
+ struct.pack_into("<I", header, 76, 3)
333
+ for i in range(1, 109):
334
+ struct.pack_into("<I", header, 76 + i * 4, FREESECT)
335
+
336
+ # --- Assemble ---
337
+ out = io.BytesIO()
338
+ out.write(bytes(header))
339
+ out.write(b"\x00" * SECTOR_SIZE) # sector 0
340
+ out.write(b"\x00" * SECTOR_SIZE) # sector 1
341
+ out.write(dir_sector_1) # sector 2
342
+ out.write(b"".join(struct.pack("<I", x) for x in fat)) # sector 3
343
+ out.write(dir_sector_2) # sector 4
344
+ out.write(b"".join(struct.pack("<I", x) for x in mini_fat)) # sector 5
345
+ ms_padded = bytes(mini_stream) + b"\x00" * (SECTOR_SIZE - len(mini_stream))
346
+ out.write(ms_padded) # sector 6
347
+ cdx_padded = cdx_data + b"\x00" * (cdx_sectors * SECTOR_SIZE - len(cdx_data))
348
+ out.write(cdx_padded) # sectors 7+
349
+
350
+ return out.getvalue()
351
+
352
+
353
+ # ---------------------------------------------------------------------------
354
+ # OOXML helpers (shared)
355
+ # ---------------------------------------------------------------------------
356
+
357
+ def _ensure_content_types(ct_xml):
358
+ """Ensure .bin and .emf content types are declared."""
359
+ from lxml import etree
360
+
361
+ root = etree.fromstring(ct_xml)
362
+ ct_ns = "http://schemas.openxmlformats.org/package/2006/content-types"
363
+
364
+ existing = set()
365
+ for elem in root.findall(f"{{{ct_ns}}}Default"):
366
+ existing.add(elem.get("Extension", "").lower())
367
+
368
+ if "bin" not in existing:
369
+ etree.SubElement(root, f"{{{ct_ns}}}Default", attrib={
370
+ "Extension": "bin",
371
+ "ContentType": "application/vnd.openxmlformats-officedocument.oleObject",
372
+ })
373
+ if "emf" not in existing:
374
+ etree.SubElement(root, f"{{{ct_ns}}}Default", attrib={
375
+ "Extension": "emf",
376
+ "ContentType": "image/x-emf",
377
+ })
378
+
379
+ return etree.tostring(root, xml_declaration=True, encoding="UTF-8", standalone=True)
380
+
381
+
382
+ # ---------------------------------------------------------------------------
383
+ # PPTX builder — one OLE object per slide
384
+ # ---------------------------------------------------------------------------
385
+
386
+ def build_pptx(items, output_path):
387
+ """
388
+ Create a PPTX with one editable ChemDraw OLE object per slide.
389
+
390
+ items: list of dicts with 'ole_data', 'emf_data', 'width_emu', 'height_emu', 'name'
391
+ """
392
+ from pptx import Presentation
393
+
394
+ prs = Presentation()
395
+
396
+ # Find blank layout
397
+ blank_layout = None
398
+ for layout in prs.slide_layouts:
399
+ if layout.name == "Blank":
400
+ blank_layout = layout
401
+ break
402
+ if blank_layout is None:
403
+ blank_layout = prs.slide_layouts[6]
404
+
405
+ # Create one slide per item
406
+ for _ in items:
407
+ prs.slides.add_slide(blank_layout)
408
+
409
+ tmp_pptx = tempfile.mktemp(suffix=".pptx")
410
+ prs.save(tmp_pptx)
411
+
412
+ try:
413
+ _inject_pptx(tmp_pptx, output_path, items)
414
+ finally:
415
+ if os.path.exists(tmp_pptx):
416
+ os.unlink(tmp_pptx)
417
+
418
+
419
+ def _inject_pptx(source_pptx, output_path, items):
420
+ """Inject OLE objects into each slide of a PPTX."""
421
+ from lxml import etree
422
+
423
+ with zipfile.ZipFile(source_pptx, "r") as zin:
424
+ with zipfile.ZipFile(output_path, "w", zipfile.ZIP_DEFLATED) as zout:
425
+ # Copy existing entries, modifying as needed
426
+ for entry in zin.namelist():
427
+ data = zin.read(entry)
428
+
429
+ # Modify slide XML for each slide
430
+ for idx, item in enumerate(items):
431
+ slide_num = idx + 1
432
+ slide_path = f"ppt/slides/slide{slide_num}.xml"
433
+ rels_path = f"ppt/slides/_rels/slide{slide_num}.xml.rels"
434
+ ole_idx = idx + 1
435
+
436
+ if entry == slide_path:
437
+ data = _build_slide_xml(data, item, ole_idx)
438
+ elif entry == rels_path:
439
+ data = _add_slide_rels(data, ole_idx)
440
+
441
+ if entry == "[Content_Types].xml":
442
+ data = _ensure_content_types(data)
443
+
444
+ zout.writestr(entry, data)
445
+
446
+ # Add OLE + EMF files
447
+ for idx, item in enumerate(items):
448
+ ole_idx = idx + 1
449
+ zout.writestr(
450
+ f"ppt/embeddings/oleObject{ole_idx}.bin",
451
+ item['ole_data'])
452
+ zout.writestr(
453
+ f"ppt/media/olePreview{ole_idx}.emf",
454
+ item['emf_data'])
455
+
456
+
457
+ def _build_slide_xml(slide_xml, item, ole_idx):
458
+ """Add OLE mc:AlternateContent to a slide."""
459
+ from lxml import etree
460
+
461
+ root = etree.fromstring(slide_xml)
462
+
463
+ p_ns = "http://schemas.openxmlformats.org/presentationml/2006/main"
464
+ mc_ns = "http://schemas.openxmlformats.org/markup-compatibility/2006"
465
+ a_ns = "http://schemas.openxmlformats.org/drawingml/2006/main"
466
+ r_ns = "http://schemas.openxmlformats.org/officeDocument/2006/relationships"
467
+ v_ns = "urn:schemas-microsoft-com:vml"
468
+
469
+ sp_tree = root.find(f".//{{{p_ns}}}spTree")
470
+ if sp_tree is None:
471
+ raise RuntimeError("Could not find spTree in slide XML")
472
+
473
+ w = item['width_emu']
474
+ h = item['height_emu']
475
+
476
+ # Center on slide (standard slide: 9144000 x 6858000 EMU)
477
+ left = max(0, (9144000 - w) // 2)
478
+ top = max(0, (6858000 - h) // 2)
479
+
480
+ ole_rel = f"rIdOle{ole_idx}"
481
+ img_rel = f"rIdOleImg{ole_idx}"
482
+ base_id = 100 + (ole_idx - 1) * 10
483
+
484
+ alt_xml = f"""<mc:AlternateContent
485
+ xmlns:mc="{mc_ns}" xmlns:p="{p_ns}"
486
+ xmlns:a="{a_ns}" xmlns:r="{r_ns}" xmlns:v="{v_ns}">
487
+ <mc:Choice Requires="v">
488
+ <p:graphicFrame>
489
+ <p:nvGraphicFramePr>
490
+ <p:cNvPr id="{base_id}" name="Object {ole_idx}"/>
491
+ <p:cNvGraphicFramePr>
492
+ <a:graphicFrameLocks noChangeAspect="1"/>
493
+ </p:cNvGraphicFramePr>
494
+ <p:nvPr/>
495
+ </p:nvGraphicFramePr>
496
+ <p:xfrm>
497
+ <a:off x="{left}" y="{top}"/>
498
+ <a:ext cx="{w}" cy="{h}"/>
499
+ </p:xfrm>
500
+ <a:graphic>
501
+ <a:graphicData uri="http://schemas.openxmlformats.org/presentationml/2006/ole">
502
+ <p:oleObj name="CS ChemDraw Drawing"
503
+ r:id="{ole_rel}"
504
+ imgW="{w}" imgH="{h}"
505
+ progId="ChemDraw.Document.6.0">
506
+ <p:embed/>
507
+ </p:oleObj>
508
+ </a:graphicData>
509
+ </a:graphic>
510
+ </p:graphicFrame>
511
+ </mc:Choice>
512
+ <mc:Fallback>
513
+ <p:graphicFrame>
514
+ <p:nvGraphicFramePr>
515
+ <p:cNvPr id="{base_id + 1}" name="Object {ole_idx}"/>
516
+ <p:cNvGraphicFramePr>
517
+ <a:graphicFrameLocks noChangeAspect="1"/>
518
+ </p:cNvGraphicFramePr>
519
+ <p:nvPr/>
520
+ </p:nvGraphicFramePr>
521
+ <p:xfrm>
522
+ <a:off x="{left}" y="{top}"/>
523
+ <a:ext cx="{w}" cy="{h}"/>
524
+ </p:xfrm>
525
+ <a:graphic>
526
+ <a:graphicData uri="http://schemas.openxmlformats.org/presentationml/2006/ole">
527
+ <p:oleObj name="CS ChemDraw Drawing"
528
+ r:id="{ole_rel}"
529
+ imgW="{w}" imgH="{h}"
530
+ progId="ChemDraw.Document.6.0">
531
+ <p:embed/>
532
+ <p:pic>
533
+ <p:nvPicPr>
534
+ <p:cNvPr id="{base_id + 2}" name="Preview {ole_idx}"/>
535
+ <p:cNvPicPr/>
536
+ <p:nvPr/>
537
+ </p:nvPicPr>
538
+ <p:blipFill>
539
+ <a:blip r:embed="{img_rel}"/>
540
+ <a:stretch><a:fillRect/></a:stretch>
541
+ </p:blipFill>
542
+ <p:spPr>
543
+ <a:xfrm>
544
+ <a:off x="{left}" y="{top}"/>
545
+ <a:ext cx="{w}" cy="{h}"/>
546
+ </a:xfrm>
547
+ <a:prstGeom prst="rect"><a:avLst/></a:prstGeom>
548
+ </p:spPr>
549
+ </p:pic>
550
+ </p:oleObj>
551
+ </a:graphicData>
552
+ </a:graphic>
553
+ </p:graphicFrame>
554
+ </mc:Fallback>
555
+ </mc:AlternateContent>"""
556
+
557
+ sp_tree.append(etree.fromstring(alt_xml))
558
+ return etree.tostring(root, xml_declaration=True, encoding="UTF-8", standalone=True)
559
+
560
+
561
+ def _add_slide_rels(rels_xml, ole_idx):
562
+ """Add OLE + image relationships to a slide's rels."""
563
+ from lxml import etree
564
+
565
+ root = etree.fromstring(rels_xml)
566
+
567
+ etree.SubElement(root, "Relationship", attrib={
568
+ "Id": f"rIdOle{ole_idx}",
569
+ "Type": "http://schemas.openxmlformats.org/officeDocument/2006/relationships/oleObject",
570
+ "Target": f"../embeddings/oleObject{ole_idx}.bin",
571
+ })
572
+ etree.SubElement(root, "Relationship", attrib={
573
+ "Id": f"rIdOleImg{ole_idx}",
574
+ "Type": "http://schemas.openxmlformats.org/officeDocument/2006/relationships/image",
575
+ "Target": f"../media/olePreview{ole_idx}.emf",
576
+ })
577
+
578
+ return etree.tostring(root, xml_declaration=True, encoding="UTF-8", standalone=True)
579
+
580
+
581
+ # ---------------------------------------------------------------------------
582
+ # DOCX builder — one OLE object per paragraph
583
+ # ---------------------------------------------------------------------------
584
+
585
+ def build_docx(items, output_path):
586
+ """
587
+ Create a DOCX with editable ChemDraw OLE objects, one per paragraph.
588
+
589
+ items: list of dicts with 'ole_data', 'emf_data', 'width_emu', 'height_emu', 'name'
590
+ """
591
+ from docx import Document
592
+
593
+ doc = Document()
594
+ doc.add_paragraph("") # placeholder
595
+ tmp_docx = tempfile.mktemp(suffix=".docx")
596
+ doc.save(tmp_docx)
597
+
598
+ try:
599
+ _inject_docx(tmp_docx, output_path, items)
600
+ finally:
601
+ if os.path.exists(tmp_docx):
602
+ os.unlink(tmp_docx)
603
+
604
+
605
+ def _inject_docx(source_docx, output_path, items):
606
+ """Inject OLE objects into a DOCX."""
607
+ from lxml import etree
608
+
609
+ with zipfile.ZipFile(source_docx, "r") as zin:
610
+ with zipfile.ZipFile(output_path, "w", zipfile.ZIP_DEFLATED) as zout:
611
+ for entry in zin.namelist():
612
+ data = zin.read(entry)
613
+
614
+ if entry == "word/document.xml":
615
+ data = _build_docx_xml(data, items)
616
+ elif entry == "word/_rels/document.xml.rels":
617
+ data = _add_docx_rels(data, len(items))
618
+ elif entry == "[Content_Types].xml":
619
+ data = _ensure_content_types(data)
620
+
621
+ zout.writestr(entry, data)
622
+
623
+ for idx, item in enumerate(items):
624
+ ole_idx = idx + 1
625
+ zout.writestr(
626
+ f"word/embeddings/oleObject{ole_idx}.bin",
627
+ item['ole_data'])
628
+ zout.writestr(
629
+ f"word/media/olePreview{ole_idx}.emf",
630
+ item['emf_data'])
631
+
632
+
633
+ def _build_docx_xml(doc_xml, items):
634
+ """Build document.xml with OLE objects."""
635
+ from lxml import etree
636
+
637
+ root = etree.fromstring(doc_xml)
638
+ w_ns = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
639
+
640
+ body = root.find(f"{{{w_ns}}}body")
641
+ if body is None:
642
+ raise RuntimeError("Could not find body in document XML")
643
+
644
+ sect_pr = body.find(f"{{{w_ns}}}sectPr")
645
+
646
+ for idx, item in enumerate(items):
647
+ ole_idx = idx + 1
648
+ w_emu = item['width_emu']
649
+ h_emu = item['height_emu']
650
+
651
+ w_pt = w_emu / 914400 * 72
652
+ h_pt = h_emu / 914400 * 72
653
+ w_twips = int(w_pt * 20)
654
+ h_twips = int(h_pt * 20)
655
+
656
+ shape_id = f"_x0000_s{1026 + idx}"
657
+ object_id = f"_{1728379061 + idx}"
658
+
659
+ para_xml = f"""<w:p xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"
660
+ xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
661
+ xmlns:o="urn:schemas-microsoft-com:office:office"
662
+ xmlns:v="urn:schemas-microsoft-com:vml"
663
+ xmlns:w10="urn:schemas-microsoft-com:office:word">
664
+ <w:r>
665
+ <w:object w:dxaOrig="{w_twips}" w:dyaOrig="{h_twips}">
666
+ <v:shape id="{shape_id}"
667
+ type="#_x0000_t75"
668
+ style="width:{w_pt:.1f}pt;height:{h_pt:.1f}pt"
669
+ o:ole="">
670
+ <v:imagedata r:id="rIdOleImg{ole_idx}" o:title=""/>
671
+ </v:shape>
672
+ <o:OLEObject Type="Embed"
673
+ ProgID="ChemDraw.Document.6.0"
674
+ ShapeID="{shape_id}"
675
+ DrawAspect="Content"
676
+ ObjectID="{object_id}"
677
+ r:id="rIdOle{ole_idx}"/>
678
+ </w:object>
679
+ </w:r>
680
+ </w:p>"""
681
+
682
+ ole_elem = etree.fromstring(para_xml)
683
+
684
+ if sect_pr is not None:
685
+ sect_pr.addprevious(ole_elem)
686
+ else:
687
+ body.append(ole_elem)
688
+
689
+ return etree.tostring(root, xml_declaration=True, encoding="UTF-8", standalone=True)
690
+
691
+
692
+ def _add_docx_rels(rels_xml, count):
693
+ """Add OLE + image relationships for all objects."""
694
+ from lxml import etree
695
+
696
+ root = etree.fromstring(rels_xml)
697
+
698
+ for ole_idx in range(1, count + 1):
699
+ etree.SubElement(root, "Relationship", attrib={
700
+ "Id": f"rIdOle{ole_idx}",
701
+ "Type": "http://schemas.openxmlformats.org/officeDocument/2006/relationships/oleObject",
702
+ "Target": f"embeddings/oleObject{ole_idx}.bin",
703
+ })
704
+ etree.SubElement(root, "Relationship", attrib={
705
+ "Id": f"rIdOleImg{ole_idx}",
706
+ "Type": "http://schemas.openxmlformats.org/officeDocument/2006/relationships/image",
707
+ "Target": f"media/olePreview{ole_idx}.emf",
708
+ })
709
+
710
+ return etree.tostring(root, xml_declaration=True, encoding="UTF-8", standalone=True)
711
+
712
+
713
+ # ---------------------------------------------------------------------------
714
+ # CLI
715
+ # ---------------------------------------------------------------------------
716
+
717
+ def main(argv=None) -> int:
718
+ parser = argparse.ArgumentParser(
719
+ description="Embed CDXML files as editable ChemDraw OLE objects in PPTX or DOCX."
720
+ )
721
+ parser.add_argument("inputs", nargs="+", help="Input CDXML file(s)")
722
+ parser.add_argument("-o", "--output", required=True, help="Output file path (.pptx or .docx)")
723
+ parser.add_argument("--margin", type=float, default=0.0,
724
+ help="Margin in points around content (default: 0.0)")
725
+
726
+ group = parser.add_mutually_exclusive_group(required=True)
727
+ group.add_argument("--pptx", action="store_true", help="Create PowerPoint .pptx")
728
+ group.add_argument("--docx", action="store_true", help="Create Word .docx")
729
+
730
+ parser.add_argument("--json", action="store_true",
731
+ help="Output result as JSON to stdout")
732
+
733
+ args = parser.parse_args(argv)
734
+
735
+ # When --json, redirect status prints to stderr
736
+ _log = print
737
+ if args.json:
738
+ def _log(*a, **kw):
739
+ kw.setdefault("file", sys.stderr)
740
+ print(*a, **kw)
741
+
742
+ # Validate inputs
743
+ for path in args.inputs:
744
+ if not os.path.isfile(path):
745
+ print(f"Error: file not found: {path}", file=sys.stderr)
746
+ return 1
747
+ if not path.lower().endswith(".cdxml"):
748
+ print(f"Warning: {path} is not a .cdxml file", file=sys.stderr)
749
+
750
+ ext = ".pptx" if args.pptx else ".docx"
751
+ if not args.output.lower().endswith(ext):
752
+ print(f"Warning: output file doesn't end with {ext}", file=sys.stderr)
753
+
754
+ n = len(args.inputs)
755
+ fmt = "PPTX" if args.pptx else "DOCX"
756
+ _log(f"Embedding {n} structure{'s' if n > 1 else ''} into {fmt}...")
757
+
758
+ # Step 1: Batch convert CDXML -> CDX + EMF
759
+ _log(f"[1/3] Converting {n} CDXML file{'s' if n > 1 else ''} via ChemDraw COM...")
760
+ converted = batch_convert(args.inputs)
761
+ for item in converted:
762
+ _log(f" {item['name']}: CDX={len(item['cdx_data']):,}B, EMF={len(item['emf_data']):,}B")
763
+
764
+ # Step 2: Compute dimensions + build OLE compound files
765
+ _log(f"[2/3] Computing dimensions and building OLE files...")
766
+ items = []
767
+ for item in converted:
768
+ w_emu, h_emu = get_cdxml_content_size(item['path'], margin_pt=args.margin)
769
+ ole_data = build_ole_compound_file(item['cdx_data'])
770
+
771
+ w_pt = w_emu / 12700
772
+ h_pt = h_emu / 12700
773
+ _log(f" {item['name']}: {w_pt:.1f} x {h_pt:.1f} pt, OLE={len(ole_data):,}B")
774
+
775
+ items.append({
776
+ 'ole_data': ole_data,
777
+ 'emf_data': item['emf_data'],
778
+ 'width_emu': w_emu,
779
+ 'height_emu': h_emu,
780
+ 'name': item['name'],
781
+ })
782
+
783
+ # Step 3: Build output document
784
+ _log(f"[3/3] Building {fmt}...")
785
+ if args.pptx:
786
+ build_pptx(items, args.output)
787
+ else:
788
+ build_docx(items, args.output)
789
+
790
+ if args.json:
791
+ result = {
792
+ "input_files": [os.path.abspath(p) for p in args.inputs],
793
+ "output": os.path.abspath(args.output),
794
+ "format": fmt.lower(),
795
+ "num_objects_embedded": n,
796
+ }
797
+ print(json.dumps(result, indent=2))
798
+ else:
799
+ print(f"\nDone! {args.output}")
800
+ print(f" {n} editable ChemDraw OLE object{'s' if n > 1 else ''}")
801
+ if args.pptx and n > 1:
802
+ print(f" {n} slides (one per structure)")
803
+
804
+ return 0
805
+
806
+
807
+ if __name__ == "__main__":
808
+ sys.exit(main())