cdxml-toolkit 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. cdxml_toolkit/__init__.py +18 -0
  2. cdxml_toolkit/_jre/__init__.py +2 -0
  3. cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
  4. cdxml_toolkit/analysis/__init__.py +35 -0
  5. cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
  6. cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
  7. cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
  8. cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
  9. cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
  10. cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
  11. cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
  12. cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
  13. cdxml_toolkit/analysis/extract_nmr.py +47 -0
  14. cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
  15. cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
  16. cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
  17. cdxml_toolkit/cdxml_builder.py +920 -0
  18. cdxml_toolkit/cdxml_utils.py +342 -0
  19. cdxml_toolkit/chemdraw/__init__.py +5 -0
  20. cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
  21. cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
  22. cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
  23. cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
  24. cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
  25. cdxml_toolkit/constants.py +304 -0
  26. cdxml_toolkit/coord_normalizer.py +438 -0
  27. cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
  28. cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
  29. cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
  30. cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
  31. cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
  32. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
  33. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
  34. cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
  35. cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
  36. cdxml_toolkit/image/__init__.py +15 -0
  37. cdxml_toolkit/image/reaction_from_image.py +2103 -0
  38. cdxml_toolkit/image/structure_from_image.py +1711 -0
  39. cdxml_toolkit/layout/__init__.py +5 -0
  40. cdxml_toolkit/layout/alignment.py +1642 -0
  41. cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
  42. cdxml_toolkit/layout/scheme_merger.py +2260 -0
  43. cdxml_toolkit/mcp_server/__init__.py +0 -0
  44. cdxml_toolkit/mcp_server/__main__.py +5 -0
  45. cdxml_toolkit/mcp_server/server.py +1567 -0
  46. cdxml_toolkit/naming/__init__.py +6 -0
  47. cdxml_toolkit/naming/aligned_namer.py +2342 -0
  48. cdxml_toolkit/naming/mol_builder.py +3722 -0
  49. cdxml_toolkit/naming/name_decomposer.py +2843 -0
  50. cdxml_toolkit/naming/reactions_datamol.json +2414 -0
  51. cdxml_toolkit/office/__init__.py +5 -0
  52. cdxml_toolkit/office/doc_from_template.py +722 -0
  53. cdxml_toolkit/office/ole_embedder.py +808 -0
  54. cdxml_toolkit/office/ole_extractor.py +272 -0
  55. cdxml_toolkit/perception/__init__.py +10 -0
  56. cdxml_toolkit/perception/compound_search.py +229 -0
  57. cdxml_toolkit/perception/eln_csv_parser.py +240 -0
  58. cdxml_toolkit/perception/rdf_parser.py +664 -0
  59. cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
  60. cdxml_toolkit/perception/reaction_parser.py +2150 -0
  61. cdxml_toolkit/perception/scheme_reader.py +2948 -0
  62. cdxml_toolkit/perception/scheme_refine.py +1404 -0
  63. cdxml_toolkit/perception/scheme_segmenter.py +619 -0
  64. cdxml_toolkit/perception/spatial_assignment.py +1013 -0
  65. cdxml_toolkit/rdkit_utils.py +605 -0
  66. cdxml_toolkit/render/__init__.py +17 -0
  67. cdxml_toolkit/render/auto_layout.py +229 -0
  68. cdxml_toolkit/render/compact_parser.py +632 -0
  69. cdxml_toolkit/render/parser.py +706 -0
  70. cdxml_toolkit/render/render_scheme.py +267 -0
  71. cdxml_toolkit/render/renderer.py +2387 -0
  72. cdxml_toolkit/render/schema.py +90 -0
  73. cdxml_toolkit/render/scheme_maker.py +1043 -0
  74. cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
  75. cdxml_toolkit/resolve/__init__.py +13 -0
  76. cdxml_toolkit/resolve/cas_resolver.py +430 -0
  77. cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
  78. cdxml_toolkit/resolve/condensed_formula.py +493 -0
  79. cdxml_toolkit/resolve/jre_manager.py +195 -0
  80. cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
  81. cdxml_toolkit/resolve/reagent_db.py +285 -0
  82. cdxml_toolkit/resolve/superatom_data.json +2856 -0
  83. cdxml_toolkit/resolve/superatom_table.py +146 -0
  84. cdxml_toolkit/text_formatting.py +298 -0
  85. cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
  86. cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
  87. cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
  88. cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
  89. cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
  90. cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
  91. cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,722 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ doc_from_template.py — Fill a PowerPoint or Word template with text and ChemDraw OLE structures.
4
+
5
+ Two-pass approach:
6
+ Pass 1: python-pptx/python-docx replaces text placeholders (preserving formatting)
7
+ Pass 2: XML-level injection replaces CDXML placeholders with editable OLE objects
8
+
9
+ Usage:
10
+ python doc_from_template.py --template template.pptx --manifest manifest.json -o output.pptx
11
+ python doc_from_template.py --template template.docx --manifest manifest.json -o output.docx
12
+ python doc_from_template.py --create-test-template # creates templates/reaction_summary.pptx
13
+
14
+ Requirements:
15
+ - ChemDraw 16+ (COM automation for CDXML -> CDX + EMF) — only needed for cdxml slots
16
+ - python-pptx
17
+ - python-docx
18
+ - lxml
19
+ """
20
+
21
+ import argparse
22
+ import json
23
+ import os
24
+ import re
25
+ import shutil
26
+ import sys
27
+ import tempfile
28
+ import zipfile
29
+
30
+ from .ole_embedder import (
31
+ batch_convert,
32
+ get_cdxml_content_size,
33
+ build_ole_compound_file,
34
+ _ensure_content_types,
35
+ )
36
+
37
+
38
+ # ---------------------------------------------------------------------------
39
+ # XML namespaces
40
+ # ---------------------------------------------------------------------------
41
+
42
+ A_NS = "http://schemas.openxmlformats.org/drawingml/2006/main"
43
+ P_NS = "http://schemas.openxmlformats.org/presentationml/2006/main"
44
+ R_NS = "http://schemas.openxmlformats.org/officeDocument/2006/relationships"
45
+ MC_NS = "http://schemas.openxmlformats.org/markup-compatibility/2006"
46
+ V_NS = "urn:schemas-microsoft-com:vml"
47
+ W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
48
+ O_NS = "urn:schemas-microsoft-com:office:office"
49
+ RELS_NS = "http://schemas.openxmlformats.org/package/2006/relationships"
50
+ OLEOBJ_TYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/oleObject"
51
+ IMAGE_TYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/image"
52
+
53
+
54
+ # ---------------------------------------------------------------------------
55
+ # Manifest loading
56
+ # ---------------------------------------------------------------------------
57
+
58
+ def load_manifest(manifest_path):
59
+ """Load JSON manifest. Resolve CDXML paths relative to manifest directory.
60
+
61
+ Returns (text_slots, cdxml_slots, warnings).
62
+ """
63
+ with open(manifest_path) as f:
64
+ data = json.load(f)
65
+
66
+ base_dir = os.path.dirname(os.path.abspath(manifest_path))
67
+ text_slots = {} # placeholder -> value
68
+ cdxml_slots = [] # [{"placeholder": ..., "file": abs_path}, ...]
69
+ warnings = []
70
+
71
+ for slot in data.get("slots", []):
72
+ ph = slot["placeholder"]
73
+ stype = slot.get("type", "text")
74
+
75
+ if stype == "text":
76
+ text_slots[ph] = slot["value"]
77
+ elif stype == "cdxml":
78
+ fpath = slot["file"]
79
+ if not os.path.isabs(fpath):
80
+ fpath = os.path.join(base_dir, fpath)
81
+ fpath = os.path.abspath(fpath)
82
+ if not os.path.isfile(fpath):
83
+ warnings.append(f"CDXML file not found: {fpath}")
84
+ cdxml_slots.append({"placeholder": ph, "file": fpath})
85
+ else:
86
+ warnings.append(f"Unknown slot type '{stype}' for {ph}")
87
+
88
+ return text_slots, cdxml_slots, warnings
89
+
90
+
91
+ # ---------------------------------------------------------------------------
92
+ # OLE preparation (ChemDraw COM batch conversion)
93
+ # ---------------------------------------------------------------------------
94
+
95
+ def prepare_ole_items(cdxml_slots, margin_pt=0.0):
96
+ """Convert unique CDXML files to OLE data via ChemDraw COM.
97
+
98
+ Returns dict: abs_path -> {"ole_data", "emf_data", "width_emu", "height_emu"}
99
+ """
100
+ unique_files = list(dict.fromkeys(
101
+ s["file"] for s in cdxml_slots if os.path.isfile(s["file"])
102
+ ))
103
+ if not unique_files:
104
+ return {}
105
+
106
+ converted = batch_convert(unique_files)
107
+
108
+ items = {}
109
+ for conv in converted:
110
+ path = os.path.abspath(conv["path"])
111
+ w_emu, h_emu = get_cdxml_content_size(conv["path"], margin_pt=margin_pt)
112
+ ole_data = build_ole_compound_file(conv["cdx_data"])
113
+ items[path] = {
114
+ "ole_data": ole_data,
115
+ "emf_data": conv["emf_data"],
116
+ "width_emu": w_emu,
117
+ "height_emu": h_emu,
118
+ }
119
+
120
+ return items
121
+
122
+
123
+ # ---------------------------------------------------------------------------
124
+ # Pass 1: Text replacement (python-pptx / python-docx)
125
+ # ---------------------------------------------------------------------------
126
+
127
+ def _replace_in_paragraph(paragraph, text_slots):
128
+ """Replace {{PLACEHOLDER}} patterns in a paragraph's runs.
129
+
130
+ Joins all run texts, performs replacements, puts result in first run.
131
+ Preserves the first run's formatting. Returns set of filled placeholder names.
132
+ """
133
+ runs = paragraph.runs
134
+ if not runs:
135
+ return set()
136
+
137
+ full_text = "".join(r.text or "" for r in runs)
138
+ filled = set()
139
+ new_text = full_text
140
+
141
+ for placeholder, value in text_slots.items():
142
+ if placeholder in new_text:
143
+ new_text = new_text.replace(placeholder, value)
144
+ filled.add(placeholder)
145
+
146
+ if filled:
147
+ runs[0].text = new_text
148
+ for r in runs[1:]:
149
+ r.text = ""
150
+
151
+ return filled
152
+
153
+
154
+ def pass1_pptx(template_path, text_slots, temp_path):
155
+ """Replace text placeholders in PPTX template. Save to temp_path.
156
+
157
+ Returns set of placeholder names that were filled.
158
+ """
159
+ from pptx import Presentation
160
+
161
+ prs = Presentation(template_path)
162
+ filled = set()
163
+
164
+ for slide in prs.slides:
165
+ for shape in slide.shapes:
166
+ if shape.has_text_frame:
167
+ for para in shape.text_frame.paragraphs:
168
+ filled.update(_replace_in_paragraph(para, text_slots))
169
+
170
+ prs.save(temp_path)
171
+ return filled
172
+
173
+
174
+ def pass1_docx(template_path, text_slots, temp_path):
175
+ """Replace text placeholders in DOCX template. Save to temp_path.
176
+
177
+ Returns set of placeholder names that were filled.
178
+ """
179
+ from docx import Document
180
+
181
+ doc = Document(template_path)
182
+ filled = set()
183
+
184
+ for para in doc.paragraphs:
185
+ filled.update(_replace_in_paragraph(para, text_slots))
186
+
187
+ for table in doc.tables:
188
+ for row in table.rows:
189
+ for cell in row.cells:
190
+ for para in cell.paragraphs:
191
+ filled.update(_replace_in_paragraph(para, text_slots))
192
+
193
+ doc.save(temp_path)
194
+ return filled
195
+
196
+
197
+ # ---------------------------------------------------------------------------
198
+ # Pass 2 helpers: relationship + content type management
199
+ # ---------------------------------------------------------------------------
200
+
201
+ def _rels_path_for(entry):
202
+ """Compute the .rels file path for a ZIP entry (forward-slash paths)."""
203
+ idx = entry.rfind("/")
204
+ if idx < 0:
205
+ return f"_rels/{entry}.rels"
206
+ return f"{entry[:idx]}/_rels/{entry[idx + 1:]}.rels"
207
+
208
+
209
+ def _add_ole_rels(rels_xml, ole_idx, target_prefix):
210
+ """Add OLE + image relationship entries to a rels XML document.
211
+
212
+ target_prefix: '../' for PPTX slides, '' for DOCX document.
213
+ """
214
+ from lxml import etree
215
+
216
+ root = etree.fromstring(rels_xml)
217
+
218
+ etree.SubElement(root, "Relationship", attrib={
219
+ "Id": f"rIdOle{ole_idx}",
220
+ "Type": OLEOBJ_TYPE,
221
+ "Target": f"{target_prefix}embeddings/oleObject{ole_idx}.bin",
222
+ })
223
+ etree.SubElement(root, "Relationship", attrib={
224
+ "Id": f"rIdOleImg{ole_idx}",
225
+ "Type": IMAGE_TYPE,
226
+ "Target": f"{target_prefix}media/olePreview{ole_idx}.emf",
227
+ })
228
+
229
+ return etree.tostring(root, xml_declaration=True, encoding="UTF-8", standalone=True)
230
+
231
+
232
+ # ---------------------------------------------------------------------------
233
+ # Pass 2 — PPTX: replace CDXML placeholder shapes with OLE objects
234
+ # ---------------------------------------------------------------------------
235
+
236
+ def _make_pptx_ole_xml(ole_idx, x, y, w, h):
237
+ """Build mc:AlternateContent XML for an OLE object in a PPTX slide."""
238
+ from lxml import etree
239
+
240
+ bid = 10000 + (ole_idx - 1) * 10
241
+ orel = f"rIdOle{ole_idx}"
242
+ irel = f"rIdOleImg{ole_idx}"
243
+
244
+ xml_str = f"""<mc:AlternateContent
245
+ xmlns:mc="{MC_NS}" xmlns:p="{P_NS}"
246
+ xmlns:a="{A_NS}" xmlns:r="{R_NS}" xmlns:v="{V_NS}">
247
+ <mc:Choice Requires="v">
248
+ <p:graphicFrame>
249
+ <p:nvGraphicFramePr>
250
+ <p:cNvPr id="{bid}" name="ChemDraw {ole_idx}"/>
251
+ <p:cNvGraphicFramePr>
252
+ <a:graphicFrameLocks noChangeAspect="1"/>
253
+ </p:cNvGraphicFramePr>
254
+ <p:nvPr/>
255
+ </p:nvGraphicFramePr>
256
+ <p:xfrm>
257
+ <a:off x="{x}" y="{y}"/>
258
+ <a:ext cx="{w}" cy="{h}"/>
259
+ </p:xfrm>
260
+ <a:graphic>
261
+ <a:graphicData uri="http://schemas.openxmlformats.org/presentationml/2006/ole">
262
+ <p:oleObj name="CS ChemDraw Drawing" r:id="{orel}"
263
+ imgW="{w}" imgH="{h}" progId="ChemDraw.Document.6.0">
264
+ <p:embed/>
265
+ </p:oleObj>
266
+ </a:graphicData>
267
+ </a:graphic>
268
+ </p:graphicFrame>
269
+ </mc:Choice>
270
+ <mc:Fallback>
271
+ <p:graphicFrame>
272
+ <p:nvGraphicFramePr>
273
+ <p:cNvPr id="{bid + 1}" name="ChemDraw {ole_idx}"/>
274
+ <p:cNvGraphicFramePr>
275
+ <a:graphicFrameLocks noChangeAspect="1"/>
276
+ </p:cNvGraphicFramePr>
277
+ <p:nvPr/>
278
+ </p:nvGraphicFramePr>
279
+ <p:xfrm>
280
+ <a:off x="{x}" y="{y}"/>
281
+ <a:ext cx="{w}" cy="{h}"/>
282
+ </p:xfrm>
283
+ <a:graphic>
284
+ <a:graphicData uri="http://schemas.openxmlformats.org/presentationml/2006/ole">
285
+ <p:oleObj name="CS ChemDraw Drawing" r:id="{orel}"
286
+ imgW="{w}" imgH="{h}" progId="ChemDraw.Document.6.0">
287
+ <p:embed/>
288
+ <p:pic>
289
+ <p:nvPicPr>
290
+ <p:cNvPr id="{bid + 2}" name="Preview {ole_idx}"/>
291
+ <p:cNvPicPr/><p:nvPr/>
292
+ </p:nvPicPr>
293
+ <p:blipFill>
294
+ <a:blip r:embed="{irel}"/>
295
+ <a:stretch><a:fillRect/></a:stretch>
296
+ </p:blipFill>
297
+ <p:spPr>
298
+ <a:xfrm>
299
+ <a:off x="{x}" y="{y}"/>
300
+ <a:ext cx="{w}" cy="{h}"/>
301
+ </a:xfrm>
302
+ <a:prstGeom prst="rect"><a:avLst/></a:prstGeom>
303
+ </p:spPr>
304
+ </p:pic>
305
+ </p:oleObj>
306
+ </a:graphicData>
307
+ </a:graphic>
308
+ </p:graphicFrame>
309
+ </mc:Fallback>
310
+ </mc:AlternateContent>"""
311
+
312
+ return etree.fromstring(xml_str)
313
+
314
+
315
+ def pass2_pptx(input_path, output_path, cdxml_slots, ole_items):
316
+ """Replace CDXML placeholder text boxes with OLE objects in PPTX.
317
+
318
+ Returns (ole_count, filled_placeholders_set).
319
+ """
320
+ from lxml import etree
321
+
322
+ ph_to_file = {s["placeholder"]: s["file"] for s in cdxml_slots}
323
+ ole_counter = 0
324
+ filled_ph = set()
325
+
326
+ # Scan slides for placeholder shapes, build modifications
327
+ # mods: slide_entry -> (modified_xml_bytes, [(ole_idx, item)])
328
+ mods = {}
329
+
330
+ with zipfile.ZipFile(input_path, "r") as zin:
331
+ for entry in zin.namelist():
332
+ if not re.match(r"ppt/slides/slide\d+\.xml$", entry):
333
+ continue
334
+
335
+ root = etree.fromstring(zin.read(entry))
336
+ sp_tree = root.find(f".//{{{P_NS}}}spTree")
337
+ if sp_tree is None:
338
+ continue
339
+
340
+ slide_oles = []
341
+
342
+ for sp in list(sp_tree.findall(f"{{{P_NS}}}sp")):
343
+ texts = [t.text for t in sp.iter(f"{{{A_NS}}}t") if t.text]
344
+ full = "".join(texts).strip()
345
+
346
+ for ph, fpath in ph_to_file.items():
347
+ if ph not in full or fpath not in ole_items:
348
+ continue
349
+
350
+ ole_counter += 1
351
+ item = ole_items[fpath]
352
+ filled_ph.add(ph)
353
+
354
+ # Get shape position from its transform
355
+ x = y = 0
356
+ xfrm = sp.find(f"{{{P_NS}}}spPr/{{{A_NS}}}xfrm")
357
+ if xfrm is None:
358
+ xfrm = sp.find(f".//{{{A_NS}}}xfrm")
359
+ if xfrm is not None:
360
+ off = xfrm.find(f"{{{A_NS}}}off")
361
+ if off is not None:
362
+ x = int(off.get("x", "0"))
363
+ y = int(off.get("y", "0"))
364
+
365
+ # Remove placeholder text box, add OLE graphic frame
366
+ sp_tree.remove(sp)
367
+ sp_tree.append(_make_pptx_ole_xml(
368
+ ole_counter, x, y,
369
+ item["width_emu"], item["height_emu"],
370
+ ))
371
+ slide_oles.append((ole_counter, item))
372
+ break # one placeholder per shape
373
+
374
+ if slide_oles:
375
+ mods[entry] = (
376
+ etree.tostring(root, xml_declaration=True,
377
+ encoding="UTF-8", standalone=True),
378
+ slide_oles,
379
+ )
380
+
381
+ # Build rels update map: rels_entry -> [(ole_idx, item)]
382
+ rels_updates = {}
383
+ for slide_entry, (_, oles) in mods.items():
384
+ rp = _rels_path_for(slide_entry)
385
+ rels_updates[rp] = oles
386
+
387
+ # Write output ZIP
388
+ with zipfile.ZipFile(input_path, "r") as zin:
389
+ with zipfile.ZipFile(output_path, "w", zipfile.ZIP_DEFLATED) as zout:
390
+ for entry in zin.namelist():
391
+ data = zin.read(entry)
392
+
393
+ # Swap in modified slide XML
394
+ if entry in mods:
395
+ data = mods[entry][0]
396
+
397
+ # Add OLE relationships to affected slide rels
398
+ if entry in rels_updates:
399
+ for idx, _ in rels_updates[entry]:
400
+ data = _add_ole_rels(data, idx, "../")
401
+
402
+ # Ensure .bin and .emf content types exist
403
+ if entry == "[Content_Types].xml":
404
+ data = _ensure_content_types(data)
405
+
406
+ zout.writestr(entry, data)
407
+
408
+ # Write OLE + EMF binary files
409
+ for _, (_, oles) in mods.items():
410
+ for idx, item in oles:
411
+ zout.writestr(
412
+ f"ppt/embeddings/oleObject{idx}.bin", item["ole_data"])
413
+ zout.writestr(
414
+ f"ppt/media/olePreview{idx}.emf", item["emf_data"])
415
+
416
+ return ole_counter, filled_ph
417
+
418
+
419
+ # ---------------------------------------------------------------------------
420
+ # Pass 2 — DOCX: replace CDXML placeholder paragraphs with OLE objects
421
+ # ---------------------------------------------------------------------------
422
+
423
+ def _make_docx_ole_para(ole_idx, w_emu, h_emu):
424
+ """Build a DOCX paragraph containing a ChemDraw OLE object."""
425
+ from lxml import etree
426
+
427
+ w_pt = w_emu / 12700
428
+ h_pt = h_emu / 12700
429
+ w_twips = int(w_pt * 20)
430
+ h_twips = int(h_pt * 20)
431
+ shape_id = f"_x0000_s{1026 + ole_idx}"
432
+ obj_id = f"_{1728379061 + ole_idx}"
433
+
434
+ xml_str = f"""<w:p xmlns:w="{W_NS}"
435
+ xmlns:r="{R_NS}"
436
+ xmlns:o="{O_NS}"
437
+ xmlns:v="{V_NS}">
438
+ <w:r>
439
+ <w:object w:dxaOrig="{w_twips}" w:dyaOrig="{h_twips}">
440
+ <v:shape id="{shape_id}" type="#_x0000_t75"
441
+ style="width:{w_pt:.1f}pt;height:{h_pt:.1f}pt"
442
+ o:ole="">
443
+ <v:imagedata r:id="rIdOleImg{ole_idx}" o:title=""/>
444
+ </v:shape>
445
+ <o:OLEObject Type="Embed"
446
+ ProgID="ChemDraw.Document.6.0"
447
+ ShapeID="{shape_id}"
448
+ DrawAspect="Content"
449
+ ObjectID="{obj_id}"
450
+ r:id="rIdOle{ole_idx}"/>
451
+ </w:object>
452
+ </w:r>
453
+ </w:p>"""
454
+
455
+ return etree.fromstring(xml_str)
456
+
457
+
458
+ def pass2_docx(input_path, output_path, cdxml_slots, ole_items):
459
+ """Replace CDXML placeholder paragraphs with OLE objects in DOCX.
460
+
461
+ Returns (ole_count, filled_placeholders_set).
462
+ """
463
+ from lxml import etree
464
+
465
+ ph_to_file = {s["placeholder"]: s["file"] for s in cdxml_slots}
466
+ ole_counter = 0
467
+ filled_ph = set()
468
+ oles = [] # [(ole_idx, item)]
469
+
470
+ with zipfile.ZipFile(input_path, "r") as zin:
471
+ doc_xml = zin.read("word/document.xml")
472
+ root = etree.fromstring(doc_xml)
473
+ body = root.find(f"{{{W_NS}}}body")
474
+ if body is None:
475
+ shutil.copy2(input_path, output_path)
476
+ return 0, set()
477
+
478
+ # Search body paragraphs for CDXML placeholders
479
+ for p_elem in list(body.findall(f"{{{W_NS}}}p")):
480
+ texts = [t.text for t in p_elem.iter(f"{{{W_NS}}}t") if t.text]
481
+ full = "".join(texts).strip()
482
+
483
+ for ph, fpath in ph_to_file.items():
484
+ if ph not in full or fpath not in ole_items:
485
+ continue
486
+
487
+ ole_counter += 1
488
+ item = ole_items[fpath]
489
+ filled_ph.add(ph)
490
+
491
+ new_p = _make_docx_ole_para(
492
+ ole_counter, item["width_emu"], item["height_emu"])
493
+ body.replace(p_elem, new_p)
494
+ oles.append((ole_counter, item))
495
+ break # one placeholder per paragraph
496
+
497
+ if not oles:
498
+ shutil.copy2(input_path, output_path)
499
+ return 0, filled_ph
500
+
501
+ new_doc_xml = etree.tostring(
502
+ root, xml_declaration=True, encoding="UTF-8", standalone=True)
503
+
504
+ # Write output ZIP
505
+ with zipfile.ZipFile(input_path, "r") as zin:
506
+ with zipfile.ZipFile(output_path, "w", zipfile.ZIP_DEFLATED) as zout:
507
+ for entry in zin.namelist():
508
+ data = zin.read(entry)
509
+
510
+ if entry == "word/document.xml":
511
+ data = new_doc_xml
512
+
513
+ if entry == "word/_rels/document.xml.rels":
514
+ for idx, _ in oles:
515
+ data = _add_ole_rels(data, idx, "")
516
+
517
+ if entry == "[Content_Types].xml":
518
+ data = _ensure_content_types(data)
519
+
520
+ zout.writestr(entry, data)
521
+
522
+ for idx, item in oles:
523
+ zout.writestr(
524
+ f"word/embeddings/oleObject{idx}.bin", item["ole_data"])
525
+ zout.writestr(
526
+ f"word/media/olePreview{idx}.emf", item["emf_data"])
527
+
528
+ return ole_counter, filled_ph
529
+
530
+
531
+ # ---------------------------------------------------------------------------
532
+ # Test template creation
533
+ # ---------------------------------------------------------------------------
534
+
535
+ def create_test_template(output_dir="templates"):
536
+ """Create a minimal 1-slide PPTX template with placeholder text boxes."""
537
+ from pptx import Presentation
538
+ from pptx.util import Inches, Pt
539
+ from pptx.enum.text import PP_ALIGN
540
+
541
+ os.makedirs(output_dir, exist_ok=True)
542
+
543
+ prs = Presentation()
544
+
545
+ # Find blank layout
546
+ blank = None
547
+ for layout in prs.slide_layouts:
548
+ if layout.name == "Blank":
549
+ blank = layout
550
+ break
551
+ if blank is None:
552
+ blank = prs.slide_layouts[6]
553
+
554
+ slide = prs.slides.add_slide(blank)
555
+
556
+ # Title text box
557
+ tb = slide.shapes.add_textbox(Inches(1), Inches(0.4), Inches(8), Inches(0.7))
558
+ p = tb.text_frame.paragraphs[0]
559
+ p.text = "{{TITLE}}"
560
+ p.font.size = Pt(24)
561
+ p.font.bold = True
562
+ p.alignment = PP_ALIGN.CENTER
563
+
564
+ # Subtitle text box
565
+ tb = slide.shapes.add_textbox(Inches(1), Inches(1.2), Inches(8), Inches(0.5))
566
+ p = tb.text_frame.paragraphs[0]
567
+ p.text = "{{SUBTITLE}}"
568
+ p.font.size = Pt(14)
569
+ p.alignment = PP_ALIGN.CENTER
570
+
571
+ # Scheme placeholder text box (positioned for a reaction scheme)
572
+ tb = slide.shapes.add_textbox(Inches(1), Inches(2.5), Inches(8), Inches(4))
573
+ p = tb.text_frame.paragraphs[0]
574
+ p.text = "{{SCHEME}}"
575
+ p.font.size = Pt(12)
576
+ p.alignment = PP_ALIGN.CENTER
577
+
578
+ out_path = os.path.join(output_dir, "reaction_summary.pptx")
579
+ prs.save(out_path)
580
+ return out_path
581
+
582
+
583
+ # ---------------------------------------------------------------------------
584
+ # CLI
585
+ # ---------------------------------------------------------------------------
586
+
587
+ def main(argv=None) -> int:
588
+ parser = argparse.ArgumentParser(
589
+ description="Fill a PowerPoint or Word template with text and "
590
+ "ChemDraw OLE structures from a JSON manifest."
591
+ )
592
+ parser.add_argument("--template", help="Template file (.pptx or .docx)")
593
+ parser.add_argument("--manifest", help="JSON manifest file")
594
+ parser.add_argument("-o", "--output", help="Output file path")
595
+ parser.add_argument("--margin", type=float, default=0.0,
596
+ help="OLE margin in points (default: 0.0)")
597
+ parser.add_argument("--json", action="store_true",
598
+ help="JSON output summary")
599
+ parser.add_argument("--create-test-template", action="store_true",
600
+ help="Create templates/reaction_summary.pptx and exit")
601
+
602
+ args = parser.parse_args(argv)
603
+
604
+ # --create-test-template mode
605
+ if args.create_test_template:
606
+ out = create_test_template()
607
+ if args.json:
608
+ print(json.dumps({"template": out}))
609
+ else:
610
+ print(f"Created test template: {out}")
611
+ return 0
612
+
613
+ # Normal mode: validate required args
614
+ if not args.template or not args.manifest or not args.output:
615
+ parser.error("--template, --manifest, and -o are required")
616
+
617
+ if not os.path.isfile(args.template):
618
+ print(f"Error: template not found: {args.template}", file=sys.stderr)
619
+ return 1
620
+ if not os.path.isfile(args.manifest):
621
+ print(f"Error: manifest not found: {args.manifest}", file=sys.stderr)
622
+ return 1
623
+
624
+ is_pptx = args.template.lower().endswith(".pptx")
625
+ is_docx = args.template.lower().endswith(".docx")
626
+ if not (is_pptx or is_docx):
627
+ print("Error: template must be .pptx or .docx", file=sys.stderr)
628
+ return 1
629
+
630
+ # Load manifest
631
+ text_slots, cdxml_slots, warnings = load_manifest(args.manifest)
632
+
633
+ if not args.json:
634
+ fmt = "PPTX" if is_pptx else "DOCX"
635
+ print(f"Template: {args.template} ({fmt})")
636
+ print(f"Manifest: {len(text_slots)} text slot(s), "
637
+ f"{len(cdxml_slots)} CDXML slot(s)")
638
+
639
+ # Pass 1: text replacement via python-pptx/python-docx
640
+ ext = ".pptx" if is_pptx else ".docx"
641
+ tmp = tempfile.mktemp(suffix=ext)
642
+
643
+ try:
644
+ if not args.json:
645
+ print("[1/2] Replacing text placeholders...")
646
+
647
+ if is_pptx:
648
+ text_filled = pass1_pptx(args.template, text_slots, tmp)
649
+ else:
650
+ text_filled = pass1_docx(args.template, text_slots, tmp)
651
+
652
+ # Warn about unfilled text slots
653
+ for ph in text_slots:
654
+ if ph not in text_filled:
655
+ warnings.append(f"Text placeholder not found in template: {ph}")
656
+
657
+ # Pass 2: OLE embedding
658
+ ole_count = 0
659
+ cdxml_filled = set()
660
+
661
+ if cdxml_slots:
662
+ if not args.json:
663
+ print("[2/2] Converting CDXML and embedding OLE objects...")
664
+
665
+ ole_items = prepare_ole_items(cdxml_slots, margin_pt=args.margin)
666
+
667
+ # Warn about conversion failures
668
+ for slot in cdxml_slots:
669
+ if os.path.isfile(slot["file"]) and slot["file"] not in ole_items:
670
+ warnings.append(f"Failed to convert CDXML: {slot['file']}")
671
+
672
+ if ole_items:
673
+ if is_pptx:
674
+ ole_count, cdxml_filled = pass2_pptx(
675
+ tmp, args.output, cdxml_slots, ole_items)
676
+ else:
677
+ ole_count, cdxml_filled = pass2_docx(
678
+ tmp, args.output, cdxml_slots, ole_items)
679
+ else:
680
+ shutil.copy2(tmp, args.output)
681
+ else:
682
+ if not args.json:
683
+ print("[2/2] No CDXML slots — skipping OLE embedding.")
684
+ shutil.copy2(tmp, args.output)
685
+
686
+ # Warn about unfilled CDXML slots
687
+ for slot in cdxml_slots:
688
+ if slot["placeholder"] not in cdxml_filled:
689
+ if slot["file"] in (ole_items if cdxml_slots else {}):
690
+ warnings.append(
691
+ f"CDXML placeholder not found in template: "
692
+ f"{slot['placeholder']}")
693
+
694
+ total_filled = len(text_filled) + ole_count
695
+
696
+ if args.json:
697
+ result = {
698
+ "template": args.template,
699
+ "output": args.output,
700
+ "slots_filled": total_filled,
701
+ "ole_objects": ole_count,
702
+ "warnings": warnings,
703
+ }
704
+ print(json.dumps(result, indent=2))
705
+ else:
706
+ print(f"\nDone! {args.output}")
707
+ print(f" Text slots filled: {len(text_filled)}")
708
+ print(f" OLE objects embedded: {ole_count}")
709
+ if warnings:
710
+ print(f" Warnings:")
711
+ for w in warnings:
712
+ print(f" - {w}")
713
+
714
+ finally:
715
+ if os.path.exists(tmp):
716
+ os.unlink(tmp)
717
+
718
+ return 0
719
+
720
+
721
+ if __name__ == "__main__":
722
+ sys.exit(main())