cdxml-toolkit 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cdxml_toolkit/__init__.py +18 -0
- cdxml_toolkit/_jre/__init__.py +2 -0
- cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
- cdxml_toolkit/analysis/__init__.py +35 -0
- cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
- cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
- cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
- cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
- cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
- cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
- cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
- cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
- cdxml_toolkit/analysis/extract_nmr.py +47 -0
- cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
- cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
- cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
- cdxml_toolkit/cdxml_builder.py +920 -0
- cdxml_toolkit/cdxml_utils.py +342 -0
- cdxml_toolkit/chemdraw/__init__.py +5 -0
- cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
- cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
- cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
- cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
- cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
- cdxml_toolkit/constants.py +304 -0
- cdxml_toolkit/coord_normalizer.py +438 -0
- cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
- cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
- cdxml_toolkit/image/__init__.py +15 -0
- cdxml_toolkit/image/reaction_from_image.py +2103 -0
- cdxml_toolkit/image/structure_from_image.py +1711 -0
- cdxml_toolkit/layout/__init__.py +5 -0
- cdxml_toolkit/layout/alignment.py +1642 -0
- cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
- cdxml_toolkit/layout/scheme_merger.py +2260 -0
- cdxml_toolkit/mcp_server/__init__.py +0 -0
- cdxml_toolkit/mcp_server/__main__.py +5 -0
- cdxml_toolkit/mcp_server/server.py +1567 -0
- cdxml_toolkit/naming/__init__.py +6 -0
- cdxml_toolkit/naming/aligned_namer.py +2342 -0
- cdxml_toolkit/naming/mol_builder.py +3722 -0
- cdxml_toolkit/naming/name_decomposer.py +2843 -0
- cdxml_toolkit/naming/reactions_datamol.json +2414 -0
- cdxml_toolkit/office/__init__.py +5 -0
- cdxml_toolkit/office/doc_from_template.py +722 -0
- cdxml_toolkit/office/ole_embedder.py +808 -0
- cdxml_toolkit/office/ole_extractor.py +272 -0
- cdxml_toolkit/perception/__init__.py +10 -0
- cdxml_toolkit/perception/compound_search.py +229 -0
- cdxml_toolkit/perception/eln_csv_parser.py +240 -0
- cdxml_toolkit/perception/rdf_parser.py +664 -0
- cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
- cdxml_toolkit/perception/reaction_parser.py +2150 -0
- cdxml_toolkit/perception/scheme_reader.py +2948 -0
- cdxml_toolkit/perception/scheme_refine.py +1404 -0
- cdxml_toolkit/perception/scheme_segmenter.py +619 -0
- cdxml_toolkit/perception/spatial_assignment.py +1013 -0
- cdxml_toolkit/rdkit_utils.py +605 -0
- cdxml_toolkit/render/__init__.py +17 -0
- cdxml_toolkit/render/auto_layout.py +229 -0
- cdxml_toolkit/render/compact_parser.py +632 -0
- cdxml_toolkit/render/parser.py +706 -0
- cdxml_toolkit/render/render_scheme.py +267 -0
- cdxml_toolkit/render/renderer.py +2387 -0
- cdxml_toolkit/render/schema.py +90 -0
- cdxml_toolkit/render/scheme_maker.py +1043 -0
- cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
- cdxml_toolkit/resolve/__init__.py +13 -0
- cdxml_toolkit/resolve/cas_resolver.py +430 -0
- cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
- cdxml_toolkit/resolve/condensed_formula.py +493 -0
- cdxml_toolkit/resolve/jre_manager.py +195 -0
- cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
- cdxml_toolkit/resolve/reagent_db.py +285 -0
- cdxml_toolkit/resolve/superatom_data.json +2856 -0
- cdxml_toolkit/resolve/superatom_table.py +146 -0
- cdxml_toolkit/text_formatting.py +298 -0
- cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
- cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
- cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
- cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
- cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,722 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
doc_from_template.py — Fill a PowerPoint or Word template with text and ChemDraw OLE structures.
|
|
4
|
+
|
|
5
|
+
Two-pass approach:
|
|
6
|
+
Pass 1: python-pptx/python-docx replaces text placeholders (preserving formatting)
|
|
7
|
+
Pass 2: XML-level injection replaces CDXML placeholders with editable OLE objects
|
|
8
|
+
|
|
9
|
+
Usage:
|
|
10
|
+
python doc_from_template.py --template template.pptx --manifest manifest.json -o output.pptx
|
|
11
|
+
python doc_from_template.py --template template.docx --manifest manifest.json -o output.docx
|
|
12
|
+
python doc_from_template.py --create-test-template # creates templates/reaction_summary.pptx
|
|
13
|
+
|
|
14
|
+
Requirements:
|
|
15
|
+
- ChemDraw 16+ (COM automation for CDXML -> CDX + EMF) — only needed for cdxml slots
|
|
16
|
+
- python-pptx
|
|
17
|
+
- python-docx
|
|
18
|
+
- lxml
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
import argparse
|
|
22
|
+
import json
|
|
23
|
+
import os
|
|
24
|
+
import re
|
|
25
|
+
import shutil
|
|
26
|
+
import sys
|
|
27
|
+
import tempfile
|
|
28
|
+
import zipfile
|
|
29
|
+
|
|
30
|
+
from .ole_embedder import (
|
|
31
|
+
batch_convert,
|
|
32
|
+
get_cdxml_content_size,
|
|
33
|
+
build_ole_compound_file,
|
|
34
|
+
_ensure_content_types,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
# ---------------------------------------------------------------------------
|
|
39
|
+
# XML namespaces
|
|
40
|
+
# ---------------------------------------------------------------------------
|
|
41
|
+
|
|
42
|
+
A_NS = "http://schemas.openxmlformats.org/drawingml/2006/main"
|
|
43
|
+
P_NS = "http://schemas.openxmlformats.org/presentationml/2006/main"
|
|
44
|
+
R_NS = "http://schemas.openxmlformats.org/officeDocument/2006/relationships"
|
|
45
|
+
MC_NS = "http://schemas.openxmlformats.org/markup-compatibility/2006"
|
|
46
|
+
V_NS = "urn:schemas-microsoft-com:vml"
|
|
47
|
+
W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
|
|
48
|
+
O_NS = "urn:schemas-microsoft-com:office:office"
|
|
49
|
+
RELS_NS = "http://schemas.openxmlformats.org/package/2006/relationships"
|
|
50
|
+
OLEOBJ_TYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/oleObject"
|
|
51
|
+
IMAGE_TYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/image"
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
# ---------------------------------------------------------------------------
|
|
55
|
+
# Manifest loading
|
|
56
|
+
# ---------------------------------------------------------------------------
|
|
57
|
+
|
|
58
|
+
def load_manifest(manifest_path):
|
|
59
|
+
"""Load JSON manifest. Resolve CDXML paths relative to manifest directory.
|
|
60
|
+
|
|
61
|
+
Returns (text_slots, cdxml_slots, warnings).
|
|
62
|
+
"""
|
|
63
|
+
with open(manifest_path) as f:
|
|
64
|
+
data = json.load(f)
|
|
65
|
+
|
|
66
|
+
base_dir = os.path.dirname(os.path.abspath(manifest_path))
|
|
67
|
+
text_slots = {} # placeholder -> value
|
|
68
|
+
cdxml_slots = [] # [{"placeholder": ..., "file": abs_path}, ...]
|
|
69
|
+
warnings = []
|
|
70
|
+
|
|
71
|
+
for slot in data.get("slots", []):
|
|
72
|
+
ph = slot["placeholder"]
|
|
73
|
+
stype = slot.get("type", "text")
|
|
74
|
+
|
|
75
|
+
if stype == "text":
|
|
76
|
+
text_slots[ph] = slot["value"]
|
|
77
|
+
elif stype == "cdxml":
|
|
78
|
+
fpath = slot["file"]
|
|
79
|
+
if not os.path.isabs(fpath):
|
|
80
|
+
fpath = os.path.join(base_dir, fpath)
|
|
81
|
+
fpath = os.path.abspath(fpath)
|
|
82
|
+
if not os.path.isfile(fpath):
|
|
83
|
+
warnings.append(f"CDXML file not found: {fpath}")
|
|
84
|
+
cdxml_slots.append({"placeholder": ph, "file": fpath})
|
|
85
|
+
else:
|
|
86
|
+
warnings.append(f"Unknown slot type '{stype}' for {ph}")
|
|
87
|
+
|
|
88
|
+
return text_slots, cdxml_slots, warnings
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
# ---------------------------------------------------------------------------
|
|
92
|
+
# OLE preparation (ChemDraw COM batch conversion)
|
|
93
|
+
# ---------------------------------------------------------------------------
|
|
94
|
+
|
|
95
|
+
def prepare_ole_items(cdxml_slots, margin_pt=0.0):
|
|
96
|
+
"""Convert unique CDXML files to OLE data via ChemDraw COM.
|
|
97
|
+
|
|
98
|
+
Returns dict: abs_path -> {"ole_data", "emf_data", "width_emu", "height_emu"}
|
|
99
|
+
"""
|
|
100
|
+
unique_files = list(dict.fromkeys(
|
|
101
|
+
s["file"] for s in cdxml_slots if os.path.isfile(s["file"])
|
|
102
|
+
))
|
|
103
|
+
if not unique_files:
|
|
104
|
+
return {}
|
|
105
|
+
|
|
106
|
+
converted = batch_convert(unique_files)
|
|
107
|
+
|
|
108
|
+
items = {}
|
|
109
|
+
for conv in converted:
|
|
110
|
+
path = os.path.abspath(conv["path"])
|
|
111
|
+
w_emu, h_emu = get_cdxml_content_size(conv["path"], margin_pt=margin_pt)
|
|
112
|
+
ole_data = build_ole_compound_file(conv["cdx_data"])
|
|
113
|
+
items[path] = {
|
|
114
|
+
"ole_data": ole_data,
|
|
115
|
+
"emf_data": conv["emf_data"],
|
|
116
|
+
"width_emu": w_emu,
|
|
117
|
+
"height_emu": h_emu,
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
return items
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
# ---------------------------------------------------------------------------
|
|
124
|
+
# Pass 1: Text replacement (python-pptx / python-docx)
|
|
125
|
+
# ---------------------------------------------------------------------------
|
|
126
|
+
|
|
127
|
+
def _replace_in_paragraph(paragraph, text_slots):
|
|
128
|
+
"""Replace {{PLACEHOLDER}} patterns in a paragraph's runs.
|
|
129
|
+
|
|
130
|
+
Joins all run texts, performs replacements, puts result in first run.
|
|
131
|
+
Preserves the first run's formatting. Returns set of filled placeholder names.
|
|
132
|
+
"""
|
|
133
|
+
runs = paragraph.runs
|
|
134
|
+
if not runs:
|
|
135
|
+
return set()
|
|
136
|
+
|
|
137
|
+
full_text = "".join(r.text or "" for r in runs)
|
|
138
|
+
filled = set()
|
|
139
|
+
new_text = full_text
|
|
140
|
+
|
|
141
|
+
for placeholder, value in text_slots.items():
|
|
142
|
+
if placeholder in new_text:
|
|
143
|
+
new_text = new_text.replace(placeholder, value)
|
|
144
|
+
filled.add(placeholder)
|
|
145
|
+
|
|
146
|
+
if filled:
|
|
147
|
+
runs[0].text = new_text
|
|
148
|
+
for r in runs[1:]:
|
|
149
|
+
r.text = ""
|
|
150
|
+
|
|
151
|
+
return filled
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def pass1_pptx(template_path, text_slots, temp_path):
|
|
155
|
+
"""Replace text placeholders in PPTX template. Save to temp_path.
|
|
156
|
+
|
|
157
|
+
Returns set of placeholder names that were filled.
|
|
158
|
+
"""
|
|
159
|
+
from pptx import Presentation
|
|
160
|
+
|
|
161
|
+
prs = Presentation(template_path)
|
|
162
|
+
filled = set()
|
|
163
|
+
|
|
164
|
+
for slide in prs.slides:
|
|
165
|
+
for shape in slide.shapes:
|
|
166
|
+
if shape.has_text_frame:
|
|
167
|
+
for para in shape.text_frame.paragraphs:
|
|
168
|
+
filled.update(_replace_in_paragraph(para, text_slots))
|
|
169
|
+
|
|
170
|
+
prs.save(temp_path)
|
|
171
|
+
return filled
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def pass1_docx(template_path, text_slots, temp_path):
|
|
175
|
+
"""Replace text placeholders in DOCX template. Save to temp_path.
|
|
176
|
+
|
|
177
|
+
Returns set of placeholder names that were filled.
|
|
178
|
+
"""
|
|
179
|
+
from docx import Document
|
|
180
|
+
|
|
181
|
+
doc = Document(template_path)
|
|
182
|
+
filled = set()
|
|
183
|
+
|
|
184
|
+
for para in doc.paragraphs:
|
|
185
|
+
filled.update(_replace_in_paragraph(para, text_slots))
|
|
186
|
+
|
|
187
|
+
for table in doc.tables:
|
|
188
|
+
for row in table.rows:
|
|
189
|
+
for cell in row.cells:
|
|
190
|
+
for para in cell.paragraphs:
|
|
191
|
+
filled.update(_replace_in_paragraph(para, text_slots))
|
|
192
|
+
|
|
193
|
+
doc.save(temp_path)
|
|
194
|
+
return filled
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
# ---------------------------------------------------------------------------
|
|
198
|
+
# Pass 2 helpers: relationship + content type management
|
|
199
|
+
# ---------------------------------------------------------------------------
|
|
200
|
+
|
|
201
|
+
def _rels_path_for(entry):
|
|
202
|
+
"""Compute the .rels file path for a ZIP entry (forward-slash paths)."""
|
|
203
|
+
idx = entry.rfind("/")
|
|
204
|
+
if idx < 0:
|
|
205
|
+
return f"_rels/{entry}.rels"
|
|
206
|
+
return f"{entry[:idx]}/_rels/{entry[idx + 1:]}.rels"
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def _add_ole_rels(rels_xml, ole_idx, target_prefix):
|
|
210
|
+
"""Add OLE + image relationship entries to a rels XML document.
|
|
211
|
+
|
|
212
|
+
target_prefix: '../' for PPTX slides, '' for DOCX document.
|
|
213
|
+
"""
|
|
214
|
+
from lxml import etree
|
|
215
|
+
|
|
216
|
+
root = etree.fromstring(rels_xml)
|
|
217
|
+
|
|
218
|
+
etree.SubElement(root, "Relationship", attrib={
|
|
219
|
+
"Id": f"rIdOle{ole_idx}",
|
|
220
|
+
"Type": OLEOBJ_TYPE,
|
|
221
|
+
"Target": f"{target_prefix}embeddings/oleObject{ole_idx}.bin",
|
|
222
|
+
})
|
|
223
|
+
etree.SubElement(root, "Relationship", attrib={
|
|
224
|
+
"Id": f"rIdOleImg{ole_idx}",
|
|
225
|
+
"Type": IMAGE_TYPE,
|
|
226
|
+
"Target": f"{target_prefix}media/olePreview{ole_idx}.emf",
|
|
227
|
+
})
|
|
228
|
+
|
|
229
|
+
return etree.tostring(root, xml_declaration=True, encoding="UTF-8", standalone=True)
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
# ---------------------------------------------------------------------------
|
|
233
|
+
# Pass 2 — PPTX: replace CDXML placeholder shapes with OLE objects
|
|
234
|
+
# ---------------------------------------------------------------------------
|
|
235
|
+
|
|
236
|
+
def _make_pptx_ole_xml(ole_idx, x, y, w, h):
|
|
237
|
+
"""Build mc:AlternateContent XML for an OLE object in a PPTX slide."""
|
|
238
|
+
from lxml import etree
|
|
239
|
+
|
|
240
|
+
bid = 10000 + (ole_idx - 1) * 10
|
|
241
|
+
orel = f"rIdOle{ole_idx}"
|
|
242
|
+
irel = f"rIdOleImg{ole_idx}"
|
|
243
|
+
|
|
244
|
+
xml_str = f"""<mc:AlternateContent
|
|
245
|
+
xmlns:mc="{MC_NS}" xmlns:p="{P_NS}"
|
|
246
|
+
xmlns:a="{A_NS}" xmlns:r="{R_NS}" xmlns:v="{V_NS}">
|
|
247
|
+
<mc:Choice Requires="v">
|
|
248
|
+
<p:graphicFrame>
|
|
249
|
+
<p:nvGraphicFramePr>
|
|
250
|
+
<p:cNvPr id="{bid}" name="ChemDraw {ole_idx}"/>
|
|
251
|
+
<p:cNvGraphicFramePr>
|
|
252
|
+
<a:graphicFrameLocks noChangeAspect="1"/>
|
|
253
|
+
</p:cNvGraphicFramePr>
|
|
254
|
+
<p:nvPr/>
|
|
255
|
+
</p:nvGraphicFramePr>
|
|
256
|
+
<p:xfrm>
|
|
257
|
+
<a:off x="{x}" y="{y}"/>
|
|
258
|
+
<a:ext cx="{w}" cy="{h}"/>
|
|
259
|
+
</p:xfrm>
|
|
260
|
+
<a:graphic>
|
|
261
|
+
<a:graphicData uri="http://schemas.openxmlformats.org/presentationml/2006/ole">
|
|
262
|
+
<p:oleObj name="CS ChemDraw Drawing" r:id="{orel}"
|
|
263
|
+
imgW="{w}" imgH="{h}" progId="ChemDraw.Document.6.0">
|
|
264
|
+
<p:embed/>
|
|
265
|
+
</p:oleObj>
|
|
266
|
+
</a:graphicData>
|
|
267
|
+
</a:graphic>
|
|
268
|
+
</p:graphicFrame>
|
|
269
|
+
</mc:Choice>
|
|
270
|
+
<mc:Fallback>
|
|
271
|
+
<p:graphicFrame>
|
|
272
|
+
<p:nvGraphicFramePr>
|
|
273
|
+
<p:cNvPr id="{bid + 1}" name="ChemDraw {ole_idx}"/>
|
|
274
|
+
<p:cNvGraphicFramePr>
|
|
275
|
+
<a:graphicFrameLocks noChangeAspect="1"/>
|
|
276
|
+
</p:cNvGraphicFramePr>
|
|
277
|
+
<p:nvPr/>
|
|
278
|
+
</p:nvGraphicFramePr>
|
|
279
|
+
<p:xfrm>
|
|
280
|
+
<a:off x="{x}" y="{y}"/>
|
|
281
|
+
<a:ext cx="{w}" cy="{h}"/>
|
|
282
|
+
</p:xfrm>
|
|
283
|
+
<a:graphic>
|
|
284
|
+
<a:graphicData uri="http://schemas.openxmlformats.org/presentationml/2006/ole">
|
|
285
|
+
<p:oleObj name="CS ChemDraw Drawing" r:id="{orel}"
|
|
286
|
+
imgW="{w}" imgH="{h}" progId="ChemDraw.Document.6.0">
|
|
287
|
+
<p:embed/>
|
|
288
|
+
<p:pic>
|
|
289
|
+
<p:nvPicPr>
|
|
290
|
+
<p:cNvPr id="{bid + 2}" name="Preview {ole_idx}"/>
|
|
291
|
+
<p:cNvPicPr/><p:nvPr/>
|
|
292
|
+
</p:nvPicPr>
|
|
293
|
+
<p:blipFill>
|
|
294
|
+
<a:blip r:embed="{irel}"/>
|
|
295
|
+
<a:stretch><a:fillRect/></a:stretch>
|
|
296
|
+
</p:blipFill>
|
|
297
|
+
<p:spPr>
|
|
298
|
+
<a:xfrm>
|
|
299
|
+
<a:off x="{x}" y="{y}"/>
|
|
300
|
+
<a:ext cx="{w}" cy="{h}"/>
|
|
301
|
+
</a:xfrm>
|
|
302
|
+
<a:prstGeom prst="rect"><a:avLst/></a:prstGeom>
|
|
303
|
+
</p:spPr>
|
|
304
|
+
</p:pic>
|
|
305
|
+
</p:oleObj>
|
|
306
|
+
</a:graphicData>
|
|
307
|
+
</a:graphic>
|
|
308
|
+
</p:graphicFrame>
|
|
309
|
+
</mc:Fallback>
|
|
310
|
+
</mc:AlternateContent>"""
|
|
311
|
+
|
|
312
|
+
return etree.fromstring(xml_str)
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
def pass2_pptx(input_path, output_path, cdxml_slots, ole_items):
|
|
316
|
+
"""Replace CDXML placeholder text boxes with OLE objects in PPTX.
|
|
317
|
+
|
|
318
|
+
Returns (ole_count, filled_placeholders_set).
|
|
319
|
+
"""
|
|
320
|
+
from lxml import etree
|
|
321
|
+
|
|
322
|
+
ph_to_file = {s["placeholder"]: s["file"] for s in cdxml_slots}
|
|
323
|
+
ole_counter = 0
|
|
324
|
+
filled_ph = set()
|
|
325
|
+
|
|
326
|
+
# Scan slides for placeholder shapes, build modifications
|
|
327
|
+
# mods: slide_entry -> (modified_xml_bytes, [(ole_idx, item)])
|
|
328
|
+
mods = {}
|
|
329
|
+
|
|
330
|
+
with zipfile.ZipFile(input_path, "r") as zin:
|
|
331
|
+
for entry in zin.namelist():
|
|
332
|
+
if not re.match(r"ppt/slides/slide\d+\.xml$", entry):
|
|
333
|
+
continue
|
|
334
|
+
|
|
335
|
+
root = etree.fromstring(zin.read(entry))
|
|
336
|
+
sp_tree = root.find(f".//{{{P_NS}}}spTree")
|
|
337
|
+
if sp_tree is None:
|
|
338
|
+
continue
|
|
339
|
+
|
|
340
|
+
slide_oles = []
|
|
341
|
+
|
|
342
|
+
for sp in list(sp_tree.findall(f"{{{P_NS}}}sp")):
|
|
343
|
+
texts = [t.text for t in sp.iter(f"{{{A_NS}}}t") if t.text]
|
|
344
|
+
full = "".join(texts).strip()
|
|
345
|
+
|
|
346
|
+
for ph, fpath in ph_to_file.items():
|
|
347
|
+
if ph not in full or fpath not in ole_items:
|
|
348
|
+
continue
|
|
349
|
+
|
|
350
|
+
ole_counter += 1
|
|
351
|
+
item = ole_items[fpath]
|
|
352
|
+
filled_ph.add(ph)
|
|
353
|
+
|
|
354
|
+
# Get shape position from its transform
|
|
355
|
+
x = y = 0
|
|
356
|
+
xfrm = sp.find(f"{{{P_NS}}}spPr/{{{A_NS}}}xfrm")
|
|
357
|
+
if xfrm is None:
|
|
358
|
+
xfrm = sp.find(f".//{{{A_NS}}}xfrm")
|
|
359
|
+
if xfrm is not None:
|
|
360
|
+
off = xfrm.find(f"{{{A_NS}}}off")
|
|
361
|
+
if off is not None:
|
|
362
|
+
x = int(off.get("x", "0"))
|
|
363
|
+
y = int(off.get("y", "0"))
|
|
364
|
+
|
|
365
|
+
# Remove placeholder text box, add OLE graphic frame
|
|
366
|
+
sp_tree.remove(sp)
|
|
367
|
+
sp_tree.append(_make_pptx_ole_xml(
|
|
368
|
+
ole_counter, x, y,
|
|
369
|
+
item["width_emu"], item["height_emu"],
|
|
370
|
+
))
|
|
371
|
+
slide_oles.append((ole_counter, item))
|
|
372
|
+
break # one placeholder per shape
|
|
373
|
+
|
|
374
|
+
if slide_oles:
|
|
375
|
+
mods[entry] = (
|
|
376
|
+
etree.tostring(root, xml_declaration=True,
|
|
377
|
+
encoding="UTF-8", standalone=True),
|
|
378
|
+
slide_oles,
|
|
379
|
+
)
|
|
380
|
+
|
|
381
|
+
# Build rels update map: rels_entry -> [(ole_idx, item)]
|
|
382
|
+
rels_updates = {}
|
|
383
|
+
for slide_entry, (_, oles) in mods.items():
|
|
384
|
+
rp = _rels_path_for(slide_entry)
|
|
385
|
+
rels_updates[rp] = oles
|
|
386
|
+
|
|
387
|
+
# Write output ZIP
|
|
388
|
+
with zipfile.ZipFile(input_path, "r") as zin:
|
|
389
|
+
with zipfile.ZipFile(output_path, "w", zipfile.ZIP_DEFLATED) as zout:
|
|
390
|
+
for entry in zin.namelist():
|
|
391
|
+
data = zin.read(entry)
|
|
392
|
+
|
|
393
|
+
# Swap in modified slide XML
|
|
394
|
+
if entry in mods:
|
|
395
|
+
data = mods[entry][0]
|
|
396
|
+
|
|
397
|
+
# Add OLE relationships to affected slide rels
|
|
398
|
+
if entry in rels_updates:
|
|
399
|
+
for idx, _ in rels_updates[entry]:
|
|
400
|
+
data = _add_ole_rels(data, idx, "../")
|
|
401
|
+
|
|
402
|
+
# Ensure .bin and .emf content types exist
|
|
403
|
+
if entry == "[Content_Types].xml":
|
|
404
|
+
data = _ensure_content_types(data)
|
|
405
|
+
|
|
406
|
+
zout.writestr(entry, data)
|
|
407
|
+
|
|
408
|
+
# Write OLE + EMF binary files
|
|
409
|
+
for _, (_, oles) in mods.items():
|
|
410
|
+
for idx, item in oles:
|
|
411
|
+
zout.writestr(
|
|
412
|
+
f"ppt/embeddings/oleObject{idx}.bin", item["ole_data"])
|
|
413
|
+
zout.writestr(
|
|
414
|
+
f"ppt/media/olePreview{idx}.emf", item["emf_data"])
|
|
415
|
+
|
|
416
|
+
return ole_counter, filled_ph
|
|
417
|
+
|
|
418
|
+
|
|
419
|
+
# ---------------------------------------------------------------------------
|
|
420
|
+
# Pass 2 — DOCX: replace CDXML placeholder paragraphs with OLE objects
|
|
421
|
+
# ---------------------------------------------------------------------------
|
|
422
|
+
|
|
423
|
+
def _make_docx_ole_para(ole_idx, w_emu, h_emu):
|
|
424
|
+
"""Build a DOCX paragraph containing a ChemDraw OLE object."""
|
|
425
|
+
from lxml import etree
|
|
426
|
+
|
|
427
|
+
w_pt = w_emu / 12700
|
|
428
|
+
h_pt = h_emu / 12700
|
|
429
|
+
w_twips = int(w_pt * 20)
|
|
430
|
+
h_twips = int(h_pt * 20)
|
|
431
|
+
shape_id = f"_x0000_s{1026 + ole_idx}"
|
|
432
|
+
obj_id = f"_{1728379061 + ole_idx}"
|
|
433
|
+
|
|
434
|
+
xml_str = f"""<w:p xmlns:w="{W_NS}"
|
|
435
|
+
xmlns:r="{R_NS}"
|
|
436
|
+
xmlns:o="{O_NS}"
|
|
437
|
+
xmlns:v="{V_NS}">
|
|
438
|
+
<w:r>
|
|
439
|
+
<w:object w:dxaOrig="{w_twips}" w:dyaOrig="{h_twips}">
|
|
440
|
+
<v:shape id="{shape_id}" type="#_x0000_t75"
|
|
441
|
+
style="width:{w_pt:.1f}pt;height:{h_pt:.1f}pt"
|
|
442
|
+
o:ole="">
|
|
443
|
+
<v:imagedata r:id="rIdOleImg{ole_idx}" o:title=""/>
|
|
444
|
+
</v:shape>
|
|
445
|
+
<o:OLEObject Type="Embed"
|
|
446
|
+
ProgID="ChemDraw.Document.6.0"
|
|
447
|
+
ShapeID="{shape_id}"
|
|
448
|
+
DrawAspect="Content"
|
|
449
|
+
ObjectID="{obj_id}"
|
|
450
|
+
r:id="rIdOle{ole_idx}"/>
|
|
451
|
+
</w:object>
|
|
452
|
+
</w:r>
|
|
453
|
+
</w:p>"""
|
|
454
|
+
|
|
455
|
+
return etree.fromstring(xml_str)
|
|
456
|
+
|
|
457
|
+
|
|
458
|
+
def pass2_docx(input_path, output_path, cdxml_slots, ole_items):
|
|
459
|
+
"""Replace CDXML placeholder paragraphs with OLE objects in DOCX.
|
|
460
|
+
|
|
461
|
+
Returns (ole_count, filled_placeholders_set).
|
|
462
|
+
"""
|
|
463
|
+
from lxml import etree
|
|
464
|
+
|
|
465
|
+
ph_to_file = {s["placeholder"]: s["file"] for s in cdxml_slots}
|
|
466
|
+
ole_counter = 0
|
|
467
|
+
filled_ph = set()
|
|
468
|
+
oles = [] # [(ole_idx, item)]
|
|
469
|
+
|
|
470
|
+
with zipfile.ZipFile(input_path, "r") as zin:
|
|
471
|
+
doc_xml = zin.read("word/document.xml")
|
|
472
|
+
root = etree.fromstring(doc_xml)
|
|
473
|
+
body = root.find(f"{{{W_NS}}}body")
|
|
474
|
+
if body is None:
|
|
475
|
+
shutil.copy2(input_path, output_path)
|
|
476
|
+
return 0, set()
|
|
477
|
+
|
|
478
|
+
# Search body paragraphs for CDXML placeholders
|
|
479
|
+
for p_elem in list(body.findall(f"{{{W_NS}}}p")):
|
|
480
|
+
texts = [t.text for t in p_elem.iter(f"{{{W_NS}}}t") if t.text]
|
|
481
|
+
full = "".join(texts).strip()
|
|
482
|
+
|
|
483
|
+
for ph, fpath in ph_to_file.items():
|
|
484
|
+
if ph not in full or fpath not in ole_items:
|
|
485
|
+
continue
|
|
486
|
+
|
|
487
|
+
ole_counter += 1
|
|
488
|
+
item = ole_items[fpath]
|
|
489
|
+
filled_ph.add(ph)
|
|
490
|
+
|
|
491
|
+
new_p = _make_docx_ole_para(
|
|
492
|
+
ole_counter, item["width_emu"], item["height_emu"])
|
|
493
|
+
body.replace(p_elem, new_p)
|
|
494
|
+
oles.append((ole_counter, item))
|
|
495
|
+
break # one placeholder per paragraph
|
|
496
|
+
|
|
497
|
+
if not oles:
|
|
498
|
+
shutil.copy2(input_path, output_path)
|
|
499
|
+
return 0, filled_ph
|
|
500
|
+
|
|
501
|
+
new_doc_xml = etree.tostring(
|
|
502
|
+
root, xml_declaration=True, encoding="UTF-8", standalone=True)
|
|
503
|
+
|
|
504
|
+
# Write output ZIP
|
|
505
|
+
with zipfile.ZipFile(input_path, "r") as zin:
|
|
506
|
+
with zipfile.ZipFile(output_path, "w", zipfile.ZIP_DEFLATED) as zout:
|
|
507
|
+
for entry in zin.namelist():
|
|
508
|
+
data = zin.read(entry)
|
|
509
|
+
|
|
510
|
+
if entry == "word/document.xml":
|
|
511
|
+
data = new_doc_xml
|
|
512
|
+
|
|
513
|
+
if entry == "word/_rels/document.xml.rels":
|
|
514
|
+
for idx, _ in oles:
|
|
515
|
+
data = _add_ole_rels(data, idx, "")
|
|
516
|
+
|
|
517
|
+
if entry == "[Content_Types].xml":
|
|
518
|
+
data = _ensure_content_types(data)
|
|
519
|
+
|
|
520
|
+
zout.writestr(entry, data)
|
|
521
|
+
|
|
522
|
+
for idx, item in oles:
|
|
523
|
+
zout.writestr(
|
|
524
|
+
f"word/embeddings/oleObject{idx}.bin", item["ole_data"])
|
|
525
|
+
zout.writestr(
|
|
526
|
+
f"word/media/olePreview{idx}.emf", item["emf_data"])
|
|
527
|
+
|
|
528
|
+
return ole_counter, filled_ph
|
|
529
|
+
|
|
530
|
+
|
|
531
|
+
# ---------------------------------------------------------------------------
|
|
532
|
+
# Test template creation
|
|
533
|
+
# ---------------------------------------------------------------------------
|
|
534
|
+
|
|
535
|
+
def create_test_template(output_dir="templates"):
|
|
536
|
+
"""Create a minimal 1-slide PPTX template with placeholder text boxes."""
|
|
537
|
+
from pptx import Presentation
|
|
538
|
+
from pptx.util import Inches, Pt
|
|
539
|
+
from pptx.enum.text import PP_ALIGN
|
|
540
|
+
|
|
541
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
542
|
+
|
|
543
|
+
prs = Presentation()
|
|
544
|
+
|
|
545
|
+
# Find blank layout
|
|
546
|
+
blank = None
|
|
547
|
+
for layout in prs.slide_layouts:
|
|
548
|
+
if layout.name == "Blank":
|
|
549
|
+
blank = layout
|
|
550
|
+
break
|
|
551
|
+
if blank is None:
|
|
552
|
+
blank = prs.slide_layouts[6]
|
|
553
|
+
|
|
554
|
+
slide = prs.slides.add_slide(blank)
|
|
555
|
+
|
|
556
|
+
# Title text box
|
|
557
|
+
tb = slide.shapes.add_textbox(Inches(1), Inches(0.4), Inches(8), Inches(0.7))
|
|
558
|
+
p = tb.text_frame.paragraphs[0]
|
|
559
|
+
p.text = "{{TITLE}}"
|
|
560
|
+
p.font.size = Pt(24)
|
|
561
|
+
p.font.bold = True
|
|
562
|
+
p.alignment = PP_ALIGN.CENTER
|
|
563
|
+
|
|
564
|
+
# Subtitle text box
|
|
565
|
+
tb = slide.shapes.add_textbox(Inches(1), Inches(1.2), Inches(8), Inches(0.5))
|
|
566
|
+
p = tb.text_frame.paragraphs[0]
|
|
567
|
+
p.text = "{{SUBTITLE}}"
|
|
568
|
+
p.font.size = Pt(14)
|
|
569
|
+
p.alignment = PP_ALIGN.CENTER
|
|
570
|
+
|
|
571
|
+
# Scheme placeholder text box (positioned for a reaction scheme)
|
|
572
|
+
tb = slide.shapes.add_textbox(Inches(1), Inches(2.5), Inches(8), Inches(4))
|
|
573
|
+
p = tb.text_frame.paragraphs[0]
|
|
574
|
+
p.text = "{{SCHEME}}"
|
|
575
|
+
p.font.size = Pt(12)
|
|
576
|
+
p.alignment = PP_ALIGN.CENTER
|
|
577
|
+
|
|
578
|
+
out_path = os.path.join(output_dir, "reaction_summary.pptx")
|
|
579
|
+
prs.save(out_path)
|
|
580
|
+
return out_path
|
|
581
|
+
|
|
582
|
+
|
|
583
|
+
# ---------------------------------------------------------------------------
|
|
584
|
+
# CLI
|
|
585
|
+
# ---------------------------------------------------------------------------
|
|
586
|
+
|
|
587
|
+
def main(argv=None) -> int:
|
|
588
|
+
parser = argparse.ArgumentParser(
|
|
589
|
+
description="Fill a PowerPoint or Word template with text and "
|
|
590
|
+
"ChemDraw OLE structures from a JSON manifest."
|
|
591
|
+
)
|
|
592
|
+
parser.add_argument("--template", help="Template file (.pptx or .docx)")
|
|
593
|
+
parser.add_argument("--manifest", help="JSON manifest file")
|
|
594
|
+
parser.add_argument("-o", "--output", help="Output file path")
|
|
595
|
+
parser.add_argument("--margin", type=float, default=0.0,
|
|
596
|
+
help="OLE margin in points (default: 0.0)")
|
|
597
|
+
parser.add_argument("--json", action="store_true",
|
|
598
|
+
help="JSON output summary")
|
|
599
|
+
parser.add_argument("--create-test-template", action="store_true",
|
|
600
|
+
help="Create templates/reaction_summary.pptx and exit")
|
|
601
|
+
|
|
602
|
+
args = parser.parse_args(argv)
|
|
603
|
+
|
|
604
|
+
# --create-test-template mode
|
|
605
|
+
if args.create_test_template:
|
|
606
|
+
out = create_test_template()
|
|
607
|
+
if args.json:
|
|
608
|
+
print(json.dumps({"template": out}))
|
|
609
|
+
else:
|
|
610
|
+
print(f"Created test template: {out}")
|
|
611
|
+
return 0
|
|
612
|
+
|
|
613
|
+
# Normal mode: validate required args
|
|
614
|
+
if not args.template or not args.manifest or not args.output:
|
|
615
|
+
parser.error("--template, --manifest, and -o are required")
|
|
616
|
+
|
|
617
|
+
if not os.path.isfile(args.template):
|
|
618
|
+
print(f"Error: template not found: {args.template}", file=sys.stderr)
|
|
619
|
+
return 1
|
|
620
|
+
if not os.path.isfile(args.manifest):
|
|
621
|
+
print(f"Error: manifest not found: {args.manifest}", file=sys.stderr)
|
|
622
|
+
return 1
|
|
623
|
+
|
|
624
|
+
is_pptx = args.template.lower().endswith(".pptx")
|
|
625
|
+
is_docx = args.template.lower().endswith(".docx")
|
|
626
|
+
if not (is_pptx or is_docx):
|
|
627
|
+
print("Error: template must be .pptx or .docx", file=sys.stderr)
|
|
628
|
+
return 1
|
|
629
|
+
|
|
630
|
+
# Load manifest
|
|
631
|
+
text_slots, cdxml_slots, warnings = load_manifest(args.manifest)
|
|
632
|
+
|
|
633
|
+
if not args.json:
|
|
634
|
+
fmt = "PPTX" if is_pptx else "DOCX"
|
|
635
|
+
print(f"Template: {args.template} ({fmt})")
|
|
636
|
+
print(f"Manifest: {len(text_slots)} text slot(s), "
|
|
637
|
+
f"{len(cdxml_slots)} CDXML slot(s)")
|
|
638
|
+
|
|
639
|
+
# Pass 1: text replacement via python-pptx/python-docx
|
|
640
|
+
ext = ".pptx" if is_pptx else ".docx"
|
|
641
|
+
tmp = tempfile.mktemp(suffix=ext)
|
|
642
|
+
|
|
643
|
+
try:
|
|
644
|
+
if not args.json:
|
|
645
|
+
print("[1/2] Replacing text placeholders...")
|
|
646
|
+
|
|
647
|
+
if is_pptx:
|
|
648
|
+
text_filled = pass1_pptx(args.template, text_slots, tmp)
|
|
649
|
+
else:
|
|
650
|
+
text_filled = pass1_docx(args.template, text_slots, tmp)
|
|
651
|
+
|
|
652
|
+
# Warn about unfilled text slots
|
|
653
|
+
for ph in text_slots:
|
|
654
|
+
if ph not in text_filled:
|
|
655
|
+
warnings.append(f"Text placeholder not found in template: {ph}")
|
|
656
|
+
|
|
657
|
+
# Pass 2: OLE embedding
|
|
658
|
+
ole_count = 0
|
|
659
|
+
cdxml_filled = set()
|
|
660
|
+
|
|
661
|
+
if cdxml_slots:
|
|
662
|
+
if not args.json:
|
|
663
|
+
print("[2/2] Converting CDXML and embedding OLE objects...")
|
|
664
|
+
|
|
665
|
+
ole_items = prepare_ole_items(cdxml_slots, margin_pt=args.margin)
|
|
666
|
+
|
|
667
|
+
# Warn about conversion failures
|
|
668
|
+
for slot in cdxml_slots:
|
|
669
|
+
if os.path.isfile(slot["file"]) and slot["file"] not in ole_items:
|
|
670
|
+
warnings.append(f"Failed to convert CDXML: {slot['file']}")
|
|
671
|
+
|
|
672
|
+
if ole_items:
|
|
673
|
+
if is_pptx:
|
|
674
|
+
ole_count, cdxml_filled = pass2_pptx(
|
|
675
|
+
tmp, args.output, cdxml_slots, ole_items)
|
|
676
|
+
else:
|
|
677
|
+
ole_count, cdxml_filled = pass2_docx(
|
|
678
|
+
tmp, args.output, cdxml_slots, ole_items)
|
|
679
|
+
else:
|
|
680
|
+
shutil.copy2(tmp, args.output)
|
|
681
|
+
else:
|
|
682
|
+
if not args.json:
|
|
683
|
+
print("[2/2] No CDXML slots — skipping OLE embedding.")
|
|
684
|
+
shutil.copy2(tmp, args.output)
|
|
685
|
+
|
|
686
|
+
# Warn about unfilled CDXML slots
|
|
687
|
+
for slot in cdxml_slots:
|
|
688
|
+
if slot["placeholder"] not in cdxml_filled:
|
|
689
|
+
if slot["file"] in (ole_items if cdxml_slots else {}):
|
|
690
|
+
warnings.append(
|
|
691
|
+
f"CDXML placeholder not found in template: "
|
|
692
|
+
f"{slot['placeholder']}")
|
|
693
|
+
|
|
694
|
+
total_filled = len(text_filled) + ole_count
|
|
695
|
+
|
|
696
|
+
if args.json:
|
|
697
|
+
result = {
|
|
698
|
+
"template": args.template,
|
|
699
|
+
"output": args.output,
|
|
700
|
+
"slots_filled": total_filled,
|
|
701
|
+
"ole_objects": ole_count,
|
|
702
|
+
"warnings": warnings,
|
|
703
|
+
}
|
|
704
|
+
print(json.dumps(result, indent=2))
|
|
705
|
+
else:
|
|
706
|
+
print(f"\nDone! {args.output}")
|
|
707
|
+
print(f" Text slots filled: {len(text_filled)}")
|
|
708
|
+
print(f" OLE objects embedded: {ole_count}")
|
|
709
|
+
if warnings:
|
|
710
|
+
print(f" Warnings:")
|
|
711
|
+
for w in warnings:
|
|
712
|
+
print(f" - {w}")
|
|
713
|
+
|
|
714
|
+
finally:
|
|
715
|
+
if os.path.exists(tmp):
|
|
716
|
+
os.unlink(tmp)
|
|
717
|
+
|
|
718
|
+
return 0
|
|
719
|
+
|
|
720
|
+
|
|
721
|
+
if __name__ == "__main__":
|
|
722
|
+
sys.exit(main())
|