regen.mde 0.2.2 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. package/README.md +409 -295
  2. package/bin/build-corpus-editor.js +5 -3
  3. package/bin/postinstall.js +259 -187
  4. package/bin/regen-mdeditor-install.js +1 -1
  5. package/bin/regen-mdeditor-uninstall.js +1 -1
  6. package/desktop/BuildCorpusEditor/BuildCorpusBridge.cs +493 -270
  7. package/desktop/BuildCorpusEditor/EditorForm.cs +853 -540
  8. package/desktop/BuildCorpusEditor/Program.cs +85 -81
  9. package/dist/release/regen-mde-0.3.0-win-x64-setup.exe +0 -0
  10. package/dist/release/{regen.mde-0.2.2-win-x64.zip → regen-mde-0.3.0-win-x64.zip} +0 -0
  11. package/dist/release/regen-mde-0.7.0-win-x64-setup.exe +0 -0
  12. package/dist/release/regen-mde-0.7.0-win-x64.zip +0 -0
  13. package/dist/windows-editor/BuildCorpusEditor.dll +0 -0
  14. package/dist/windows-editor/BuildCorpusEditor.exe +0 -0
  15. package/dist/windows-editor/BuildCorpusEditor.pdb +0 -0
  16. package/dist/windows-editor/wwwroot/assets/index-C_VxJk4k.js +375 -0
  17. package/dist/windows-editor/wwwroot/assets/index-Wt9zSjIw.css +1 -0
  18. package/dist/windows-editor/wwwroot/index.html +3 -3
  19. package/editor-web/index.html +1 -1
  20. package/editor-web/src/main.jsx +1044 -399
  21. package/editor-web/src/styles.css +846 -602
  22. package/installer/install-regen-mde.ps1 +49 -10
  23. package/installer/regen-mde.nsi +16 -16
  24. package/package.json +90 -86
  25. package/pyproject.toml +35 -33
  26. package/requirements.txt +6 -4
  27. package/scripts/package-windows-editor.ps1 +8 -8
  28. package/scripts/release-dual.mjs +105 -0
  29. package/scripts/run-editor-implementation-plane.ps1 +29 -6
  30. package/src/build_corpus/docx_exporter.py +1055 -798
  31. package/src/build_corpus/equations.py +80 -0
  32. package/src/build_corpus/exporter.py +1488 -1195
  33. package/src/build_corpus/frontmatter.py +302 -0
  34. package/src/build_corpus/ppt_exporter.py +543 -532
  35. package/dist/release/regen.mde-0.2.2-win-x64-setup.exe +0 -0
  36. package/dist/windows-editor/wwwroot/assets/index-DjJ6xmhy.js +0 -326
  37. package/dist/windows-editor/wwwroot/assets/index-_dwMNNsm.css +0 -1
@@ -1,532 +1,543 @@
1
- from __future__ import annotations
2
-
3
- import base64
4
- import hashlib
5
- import io
6
- import json
7
- import math
8
- import mimetypes
9
- import re
10
- import subprocess
11
- import tempfile
12
- from dataclasses import dataclass, field
13
- from pathlib import Path
14
- from typing import Optional
15
- from zipfile import ZipFile
16
- from xml.etree import ElementTree as ET
17
-
18
- from PIL import Image
19
-
20
- NS = {
21
- "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
22
- "p": "http://schemas.openxmlformats.org/presentationml/2006/main",
23
- "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
24
- }
25
-
26
-
27
- @dataclass
28
- class ShapeBlock:
29
- kind: str
30
- x: int
31
- y: int
32
- cx: int
33
- cy: int
34
- markdown: str
35
- text_norm: str = ""
36
- image_sig: str = ""
37
- is_title: bool = False
38
-
39
-
40
- @dataclass
41
- class SlideContent:
42
- index: int
43
- title: str
44
- blocks: list[ShapeBlock] = field(default_factory=list)
45
-
46
-
47
- @dataclass
48
- class PresentationStats:
49
- slides: int = 0
50
- removed_repetitive_blocks: int = 0
51
- tables: int = 0
52
- text_blocks: int = 0
53
- images: int = 0
54
- removed_logo_images: int = 0
55
- low_dpi_images: int = 0
56
- warnings: list[str] = field(default_factory=list)
57
-
58
-
59
- def clean_text(value: str) -> str:
60
- return re.sub(r"\s+", " ", (value or "").replace("\u00a0", " ")).strip()
61
-
62
-
63
- def md_escape_cell(value: str) -> str:
64
- return clean_text(value).replace("|", r"\|").replace("\n", "<br>")
65
-
66
-
67
- def normalize_for_repeat(value: str) -> str:
68
- return re.sub(r"\s+", " ", value).strip().lower()
69
-
70
-
71
- def local_name(tag: str) -> str:
72
- return tag.rsplit("}", 1)[-1] if "}" in tag else tag
73
-
74
-
75
- def emu_box(node: ET.Element) -> tuple[int, int, int, int]:
76
- xfrm = node.find("./p:spPr/a:xfrm", NS)
77
- if xfrm is None:
78
- return 0, 0, 0, 0
79
- off = xfrm.find("./a:off", NS)
80
- ext = xfrm.find("./a:ext", NS)
81
- try:
82
- x = int((off.attrib.get("x", "0") if off is not None else "0"))
83
- y = int((off.attrib.get("y", "0") if off is not None else "0"))
84
- cx = int((ext.attrib.get("cx", "0") if ext is not None else "0"))
85
- cy = int((ext.attrib.get("cy", "0") if ext is not None else "0"))
86
- return x, y, cx, cy
87
- except ValueError:
88
- return 0, 0, 0, 0
89
-
90
-
91
- class PptxMarkdownExporter:
92
- def __init__(self, input_path: Path, output_dir: Path, image_mode: str = "assets"):
93
- self.input_path = input_path
94
- self.output_dir = output_dir
95
- self.output_md = output_dir / f"{input_path.stem}.md"
96
- self.report_path = output_dir / "export-report.json"
97
- self.assets_dir = output_dir / "assets"
98
- self.asset_ref_prefix = self.assets_dir.name
99
- self.image_mode = image_mode
100
- self.media_map: dict[str, str] = {}
101
- self.media_bytes: dict[str, bytes] = {}
102
- self.variant_map: dict[str, str] = {}
103
- self.stats = PresentationStats()
104
-
105
- def export(self) -> dict:
106
- self.output_dir.mkdir(parents=True, exist_ok=True)
107
- if self.image_mode == "assets":
108
- self.assets_dir.mkdir(parents=True, exist_ok=True)
109
-
110
- with ZipFile(self.input_path) as zf:
111
- self._copy_media(zf)
112
- slides = self._parse_all_slides(zf)
113
-
114
- self.stats.slides = len(slides)
115
- self._remove_repetitive_text_blocks(slides)
116
- self._remove_repeated_logo_images(slides)
117
- markdown = self._render_markdown(slides)
118
- self.output_md.write_text(markdown, encoding="utf-8")
119
-
120
- report = {
121
- "input": str(self.input_path),
122
- "output": str(self.output_md),
123
- "stats": self.stats.__dict__,
124
- }
125
- self.report_path.write_text(json.dumps(report, indent=2, ensure_ascii=False), encoding="utf-8")
126
- return report
127
-
128
- def _parse_all_slides(self, zf: ZipFile) -> list[SlideContent]:
129
- slide_paths = self._ordered_slide_paths(zf)
130
- slides: list[SlideContent] = []
131
- for i, slide_path in enumerate(slide_paths, 1):
132
- root = ET.fromstring(zf.read(slide_path))
133
- rels = self._relationships(zf, slide_path)
134
- content = SlideContent(index=i, title="")
135
- self._walk_shape_tree(root.find("./p:cSld/p:spTree", NS), content, rels, slide_path)
136
- self._sort_blocks(content)
137
- if not content.title:
138
- image_only = bool(content.blocks) and all(b.kind == "image" for b in content.blocks)
139
- content.title = "Image" if image_only else f"Slide {i}"
140
- slides.append(content)
141
- return slides
142
-
143
- def _walk_shape_tree(
144
- self,
145
- node: Optional[ET.Element],
146
- content: SlideContent,
147
- rels: dict[str, str],
148
- slide_path: str,
149
- ) -> None:
150
- if node is None:
151
- return
152
- for child in list(node):
153
- tag = local_name(child.tag)
154
- if tag == "sp":
155
- block = self._parse_text_shape(child)
156
- if block is not None:
157
- if block.is_title and not content.title:
158
- content.title = clean_text(block.markdown.replace("- ", " "))
159
- else:
160
- self.stats.text_blocks += 1
161
- content.blocks.append(block)
162
- elif tag == "pic":
163
- block = self._parse_picture_shape(child, rels, slide_path)
164
- if block is not None:
165
- self.stats.images += 1
166
- content.blocks.append(block)
167
- elif tag == "graphicFrame":
168
- block = self._parse_table_shape(child)
169
- if block is not None:
170
- self.stats.tables += 1
171
- content.blocks.append(block)
172
- elif tag == "grpSp":
173
- self._walk_shape_tree(child, content, rels, slide_path)
174
-
175
- def _parse_text_shape(self, shape: ET.Element) -> Optional[ShapeBlock]:
176
- ph = shape.find("./p:nvSpPr/p:nvPr/p:ph", NS)
177
- ph_type = (ph.attrib.get("type", "") if ph is not None else "").lower()
178
- if ph_type in {"ftr", "dt", "sldnum"}:
179
- return None
180
-
181
- tx_body = shape.find("./p:txBody", NS)
182
- if tx_body is None:
183
- return None
184
-
185
- lines: list[str] = []
186
- for p in tx_body.findall("./a:p", NS):
187
- txt = self._paragraph_text(p)
188
- if not txt:
189
- continue
190
- lvl = self._paragraph_level(p)
191
- if lvl is None:
192
- lines.append(txt)
193
- else:
194
- lines.append(f"{' ' * max(lvl, 0)}- {txt}")
195
- if not lines:
196
- return None
197
- x, y, cx, cy = emu_box(shape)
198
- md = "\n".join(lines)
199
- return ShapeBlock(
200
- kind="text",
201
- x=x,
202
- y=y,
203
- cx=cx,
204
- cy=cy,
205
- markdown=md,
206
- text_norm=normalize_for_repeat(md),
207
- is_title=ph_type in {"title", "ctrtitle"},
208
- )
209
-
210
- def _parse_table_shape(self, frame: ET.Element) -> Optional[ShapeBlock]:
211
- tbl = frame.find(".//a:tbl", NS)
212
- if tbl is None:
213
- return None
214
- rows: list[list[str]] = []
215
- for tr in tbl.findall("./a:tr", NS):
216
- row: list[str] = []
217
- for tc in tr.findall("./a:tc", NS):
218
- cell_lines = []
219
- for p in tc.findall(".//a:p", NS):
220
- txt = self._paragraph_text(p)
221
- if txt:
222
- cell_lines.append(txt)
223
- row.append(md_escape_cell("\n".join(cell_lines)))
224
- rows.append(row)
225
- if not rows:
226
- return None
227
- width = max(len(r) for r in rows)
228
- padded = [r + [""] * (width - len(r)) for r in rows]
229
- md_lines = []
230
- md_lines.append("| " + " | ".join(padded[0]) + " |")
231
- md_lines.append("| " + " | ".join("---" for _ in range(width)) + " |")
232
- for row in padded[1:]:
233
- md_lines.append("| " + " | ".join(row) + " |")
234
- x, y, cx, cy = emu_box(frame)
235
- md = "\n".join(md_lines)
236
- return ShapeBlock(kind="table", x=x, y=y, cx=cx, cy=cy, markdown=md, text_norm=normalize_for_repeat(md))
237
-
238
- def _parse_picture_shape(self, pic: ET.Element, rels: dict[str, str], slide_path: str) -> Optional[ShapeBlock]:
239
- blip = pic.find(".//a:blip", NS)
240
- if blip is None:
241
- return None
242
- rid = blip.attrib.get(f"{{{NS['r']}}}embed") or blip.attrib.get(f"{{{NS['r']}}}link")
243
- if not rid:
244
- return None
245
- target = rels.get(rid, "")
246
- if not target:
247
- return None
248
- part = self._resolve_part_path(slide_path, target)
249
- ref = self.media_map.get(part, "")
250
- if not ref:
251
- return None
252
-
253
- src_rect = self._source_crop(pic)
254
- image_bytes = self.media_bytes.get(part, b"")
255
- if src_rect is not None:
256
- cropped = self._render_cropped_variant(part, src_rect)
257
- if cropped:
258
- ref = cropped
259
- image_bytes = self._variant_bytes(part, src_rect) or image_bytes
260
-
261
- x, y, cx, cy = emu_box(pic)
262
- self._track_dpi(cx, cy, image_bytes, Path(part).suffix.lower())
263
- sig = hashlib.sha256(image_bytes).hexdigest()[:16]
264
- return ShapeBlock(
265
- kind="image",
266
- x=x,
267
- y=y,
268
- cx=cx,
269
- cy=cy,
270
- markdown=f"![image]({ref})",
271
- image_sig=sig,
272
- )
273
-
274
- def _track_dpi(self, cx: int, cy: int, data: bytes, suffix: str) -> None:
275
- dims = self._image_dimensions(data, suffix)
276
- if not dims or cx <= 0 or cy <= 0:
277
- return
278
- w, h = dims
279
- dpi_x = w / (cx / 914400)
280
- dpi_y = h / (cy / 914400)
281
- if min(dpi_x, dpi_y) < 150:
282
- self.stats.low_dpi_images += 1
283
-
284
- def _sort_blocks(self, slide: SlideContent) -> None:
285
- row_height = 0.35 * 914400
286
- slide.blocks.sort(key=lambda b: (round(b.y / row_height), b.y, b.x))
287
-
288
- def _remove_repetitive_text_blocks(self, slides: list[SlideContent]) -> None:
289
- counts: dict[str, int] = {}
290
- for slide in slides:
291
- seen = set()
292
- for b in slide.blocks:
293
- if b.kind != "text" or not b.text_norm:
294
- continue
295
- if b.text_norm in seen:
296
- continue
297
- seen.add(b.text_norm)
298
- counts[b.text_norm] = counts.get(b.text_norm, 0) + 1
299
- threshold = max(4, math.ceil(len(slides) * 0.6)) if slides else 99999
300
- repetitive = {k for k, v in counts.items() if v >= threshold}
301
- for slide in slides:
302
- kept: list[ShapeBlock] = []
303
- for b in slide.blocks:
304
- if b.kind == "text" and b.text_norm and b.text_norm in repetitive:
305
- self.stats.removed_repetitive_blocks += 1
306
- continue
307
- kept.append(b)
308
- slide.blocks = kept
309
-
310
- def _remove_repeated_logo_images(self, slides: list[SlideContent]) -> None:
311
- counts: dict[str, int] = {}
312
- geom_counts: dict[str, int] = {}
313
- for slide in slides:
314
- for b in slide.blocks:
315
- if b.kind == "image" and b.image_sig:
316
- counts[b.image_sig] = counts.get(b.image_sig, 0) + 1
317
- if self._looks_like_corner_logo(b):
318
- g = self._logo_geom_bucket(b)
319
- geom_counts[g] = geom_counts.get(g, 0) + 1
320
- threshold = max(3, math.ceil(len(slides) * 0.15)) if slides else 99999
321
- repetitive = {k for k, v in counts.items() if v >= threshold}
322
- repetitive_geom = {k for k, v in geom_counts.items() if v >= threshold}
323
- for slide in slides:
324
- kept: list[ShapeBlock] = []
325
- for b in slide.blocks:
326
- if b.kind == "image" and self._looks_like_corner_logo(b):
327
- geom_match = self._logo_geom_bucket(b) in repetitive_geom
328
- hash_match = b.image_sig in repetitive
329
- if geom_match or hash_match:
330
- self.stats.removed_logo_images += 1
331
- continue
332
- kept.append(b)
333
- slide.blocks = kept
334
-
335
- @staticmethod
336
- def _looks_like_corner_logo(b: ShapeBlock) -> bool:
337
- # Small and near top/bottom edge is a typical logo/footer marker.
338
- w_in = b.cx / 914400 if b.cx else 0
339
- h_in = b.cy / 914400 if b.cy else 0
340
- y_in = b.y / 914400 if b.y else 0
341
- return w_in <= 2.2 and h_in <= 1.2 and (y_in <= 1.4 or y_in >= 5.5)
342
-
343
- @staticmethod
344
- def _logo_geom_bucket(b: ShapeBlock) -> str:
345
- return f"{round(b.x/300000)}:{round(b.y/300000)}:{round(b.cx/300000)}:{round(b.cy/300000)}"
346
-
347
- def _render_markdown(self, slides: list[SlideContent]) -> str:
348
- out: list[str] = []
349
- for slide in slides:
350
- out.append(f"## Slide {slide.index}: {slide.title}")
351
- if slide.blocks:
352
- out.append("")
353
- for b in slide.blocks:
354
- out.append(b.markdown)
355
- out.append("")
356
- else:
357
- out.append("")
358
- if out and out[-1] != "":
359
- out.append("")
360
- return "\n".join(out).strip() + "\n"
361
-
362
- def _copy_media(self, zf: ZipFile) -> None:
363
- for name in zf.namelist():
364
- if not name.startswith("ppt/media/"):
365
- continue
366
- data = zf.read(name)
367
- self.media_bytes[name] = data
368
- mime = mimetypes.guess_type(Path(name).name)[0] or "application/octet-stream"
369
- if self.image_mode == "base64":
370
- self.media_map[name] = f"data:{mime};base64,{base64.b64encode(data).decode('ascii')}"
371
- elif self.image_mode == "assets":
372
- target = self.assets_dir / Path(name).name
373
- target.write_bytes(data)
374
- self.media_map[name] = f"{self.asset_ref_prefix}/{target.name}"
375
- else:
376
- self.media_map[name] = ""
377
-
378
- def _source_crop(self, pic: ET.Element) -> tuple[int, int, int, int] | None:
379
- src = pic.find("./p:blipFill/a:srcRect", NS)
380
- if src is None:
381
- return None
382
- try:
383
- return (
384
- int(src.attrib.get("l", "0")),
385
- int(src.attrib.get("t", "0")),
386
- int(src.attrib.get("r", "0")),
387
- int(src.attrib.get("b", "0")),
388
- )
389
- except ValueError:
390
- return None
391
-
392
- def _variant_bytes(self, part: str, rect: tuple[int, int, int, int]) -> bytes | None:
393
- key = f"{part}:{rect[0]}:{rect[1]}:{rect[2]}:{rect[3]}:bytes"
394
- ref = self.variant_map.get(key, "")
395
- if not ref:
396
- return None
397
- return base64.b64decode(ref)
398
-
399
- def _render_cropped_variant(self, part: str, src_rect: tuple[int, int, int, int]) -> str:
400
- cache_key_ref = f"{part}:{src_rect[0]}:{src_rect[1]}:{src_rect[2]}:{src_rect[3]}:{self.image_mode}"
401
- cache_key_bytes = f"{part}:{src_rect[0]}:{src_rect[1]}:{src_rect[2]}:{src_rect[3]}:bytes"
402
- if cache_key_ref in self.variant_map:
403
- return self.variant_map[cache_key_ref]
404
- data = self.media_bytes.get(part, b"")
405
- if not data:
406
- return ""
407
- ext = Path(part).suffix.lower()
408
- if ext not in {".png", ".jpg", ".jpeg"}:
409
- return ""
410
- with Image.open(io.BytesIO(data)) as img:
411
- width, height = img.size
412
- l, t, r, b = src_rect
413
- left = max(0, min(width, round(width * (l / 100000.0))))
414
- top = max(0, min(height, round(height * (t / 100000.0))))
415
- right = max(0, min(width, round(width * ((100000 - r) / 100000.0))))
416
- bottom = max(0, min(height, round(height * ((100000 - b) / 100000.0))))
417
- if right <= left or bottom <= top:
418
- return ""
419
- cropped = img.crop((left, top, right, bottom))
420
- out = io.BytesIO()
421
- fmt = "PNG" if ext == ".png" else "JPEG"
422
- kwargs = {"quality": 95} if fmt == "JPEG" else {}
423
- cropped.save(out, format=fmt, **kwargs)
424
- out_bytes = out.getvalue()
425
- self.variant_map[cache_key_bytes] = base64.b64encode(out_bytes).decode("ascii")
426
- digest = hashlib.sha256(out_bytes).hexdigest()[:12]
427
- out_ext = ".jpg" if ext == ".jpeg" else ext
428
- name = f"{Path(part).stem}.crop.{digest}{out_ext}"
429
- if self.image_mode == "base64":
430
- mime = mimetypes.guess_type(name)[0] or "application/octet-stream"
431
- ref = f"data:{mime};base64,{base64.b64encode(out_bytes).decode('ascii')}"
432
- else:
433
- target = self.assets_dir / name
434
- target.write_bytes(out_bytes)
435
- ref = f"{self.asset_ref_prefix}/{target.name}"
436
- self.variant_map[cache_key_ref] = ref
437
- return ref
438
-
439
- @staticmethod
440
- def _image_dimensions(data: bytes, suffix: str) -> tuple[int, int] | None:
441
- if suffix == ".png" and data.startswith(b"\x89PNG\r\n\x1a\n") and len(data) >= 24:
442
- return int.from_bytes(data[16:20], "big"), int.from_bytes(data[20:24], "big")
443
- if suffix in {".jpg", ".jpeg"} and data.startswith(b"\xff\xd8"):
444
- i = 2
445
- while i + 9 < len(data):
446
- if data[i] != 0xFF:
447
- i += 1
448
- continue
449
- marker = data[i + 1]
450
- if marker in {0xC0, 0xC1, 0xC2, 0xC3, 0xC9, 0xCA, 0xCB}:
451
- if i + 9 <= len(data):
452
- return int.from_bytes(data[i + 7:i + 9], "big"), int.from_bytes(data[i + 5:i + 7], "big")
453
- if i + 4 > len(data):
454
- break
455
- seg_len = int.from_bytes(data[i + 2:i + 4], "big")
456
- i += 2 + seg_len
457
- return None
458
-
459
- def _paragraph_text(self, p: ET.Element) -> str:
460
- chunks = []
461
- for n in p.iter():
462
- if local_name(n.tag) == "t" and n.text:
463
- chunks.append(n.text)
464
- return clean_text("".join(chunks))
465
-
466
- def _paragraph_level(self, p: ET.Element) -> Optional[int]:
467
- ppr = p.find("./a:pPr", NS)
468
- if ppr is None:
469
- return None
470
- lvl = ppr.attrib.get("lvl")
471
- if lvl is None:
472
- return None
473
- try:
474
- return int(lvl)
475
- except ValueError:
476
- return None
477
-
478
- def _ordered_slide_paths(self, zf: ZipFile) -> list[str]:
479
- presentation = ET.fromstring(zf.read("ppt/presentation.xml"))
480
- rels = self._relationships(zf, "ppt/presentation.xml")
481
- paths: list[str] = []
482
- for sld_id in presentation.findall("./p:sldIdLst/p:sldId", NS):
483
- rid = sld_id.attrib.get(f"{{{NS['r']}}}id")
484
- target = rels.get(rid or "")
485
- if not target:
486
- continue
487
- full = self._resolve_part_path("ppt/presentation.xml", target)
488
- if full in zf.namelist():
489
- paths.append(full)
490
- return paths
491
-
492
- def _relationships(self, zf: ZipFile, part: str) -> dict[str, str]:
493
- rels_path = str(Path(part).parent / "_rels" / f"{Path(part).name}.rels").replace("\\", "/")
494
- if rels_path not in zf.namelist():
495
- return {}
496
- root = ET.fromstring(zf.read(rels_path))
497
- return {rel.attrib["Id"]: rel.attrib.get("Target", "") for rel in root if "Id" in rel.attrib}
498
-
499
- @staticmethod
500
- def _resolve_part_path(part: str, target: str) -> str:
501
- joined = (Path(part).parent / target).as_posix()
502
- norm: list[str] = []
503
- for piece in joined.split("/"):
504
- if piece in {"", "."}:
505
- continue
506
- if piece == "..":
507
- if norm:
508
- norm.pop()
509
- continue
510
- norm.append(piece)
511
- return "/".join(norm)
512
-
513
-
514
- def export_presentation(input_path: Path, output_root: Path, out_same_dir: bool, image_mode: str = "assets") -> dict:
515
- if input_path.suffix.lower() == ".ppt":
516
- input_path = convert_legacy_ppt(input_path)
517
- output_dir = input_path.parent if out_same_dir else output_root / input_path.stem
518
- return PptxMarkdownExporter(input_path, output_dir, image_mode=image_mode).export()
519
-
520
-
521
- def convert_legacy_ppt(input_path: Path) -> Path:
522
- with tempfile.TemporaryDirectory(prefix="build-corpus-ppt-") as tmp:
523
- tmp_dir = Path(tmp)
524
- cmd = ["soffice", "--headless", "--convert-to", "pptx", "--outdir", str(tmp_dir), str(input_path)]
525
- result = subprocess.run(cmd, capture_output=True, text=True)
526
- converted = tmp_dir / f"{input_path.stem}.pptx"
527
- if result.returncode != 0 or not converted.exists():
528
- message = (result.stderr or result.stdout or "conversion failed").strip()
529
- raise RuntimeError(f"Legacy .ppt conversion requires LibreOffice; failed for {input_path}: {message}")
530
- persistent = input_path.with_suffix(".converted.pptx")
531
- persistent.write_bytes(converted.read_bytes())
532
- return persistent
1
+ from __future__ import annotations
2
+
3
+ import base64
4
+ import hashlib
5
+ import io
6
+ import json
7
+ import math
8
+ import mimetypes
9
+ import re
10
+ import subprocess
11
+ import tempfile
12
+ from dataclasses import dataclass, field
13
+ from pathlib import Path
14
+ from typing import Optional
15
+ from zipfile import ZipFile
16
+ from xml.etree import ElementTree as ET
17
+
18
+ from PIL import Image
19
+
20
+ try:
21
+ from .frontmatter import add_mdk_frontmatter, read_frontmatter_from_zip
22
+ except ImportError: # pragma: no cover - allows direct script execution
23
+ from build_corpus.frontmatter import add_mdk_frontmatter, read_frontmatter_from_zip
24
+
25
+ NS = {
26
+ "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
27
+ "p": "http://schemas.openxmlformats.org/presentationml/2006/main",
28
+ "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
29
+ }
30
+
31
+
32
+ @dataclass
33
+ class ShapeBlock:
34
+ kind: str
35
+ x: int
36
+ y: int
37
+ cx: int
38
+ cy: int
39
+ markdown: str
40
+ text_norm: str = ""
41
+ image_sig: str = ""
42
+ is_title: bool = False
43
+
44
+
45
+ @dataclass
46
+ class SlideContent:
47
+ index: int
48
+ title: str
49
+ blocks: list[ShapeBlock] = field(default_factory=list)
50
+
51
+
52
+ @dataclass
53
+ class PresentationStats:
54
+ slides: int = 0
55
+ removed_repetitive_blocks: int = 0
56
+ tables: int = 0
57
+ text_blocks: int = 0
58
+ images: int = 0
59
+ removed_logo_images: int = 0
60
+ low_dpi_images: int = 0
61
+ warnings: list[str] = field(default_factory=list)
62
+
63
+
64
+ def clean_text(value: str) -> str:
65
+ return re.sub(r"\s+", " ", (value or "").replace("\u00a0", " ")).strip()
66
+
67
+
68
+ def md_escape_cell(value: str) -> str:
69
+ return clean_text(value).replace("|", r"\|").replace("\n", "<br>")
70
+
71
+
72
+ def normalize_for_repeat(value: str) -> str:
73
+ return re.sub(r"\s+", " ", value).strip().lower()
74
+
75
+
76
+ def local_name(tag: str) -> str:
77
+ return tag.rsplit("}", 1)[-1] if "}" in tag else tag
78
+
79
+
80
+ def emu_box(node: ET.Element) -> tuple[int, int, int, int]:
81
+ xfrm = node.find("./p:spPr/a:xfrm", NS)
82
+ if xfrm is None:
83
+ return 0, 0, 0, 0
84
+ off = xfrm.find("./a:off", NS)
85
+ ext = xfrm.find("./a:ext", NS)
86
+ try:
87
+ x = int((off.attrib.get("x", "0") if off is not None else "0"))
88
+ y = int((off.attrib.get("y", "0") if off is not None else "0"))
89
+ cx = int((ext.attrib.get("cx", "0") if ext is not None else "0"))
90
+ cy = int((ext.attrib.get("cy", "0") if ext is not None else "0"))
91
+ return x, y, cx, cy
92
+ except ValueError:
93
+ return 0, 0, 0, 0
94
+
95
+
96
+ class PptxMarkdownExporter:
97
+ def __init__(self, input_path: Path, output_dir: Path, image_mode: str = "assets", emit_frontmatter: bool = True):
98
+ self.input_path = input_path
99
+ self.output_dir = output_dir
100
+ self.output_md = output_dir / f"{input_path.stem}.md"
101
+ self.report_path = output_dir / "export-report.json"
102
+ self.assets_dir = output_dir / "assets"
103
+ self.asset_ref_prefix = self.assets_dir.name
104
+ self.image_mode = image_mode
105
+ self.emit_frontmatter = emit_frontmatter
106
+ self.media_map: dict[str, str] = {}
107
+ self.media_bytes: dict[str, bytes] = {}
108
+ self.variant_map: dict[str, str] = {}
109
+ self.stats = PresentationStats()
110
+
111
+ def export(self) -> dict:
112
+ self.output_dir.mkdir(parents=True, exist_ok=True)
113
+ if self.image_mode == "assets":
114
+ self.assets_dir.mkdir(parents=True, exist_ok=True)
115
+
116
+ prior_frontmatter: str | None = None
117
+ with ZipFile(self.input_path) as zf:
118
+ self._copy_media(zf)
119
+ slides = self._parse_all_slides(zf)
120
+ if self.emit_frontmatter:
121
+ prior_frontmatter = read_frontmatter_from_zip(zf)
122
+
123
+ self.stats.slides = len(slides)
124
+ self._remove_repetitive_text_blocks(slides)
125
+ self._remove_repeated_logo_images(slides)
126
+ markdown = self._render_markdown(slides)
127
+ if self.emit_frontmatter:
128
+ markdown = add_mdk_frontmatter(markdown, self.input_path, prior_frontmatter)
129
+ self.output_md.write_text(markdown, encoding="utf-8")
130
+
131
+ report = {
132
+ "input": str(self.input_path),
133
+ "output": str(self.output_md),
134
+ "stats": self.stats.__dict__,
135
+ }
136
+ self.report_path.write_text(json.dumps(report, indent=2, ensure_ascii=False), encoding="utf-8")
137
+ return report
138
+
139
+ def _parse_all_slides(self, zf: ZipFile) -> list[SlideContent]:
140
+ slide_paths = self._ordered_slide_paths(zf)
141
+ slides: list[SlideContent] = []
142
+ for i, slide_path in enumerate(slide_paths, 1):
143
+ root = ET.fromstring(zf.read(slide_path))
144
+ rels = self._relationships(zf, slide_path)
145
+ content = SlideContent(index=i, title="")
146
+ self._walk_shape_tree(root.find("./p:cSld/p:spTree", NS), content, rels, slide_path)
147
+ self._sort_blocks(content)
148
+ if not content.title:
149
+ image_only = bool(content.blocks) and all(b.kind == "image" for b in content.blocks)
150
+ content.title = "Image" if image_only else f"Slide {i}"
151
+ slides.append(content)
152
+ return slides
153
+
154
+ def _walk_shape_tree(
155
+ self,
156
+ node: Optional[ET.Element],
157
+ content: SlideContent,
158
+ rels: dict[str, str],
159
+ slide_path: str,
160
+ ) -> None:
161
+ if node is None:
162
+ return
163
+ for child in list(node):
164
+ tag = local_name(child.tag)
165
+ if tag == "sp":
166
+ block = self._parse_text_shape(child)
167
+ if block is not None:
168
+ if block.is_title and not content.title:
169
+ content.title = clean_text(block.markdown.replace("- ", " "))
170
+ else:
171
+ self.stats.text_blocks += 1
172
+ content.blocks.append(block)
173
+ elif tag == "pic":
174
+ block = self._parse_picture_shape(child, rels, slide_path)
175
+ if block is not None:
176
+ self.stats.images += 1
177
+ content.blocks.append(block)
178
+ elif tag == "graphicFrame":
179
+ block = self._parse_table_shape(child)
180
+ if block is not None:
181
+ self.stats.tables += 1
182
+ content.blocks.append(block)
183
+ elif tag == "grpSp":
184
+ self._walk_shape_tree(child, content, rels, slide_path)
185
+
186
+ def _parse_text_shape(self, shape: ET.Element) -> Optional[ShapeBlock]:
187
+ ph = shape.find("./p:nvSpPr/p:nvPr/p:ph", NS)
188
+ ph_type = (ph.attrib.get("type", "") if ph is not None else "").lower()
189
+ if ph_type in {"ftr", "dt", "sldnum"}:
190
+ return None
191
+
192
+ tx_body = shape.find("./p:txBody", NS)
193
+ if tx_body is None:
194
+ return None
195
+
196
+ lines: list[str] = []
197
+ for p in tx_body.findall("./a:p", NS):
198
+ txt = self._paragraph_text(p)
199
+ if not txt:
200
+ continue
201
+ lvl = self._paragraph_level(p)
202
+ if lvl is None:
203
+ lines.append(txt)
204
+ else:
205
+ lines.append(f"{' ' * max(lvl, 0)}- {txt}")
206
+ if not lines:
207
+ return None
208
+ x, y, cx, cy = emu_box(shape)
209
+ md = "\n".join(lines)
210
+ return ShapeBlock(
211
+ kind="text",
212
+ x=x,
213
+ y=y,
214
+ cx=cx,
215
+ cy=cy,
216
+ markdown=md,
217
+ text_norm=normalize_for_repeat(md),
218
+ is_title=ph_type in {"title", "ctrtitle"},
219
+ )
220
+
221
+ def _parse_table_shape(self, frame: ET.Element) -> Optional[ShapeBlock]:
222
+ tbl = frame.find(".//a:tbl", NS)
223
+ if tbl is None:
224
+ return None
225
+ rows: list[list[str]] = []
226
+ for tr in tbl.findall("./a:tr", NS):
227
+ row: list[str] = []
228
+ for tc in tr.findall("./a:tc", NS):
229
+ cell_lines = []
230
+ for p in tc.findall(".//a:p", NS):
231
+ txt = self._paragraph_text(p)
232
+ if txt:
233
+ cell_lines.append(txt)
234
+ row.append(md_escape_cell("\n".join(cell_lines)))
235
+ rows.append(row)
236
+ if not rows:
237
+ return None
238
+ width = max(len(r) for r in rows)
239
+ padded = [r + [""] * (width - len(r)) for r in rows]
240
+ md_lines = []
241
+ md_lines.append("| " + " | ".join(padded[0]) + " |")
242
+ md_lines.append("| " + " | ".join("---" for _ in range(width)) + " |")
243
+ for row in padded[1:]:
244
+ md_lines.append("| " + " | ".join(row) + " |")
245
+ x, y, cx, cy = emu_box(frame)
246
+ md = "\n".join(md_lines)
247
+ return ShapeBlock(kind="table", x=x, y=y, cx=cx, cy=cy, markdown=md, text_norm=normalize_for_repeat(md))
248
+
249
+ def _parse_picture_shape(self, pic: ET.Element, rels: dict[str, str], slide_path: str) -> Optional[ShapeBlock]:
250
+ blip = pic.find(".//a:blip", NS)
251
+ if blip is None:
252
+ return None
253
+ rid = blip.attrib.get(f"{{{NS['r']}}}embed") or blip.attrib.get(f"{{{NS['r']}}}link")
254
+ if not rid:
255
+ return None
256
+ target = rels.get(rid, "")
257
+ if not target:
258
+ return None
259
+ part = self._resolve_part_path(slide_path, target)
260
+ ref = self.media_map.get(part, "")
261
+ if not ref:
262
+ return None
263
+
264
+ src_rect = self._source_crop(pic)
265
+ image_bytes = self.media_bytes.get(part, b"")
266
+ if src_rect is not None:
267
+ cropped = self._render_cropped_variant(part, src_rect)
268
+ if cropped:
269
+ ref = cropped
270
+ image_bytes = self._variant_bytes(part, src_rect) or image_bytes
271
+
272
+ x, y, cx, cy = emu_box(pic)
273
+ self._track_dpi(cx, cy, image_bytes, Path(part).suffix.lower())
274
+ sig = hashlib.sha256(image_bytes).hexdigest()[:16]
275
+ return ShapeBlock(
276
+ kind="image",
277
+ x=x,
278
+ y=y,
279
+ cx=cx,
280
+ cy=cy,
281
+ markdown=f"![image]({ref})",
282
+ image_sig=sig,
283
+ )
284
+
285
+ def _track_dpi(self, cx: int, cy: int, data: bytes, suffix: str) -> None:
286
+ dims = self._image_dimensions(data, suffix)
287
+ if not dims or cx <= 0 or cy <= 0:
288
+ return
289
+ w, h = dims
290
+ dpi_x = w / (cx / 914400)
291
+ dpi_y = h / (cy / 914400)
292
+ if min(dpi_x, dpi_y) < 150:
293
+ self.stats.low_dpi_images += 1
294
+
295
+ def _sort_blocks(self, slide: SlideContent) -> None:
296
+ row_height = 0.35 * 914400
297
+ slide.blocks.sort(key=lambda b: (round(b.y / row_height), b.y, b.x))
298
+
299
+ def _remove_repetitive_text_blocks(self, slides: list[SlideContent]) -> None:
300
+ counts: dict[str, int] = {}
301
+ for slide in slides:
302
+ seen = set()
303
+ for b in slide.blocks:
304
+ if b.kind != "text" or not b.text_norm:
305
+ continue
306
+ if b.text_norm in seen:
307
+ continue
308
+ seen.add(b.text_norm)
309
+ counts[b.text_norm] = counts.get(b.text_norm, 0) + 1
310
+ threshold = max(4, math.ceil(len(slides) * 0.6)) if slides else 99999
311
+ repetitive = {k for k, v in counts.items() if v >= threshold}
312
+ for slide in slides:
313
+ kept: list[ShapeBlock] = []
314
+ for b in slide.blocks:
315
+ if b.kind == "text" and b.text_norm and b.text_norm in repetitive:
316
+ self.stats.removed_repetitive_blocks += 1
317
+ continue
318
+ kept.append(b)
319
+ slide.blocks = kept
320
+
321
+ def _remove_repeated_logo_images(self, slides: list[SlideContent]) -> None:
322
+ counts: dict[str, int] = {}
323
+ geom_counts: dict[str, int] = {}
324
+ for slide in slides:
325
+ for b in slide.blocks:
326
+ if b.kind == "image" and b.image_sig:
327
+ counts[b.image_sig] = counts.get(b.image_sig, 0) + 1
328
+ if self._looks_like_corner_logo(b):
329
+ g = self._logo_geom_bucket(b)
330
+ geom_counts[g] = geom_counts.get(g, 0) + 1
331
+ threshold = max(3, math.ceil(len(slides) * 0.15)) if slides else 99999
332
+ repetitive = {k for k, v in counts.items() if v >= threshold}
333
+ repetitive_geom = {k for k, v in geom_counts.items() if v >= threshold}
334
+ for slide in slides:
335
+ kept: list[ShapeBlock] = []
336
+ for b in slide.blocks:
337
+ if b.kind == "image" and self._looks_like_corner_logo(b):
338
+ geom_match = self._logo_geom_bucket(b) in repetitive_geom
339
+ hash_match = b.image_sig in repetitive
340
+ if geom_match or hash_match:
341
+ self.stats.removed_logo_images += 1
342
+ continue
343
+ kept.append(b)
344
+ slide.blocks = kept
345
+
346
+ @staticmethod
347
+ def _looks_like_corner_logo(b: ShapeBlock) -> bool:
348
+ # Small and near top/bottom edge is a typical logo/footer marker.
349
+ w_in = b.cx / 914400 if b.cx else 0
350
+ h_in = b.cy / 914400 if b.cy else 0
351
+ y_in = b.y / 914400 if b.y else 0
352
+ return w_in <= 2.2 and h_in <= 1.2 and (y_in <= 1.4 or y_in >= 5.5)
353
+
354
+ @staticmethod
355
+ def _logo_geom_bucket(b: ShapeBlock) -> str:
356
+ return f"{round(b.x/300000)}:{round(b.y/300000)}:{round(b.cx/300000)}:{round(b.cy/300000)}"
357
+
358
+ def _render_markdown(self, slides: list[SlideContent]) -> str:
359
+ out: list[str] = []
360
+ for slide in slides:
361
+ out.append(f"## Slide {slide.index}: {slide.title}")
362
+ if slide.blocks:
363
+ out.append("")
364
+ for b in slide.blocks:
365
+ out.append(b.markdown)
366
+ out.append("")
367
+ else:
368
+ out.append("")
369
+ if out and out[-1] != "":
370
+ out.append("")
371
+ return "\n".join(out).strip() + "\n"
372
+
373
+ def _copy_media(self, zf: ZipFile) -> None:
374
+ for name in zf.namelist():
375
+ if not name.startswith("ppt/media/"):
376
+ continue
377
+ data = zf.read(name)
378
+ self.media_bytes[name] = data
379
+ mime = mimetypes.guess_type(Path(name).name)[0] or "application/octet-stream"
380
+ if self.image_mode == "base64":
381
+ self.media_map[name] = f"data:{mime};base64,{base64.b64encode(data).decode('ascii')}"
382
+ elif self.image_mode == "assets":
383
+ target = self.assets_dir / Path(name).name
384
+ target.write_bytes(data)
385
+ self.media_map[name] = f"{self.asset_ref_prefix}/{target.name}"
386
+ else:
387
+ self.media_map[name] = ""
388
+
389
+ def _source_crop(self, pic: ET.Element) -> tuple[int, int, int, int] | None:
390
+ src = pic.find("./p:blipFill/a:srcRect", NS)
391
+ if src is None:
392
+ return None
393
+ try:
394
+ return (
395
+ int(src.attrib.get("l", "0")),
396
+ int(src.attrib.get("t", "0")),
397
+ int(src.attrib.get("r", "0")),
398
+ int(src.attrib.get("b", "0")),
399
+ )
400
+ except ValueError:
401
+ return None
402
+
403
+ def _variant_bytes(self, part: str, rect: tuple[int, int, int, int]) -> bytes | None:
404
+ key = f"{part}:{rect[0]}:{rect[1]}:{rect[2]}:{rect[3]}:bytes"
405
+ ref = self.variant_map.get(key, "")
406
+ if not ref:
407
+ return None
408
+ return base64.b64decode(ref)
409
+
410
+ def _render_cropped_variant(self, part: str, src_rect: tuple[int, int, int, int]) -> str:
411
+ cache_key_ref = f"{part}:{src_rect[0]}:{src_rect[1]}:{src_rect[2]}:{src_rect[3]}:{self.image_mode}"
412
+ cache_key_bytes = f"{part}:{src_rect[0]}:{src_rect[1]}:{src_rect[2]}:{src_rect[3]}:bytes"
413
+ if cache_key_ref in self.variant_map:
414
+ return self.variant_map[cache_key_ref]
415
+ data = self.media_bytes.get(part, b"")
416
+ if not data:
417
+ return ""
418
+ ext = Path(part).suffix.lower()
419
+ if ext not in {".png", ".jpg", ".jpeg"}:
420
+ return ""
421
+ with Image.open(io.BytesIO(data)) as img:
422
+ width, height = img.size
423
+ l, t, r, b = src_rect
424
+ left = max(0, min(width, round(width * (l / 100000.0))))
425
+ top = max(0, min(height, round(height * (t / 100000.0))))
426
+ right = max(0, min(width, round(width * ((100000 - r) / 100000.0))))
427
+ bottom = max(0, min(height, round(height * ((100000 - b) / 100000.0))))
428
+ if right <= left or bottom <= top:
429
+ return ""
430
+ cropped = img.crop((left, top, right, bottom))
431
+ out = io.BytesIO()
432
+ fmt = "PNG" if ext == ".png" else "JPEG"
433
+ kwargs = {"quality": 95} if fmt == "JPEG" else {}
434
+ cropped.save(out, format=fmt, **kwargs)
435
+ out_bytes = out.getvalue()
436
+ self.variant_map[cache_key_bytes] = base64.b64encode(out_bytes).decode("ascii")
437
+ digest = hashlib.sha256(out_bytes).hexdigest()[:12]
438
+ out_ext = ".jpg" if ext == ".jpeg" else ext
439
+ name = f"{Path(part).stem}.crop.{digest}{out_ext}"
440
+ if self.image_mode == "base64":
441
+ mime = mimetypes.guess_type(name)[0] or "application/octet-stream"
442
+ ref = f"data:{mime};base64,{base64.b64encode(out_bytes).decode('ascii')}"
443
+ else:
444
+ target = self.assets_dir / name
445
+ target.write_bytes(out_bytes)
446
+ ref = f"{self.asset_ref_prefix}/{target.name}"
447
+ self.variant_map[cache_key_ref] = ref
448
+ return ref
449
+
450
+ @staticmethod
451
+ def _image_dimensions(data: bytes, suffix: str) -> tuple[int, int] | None:
452
+ if suffix == ".png" and data.startswith(b"\x89PNG\r\n\x1a\n") and len(data) >= 24:
453
+ return int.from_bytes(data[16:20], "big"), int.from_bytes(data[20:24], "big")
454
+ if suffix in {".jpg", ".jpeg"} and data.startswith(b"\xff\xd8"):
455
+ i = 2
456
+ while i + 9 < len(data):
457
+ if data[i] != 0xFF:
458
+ i += 1
459
+ continue
460
+ marker = data[i + 1]
461
+ if marker in {0xC0, 0xC1, 0xC2, 0xC3, 0xC9, 0xCA, 0xCB}:
462
+ if i + 9 <= len(data):
463
+ return int.from_bytes(data[i + 7:i + 9], "big"), int.from_bytes(data[i + 5:i + 7], "big")
464
+ if i + 4 > len(data):
465
+ break
466
+ seg_len = int.from_bytes(data[i + 2:i + 4], "big")
467
+ i += 2 + seg_len
468
+ return None
469
+
470
+ def _paragraph_text(self, p: ET.Element) -> str:
471
+ chunks = []
472
+ for n in p.iter():
473
+ if local_name(n.tag) == "t" and n.text:
474
+ chunks.append(n.text)
475
+ return clean_text("".join(chunks))
476
+
477
+ def _paragraph_level(self, p: ET.Element) -> Optional[int]:
478
+ ppr = p.find("./a:pPr", NS)
479
+ if ppr is None:
480
+ return None
481
+ lvl = ppr.attrib.get("lvl")
482
+ if lvl is None:
483
+ return None
484
+ try:
485
+ return int(lvl)
486
+ except ValueError:
487
+ return None
488
+
489
+ def _ordered_slide_paths(self, zf: ZipFile) -> list[str]:
490
+ presentation = ET.fromstring(zf.read("ppt/presentation.xml"))
491
+ rels = self._relationships(zf, "ppt/presentation.xml")
492
+ paths: list[str] = []
493
+ for sld_id in presentation.findall("./p:sldIdLst/p:sldId", NS):
494
+ rid = sld_id.attrib.get(f"{{{NS['r']}}}id")
495
+ target = rels.get(rid or "")
496
+ if not target:
497
+ continue
498
+ full = self._resolve_part_path("ppt/presentation.xml", target)
499
+ if full in zf.namelist():
500
+ paths.append(full)
501
+ return paths
502
+
503
+ def _relationships(self, zf: ZipFile, part: str) -> dict[str, str]:
504
+ rels_path = str(Path(part).parent / "_rels" / f"{Path(part).name}.rels").replace("\\", "/")
505
+ if rels_path not in zf.namelist():
506
+ return {}
507
+ root = ET.fromstring(zf.read(rels_path))
508
+ return {rel.attrib["Id"]: rel.attrib.get("Target", "") for rel in root if "Id" in rel.attrib}
509
+
510
+ @staticmethod
511
+ def _resolve_part_path(part: str, target: str) -> str:
512
+ joined = (Path(part).parent / target).as_posix()
513
+ norm: list[str] = []
514
+ for piece in joined.split("/"):
515
+ if piece in {"", "."}:
516
+ continue
517
+ if piece == "..":
518
+ if norm:
519
+ norm.pop()
520
+ continue
521
+ norm.append(piece)
522
+ return "/".join(norm)
523
+
524
+
525
+ def export_presentation(input_path: Path, output_root: Path, out_same_dir: bool, image_mode: str = "assets", emit_frontmatter: bool = True) -> dict:
526
+ if input_path.suffix.lower() == ".ppt":
527
+ input_path = convert_legacy_ppt(input_path)
528
+ output_dir = input_path.parent if out_same_dir else output_root / input_path.stem
529
+ return PptxMarkdownExporter(input_path, output_dir, image_mode=image_mode, emit_frontmatter=emit_frontmatter).export()
530
+
531
+
532
+ def convert_legacy_ppt(input_path: Path) -> Path:
533
+ with tempfile.TemporaryDirectory(prefix="build-corpus-ppt-") as tmp:
534
+ tmp_dir = Path(tmp)
535
+ cmd = ["soffice", "--headless", "--convert-to", "pptx", "--outdir", str(tmp_dir), str(input_path)]
536
+ result = subprocess.run(cmd, capture_output=True, text=True)
537
+ converted = tmp_dir / f"{input_path.stem}.pptx"
538
+ if result.returncode != 0 or not converted.exists():
539
+ message = (result.stderr or result.stdout or "conversion failed").strip()
540
+ raise RuntimeError(f"Legacy .ppt conversion requires LibreOffice; failed for {input_path}: {message}")
541
+ persistent = input_path.with_suffix(".converted.pptx")
542
+ persistent.write_bytes(converted.read_bytes())
543
+ return persistent