regen.mde 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. package/LICENSE +16 -0
  2. package/README.md +295 -0
  3. package/bin/build-corpus-editor.js +81 -0
  4. package/bin/build-corpus.js +41 -0
  5. package/bin/postinstall.js +187 -0
  6. package/bin/regen-mdeditor-install.js +27 -0
  7. package/bin/regen-mdeditor-uninstall.js +19 -0
  8. package/bin/validate-katex.js +93 -0
  9. package/desktop/BuildCorpusEditor/BuildCorpusBridge.cs +270 -0
  10. package/desktop/BuildCorpusEditor/BuildCorpusEditor.csproj +22 -0
  11. package/desktop/BuildCorpusEditor/EditorForm.cs +540 -0
  12. package/desktop/BuildCorpusEditor/Program.cs +81 -0
  13. package/desktop/BuildCorpusEditor/app.manifest +16 -0
  14. package/dist/release/regen.mde-0.2.2-win-x64-setup.exe +0 -0
  15. package/dist/release/regen.mde-0.2.2-win-x64.zip +0 -0
  16. package/dist/windows-editor/BuildCorpusEditor.deps.json +83 -0
  17. package/dist/windows-editor/BuildCorpusEditor.dll +0 -0
  18. package/dist/windows-editor/BuildCorpusEditor.exe +0 -0
  19. package/dist/windows-editor/BuildCorpusEditor.pdb +0 -0
  20. package/dist/windows-editor/BuildCorpusEditor.runtimeconfig.json +19 -0
  21. package/dist/windows-editor/Microsoft.Web.WebView2.Core.dll +0 -0
  22. package/dist/windows-editor/Microsoft.Web.WebView2.Core.xml +6817 -0
  23. package/dist/windows-editor/Microsoft.Web.WebView2.WinForms.dll +0 -0
  24. package/dist/windows-editor/Microsoft.Web.WebView2.WinForms.xml +510 -0
  25. package/dist/windows-editor/Microsoft.Web.WebView2.Wpf.dll +0 -0
  26. package/dist/windows-editor/Microsoft.Web.WebView2.Wpf.xml +1902 -0
  27. package/dist/windows-editor/WebView2Loader.dll +0 -0
  28. package/dist/windows-editor/runtimes/win-x64/native/WebView2Loader.dll +0 -0
  29. package/dist/windows-editor/wwwroot/assets/index-DjJ6xmhy.js +326 -0
  30. package/dist/windows-editor/wwwroot/assets/index-_dwMNNsm.css +1 -0
  31. package/dist/windows-editor/wwwroot/index.html +22 -0
  32. package/editor-web/index.html +21 -0
  33. package/editor-web/src/main.jsx +399 -0
  34. package/editor-web/src/styles.css +602 -0
  35. package/editor-web/vite.config.js +13 -0
  36. package/examples/build-corpus.config.example.json +21 -0
  37. package/installer/install-regen-mde.ps1 +175 -0
  38. package/installer/regen-mde.nsi +81 -0
  39. package/package.json +86 -0
  40. package/pyproject.toml +33 -0
  41. package/requirements.txt +4 -0
  42. package/scripts/build-windows-editor.ps1 +47 -0
  43. package/scripts/package-windows-editor.ps1 +90 -0
  44. package/scripts/run-corpus.ps1 +28 -0
  45. package/scripts/run-editor-implementation-plane.ps1 +203 -0
  46. package/scripts/run-required-tests.ps1 +98 -0
  47. package/scripts/run-smoke.ps1 +28 -0
  48. package/src/build_corpus/__init__.py +3 -0
  49. package/src/build_corpus/docx_exporter.py +798 -0
  50. package/src/build_corpus/exporter.py +1195 -0
  51. package/src/build_corpus/ppt_exporter.py +532 -0
  52. package/src/build_corpus/templates/__init__.py +1 -0
  53. package/src/build_corpus/templates/md-to-word-template.dotx +0 -0
  54. package/src/build_corpus/validate_assets.py +46 -0
  55. package/tools/audit_corpus.py +203 -0
  56. package/tools/collect_microsoft_word_templates.py +228 -0
  57. package/tools/collect_online_docx_corpus.py +272 -0
  58. package/tools/collect_online_pptx_corpus.py +252 -0
  59. package/tools/compare_pptx_inputs_outputs.py +87 -0
  60. package/tools/roundtrip_docx_corpus.py +171 -0
@@ -0,0 +1,532 @@
1
+ from __future__ import annotations
2
+
3
+ import base64
4
+ import hashlib
5
+ import io
6
+ import json
7
+ import math
8
+ import mimetypes
9
+ import re
10
+ import subprocess
11
+ import tempfile
12
+ from dataclasses import dataclass, field
13
+ from pathlib import Path
14
+ from typing import Optional
15
+ from zipfile import ZipFile
16
+ from xml.etree import ElementTree as ET
17
+
18
+ from PIL import Image
19
+
20
+ NS = {
21
+ "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
22
+ "p": "http://schemas.openxmlformats.org/presentationml/2006/main",
23
+ "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
24
+ }
25
+
26
+
27
+ @dataclass
28
+ class ShapeBlock:
29
+ kind: str
30
+ x: int
31
+ y: int
32
+ cx: int
33
+ cy: int
34
+ markdown: str
35
+ text_norm: str = ""
36
+ image_sig: str = ""
37
+ is_title: bool = False
38
+
39
+
40
+ @dataclass
41
+ class SlideContent:
42
+ index: int
43
+ title: str
44
+ blocks: list[ShapeBlock] = field(default_factory=list)
45
+
46
+
47
+ @dataclass
48
+ class PresentationStats:
49
+ slides: int = 0
50
+ removed_repetitive_blocks: int = 0
51
+ tables: int = 0
52
+ text_blocks: int = 0
53
+ images: int = 0
54
+ removed_logo_images: int = 0
55
+ low_dpi_images: int = 0
56
+ warnings: list[str] = field(default_factory=list)
57
+
58
+
59
+ def clean_text(value: str) -> str:
60
+ return re.sub(r"\s+", " ", (value or "").replace("\u00a0", " ")).strip()
61
+
62
+
63
+ def md_escape_cell(value: str) -> str:
64
+ return clean_text(value).replace("|", r"\|").replace("\n", "<br>")
65
+
66
+
67
+ def normalize_for_repeat(value: str) -> str:
68
+ return re.sub(r"\s+", " ", value).strip().lower()
69
+
70
+
71
+ def local_name(tag: str) -> str:
72
+ return tag.rsplit("}", 1)[-1] if "}" in tag else tag
73
+
74
+
75
+ def emu_box(node: ET.Element) -> tuple[int, int, int, int]:
76
+ xfrm = node.find("./p:spPr/a:xfrm", NS)
77
+ if xfrm is None:
78
+ return 0, 0, 0, 0
79
+ off = xfrm.find("./a:off", NS)
80
+ ext = xfrm.find("./a:ext", NS)
81
+ try:
82
+ x = int((off.attrib.get("x", "0") if off is not None else "0"))
83
+ y = int((off.attrib.get("y", "0") if off is not None else "0"))
84
+ cx = int((ext.attrib.get("cx", "0") if ext is not None else "0"))
85
+ cy = int((ext.attrib.get("cy", "0") if ext is not None else "0"))
86
+ return x, y, cx, cy
87
+ except ValueError:
88
+ return 0, 0, 0, 0
89
+
90
+
91
+ class PptxMarkdownExporter:
92
+ def __init__(self, input_path: Path, output_dir: Path, image_mode: str = "assets"):
93
+ self.input_path = input_path
94
+ self.output_dir = output_dir
95
+ self.output_md = output_dir / f"{input_path.stem}.md"
96
+ self.report_path = output_dir / "export-report.json"
97
+ self.assets_dir = output_dir / "assets"
98
+ self.asset_ref_prefix = self.assets_dir.name
99
+ self.image_mode = image_mode
100
+ self.media_map: dict[str, str] = {}
101
+ self.media_bytes: dict[str, bytes] = {}
102
+ self.variant_map: dict[str, str] = {}
103
+ self.stats = PresentationStats()
104
+
105
+ def export(self) -> dict:
106
+ self.output_dir.mkdir(parents=True, exist_ok=True)
107
+ if self.image_mode == "assets":
108
+ self.assets_dir.mkdir(parents=True, exist_ok=True)
109
+
110
+ with ZipFile(self.input_path) as zf:
111
+ self._copy_media(zf)
112
+ slides = self._parse_all_slides(zf)
113
+
114
+ self.stats.slides = len(slides)
115
+ self._remove_repetitive_text_blocks(slides)
116
+ self._remove_repeated_logo_images(slides)
117
+ markdown = self._render_markdown(slides)
118
+ self.output_md.write_text(markdown, encoding="utf-8")
119
+
120
+ report = {
121
+ "input": str(self.input_path),
122
+ "output": str(self.output_md),
123
+ "stats": self.stats.__dict__,
124
+ }
125
+ self.report_path.write_text(json.dumps(report, indent=2, ensure_ascii=False), encoding="utf-8")
126
+ return report
127
+
128
+ def _parse_all_slides(self, zf: ZipFile) -> list[SlideContent]:
129
+ slide_paths = self._ordered_slide_paths(zf)
130
+ slides: list[SlideContent] = []
131
+ for i, slide_path in enumerate(slide_paths, 1):
132
+ root = ET.fromstring(zf.read(slide_path))
133
+ rels = self._relationships(zf, slide_path)
134
+ content = SlideContent(index=i, title="")
135
+ self._walk_shape_tree(root.find("./p:cSld/p:spTree", NS), content, rels, slide_path)
136
+ self._sort_blocks(content)
137
+ if not content.title:
138
+ image_only = bool(content.blocks) and all(b.kind == "image" for b in content.blocks)
139
+ content.title = "Image" if image_only else f"Slide {i}"
140
+ slides.append(content)
141
+ return slides
142
+
143
+ def _walk_shape_tree(
144
+ self,
145
+ node: Optional[ET.Element],
146
+ content: SlideContent,
147
+ rels: dict[str, str],
148
+ slide_path: str,
149
+ ) -> None:
150
+ if node is None:
151
+ return
152
+ for child in list(node):
153
+ tag = local_name(child.tag)
154
+ if tag == "sp":
155
+ block = self._parse_text_shape(child)
156
+ if block is not None:
157
+ if block.is_title and not content.title:
158
+ content.title = clean_text(block.markdown.replace("- ", " "))
159
+ else:
160
+ self.stats.text_blocks += 1
161
+ content.blocks.append(block)
162
+ elif tag == "pic":
163
+ block = self._parse_picture_shape(child, rels, slide_path)
164
+ if block is not None:
165
+ self.stats.images += 1
166
+ content.blocks.append(block)
167
+ elif tag == "graphicFrame":
168
+ block = self._parse_table_shape(child)
169
+ if block is not None:
170
+ self.stats.tables += 1
171
+ content.blocks.append(block)
172
+ elif tag == "grpSp":
173
+ self._walk_shape_tree(child, content, rels, slide_path)
174
+
175
+ def _parse_text_shape(self, shape: ET.Element) -> Optional[ShapeBlock]:
176
+ ph = shape.find("./p:nvSpPr/p:nvPr/p:ph", NS)
177
+ ph_type = (ph.attrib.get("type", "") if ph is not None else "").lower()
178
+ if ph_type in {"ftr", "dt", "sldnum"}:
179
+ return None
180
+
181
+ tx_body = shape.find("./p:txBody", NS)
182
+ if tx_body is None:
183
+ return None
184
+
185
+ lines: list[str] = []
186
+ for p in tx_body.findall("./a:p", NS):
187
+ txt = self._paragraph_text(p)
188
+ if not txt:
189
+ continue
190
+ lvl = self._paragraph_level(p)
191
+ if lvl is None:
192
+ lines.append(txt)
193
+ else:
194
+ lines.append(f"{' ' * max(lvl, 0)}- {txt}")
195
+ if not lines:
196
+ return None
197
+ x, y, cx, cy = emu_box(shape)
198
+ md = "\n".join(lines)
199
+ return ShapeBlock(
200
+ kind="text",
201
+ x=x,
202
+ y=y,
203
+ cx=cx,
204
+ cy=cy,
205
+ markdown=md,
206
+ text_norm=normalize_for_repeat(md),
207
+ is_title=ph_type in {"title", "ctrtitle"},
208
+ )
209
+
210
+ def _parse_table_shape(self, frame: ET.Element) -> Optional[ShapeBlock]:
211
+ tbl = frame.find(".//a:tbl", NS)
212
+ if tbl is None:
213
+ return None
214
+ rows: list[list[str]] = []
215
+ for tr in tbl.findall("./a:tr", NS):
216
+ row: list[str] = []
217
+ for tc in tr.findall("./a:tc", NS):
218
+ cell_lines = []
219
+ for p in tc.findall(".//a:p", NS):
220
+ txt = self._paragraph_text(p)
221
+ if txt:
222
+ cell_lines.append(txt)
223
+ row.append(md_escape_cell("\n".join(cell_lines)))
224
+ rows.append(row)
225
+ if not rows:
226
+ return None
227
+ width = max(len(r) for r in rows)
228
+ padded = [r + [""] * (width - len(r)) for r in rows]
229
+ md_lines = []
230
+ md_lines.append("| " + " | ".join(padded[0]) + " |")
231
+ md_lines.append("| " + " | ".join("---" for _ in range(width)) + " |")
232
+ for row in padded[1:]:
233
+ md_lines.append("| " + " | ".join(row) + " |")
234
+ x, y, cx, cy = emu_box(frame)
235
+ md = "\n".join(md_lines)
236
+ return ShapeBlock(kind="table", x=x, y=y, cx=cx, cy=cy, markdown=md, text_norm=normalize_for_repeat(md))
237
+
238
+ def _parse_picture_shape(self, pic: ET.Element, rels: dict[str, str], slide_path: str) -> Optional[ShapeBlock]:
239
+ blip = pic.find(".//a:blip", NS)
240
+ if blip is None:
241
+ return None
242
+ rid = blip.attrib.get(f"{{{NS['r']}}}embed") or blip.attrib.get(f"{{{NS['r']}}}link")
243
+ if not rid:
244
+ return None
245
+ target = rels.get(rid, "")
246
+ if not target:
247
+ return None
248
+ part = self._resolve_part_path(slide_path, target)
249
+ ref = self.media_map.get(part, "")
250
+ if not ref:
251
+ return None
252
+
253
+ src_rect = self._source_crop(pic)
254
+ image_bytes = self.media_bytes.get(part, b"")
255
+ if src_rect is not None:
256
+ cropped = self._render_cropped_variant(part, src_rect)
257
+ if cropped:
258
+ ref = cropped
259
+ image_bytes = self._variant_bytes(part, src_rect) or image_bytes
260
+
261
+ x, y, cx, cy = emu_box(pic)
262
+ self._track_dpi(cx, cy, image_bytes, Path(part).suffix.lower())
263
+ sig = hashlib.sha256(image_bytes).hexdigest()[:16]
264
+ return ShapeBlock(
265
+ kind="image",
266
+ x=x,
267
+ y=y,
268
+ cx=cx,
269
+ cy=cy,
270
+ markdown=f"![image]({ref})",
271
+ image_sig=sig,
272
+ )
273
+
274
+ def _track_dpi(self, cx: int, cy: int, data: bytes, suffix: str) -> None:
275
+ dims = self._image_dimensions(data, suffix)
276
+ if not dims or cx <= 0 or cy <= 0:
277
+ return
278
+ w, h = dims
279
+ dpi_x = w / (cx / 914400)
280
+ dpi_y = h / (cy / 914400)
281
+ if min(dpi_x, dpi_y) < 150:
282
+ self.stats.low_dpi_images += 1
283
+
284
+ def _sort_blocks(self, slide: SlideContent) -> None:
285
+ row_height = 0.35 * 914400
286
+ slide.blocks.sort(key=lambda b: (round(b.y / row_height), b.y, b.x))
287
+
288
+ def _remove_repetitive_text_blocks(self, slides: list[SlideContent]) -> None:
289
+ counts: dict[str, int] = {}
290
+ for slide in slides:
291
+ seen = set()
292
+ for b in slide.blocks:
293
+ if b.kind != "text" or not b.text_norm:
294
+ continue
295
+ if b.text_norm in seen:
296
+ continue
297
+ seen.add(b.text_norm)
298
+ counts[b.text_norm] = counts.get(b.text_norm, 0) + 1
299
+ threshold = max(4, math.ceil(len(slides) * 0.6)) if slides else 99999
300
+ repetitive = {k for k, v in counts.items() if v >= threshold}
301
+ for slide in slides:
302
+ kept: list[ShapeBlock] = []
303
+ for b in slide.blocks:
304
+ if b.kind == "text" and b.text_norm and b.text_norm in repetitive:
305
+ self.stats.removed_repetitive_blocks += 1
306
+ continue
307
+ kept.append(b)
308
+ slide.blocks = kept
309
+
310
+ def _remove_repeated_logo_images(self, slides: list[SlideContent]) -> None:
311
+ counts: dict[str, int] = {}
312
+ geom_counts: dict[str, int] = {}
313
+ for slide in slides:
314
+ for b in slide.blocks:
315
+ if b.kind == "image" and b.image_sig:
316
+ counts[b.image_sig] = counts.get(b.image_sig, 0) + 1
317
+ if self._looks_like_corner_logo(b):
318
+ g = self._logo_geom_bucket(b)
319
+ geom_counts[g] = geom_counts.get(g, 0) + 1
320
+ threshold = max(3, math.ceil(len(slides) * 0.15)) if slides else 99999
321
+ repetitive = {k for k, v in counts.items() if v >= threshold}
322
+ repetitive_geom = {k for k, v in geom_counts.items() if v >= threshold}
323
+ for slide in slides:
324
+ kept: list[ShapeBlock] = []
325
+ for b in slide.blocks:
326
+ if b.kind == "image" and self._looks_like_corner_logo(b):
327
+ geom_match = self._logo_geom_bucket(b) in repetitive_geom
328
+ hash_match = b.image_sig in repetitive
329
+ if geom_match or hash_match:
330
+ self.stats.removed_logo_images += 1
331
+ continue
332
+ kept.append(b)
333
+ slide.blocks = kept
334
+
335
+ @staticmethod
336
+ def _looks_like_corner_logo(b: ShapeBlock) -> bool:
337
+ # Small and near top/bottom edge is a typical logo/footer marker.
338
+ w_in = b.cx / 914400 if b.cx else 0
339
+ h_in = b.cy / 914400 if b.cy else 0
340
+ y_in = b.y / 914400 if b.y else 0
341
+ return w_in <= 2.2 and h_in <= 1.2 and (y_in <= 1.4 or y_in >= 5.5)
342
+
343
+ @staticmethod
344
+ def _logo_geom_bucket(b: ShapeBlock) -> str:
345
+ return f"{round(b.x/300000)}:{round(b.y/300000)}:{round(b.cx/300000)}:{round(b.cy/300000)}"
346
+
347
+ def _render_markdown(self, slides: list[SlideContent]) -> str:
348
+ out: list[str] = []
349
+ for slide in slides:
350
+ out.append(f"## Slide {slide.index}: {slide.title}")
351
+ if slide.blocks:
352
+ out.append("")
353
+ for b in slide.blocks:
354
+ out.append(b.markdown)
355
+ out.append("")
356
+ else:
357
+ out.append("")
358
+ if out and out[-1] != "":
359
+ out.append("")
360
+ return "\n".join(out).strip() + "\n"
361
+
362
+ def _copy_media(self, zf: ZipFile) -> None:
363
+ for name in zf.namelist():
364
+ if not name.startswith("ppt/media/"):
365
+ continue
366
+ data = zf.read(name)
367
+ self.media_bytes[name] = data
368
+ mime = mimetypes.guess_type(Path(name).name)[0] or "application/octet-stream"
369
+ if self.image_mode == "base64":
370
+ self.media_map[name] = f"data:{mime};base64,{base64.b64encode(data).decode('ascii')}"
371
+ elif self.image_mode == "assets":
372
+ target = self.assets_dir / Path(name).name
373
+ target.write_bytes(data)
374
+ self.media_map[name] = f"{self.asset_ref_prefix}/{target.name}"
375
+ else:
376
+ self.media_map[name] = ""
377
+
378
+ def _source_crop(self, pic: ET.Element) -> tuple[int, int, int, int] | None:
379
+ src = pic.find("./p:blipFill/a:srcRect", NS)
380
+ if src is None:
381
+ return None
382
+ try:
383
+ return (
384
+ int(src.attrib.get("l", "0")),
385
+ int(src.attrib.get("t", "0")),
386
+ int(src.attrib.get("r", "0")),
387
+ int(src.attrib.get("b", "0")),
388
+ )
389
+ except ValueError:
390
+ return None
391
+
392
+ def _variant_bytes(self, part: str, rect: tuple[int, int, int, int]) -> bytes | None:
393
+ key = f"{part}:{rect[0]}:{rect[1]}:{rect[2]}:{rect[3]}:bytes"
394
+ ref = self.variant_map.get(key, "")
395
+ if not ref:
396
+ return None
397
+ return base64.b64decode(ref)
398
+
399
+ def _render_cropped_variant(self, part: str, src_rect: tuple[int, int, int, int]) -> str:
400
+ cache_key_ref = f"{part}:{src_rect[0]}:{src_rect[1]}:{src_rect[2]}:{src_rect[3]}:{self.image_mode}"
401
+ cache_key_bytes = f"{part}:{src_rect[0]}:{src_rect[1]}:{src_rect[2]}:{src_rect[3]}:bytes"
402
+ if cache_key_ref in self.variant_map:
403
+ return self.variant_map[cache_key_ref]
404
+ data = self.media_bytes.get(part, b"")
405
+ if not data:
406
+ return ""
407
+ ext = Path(part).suffix.lower()
408
+ if ext not in {".png", ".jpg", ".jpeg"}:
409
+ return ""
410
+ with Image.open(io.BytesIO(data)) as img:
411
+ width, height = img.size
412
+ l, t, r, b = src_rect
413
+ left = max(0, min(width, round(width * (l / 100000.0))))
414
+ top = max(0, min(height, round(height * (t / 100000.0))))
415
+ right = max(0, min(width, round(width * ((100000 - r) / 100000.0))))
416
+ bottom = max(0, min(height, round(height * ((100000 - b) / 100000.0))))
417
+ if right <= left or bottom <= top:
418
+ return ""
419
+ cropped = img.crop((left, top, right, bottom))
420
+ out = io.BytesIO()
421
+ fmt = "PNG" if ext == ".png" else "JPEG"
422
+ kwargs = {"quality": 95} if fmt == "JPEG" else {}
423
+ cropped.save(out, format=fmt, **kwargs)
424
+ out_bytes = out.getvalue()
425
+ self.variant_map[cache_key_bytes] = base64.b64encode(out_bytes).decode("ascii")
426
+ digest = hashlib.sha256(out_bytes).hexdigest()[:12]
427
+ out_ext = ".jpg" if ext == ".jpeg" else ext
428
+ name = f"{Path(part).stem}.crop.{digest}{out_ext}"
429
+ if self.image_mode == "base64":
430
+ mime = mimetypes.guess_type(name)[0] or "application/octet-stream"
431
+ ref = f"data:{mime};base64,{base64.b64encode(out_bytes).decode('ascii')}"
432
+ else:
433
+ target = self.assets_dir / name
434
+ target.write_bytes(out_bytes)
435
+ ref = f"{self.asset_ref_prefix}/{target.name}"
436
+ self.variant_map[cache_key_ref] = ref
437
+ return ref
438
+
439
+ @staticmethod
440
+ def _image_dimensions(data: bytes, suffix: str) -> tuple[int, int] | None:
441
+ if suffix == ".png" and data.startswith(b"\x89PNG\r\n\x1a\n") and len(data) >= 24:
442
+ return int.from_bytes(data[16:20], "big"), int.from_bytes(data[20:24], "big")
443
+ if suffix in {".jpg", ".jpeg"} and data.startswith(b"\xff\xd8"):
444
+ i = 2
445
+ while i + 9 < len(data):
446
+ if data[i] != 0xFF:
447
+ i += 1
448
+ continue
449
+ marker = data[i + 1]
450
+ if marker in {0xC0, 0xC1, 0xC2, 0xC3, 0xC9, 0xCA, 0xCB}:
451
+ if i + 9 <= len(data):
452
+ return int.from_bytes(data[i + 7:i + 9], "big"), int.from_bytes(data[i + 5:i + 7], "big")
453
+ if i + 4 > len(data):
454
+ break
455
+ seg_len = int.from_bytes(data[i + 2:i + 4], "big")
456
+ i += 2 + seg_len
457
+ return None
458
+
459
+ def _paragraph_text(self, p: ET.Element) -> str:
460
+ chunks = []
461
+ for n in p.iter():
462
+ if local_name(n.tag) == "t" and n.text:
463
+ chunks.append(n.text)
464
+ return clean_text("".join(chunks))
465
+
466
+ def _paragraph_level(self, p: ET.Element) -> Optional[int]:
467
+ ppr = p.find("./a:pPr", NS)
468
+ if ppr is None:
469
+ return None
470
+ lvl = ppr.attrib.get("lvl")
471
+ if lvl is None:
472
+ return None
473
+ try:
474
+ return int(lvl)
475
+ except ValueError:
476
+ return None
477
+
478
+ def _ordered_slide_paths(self, zf: ZipFile) -> list[str]:
479
+ presentation = ET.fromstring(zf.read("ppt/presentation.xml"))
480
+ rels = self._relationships(zf, "ppt/presentation.xml")
481
+ paths: list[str] = []
482
+ for sld_id in presentation.findall("./p:sldIdLst/p:sldId", NS):
483
+ rid = sld_id.attrib.get(f"{{{NS['r']}}}id")
484
+ target = rels.get(rid or "")
485
+ if not target:
486
+ continue
487
+ full = self._resolve_part_path("ppt/presentation.xml", target)
488
+ if full in zf.namelist():
489
+ paths.append(full)
490
+ return paths
491
+
492
+ def _relationships(self, zf: ZipFile, part: str) -> dict[str, str]:
493
+ rels_path = str(Path(part).parent / "_rels" / f"{Path(part).name}.rels").replace("\\", "/")
494
+ if rels_path not in zf.namelist():
495
+ return {}
496
+ root = ET.fromstring(zf.read(rels_path))
497
+ return {rel.attrib["Id"]: rel.attrib.get("Target", "") for rel in root if "Id" in rel.attrib}
498
+
499
+ @staticmethod
500
+ def _resolve_part_path(part: str, target: str) -> str:
501
+ joined = (Path(part).parent / target).as_posix()
502
+ norm: list[str] = []
503
+ for piece in joined.split("/"):
504
+ if piece in {"", "."}:
505
+ continue
506
+ if piece == "..":
507
+ if norm:
508
+ norm.pop()
509
+ continue
510
+ norm.append(piece)
511
+ return "/".join(norm)
512
+
513
+
514
+ def export_presentation(input_path: Path, output_root: Path, out_same_dir: bool, image_mode: str = "assets") -> dict:
515
+ if input_path.suffix.lower() == ".ppt":
516
+ input_path = convert_legacy_ppt(input_path)
517
+ output_dir = input_path.parent if out_same_dir else output_root / input_path.stem
518
+ return PptxMarkdownExporter(input_path, output_dir, image_mode=image_mode).export()
519
+
520
+
521
+ def convert_legacy_ppt(input_path: Path) -> Path:
522
+ with tempfile.TemporaryDirectory(prefix="build-corpus-ppt-") as tmp:
523
+ tmp_dir = Path(tmp)
524
+ cmd = ["soffice", "--headless", "--convert-to", "pptx", "--outdir", str(tmp_dir), str(input_path)]
525
+ result = subprocess.run(cmd, capture_output=True, text=True)
526
+ converted = tmp_dir / f"{input_path.stem}.pptx"
527
+ if result.returncode != 0 or not converted.exists():
528
+ message = (result.stderr or result.stdout or "conversion failed").strip()
529
+ raise RuntimeError(f"Legacy .ppt conversion requires LibreOffice; failed for {input_path}: {message}")
530
+ persistent = input_path.with_suffix(".converted.pptx")
531
+ persistent.write_bytes(converted.read_bytes())
532
+ return persistent
@@ -0,0 +1,46 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import json
5
+ import re
6
+ from pathlib import Path
7
+
8
+
9
+ IMAGE_RE = re.compile(r"!\[[^\]]*\]\(([^)]+)\)")
10
+
11
+
12
+ def validate_file(path: Path) -> dict:
13
+ text = path.read_text(encoding="utf-8", errors="replace")
14
+ refs = IMAGE_RE.findall(text)
15
+ missing = []
16
+ for ref in refs:
17
+ if ref.startswith(("http://", "https://", "data:")):
18
+ continue
19
+ if not (path.parent / ref).exists() and not Path(ref).exists():
20
+ missing.append(ref)
21
+ return {
22
+ "file": str(path),
23
+ "image_refs": len(refs),
24
+ "missing_refs": len(missing),
25
+ "missing_samples": missing[:50],
26
+ }
27
+
28
+
29
+ def collect_markdown(path: Path) -> list[Path]:
30
+ if path.is_file():
31
+ return [path]
32
+ return sorted(path.rglob("*.md"))
33
+
34
+
35
+ def main() -> None:
36
+ parser = argparse.ArgumentParser()
37
+ parser.add_argument("target", type=Path)
38
+ args = parser.parse_args()
39
+
40
+ results = [validate_file(path) for path in collect_markdown(args.target)]
41
+ print(json.dumps({"files": len(results), "results": results}, indent=2))
42
+ raise SystemExit(1 if any(result["missing_refs"] for result in results) else 0)
43
+
44
+
45
+ if __name__ == "__main__":
46
+ main()