regen.mde 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +16 -0
- package/README.md +295 -0
- package/bin/build-corpus-editor.js +81 -0
- package/bin/build-corpus.js +41 -0
- package/bin/postinstall.js +187 -0
- package/bin/regen-mdeditor-install.js +27 -0
- package/bin/regen-mdeditor-uninstall.js +19 -0
- package/bin/validate-katex.js +93 -0
- package/desktop/BuildCorpusEditor/BuildCorpusBridge.cs +270 -0
- package/desktop/BuildCorpusEditor/BuildCorpusEditor.csproj +22 -0
- package/desktop/BuildCorpusEditor/EditorForm.cs +540 -0
- package/desktop/BuildCorpusEditor/Program.cs +81 -0
- package/desktop/BuildCorpusEditor/app.manifest +16 -0
- package/dist/release/regen.mde-0.2.2-win-x64-setup.exe +0 -0
- package/dist/release/regen.mde-0.2.2-win-x64.zip +0 -0
- package/dist/windows-editor/BuildCorpusEditor.deps.json +83 -0
- package/dist/windows-editor/BuildCorpusEditor.dll +0 -0
- package/dist/windows-editor/BuildCorpusEditor.exe +0 -0
- package/dist/windows-editor/BuildCorpusEditor.pdb +0 -0
- package/dist/windows-editor/BuildCorpusEditor.runtimeconfig.json +19 -0
- package/dist/windows-editor/Microsoft.Web.WebView2.Core.dll +0 -0
- package/dist/windows-editor/Microsoft.Web.WebView2.Core.xml +6817 -0
- package/dist/windows-editor/Microsoft.Web.WebView2.WinForms.dll +0 -0
- package/dist/windows-editor/Microsoft.Web.WebView2.WinForms.xml +510 -0
- package/dist/windows-editor/Microsoft.Web.WebView2.Wpf.dll +0 -0
- package/dist/windows-editor/Microsoft.Web.WebView2.Wpf.xml +1902 -0
- package/dist/windows-editor/WebView2Loader.dll +0 -0
- package/dist/windows-editor/runtimes/win-x64/native/WebView2Loader.dll +0 -0
- package/dist/windows-editor/wwwroot/assets/index-DjJ6xmhy.js +326 -0
- package/dist/windows-editor/wwwroot/assets/index-_dwMNNsm.css +1 -0
- package/dist/windows-editor/wwwroot/index.html +22 -0
- package/editor-web/index.html +21 -0
- package/editor-web/src/main.jsx +399 -0
- package/editor-web/src/styles.css +602 -0
- package/editor-web/vite.config.js +13 -0
- package/examples/build-corpus.config.example.json +21 -0
- package/installer/install-regen-mde.ps1 +175 -0
- package/installer/regen-mde.nsi +81 -0
- package/package.json +86 -0
- package/pyproject.toml +33 -0
- package/requirements.txt +4 -0
- package/scripts/build-windows-editor.ps1 +47 -0
- package/scripts/package-windows-editor.ps1 +90 -0
- package/scripts/run-corpus.ps1 +28 -0
- package/scripts/run-editor-implementation-plane.ps1 +203 -0
- package/scripts/run-required-tests.ps1 +98 -0
- package/scripts/run-smoke.ps1 +28 -0
- package/src/build_corpus/__init__.py +3 -0
- package/src/build_corpus/docx_exporter.py +798 -0
- package/src/build_corpus/exporter.py +1195 -0
- package/src/build_corpus/ppt_exporter.py +532 -0
- package/src/build_corpus/templates/__init__.py +1 -0
- package/src/build_corpus/templates/md-to-word-template.dotx +0 -0
- package/src/build_corpus/validate_assets.py +46 -0
- package/tools/audit_corpus.py +203 -0
- package/tools/collect_microsoft_word_templates.py +228 -0
- package/tools/collect_online_docx_corpus.py +272 -0
- package/tools/collect_online_pptx_corpus.py +252 -0
- package/tools/compare_pptx_inputs_outputs.py +87 -0
- package/tools/roundtrip_docx_corpus.py +171 -0
|
@@ -0,0 +1,532 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import base64
|
|
4
|
+
import hashlib
|
|
5
|
+
import io
|
|
6
|
+
import json
|
|
7
|
+
import math
|
|
8
|
+
import mimetypes
|
|
9
|
+
import re
|
|
10
|
+
import subprocess
|
|
11
|
+
import tempfile
|
|
12
|
+
from dataclasses import dataclass, field
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Optional
|
|
15
|
+
from zipfile import ZipFile
|
|
16
|
+
from xml.etree import ElementTree as ET
|
|
17
|
+
|
|
18
|
+
from PIL import Image
|
|
19
|
+
|
|
20
|
+
NS = {
|
|
21
|
+
"a": "http://schemas.openxmlformats.org/drawingml/2006/main",
|
|
22
|
+
"p": "http://schemas.openxmlformats.org/presentationml/2006/main",
|
|
23
|
+
"r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class ShapeBlock:
|
|
29
|
+
kind: str
|
|
30
|
+
x: int
|
|
31
|
+
y: int
|
|
32
|
+
cx: int
|
|
33
|
+
cy: int
|
|
34
|
+
markdown: str
|
|
35
|
+
text_norm: str = ""
|
|
36
|
+
image_sig: str = ""
|
|
37
|
+
is_title: bool = False
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@dataclass
|
|
41
|
+
class SlideContent:
|
|
42
|
+
index: int
|
|
43
|
+
title: str
|
|
44
|
+
blocks: list[ShapeBlock] = field(default_factory=list)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@dataclass
|
|
48
|
+
class PresentationStats:
|
|
49
|
+
slides: int = 0
|
|
50
|
+
removed_repetitive_blocks: int = 0
|
|
51
|
+
tables: int = 0
|
|
52
|
+
text_blocks: int = 0
|
|
53
|
+
images: int = 0
|
|
54
|
+
removed_logo_images: int = 0
|
|
55
|
+
low_dpi_images: int = 0
|
|
56
|
+
warnings: list[str] = field(default_factory=list)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def clean_text(value: str) -> str:
|
|
60
|
+
return re.sub(r"\s+", " ", (value or "").replace("\u00a0", " ")).strip()
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def md_escape_cell(value: str) -> str:
|
|
64
|
+
return clean_text(value).replace("|", r"\|").replace("\n", "<br>")
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def normalize_for_repeat(value: str) -> str:
|
|
68
|
+
return re.sub(r"\s+", " ", value).strip().lower()
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def local_name(tag: str) -> str:
|
|
72
|
+
return tag.rsplit("}", 1)[-1] if "}" in tag else tag
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def emu_box(node: ET.Element) -> tuple[int, int, int, int]:
|
|
76
|
+
xfrm = node.find("./p:spPr/a:xfrm", NS)
|
|
77
|
+
if xfrm is None:
|
|
78
|
+
return 0, 0, 0, 0
|
|
79
|
+
off = xfrm.find("./a:off", NS)
|
|
80
|
+
ext = xfrm.find("./a:ext", NS)
|
|
81
|
+
try:
|
|
82
|
+
x = int((off.attrib.get("x", "0") if off is not None else "0"))
|
|
83
|
+
y = int((off.attrib.get("y", "0") if off is not None else "0"))
|
|
84
|
+
cx = int((ext.attrib.get("cx", "0") if ext is not None else "0"))
|
|
85
|
+
cy = int((ext.attrib.get("cy", "0") if ext is not None else "0"))
|
|
86
|
+
return x, y, cx, cy
|
|
87
|
+
except ValueError:
|
|
88
|
+
return 0, 0, 0, 0
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class PptxMarkdownExporter:
|
|
92
|
+
def __init__(self, input_path: Path, output_dir: Path, image_mode: str = "assets"):
|
|
93
|
+
self.input_path = input_path
|
|
94
|
+
self.output_dir = output_dir
|
|
95
|
+
self.output_md = output_dir / f"{input_path.stem}.md"
|
|
96
|
+
self.report_path = output_dir / "export-report.json"
|
|
97
|
+
self.assets_dir = output_dir / "assets"
|
|
98
|
+
self.asset_ref_prefix = self.assets_dir.name
|
|
99
|
+
self.image_mode = image_mode
|
|
100
|
+
self.media_map: dict[str, str] = {}
|
|
101
|
+
self.media_bytes: dict[str, bytes] = {}
|
|
102
|
+
self.variant_map: dict[str, str] = {}
|
|
103
|
+
self.stats = PresentationStats()
|
|
104
|
+
|
|
105
|
+
def export(self) -> dict:
|
|
106
|
+
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
107
|
+
if self.image_mode == "assets":
|
|
108
|
+
self.assets_dir.mkdir(parents=True, exist_ok=True)
|
|
109
|
+
|
|
110
|
+
with ZipFile(self.input_path) as zf:
|
|
111
|
+
self._copy_media(zf)
|
|
112
|
+
slides = self._parse_all_slides(zf)
|
|
113
|
+
|
|
114
|
+
self.stats.slides = len(slides)
|
|
115
|
+
self._remove_repetitive_text_blocks(slides)
|
|
116
|
+
self._remove_repeated_logo_images(slides)
|
|
117
|
+
markdown = self._render_markdown(slides)
|
|
118
|
+
self.output_md.write_text(markdown, encoding="utf-8")
|
|
119
|
+
|
|
120
|
+
report = {
|
|
121
|
+
"input": str(self.input_path),
|
|
122
|
+
"output": str(self.output_md),
|
|
123
|
+
"stats": self.stats.__dict__,
|
|
124
|
+
}
|
|
125
|
+
self.report_path.write_text(json.dumps(report, indent=2, ensure_ascii=False), encoding="utf-8")
|
|
126
|
+
return report
|
|
127
|
+
|
|
128
|
+
def _parse_all_slides(self, zf: ZipFile) -> list[SlideContent]:
|
|
129
|
+
slide_paths = self._ordered_slide_paths(zf)
|
|
130
|
+
slides: list[SlideContent] = []
|
|
131
|
+
for i, slide_path in enumerate(slide_paths, 1):
|
|
132
|
+
root = ET.fromstring(zf.read(slide_path))
|
|
133
|
+
rels = self._relationships(zf, slide_path)
|
|
134
|
+
content = SlideContent(index=i, title="")
|
|
135
|
+
self._walk_shape_tree(root.find("./p:cSld/p:spTree", NS), content, rels, slide_path)
|
|
136
|
+
self._sort_blocks(content)
|
|
137
|
+
if not content.title:
|
|
138
|
+
image_only = bool(content.blocks) and all(b.kind == "image" for b in content.blocks)
|
|
139
|
+
content.title = "Image" if image_only else f"Slide {i}"
|
|
140
|
+
slides.append(content)
|
|
141
|
+
return slides
|
|
142
|
+
|
|
143
|
+
def _walk_shape_tree(
|
|
144
|
+
self,
|
|
145
|
+
node: Optional[ET.Element],
|
|
146
|
+
content: SlideContent,
|
|
147
|
+
rels: dict[str, str],
|
|
148
|
+
slide_path: str,
|
|
149
|
+
) -> None:
|
|
150
|
+
if node is None:
|
|
151
|
+
return
|
|
152
|
+
for child in list(node):
|
|
153
|
+
tag = local_name(child.tag)
|
|
154
|
+
if tag == "sp":
|
|
155
|
+
block = self._parse_text_shape(child)
|
|
156
|
+
if block is not None:
|
|
157
|
+
if block.is_title and not content.title:
|
|
158
|
+
content.title = clean_text(block.markdown.replace("- ", " "))
|
|
159
|
+
else:
|
|
160
|
+
self.stats.text_blocks += 1
|
|
161
|
+
content.blocks.append(block)
|
|
162
|
+
elif tag == "pic":
|
|
163
|
+
block = self._parse_picture_shape(child, rels, slide_path)
|
|
164
|
+
if block is not None:
|
|
165
|
+
self.stats.images += 1
|
|
166
|
+
content.blocks.append(block)
|
|
167
|
+
elif tag == "graphicFrame":
|
|
168
|
+
block = self._parse_table_shape(child)
|
|
169
|
+
if block is not None:
|
|
170
|
+
self.stats.tables += 1
|
|
171
|
+
content.blocks.append(block)
|
|
172
|
+
elif tag == "grpSp":
|
|
173
|
+
self._walk_shape_tree(child, content, rels, slide_path)
|
|
174
|
+
|
|
175
|
+
def _parse_text_shape(self, shape: ET.Element) -> Optional[ShapeBlock]:
|
|
176
|
+
ph = shape.find("./p:nvSpPr/p:nvPr/p:ph", NS)
|
|
177
|
+
ph_type = (ph.attrib.get("type", "") if ph is not None else "").lower()
|
|
178
|
+
if ph_type in {"ftr", "dt", "sldnum"}:
|
|
179
|
+
return None
|
|
180
|
+
|
|
181
|
+
tx_body = shape.find("./p:txBody", NS)
|
|
182
|
+
if tx_body is None:
|
|
183
|
+
return None
|
|
184
|
+
|
|
185
|
+
lines: list[str] = []
|
|
186
|
+
for p in tx_body.findall("./a:p", NS):
|
|
187
|
+
txt = self._paragraph_text(p)
|
|
188
|
+
if not txt:
|
|
189
|
+
continue
|
|
190
|
+
lvl = self._paragraph_level(p)
|
|
191
|
+
if lvl is None:
|
|
192
|
+
lines.append(txt)
|
|
193
|
+
else:
|
|
194
|
+
lines.append(f"{' ' * max(lvl, 0)}- {txt}")
|
|
195
|
+
if not lines:
|
|
196
|
+
return None
|
|
197
|
+
x, y, cx, cy = emu_box(shape)
|
|
198
|
+
md = "\n".join(lines)
|
|
199
|
+
return ShapeBlock(
|
|
200
|
+
kind="text",
|
|
201
|
+
x=x,
|
|
202
|
+
y=y,
|
|
203
|
+
cx=cx,
|
|
204
|
+
cy=cy,
|
|
205
|
+
markdown=md,
|
|
206
|
+
text_norm=normalize_for_repeat(md),
|
|
207
|
+
is_title=ph_type in {"title", "ctrtitle"},
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
def _parse_table_shape(self, frame: ET.Element) -> Optional[ShapeBlock]:
|
|
211
|
+
tbl = frame.find(".//a:tbl", NS)
|
|
212
|
+
if tbl is None:
|
|
213
|
+
return None
|
|
214
|
+
rows: list[list[str]] = []
|
|
215
|
+
for tr in tbl.findall("./a:tr", NS):
|
|
216
|
+
row: list[str] = []
|
|
217
|
+
for tc in tr.findall("./a:tc", NS):
|
|
218
|
+
cell_lines = []
|
|
219
|
+
for p in tc.findall(".//a:p", NS):
|
|
220
|
+
txt = self._paragraph_text(p)
|
|
221
|
+
if txt:
|
|
222
|
+
cell_lines.append(txt)
|
|
223
|
+
row.append(md_escape_cell("\n".join(cell_lines)))
|
|
224
|
+
rows.append(row)
|
|
225
|
+
if not rows:
|
|
226
|
+
return None
|
|
227
|
+
width = max(len(r) for r in rows)
|
|
228
|
+
padded = [r + [""] * (width - len(r)) for r in rows]
|
|
229
|
+
md_lines = []
|
|
230
|
+
md_lines.append("| " + " | ".join(padded[0]) + " |")
|
|
231
|
+
md_lines.append("| " + " | ".join("---" for _ in range(width)) + " |")
|
|
232
|
+
for row in padded[1:]:
|
|
233
|
+
md_lines.append("| " + " | ".join(row) + " |")
|
|
234
|
+
x, y, cx, cy = emu_box(frame)
|
|
235
|
+
md = "\n".join(md_lines)
|
|
236
|
+
return ShapeBlock(kind="table", x=x, y=y, cx=cx, cy=cy, markdown=md, text_norm=normalize_for_repeat(md))
|
|
237
|
+
|
|
238
|
+
def _parse_picture_shape(self, pic: ET.Element, rels: dict[str, str], slide_path: str) -> Optional[ShapeBlock]:
|
|
239
|
+
blip = pic.find(".//a:blip", NS)
|
|
240
|
+
if blip is None:
|
|
241
|
+
return None
|
|
242
|
+
rid = blip.attrib.get(f"{{{NS['r']}}}embed") or blip.attrib.get(f"{{{NS['r']}}}link")
|
|
243
|
+
if not rid:
|
|
244
|
+
return None
|
|
245
|
+
target = rels.get(rid, "")
|
|
246
|
+
if not target:
|
|
247
|
+
return None
|
|
248
|
+
part = self._resolve_part_path(slide_path, target)
|
|
249
|
+
ref = self.media_map.get(part, "")
|
|
250
|
+
if not ref:
|
|
251
|
+
return None
|
|
252
|
+
|
|
253
|
+
src_rect = self._source_crop(pic)
|
|
254
|
+
image_bytes = self.media_bytes.get(part, b"")
|
|
255
|
+
if src_rect is not None:
|
|
256
|
+
cropped = self._render_cropped_variant(part, src_rect)
|
|
257
|
+
if cropped:
|
|
258
|
+
ref = cropped
|
|
259
|
+
image_bytes = self._variant_bytes(part, src_rect) or image_bytes
|
|
260
|
+
|
|
261
|
+
x, y, cx, cy = emu_box(pic)
|
|
262
|
+
self._track_dpi(cx, cy, image_bytes, Path(part).suffix.lower())
|
|
263
|
+
sig = hashlib.sha256(image_bytes).hexdigest()[:16]
|
|
264
|
+
return ShapeBlock(
|
|
265
|
+
kind="image",
|
|
266
|
+
x=x,
|
|
267
|
+
y=y,
|
|
268
|
+
cx=cx,
|
|
269
|
+
cy=cy,
|
|
270
|
+
markdown=f"",
|
|
271
|
+
image_sig=sig,
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
def _track_dpi(self, cx: int, cy: int, data: bytes, suffix: str) -> None:
|
|
275
|
+
dims = self._image_dimensions(data, suffix)
|
|
276
|
+
if not dims or cx <= 0 or cy <= 0:
|
|
277
|
+
return
|
|
278
|
+
w, h = dims
|
|
279
|
+
dpi_x = w / (cx / 914400)
|
|
280
|
+
dpi_y = h / (cy / 914400)
|
|
281
|
+
if min(dpi_x, dpi_y) < 150:
|
|
282
|
+
self.stats.low_dpi_images += 1
|
|
283
|
+
|
|
284
|
+
def _sort_blocks(self, slide: SlideContent) -> None:
|
|
285
|
+
row_height = 0.35 * 914400
|
|
286
|
+
slide.blocks.sort(key=lambda b: (round(b.y / row_height), b.y, b.x))
|
|
287
|
+
|
|
288
|
+
def _remove_repetitive_text_blocks(self, slides: list[SlideContent]) -> None:
|
|
289
|
+
counts: dict[str, int] = {}
|
|
290
|
+
for slide in slides:
|
|
291
|
+
seen = set()
|
|
292
|
+
for b in slide.blocks:
|
|
293
|
+
if b.kind != "text" or not b.text_norm:
|
|
294
|
+
continue
|
|
295
|
+
if b.text_norm in seen:
|
|
296
|
+
continue
|
|
297
|
+
seen.add(b.text_norm)
|
|
298
|
+
counts[b.text_norm] = counts.get(b.text_norm, 0) + 1
|
|
299
|
+
threshold = max(4, math.ceil(len(slides) * 0.6)) if slides else 99999
|
|
300
|
+
repetitive = {k for k, v in counts.items() if v >= threshold}
|
|
301
|
+
for slide in slides:
|
|
302
|
+
kept: list[ShapeBlock] = []
|
|
303
|
+
for b in slide.blocks:
|
|
304
|
+
if b.kind == "text" and b.text_norm and b.text_norm in repetitive:
|
|
305
|
+
self.stats.removed_repetitive_blocks += 1
|
|
306
|
+
continue
|
|
307
|
+
kept.append(b)
|
|
308
|
+
slide.blocks = kept
|
|
309
|
+
|
|
310
|
+
def _remove_repeated_logo_images(self, slides: list[SlideContent]) -> None:
|
|
311
|
+
counts: dict[str, int] = {}
|
|
312
|
+
geom_counts: dict[str, int] = {}
|
|
313
|
+
for slide in slides:
|
|
314
|
+
for b in slide.blocks:
|
|
315
|
+
if b.kind == "image" and b.image_sig:
|
|
316
|
+
counts[b.image_sig] = counts.get(b.image_sig, 0) + 1
|
|
317
|
+
if self._looks_like_corner_logo(b):
|
|
318
|
+
g = self._logo_geom_bucket(b)
|
|
319
|
+
geom_counts[g] = geom_counts.get(g, 0) + 1
|
|
320
|
+
threshold = max(3, math.ceil(len(slides) * 0.15)) if slides else 99999
|
|
321
|
+
repetitive = {k for k, v in counts.items() if v >= threshold}
|
|
322
|
+
repetitive_geom = {k for k, v in geom_counts.items() if v >= threshold}
|
|
323
|
+
for slide in slides:
|
|
324
|
+
kept: list[ShapeBlock] = []
|
|
325
|
+
for b in slide.blocks:
|
|
326
|
+
if b.kind == "image" and self._looks_like_corner_logo(b):
|
|
327
|
+
geom_match = self._logo_geom_bucket(b) in repetitive_geom
|
|
328
|
+
hash_match = b.image_sig in repetitive
|
|
329
|
+
if geom_match or hash_match:
|
|
330
|
+
self.stats.removed_logo_images += 1
|
|
331
|
+
continue
|
|
332
|
+
kept.append(b)
|
|
333
|
+
slide.blocks = kept
|
|
334
|
+
|
|
335
|
+
@staticmethod
|
|
336
|
+
def _looks_like_corner_logo(b: ShapeBlock) -> bool:
|
|
337
|
+
# Small and near top/bottom edge is a typical logo/footer marker.
|
|
338
|
+
w_in = b.cx / 914400 if b.cx else 0
|
|
339
|
+
h_in = b.cy / 914400 if b.cy else 0
|
|
340
|
+
y_in = b.y / 914400 if b.y else 0
|
|
341
|
+
return w_in <= 2.2 and h_in <= 1.2 and (y_in <= 1.4 or y_in >= 5.5)
|
|
342
|
+
|
|
343
|
+
@staticmethod
|
|
344
|
+
def _logo_geom_bucket(b: ShapeBlock) -> str:
|
|
345
|
+
return f"{round(b.x/300000)}:{round(b.y/300000)}:{round(b.cx/300000)}:{round(b.cy/300000)}"
|
|
346
|
+
|
|
347
|
+
def _render_markdown(self, slides: list[SlideContent]) -> str:
|
|
348
|
+
out: list[str] = []
|
|
349
|
+
for slide in slides:
|
|
350
|
+
out.append(f"## Slide {slide.index}: {slide.title}")
|
|
351
|
+
if slide.blocks:
|
|
352
|
+
out.append("")
|
|
353
|
+
for b in slide.blocks:
|
|
354
|
+
out.append(b.markdown)
|
|
355
|
+
out.append("")
|
|
356
|
+
else:
|
|
357
|
+
out.append("")
|
|
358
|
+
if out and out[-1] != "":
|
|
359
|
+
out.append("")
|
|
360
|
+
return "\n".join(out).strip() + "\n"
|
|
361
|
+
|
|
362
|
+
def _copy_media(self, zf: ZipFile) -> None:
|
|
363
|
+
for name in zf.namelist():
|
|
364
|
+
if not name.startswith("ppt/media/"):
|
|
365
|
+
continue
|
|
366
|
+
data = zf.read(name)
|
|
367
|
+
self.media_bytes[name] = data
|
|
368
|
+
mime = mimetypes.guess_type(Path(name).name)[0] or "application/octet-stream"
|
|
369
|
+
if self.image_mode == "base64":
|
|
370
|
+
self.media_map[name] = f"data:{mime};base64,{base64.b64encode(data).decode('ascii')}"
|
|
371
|
+
elif self.image_mode == "assets":
|
|
372
|
+
target = self.assets_dir / Path(name).name
|
|
373
|
+
target.write_bytes(data)
|
|
374
|
+
self.media_map[name] = f"{self.asset_ref_prefix}/{target.name}"
|
|
375
|
+
else:
|
|
376
|
+
self.media_map[name] = ""
|
|
377
|
+
|
|
378
|
+
def _source_crop(self, pic: ET.Element) -> tuple[int, int, int, int] | None:
|
|
379
|
+
src = pic.find("./p:blipFill/a:srcRect", NS)
|
|
380
|
+
if src is None:
|
|
381
|
+
return None
|
|
382
|
+
try:
|
|
383
|
+
return (
|
|
384
|
+
int(src.attrib.get("l", "0")),
|
|
385
|
+
int(src.attrib.get("t", "0")),
|
|
386
|
+
int(src.attrib.get("r", "0")),
|
|
387
|
+
int(src.attrib.get("b", "0")),
|
|
388
|
+
)
|
|
389
|
+
except ValueError:
|
|
390
|
+
return None
|
|
391
|
+
|
|
392
|
+
def _variant_bytes(self, part: str, rect: tuple[int, int, int, int]) -> bytes | None:
|
|
393
|
+
key = f"{part}:{rect[0]}:{rect[1]}:{rect[2]}:{rect[3]}:bytes"
|
|
394
|
+
ref = self.variant_map.get(key, "")
|
|
395
|
+
if not ref:
|
|
396
|
+
return None
|
|
397
|
+
return base64.b64decode(ref)
|
|
398
|
+
|
|
399
|
+
def _render_cropped_variant(self, part: str, src_rect: tuple[int, int, int, int]) -> str:
|
|
400
|
+
cache_key_ref = f"{part}:{src_rect[0]}:{src_rect[1]}:{src_rect[2]}:{src_rect[3]}:{self.image_mode}"
|
|
401
|
+
cache_key_bytes = f"{part}:{src_rect[0]}:{src_rect[1]}:{src_rect[2]}:{src_rect[3]}:bytes"
|
|
402
|
+
if cache_key_ref in self.variant_map:
|
|
403
|
+
return self.variant_map[cache_key_ref]
|
|
404
|
+
data = self.media_bytes.get(part, b"")
|
|
405
|
+
if not data:
|
|
406
|
+
return ""
|
|
407
|
+
ext = Path(part).suffix.lower()
|
|
408
|
+
if ext not in {".png", ".jpg", ".jpeg"}:
|
|
409
|
+
return ""
|
|
410
|
+
with Image.open(io.BytesIO(data)) as img:
|
|
411
|
+
width, height = img.size
|
|
412
|
+
l, t, r, b = src_rect
|
|
413
|
+
left = max(0, min(width, round(width * (l / 100000.0))))
|
|
414
|
+
top = max(0, min(height, round(height * (t / 100000.0))))
|
|
415
|
+
right = max(0, min(width, round(width * ((100000 - r) / 100000.0))))
|
|
416
|
+
bottom = max(0, min(height, round(height * ((100000 - b) / 100000.0))))
|
|
417
|
+
if right <= left or bottom <= top:
|
|
418
|
+
return ""
|
|
419
|
+
cropped = img.crop((left, top, right, bottom))
|
|
420
|
+
out = io.BytesIO()
|
|
421
|
+
fmt = "PNG" if ext == ".png" else "JPEG"
|
|
422
|
+
kwargs = {"quality": 95} if fmt == "JPEG" else {}
|
|
423
|
+
cropped.save(out, format=fmt, **kwargs)
|
|
424
|
+
out_bytes = out.getvalue()
|
|
425
|
+
self.variant_map[cache_key_bytes] = base64.b64encode(out_bytes).decode("ascii")
|
|
426
|
+
digest = hashlib.sha256(out_bytes).hexdigest()[:12]
|
|
427
|
+
out_ext = ".jpg" if ext == ".jpeg" else ext
|
|
428
|
+
name = f"{Path(part).stem}.crop.{digest}{out_ext}"
|
|
429
|
+
if self.image_mode == "base64":
|
|
430
|
+
mime = mimetypes.guess_type(name)[0] or "application/octet-stream"
|
|
431
|
+
ref = f"data:{mime};base64,{base64.b64encode(out_bytes).decode('ascii')}"
|
|
432
|
+
else:
|
|
433
|
+
target = self.assets_dir / name
|
|
434
|
+
target.write_bytes(out_bytes)
|
|
435
|
+
ref = f"{self.asset_ref_prefix}/{target.name}"
|
|
436
|
+
self.variant_map[cache_key_ref] = ref
|
|
437
|
+
return ref
|
|
438
|
+
|
|
439
|
+
@staticmethod
|
|
440
|
+
def _image_dimensions(data: bytes, suffix: str) -> tuple[int, int] | None:
|
|
441
|
+
if suffix == ".png" and data.startswith(b"\x89PNG\r\n\x1a\n") and len(data) >= 24:
|
|
442
|
+
return int.from_bytes(data[16:20], "big"), int.from_bytes(data[20:24], "big")
|
|
443
|
+
if suffix in {".jpg", ".jpeg"} and data.startswith(b"\xff\xd8"):
|
|
444
|
+
i = 2
|
|
445
|
+
while i + 9 < len(data):
|
|
446
|
+
if data[i] != 0xFF:
|
|
447
|
+
i += 1
|
|
448
|
+
continue
|
|
449
|
+
marker = data[i + 1]
|
|
450
|
+
if marker in {0xC0, 0xC1, 0xC2, 0xC3, 0xC9, 0xCA, 0xCB}:
|
|
451
|
+
if i + 9 <= len(data):
|
|
452
|
+
return int.from_bytes(data[i + 7:i + 9], "big"), int.from_bytes(data[i + 5:i + 7], "big")
|
|
453
|
+
if i + 4 > len(data):
|
|
454
|
+
break
|
|
455
|
+
seg_len = int.from_bytes(data[i + 2:i + 4], "big")
|
|
456
|
+
i += 2 + seg_len
|
|
457
|
+
return None
|
|
458
|
+
|
|
459
|
+
def _paragraph_text(self, p: ET.Element) -> str:
|
|
460
|
+
chunks = []
|
|
461
|
+
for n in p.iter():
|
|
462
|
+
if local_name(n.tag) == "t" and n.text:
|
|
463
|
+
chunks.append(n.text)
|
|
464
|
+
return clean_text("".join(chunks))
|
|
465
|
+
|
|
466
|
+
def _paragraph_level(self, p: ET.Element) -> Optional[int]:
|
|
467
|
+
ppr = p.find("./a:pPr", NS)
|
|
468
|
+
if ppr is None:
|
|
469
|
+
return None
|
|
470
|
+
lvl = ppr.attrib.get("lvl")
|
|
471
|
+
if lvl is None:
|
|
472
|
+
return None
|
|
473
|
+
try:
|
|
474
|
+
return int(lvl)
|
|
475
|
+
except ValueError:
|
|
476
|
+
return None
|
|
477
|
+
|
|
478
|
+
def _ordered_slide_paths(self, zf: ZipFile) -> list[str]:
|
|
479
|
+
presentation = ET.fromstring(zf.read("ppt/presentation.xml"))
|
|
480
|
+
rels = self._relationships(zf, "ppt/presentation.xml")
|
|
481
|
+
paths: list[str] = []
|
|
482
|
+
for sld_id in presentation.findall("./p:sldIdLst/p:sldId", NS):
|
|
483
|
+
rid = sld_id.attrib.get(f"{{{NS['r']}}}id")
|
|
484
|
+
target = rels.get(rid or "")
|
|
485
|
+
if not target:
|
|
486
|
+
continue
|
|
487
|
+
full = self._resolve_part_path("ppt/presentation.xml", target)
|
|
488
|
+
if full in zf.namelist():
|
|
489
|
+
paths.append(full)
|
|
490
|
+
return paths
|
|
491
|
+
|
|
492
|
+
def _relationships(self, zf: ZipFile, part: str) -> dict[str, str]:
|
|
493
|
+
rels_path = str(Path(part).parent / "_rels" / f"{Path(part).name}.rels").replace("\\", "/")
|
|
494
|
+
if rels_path not in zf.namelist():
|
|
495
|
+
return {}
|
|
496
|
+
root = ET.fromstring(zf.read(rels_path))
|
|
497
|
+
return {rel.attrib["Id"]: rel.attrib.get("Target", "") for rel in root if "Id" in rel.attrib}
|
|
498
|
+
|
|
499
|
+
@staticmethod
|
|
500
|
+
def _resolve_part_path(part: str, target: str) -> str:
|
|
501
|
+
joined = (Path(part).parent / target).as_posix()
|
|
502
|
+
norm: list[str] = []
|
|
503
|
+
for piece in joined.split("/"):
|
|
504
|
+
if piece in {"", "."}:
|
|
505
|
+
continue
|
|
506
|
+
if piece == "..":
|
|
507
|
+
if norm:
|
|
508
|
+
norm.pop()
|
|
509
|
+
continue
|
|
510
|
+
norm.append(piece)
|
|
511
|
+
return "/".join(norm)
|
|
512
|
+
|
|
513
|
+
|
|
514
|
+
def export_presentation(input_path: Path, output_root: Path, out_same_dir: bool, image_mode: str = "assets") -> dict:
|
|
515
|
+
if input_path.suffix.lower() == ".ppt":
|
|
516
|
+
input_path = convert_legacy_ppt(input_path)
|
|
517
|
+
output_dir = input_path.parent if out_same_dir else output_root / input_path.stem
|
|
518
|
+
return PptxMarkdownExporter(input_path, output_dir, image_mode=image_mode).export()
|
|
519
|
+
|
|
520
|
+
|
|
521
|
+
def convert_legacy_ppt(input_path: Path) -> Path:
|
|
522
|
+
with tempfile.TemporaryDirectory(prefix="build-corpus-ppt-") as tmp:
|
|
523
|
+
tmp_dir = Path(tmp)
|
|
524
|
+
cmd = ["soffice", "--headless", "--convert-to", "pptx", "--outdir", str(tmp_dir), str(input_path)]
|
|
525
|
+
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
526
|
+
converted = tmp_dir / f"{input_path.stem}.pptx"
|
|
527
|
+
if result.returncode != 0 or not converted.exists():
|
|
528
|
+
message = (result.stderr or result.stdout or "conversion failed").strip()
|
|
529
|
+
raise RuntimeError(f"Legacy .ppt conversion requires LibreOffice; failed for {input_path}: {message}")
|
|
530
|
+
persistent = input_path.with_suffix(".converted.pptx")
|
|
531
|
+
persistent.write_bytes(converted.read_bytes())
|
|
532
|
+
return persistent
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
Binary file
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import json
|
|
5
|
+
import re
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
IMAGE_RE = re.compile(r"!\[[^\]]*\]\(([^)]+)\)")
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def validate_file(path: Path) -> dict:
|
|
13
|
+
text = path.read_text(encoding="utf-8", errors="replace")
|
|
14
|
+
refs = IMAGE_RE.findall(text)
|
|
15
|
+
missing = []
|
|
16
|
+
for ref in refs:
|
|
17
|
+
if ref.startswith(("http://", "https://", "data:")):
|
|
18
|
+
continue
|
|
19
|
+
if not (path.parent / ref).exists() and not Path(ref).exists():
|
|
20
|
+
missing.append(ref)
|
|
21
|
+
return {
|
|
22
|
+
"file": str(path),
|
|
23
|
+
"image_refs": len(refs),
|
|
24
|
+
"missing_refs": len(missing),
|
|
25
|
+
"missing_samples": missing[:50],
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def collect_markdown(path: Path) -> list[Path]:
|
|
30
|
+
if path.is_file():
|
|
31
|
+
return [path]
|
|
32
|
+
return sorted(path.rglob("*.md"))
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def main() -> None:
|
|
36
|
+
parser = argparse.ArgumentParser()
|
|
37
|
+
parser.add_argument("target", type=Path)
|
|
38
|
+
args = parser.parse_args()
|
|
39
|
+
|
|
40
|
+
results = [validate_file(path) for path in collect_markdown(args.target)]
|
|
41
|
+
print(json.dumps({"files": len(results), "results": results}, indent=2))
|
|
42
|
+
raise SystemExit(1 if any(result["missing_refs"] for result in results) else 0)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
if __name__ == "__main__":
|
|
46
|
+
main()
|