regen.mde 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. package/LICENSE +16 -0
  2. package/README.md +295 -0
  3. package/bin/build-corpus-editor.js +81 -0
  4. package/bin/build-corpus.js +41 -0
  5. package/bin/postinstall.js +187 -0
  6. package/bin/regen-mdeditor-install.js +27 -0
  7. package/bin/regen-mdeditor-uninstall.js +19 -0
  8. package/bin/validate-katex.js +93 -0
  9. package/desktop/BuildCorpusEditor/BuildCorpusBridge.cs +270 -0
  10. package/desktop/BuildCorpusEditor/BuildCorpusEditor.csproj +22 -0
  11. package/desktop/BuildCorpusEditor/EditorForm.cs +540 -0
  12. package/desktop/BuildCorpusEditor/Program.cs +81 -0
  13. package/desktop/BuildCorpusEditor/app.manifest +16 -0
  14. package/dist/release/regen.mde-0.2.2-win-x64-setup.exe +0 -0
  15. package/dist/release/regen.mde-0.2.2-win-x64.zip +0 -0
  16. package/dist/windows-editor/BuildCorpusEditor.deps.json +83 -0
  17. package/dist/windows-editor/BuildCorpusEditor.dll +0 -0
  18. package/dist/windows-editor/BuildCorpusEditor.exe +0 -0
  19. package/dist/windows-editor/BuildCorpusEditor.pdb +0 -0
  20. package/dist/windows-editor/BuildCorpusEditor.runtimeconfig.json +19 -0
  21. package/dist/windows-editor/Microsoft.Web.WebView2.Core.dll +0 -0
  22. package/dist/windows-editor/Microsoft.Web.WebView2.Core.xml +6817 -0
  23. package/dist/windows-editor/Microsoft.Web.WebView2.WinForms.dll +0 -0
  24. package/dist/windows-editor/Microsoft.Web.WebView2.WinForms.xml +510 -0
  25. package/dist/windows-editor/Microsoft.Web.WebView2.Wpf.dll +0 -0
  26. package/dist/windows-editor/Microsoft.Web.WebView2.Wpf.xml +1902 -0
  27. package/dist/windows-editor/WebView2Loader.dll +0 -0
  28. package/dist/windows-editor/runtimes/win-x64/native/WebView2Loader.dll +0 -0
  29. package/dist/windows-editor/wwwroot/assets/index-DjJ6xmhy.js +326 -0
  30. package/dist/windows-editor/wwwroot/assets/index-_dwMNNsm.css +1 -0
  31. package/dist/windows-editor/wwwroot/index.html +22 -0
  32. package/editor-web/index.html +21 -0
  33. package/editor-web/src/main.jsx +399 -0
  34. package/editor-web/src/styles.css +602 -0
  35. package/editor-web/vite.config.js +13 -0
  36. package/examples/build-corpus.config.example.json +21 -0
  37. package/installer/install-regen-mde.ps1 +175 -0
  38. package/installer/regen-mde.nsi +81 -0
  39. package/package.json +86 -0
  40. package/pyproject.toml +33 -0
  41. package/requirements.txt +4 -0
  42. package/scripts/build-windows-editor.ps1 +47 -0
  43. package/scripts/package-windows-editor.ps1 +90 -0
  44. package/scripts/run-corpus.ps1 +28 -0
  45. package/scripts/run-editor-implementation-plane.ps1 +203 -0
  46. package/scripts/run-required-tests.ps1 +98 -0
  47. package/scripts/run-smoke.ps1 +28 -0
  48. package/src/build_corpus/__init__.py +3 -0
  49. package/src/build_corpus/docx_exporter.py +798 -0
  50. package/src/build_corpus/exporter.py +1195 -0
  51. package/src/build_corpus/ppt_exporter.py +532 -0
  52. package/src/build_corpus/templates/__init__.py +1 -0
  53. package/src/build_corpus/templates/md-to-word-template.dotx +0 -0
  54. package/src/build_corpus/validate_assets.py +46 -0
  55. package/tools/audit_corpus.py +203 -0
  56. package/tools/collect_microsoft_word_templates.py +228 -0
  57. package/tools/collect_online_docx_corpus.py +272 -0
  58. package/tools/collect_online_pptx_corpus.py +252 -0
  59. package/tools/compare_pptx_inputs_outputs.py +87 -0
  60. package/tools/roundtrip_docx_corpus.py +171 -0
@@ -0,0 +1,252 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import hashlib
5
+ import json
6
+ import re
7
+ import time
8
+ from pathlib import Path
9
+ from typing import Iterable
10
+ from urllib.parse import parse_qs, quote_plus, unquote, urlparse
11
+ from zipfile import BadZipFile, ZipFile
12
+
13
+ import requests
14
+
15
+
16
+ DEFAULT_QUERIES = [
17
+ 'filetype:pptx "roadmap"',
18
+ 'filetype:pptx "quarterly business review"',
19
+ 'filetype:pptx "deck"',
20
+ 'filetype:pptx "strategy presentation"',
21
+ 'filetype:pptx "project update"',
22
+ 'filetype:pptx "financial presentation"',
23
+ 'filetype:pptx "education presentation"',
24
+ ]
25
+
26
+ GITHUB_REPO_QUERIES = [
27
+ "pptx presentation template",
28
+ "pptx deck examples",
29
+ "powerpoint pptx samples",
30
+ ]
31
+
32
+ HEADERS = {
33
+ "User-Agent": (
34
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
35
+ "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36"
36
+ )
37
+ }
38
+
39
+ PPTX_CT_HINTS = (
40
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation",
41
+ "application/octet-stream",
42
+ "application/zip",
43
+ "binary/octet-stream",
44
+ )
45
+
46
+
47
+ def extract_urls(html: str) -> list[str]:
48
+ raw = re.findall(r'https?://[^"\'<>\s)]+', html)
49
+ urls: list[str] = []
50
+ for url in raw:
51
+ url = unquote(url).replace("&amp;", "&")
52
+ parsed = urlparse(url)
53
+ if parsed.netloc.endswith("bing.com") and parsed.path == "/ck/a":
54
+ qs = parse_qs(parsed.query)
55
+ for key in ("u", "r"):
56
+ if key in qs:
57
+ url = unquote(qs[key][0])
58
+ break
59
+ if "duckduckgo.com/l/?" in url:
60
+ qs = parse_qs(urlparse(url).query)
61
+ if "uddg" in qs:
62
+ url = unquote(qs["uddg"][0])
63
+ urls.append(url.rstrip(".,;"))
64
+ return urls
65
+
66
+
67
+ def search(query: str, pages: int = 2) -> Iterable[str]:
68
+ endpoints = [
69
+ "https://www.bing.com/search?q={query}&first={offset}",
70
+ "https://html.duckduckgo.com/html/?q={query}&s={offset}",
71
+ ]
72
+ for endpoint in endpoints:
73
+ for page in range(pages):
74
+ offset = page * 10 + 1
75
+ url = endpoint.format(query=quote_plus(query), offset=offset)
76
+ try:
77
+ response = requests.get(url, headers=HEADERS, timeout=20)
78
+ if response.status_code >= 400:
79
+ continue
80
+ yield from extract_urls(response.text)
81
+ time.sleep(0.4)
82
+ except requests.RequestException:
83
+ continue
84
+
85
+
86
+ def github_json(url: str) -> dict | None:
87
+ try:
88
+ response = requests.get(url, headers={**HEADERS, "Accept": "application/vnd.github+json"}, timeout=25)
89
+ if response.status_code >= 400:
90
+ return None
91
+ return response.json()
92
+ except requests.RequestException:
93
+ return None
94
+
95
+
96
+ def github_pptx_urls(max_repos_per_query: int = 10) -> Iterable[str]:
97
+ seen_repos: set[str] = set()
98
+ for query in GITHUB_REPO_QUERIES:
99
+ search_url = (
100
+ "https://api.github.com/search/repositories"
101
+ f"?q={quote_plus(query)}&per_page={max_repos_per_query}"
102
+ )
103
+ payload = github_json(search_url)
104
+ if not payload:
105
+ continue
106
+ for repo in payload.get("items", []):
107
+ full_name = repo.get("full_name")
108
+ branch = repo.get("default_branch") or "main"
109
+ if not full_name or full_name in seen_repos:
110
+ continue
111
+ seen_repos.add(full_name)
112
+ tree_url = f"https://api.github.com/repos/{full_name}/git/trees/{branch}?recursive=1"
113
+ tree = github_json(tree_url)
114
+ if not tree:
115
+ continue
116
+ for item in tree.get("tree", []):
117
+ path = item.get("path", "")
118
+ if item.get("type") == "blob" and path.lower().endswith(".pptx"):
119
+ yield f"https://raw.githubusercontent.com/{full_name}/{branch}/{quote_path(path)}"
120
+ time.sleep(0.25)
121
+
122
+
123
+ def quote_path(path: str) -> str:
124
+ return "/".join(quote_plus(part).replace("+", "%20") for part in path.split("/"))
125
+
126
+
127
+ def looks_like_pptx_url(url: str) -> bool:
128
+ low = url.lower()
129
+ return ".pptx" in low and not any(bad in low for bad in ("?format=pdf", "/view?", "webcache"))
130
+
131
+
132
+ def safe_name(index: int, url: str, content: bytes) -> str:
133
+ parsed = urlparse(url)
134
+ stem = Path(unquote(parsed.path)).name or f"online-{index:03d}.pptx"
135
+ stem = re.sub(r"[^A-Za-z0-9._ -]+", "_", stem)
136
+ if not stem.lower().endswith(".pptx"):
137
+ stem += ".pptx"
138
+ digest = hashlib.sha256(content).hexdigest()[:10]
139
+ return f"{index:03d}-{digest}-{stem}"
140
+
141
+
142
+ def validate_pptx(path: Path) -> dict:
143
+ with ZipFile(path) as zf:
144
+ names = set(zf.namelist())
145
+ if "[Content_Types].xml" not in names or "ppt/presentation.xml" not in names:
146
+ raise BadZipFile("not a PowerPoint PPTX package")
147
+ slides = len([n for n in names if n.startswith("ppt/slides/slide") and n.endswith(".xml")])
148
+ media = len([n for n in names if n.startswith("ppt/media/")])
149
+ return {
150
+ "slides": slides,
151
+ "media_parts": media,
152
+ }
153
+
154
+
155
+ def download(url: str, out_dir: Path, index: int, max_mb: int) -> dict | None:
156
+ try:
157
+ with requests.get(url, headers=HEADERS, timeout=30, stream=True, allow_redirects=True) as response:
158
+ if response.status_code >= 400:
159
+ return None
160
+ content_type = response.headers.get("content-type", "").split(";")[0].strip().lower()
161
+ if content_type and content_type not in PPTX_CT_HINTS and "presentationml" not in content_type:
162
+ if ".pptx" not in response.url.lower():
163
+ return None
164
+ chunks: list[bytes] = []
165
+ total = 0
166
+ limit = max_mb * 1024 * 1024
167
+ for chunk in response.iter_content(chunk_size=1024 * 256):
168
+ if not chunk:
169
+ continue
170
+ total += len(chunk)
171
+ if total > limit:
172
+ return None
173
+ chunks.append(chunk)
174
+ content = b"".join(chunks)
175
+ except requests.RequestException:
176
+ return None
177
+
178
+ if not content.startswith(b"PK"):
179
+ return None
180
+
181
+ out_path = out_dir / safe_name(index, response.url, content)
182
+ out_path.write_bytes(content)
183
+ try:
184
+ stats = validate_pptx(out_path)
185
+ except (BadZipFile, KeyError):
186
+ out_path.unlink(missing_ok=True)
187
+ return None
188
+
189
+ return {
190
+ "file": str(out_path),
191
+ "source_url": response.url,
192
+ "bytes": len(content),
193
+ "content_type": content_type,
194
+ **stats,
195
+ }
196
+
197
+
198
+ def main() -> int:
199
+ parser = argparse.ArgumentParser()
200
+ parser.add_argument("--out", type=Path, required=True)
201
+ parser.add_argument("--target", type=int, default=20)
202
+ parser.add_argument("--max-mb", type=int, default=50)
203
+ args = parser.parse_args()
204
+
205
+ args.out.mkdir(parents=True, exist_ok=True)
206
+ seen: set[str] = set()
207
+ manifest: list[dict] = []
208
+ index = len(list(args.out.glob("*.pptx"))) + 1
209
+
210
+ print("source: github repositories", flush=True)
211
+ for url in github_pptx_urls():
212
+ if len(manifest) >= args.target:
213
+ break
214
+ normalized = url.split("#", 1)[0]
215
+ if normalized in seen:
216
+ continue
217
+ seen.add(normalized)
218
+ item = download(normalized, args.out, index, args.max_mb)
219
+ if not item:
220
+ continue
221
+ manifest.append(item)
222
+ index += 1
223
+ print(f"kept {len(manifest):02d}: {Path(item['file']).name} slides={item['slides']}", flush=True)
224
+
225
+ for query in DEFAULT_QUERIES:
226
+ print(f"search: {query}", flush=True)
227
+ for url in search(query):
228
+ if len(manifest) >= args.target:
229
+ break
230
+ if not looks_like_pptx_url(url):
231
+ continue
232
+ normalized = url.split("#", 1)[0]
233
+ if normalized in seen:
234
+ continue
235
+ seen.add(normalized)
236
+ item = download(normalized, args.out, index, args.max_mb)
237
+ if not item:
238
+ continue
239
+ manifest.append(item)
240
+ index += 1
241
+ print(f"kept {len(manifest):02d}: {Path(item['file']).name} slides={item['slides']}", flush=True)
242
+ if len(manifest) >= args.target:
243
+ break
244
+
245
+ manifest_path = args.out / "online-pptx-manifest.json"
246
+ manifest_path.write_text(json.dumps({"count": len(manifest), "items": manifest}, indent=2), encoding="utf-8")
247
+ print(f"saved manifest: {manifest_path}", flush=True)
248
+ return 0 if len(manifest) >= args.target else 1
249
+
250
+
251
+ if __name__ == "__main__":
252
+ raise SystemExit(main())
@@ -0,0 +1,87 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import json
5
+ import re
6
+ from pathlib import Path
7
+ from zipfile import ZipFile
8
+
9
+
10
+ def count_input_features(path: Path) -> dict:
11
+ with ZipFile(path) as zf:
12
+ names = zf.namelist()
13
+ slide_count = len([n for n in names if n.startswith("ppt/slides/slide") and n.endswith(".xml")])
14
+ media_count = len([n for n in names if n.startswith("ppt/media/")])
15
+ table_hint = 0
16
+ slide_referenced_images = 0
17
+ for n in names:
18
+ if not (n.startswith("ppt/slides/slide") and n.endswith(".xml")):
19
+ continue
20
+ xml = zf.read(n)
21
+ table_hint += len(re.findall(rb"<a:tbl\b", xml))
22
+ rels_path = f"{Path(n).parent.as_posix()}/_rels/{Path(n).name}.rels"
23
+ if rels_path in names:
24
+ rels = zf.read(rels_path)
25
+ slide_referenced_images += len(re.findall(rb"/media/[^\"']+", rels))
26
+ return {
27
+ "slides_in": slide_count,
28
+ "images_in_package": media_count,
29
+ "images_in_slides": slide_referenced_images,
30
+ "tables_in": table_hint,
31
+ }
32
+
33
+
34
+ def count_output_features(md_path: Path) -> dict:
35
+ text = md_path.read_text(encoding="utf-8") if md_path.exists() else ""
36
+ slides_out = len(re.findall(r"^## Slide \d+:", text, flags=re.MULTILINE))
37
+ images_out = len(re.findall(r"!\[[^\]]*\]\([^)]+\)|<img\s+[^>]*src=", text, flags=re.IGNORECASE))
38
+ tables_out = len(re.findall(r"^\|\s.*\|\s*$", text, flags=re.MULTILINE))
39
+ return {
40
+ "slides_out": slides_out,
41
+ "images_out": images_out,
42
+ "table_lines_out": tables_out,
43
+ }
44
+
45
+
46
+ def main() -> int:
47
+ parser = argparse.ArgumentParser()
48
+ parser.add_argument("--manifest", type=Path, required=True)
49
+ parser.add_argument("--out", type=Path, required=True, help="conversion output root")
50
+ parser.add_argument("--report", type=Path, required=True)
51
+ args = parser.parse_args()
52
+
53
+ payload = json.loads(args.manifest.read_text(encoding="utf-8"))
54
+ items = payload.get("items", [])
55
+ rows = []
56
+ for item in items:
57
+ source = Path(item["file"])
58
+ in_stats = count_input_features(source)
59
+ md = args.out / source.stem / f"{source.stem}.md"
60
+ out_stats = count_output_features(md)
61
+ rows.append({
62
+ "file": source.name,
63
+ **in_stats,
64
+ **out_stats,
65
+ "slide_delta": out_stats["slides_out"] - in_stats["slides_in"],
66
+ "image_delta": out_stats["images_out"] - in_stats["images_in_slides"],
67
+ })
68
+
69
+ summary = {
70
+ "count": len(rows),
71
+ "slides_in_total": sum(r["slides_in"] for r in rows),
72
+ "slides_out_total": sum(r["slides_out"] for r in rows),
73
+ "images_in_package_total": sum(r["images_in_package"] for r in rows),
74
+ "images_in_slides_total": sum(r["images_in_slides"] for r in rows),
75
+ "images_out_total": sum(r["images_out"] for r in rows),
76
+ "files_with_slide_mismatch": len([r for r in rows if r["slide_delta"] != 0]),
77
+ "files_with_image_gap": len([r for r in rows if r["images_out"] < r["images_in_slides"]]),
78
+ }
79
+ report = {"summary": summary, "rows": rows}
80
+ args.report.parent.mkdir(parents=True, exist_ok=True)
81
+ args.report.write_text(json.dumps(report, indent=2), encoding="utf-8")
82
+ print(json.dumps(summary, indent=2))
83
+ return 0
84
+
85
+
86
+ if __name__ == "__main__":
87
+ raise SystemExit(main())
@@ -0,0 +1,171 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import difflib
5
+ import json
6
+ import re
7
+ import shutil
8
+ import sys
9
+ from collections import Counter
10
+ from pathlib import Path
11
+
12
+ ROOT = Path(__file__).resolve().parents[1]
13
+ SRC = ROOT / "src"
14
+ if str(SRC) not in sys.path:
15
+ sys.path.insert(0, str(SRC))
16
+
17
+ from build_corpus.docx_exporter import export_markdown_to_docx
18
+ from build_corpus.exporter import BuildCorpusExporter
19
+
20
+
21
+ def markdown_stats(text: str) -> dict[str, int]:
22
+ return {
23
+ "headings": len(re.findall(r"^#+\s", text, re.M)),
24
+ "tables": len(re.findall(r"^\| .* \|$", text, re.M)),
25
+ "images": len(re.findall(r"!\[[^\]]*\]\([^)]+\)", text)),
26
+ "links": len(re.findall(r"(?<!!)\[[^\]]+\]\([^)]+\)", text)),
27
+ "math_inline": len(re.findall(r"\$[^$\n]+\$", text)),
28
+ "math_block": len(re.findall(r"^\$\$$", text, re.M)),
29
+ "code_fences": len(re.findall(r"^```", text, re.M)) // 2,
30
+ "lists": len(re.findall(r"^(?:\s*)(?:[-*+]|\d+\.)\s+", text, re.M)),
31
+ "chars": len(text),
32
+ "lines": len(text.splitlines()),
33
+ }
34
+
35
+
36
+ def compare_markdown(source: str, roundtrip: str) -> dict:
37
+ source_lines = source.splitlines()
38
+ roundtrip_lines = roundtrip.splitlines()
39
+ diff = list(difflib.unified_diff(source_lines, roundtrip_lines, fromfile="pass1.md", tofile="pass2.md", lineterm=""))
40
+ return {
41
+ "diff_lines": sum(
42
+ 1
43
+ for line in diff
44
+ if line.startswith(("+", "-")) and not line.startswith(("+++", "---"))
45
+ ),
46
+ "diff_preview": diff[:40],
47
+ "source_stats": markdown_stats(source),
48
+ "roundtrip_stats": markdown_stats(roundtrip),
49
+ }
50
+
51
+
52
+ def collect_inputs(path: Path) -> list[Path]:
53
+ if path.is_file():
54
+ return [path]
55
+ return sorted(doc for doc in path.rglob("*.docx") if not doc.name.startswith("~$"))
56
+
57
+
58
+ def run_one(docx_path: Path, out_root: Path) -> dict:
59
+ slug = docx_path.stem
60
+ file_root = out_root / slug
61
+ pass1_root = file_root / "pass1"
62
+ pass2_root = file_root / "pass2"
63
+ if file_root.exists():
64
+ shutil.rmtree(file_root)
65
+ pass1_root.mkdir(parents=True, exist_ok=True)
66
+ pass2_root.mkdir(parents=True, exist_ok=True)
67
+
68
+ first = BuildCorpusExporter(docx_path, pass1_root).export()
69
+ md1 = Path(first["output"])
70
+ second = export_markdown_to_docx(md1, pass1_root, out_same_dir=False)
71
+ regenerated_docx = Path(second["output"])
72
+ third = BuildCorpusExporter(regenerated_docx, pass2_root).export()
73
+ md2 = Path(third["output"])
74
+
75
+ source_text = md1.read_text(encoding="utf-8")
76
+ roundtrip_text = md2.read_text(encoding="utf-8")
77
+ compare = compare_markdown(source_text, roundtrip_text)
78
+ compare["input"] = str(docx_path)
79
+ compare["pass1_markdown"] = str(md1)
80
+ compare["pass2_markdown"] = str(md2)
81
+ compare["roundtrip_docx"] = str(regenerated_docx)
82
+ compare["pass1_report"] = first
83
+ compare["pass2_report"] = third
84
+ return compare
85
+
86
+
87
+ def summarize(results: list[dict]) -> dict:
88
+ counter: Counter[str] = Counter()
89
+ total_diff = 0
90
+ for result in results:
91
+ total_diff += result.get("diff_lines", 0)
92
+ source_stats = result["source_stats"]
93
+ roundtrip_stats = result["roundtrip_stats"]
94
+ for key in ("headings", "tables", "images", "links", "math_inline", "math_block", "code_fences", "lists"):
95
+ if source_stats.get(key) != roundtrip_stats.get(key):
96
+ counter[f"{key}_mismatch"] += 1
97
+ if result.get("diff_lines", 0):
98
+ counter["files_with_diff"] += 1
99
+ if result.get("pass1_report", {}).get("stats", {}).get("warnings"):
100
+ counter["pass1_warnings"] += 1
101
+ if result.get("pass2_report", {}).get("stats", {}).get("warnings"):
102
+ counter["pass2_warnings"] += 1
103
+ return {
104
+ "files": len(results),
105
+ "files_with_diff": counter.get("files_with_diff", 0),
106
+ "avg_diff_lines": (total_diff / len(results)) if results else 0,
107
+ "max_diff_lines": max((result.get("diff_lines", 0) for result in results), default=0),
108
+ "mismatch_counts": dict(counter),
109
+ "worst_examples": sorted(
110
+ [
111
+ {
112
+ "input": result["input"],
113
+ "diff_lines": result["diff_lines"],
114
+ "source_stats": result["source_stats"],
115
+ "roundtrip_stats": result["roundtrip_stats"],
116
+ "diff_preview": result["diff_preview"][:12],
117
+ }
118
+ for result in results
119
+ ],
120
+ key=lambda item: item["diff_lines"],
121
+ reverse=True,
122
+ )[:25],
123
+ }
124
+
125
+
126
+ def main() -> int:
127
+ parser = argparse.ArgumentParser(description="Run DOCX -> Markdown -> DOCX -> Markdown round-trip audits across a corpus.")
128
+ parser.add_argument("--source", type=Path, required=True, help="Single DOCX file or directory of DOCX files")
129
+ parser.add_argument("--out", type=Path, required=True, help="Output directory for round-trip artifacts and report")
130
+ parser.add_argument("--limit", type=int, default=0, help="Optional max number of files to process")
131
+ args = parser.parse_args()
132
+
133
+ inputs = collect_inputs(args.source)
134
+ if args.limit > 0:
135
+ inputs = inputs[: args.limit]
136
+
137
+ args.out.mkdir(parents=True, exist_ok=True)
138
+ results: list[dict] = []
139
+ for index, docx_path in enumerate(inputs, 1):
140
+ try:
141
+ result = run_one(docx_path, args.out)
142
+ results.append(result)
143
+ print(f"[{index}/{len(inputs)}] {docx_path.name} diff={result['diff_lines']}", flush=True)
144
+ except Exception as exc:
145
+ failure = {
146
+ "input": str(docx_path),
147
+ "error": repr(exc),
148
+ "diff_lines": -1,
149
+ "source_stats": {},
150
+ "roundtrip_stats": {},
151
+ "diff_preview": [],
152
+ }
153
+ results.append(failure)
154
+ print(f"[{index}/{len(inputs)}] {docx_path.name} FAILED {exc!r}", flush=True)
155
+
156
+ summary = summarize([result for result in results if result.get("diff_lines", 0) >= 0])
157
+ report = {
158
+ "source": str(args.source),
159
+ "out": str(args.out),
160
+ "summary": summary,
161
+ "results": results,
162
+ }
163
+ report_path = args.out / "roundtrip-report.json"
164
+ report_path.write_text(json.dumps(report, indent=2), encoding="utf-8")
165
+ print(json.dumps(summary, indent=2))
166
+ print(f"WROTE {report_path}")
167
+ return 0
168
+
169
+
170
+ if __name__ == "__main__":
171
+ raise SystemExit(main())