regen.mde 0.2.2 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +16 -16
- package/README.md +409 -295
- package/bin/build-corpus-editor.js +83 -81
- package/bin/build-corpus.js +41 -41
- package/bin/postinstall.js +259 -187
- package/bin/regen-mdeditor-install.js +27 -27
- package/bin/regen-mdeditor-uninstall.js +19 -19
- package/bin/validate-katex.js +93 -93
- package/desktop/BuildCorpusEditor/BuildCorpusBridge.cs +493 -270
- package/desktop/BuildCorpusEditor/BuildCorpusEditor.csproj +22 -22
- package/desktop/BuildCorpusEditor/EditorForm.cs +853 -540
- package/desktop/BuildCorpusEditor/Program.cs +85 -81
- package/desktop/BuildCorpusEditor/app.manifest +16 -16
- package/dist/release/regen-mde-0.8.0-win-x64.zip +0 -0
- package/dist/windows-editor/BuildCorpusEditor.dll +0 -0
- package/dist/windows-editor/BuildCorpusEditor.exe +0 -0
- package/dist/windows-editor/BuildCorpusEditor.pdb +0 -0
- package/dist/windows-editor/BuildCorpusEditor.runtimeconfig.json +1 -1
- package/dist/windows-editor/wwwroot/assets/index-C_VxJk4k.js +375 -0
- package/dist/windows-editor/wwwroot/assets/index-Wt9zSjIw.css +1 -0
- package/dist/windows-editor/wwwroot/index.html +22 -22
- package/editor-web/index.html +21 -21
- package/editor-web/src/main.jsx +1044 -399
- package/editor-web/src/styles.css +846 -602
- package/editor-web/vite.config.js +13 -13
- package/examples/build-corpus.config.example.json +21 -21
- package/installer/install-regen-mde.ps1 +214 -175
- package/installer/regen-mde.nsi +81 -81
- package/package.json +10 -6
- package/pyproject.toml +4 -3
- package/requirements.txt +5 -4
- package/scripts/build-windows-editor.ps1 +47 -47
- package/scripts/package-windows-editor.ps1 +90 -90
- package/scripts/release-dual.mjs +105 -0
- package/scripts/run-corpus.ps1 +28 -28
- package/scripts/run-editor-implementation-plane.ps1 +226 -203
- package/scripts/run-required-tests.ps1 +98 -98
- package/scripts/run-smoke.ps1 +28 -28
- package/src/build_corpus/__init__.py +1 -1
- package/src/build_corpus/docx_exporter.py +1055 -798
- package/src/build_corpus/equations.py +1345 -0
- package/src/build_corpus/exporter.py +1488 -1195
- package/src/build_corpus/frontmatter.py +302 -0
- package/src/build_corpus/ppt_exporter.py +543 -532
- package/src/build_corpus/templates/__init__.py +1 -1
- package/src/build_corpus/validate_assets.py +46 -46
- package/tools/audit_corpus.py +203 -203
- package/tools/collect_microsoft_word_templates.py +228 -228
- package/tools/collect_online_docx_corpus.py +272 -272
- package/tools/collect_online_pptx_corpus.py +252 -252
- package/tools/compare_pptx_inputs_outputs.py +87 -87
- package/tools/roundtrip_docx_corpus.py +171 -171
- package/dist/release/regen.mde-0.2.2-win-x64-setup.exe +0 -0
- package/dist/release/regen.mde-0.2.2-win-x64.zip +0 -0
- package/dist/windows-editor/wwwroot/assets/index-DjJ6xmhy.js +0 -326
- package/dist/windows-editor/wwwroot/assets/index-_dwMNNsm.css +0 -1
|
@@ -1,87 +1,87 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import argparse
|
|
4
|
-
import json
|
|
5
|
-
import re
|
|
6
|
-
from pathlib import Path
|
|
7
|
-
from zipfile import ZipFile
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
def count_input_features(path: Path) -> dict:
|
|
11
|
-
with ZipFile(path) as zf:
|
|
12
|
-
names = zf.namelist()
|
|
13
|
-
slide_count = len([n for n in names if n.startswith("ppt/slides/slide") and n.endswith(".xml")])
|
|
14
|
-
media_count = len([n for n in names if n.startswith("ppt/media/")])
|
|
15
|
-
table_hint = 0
|
|
16
|
-
slide_referenced_images = 0
|
|
17
|
-
for n in names:
|
|
18
|
-
if not (n.startswith("ppt/slides/slide") and n.endswith(".xml")):
|
|
19
|
-
continue
|
|
20
|
-
xml = zf.read(n)
|
|
21
|
-
table_hint += len(re.findall(rb"<a:tbl\b", xml))
|
|
22
|
-
rels_path = f"{Path(n).parent.as_posix()}/_rels/{Path(n).name}.rels"
|
|
23
|
-
if rels_path in names:
|
|
24
|
-
rels = zf.read(rels_path)
|
|
25
|
-
slide_referenced_images += len(re.findall(rb"/media/[^\"']+", rels))
|
|
26
|
-
return {
|
|
27
|
-
"slides_in": slide_count,
|
|
28
|
-
"images_in_package": media_count,
|
|
29
|
-
"images_in_slides": slide_referenced_images,
|
|
30
|
-
"tables_in": table_hint,
|
|
31
|
-
}
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
def count_output_features(md_path: Path) -> dict:
|
|
35
|
-
text = md_path.read_text(encoding="utf-8") if md_path.exists() else ""
|
|
36
|
-
slides_out = len(re.findall(r"^## Slide \d+:", text, flags=re.MULTILINE))
|
|
37
|
-
images_out = len(re.findall(r"!\[[^\]]*\]\([^)]+\)|<img\s+[^>]*src=", text, flags=re.IGNORECASE))
|
|
38
|
-
tables_out = len(re.findall(r"^\|\s.*\|\s*$", text, flags=re.MULTILINE))
|
|
39
|
-
return {
|
|
40
|
-
"slides_out": slides_out,
|
|
41
|
-
"images_out": images_out,
|
|
42
|
-
"table_lines_out": tables_out,
|
|
43
|
-
}
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
def main() -> int:
|
|
47
|
-
parser = argparse.ArgumentParser()
|
|
48
|
-
parser.add_argument("--manifest", type=Path, required=True)
|
|
49
|
-
parser.add_argument("--out", type=Path, required=True, help="conversion output root")
|
|
50
|
-
parser.add_argument("--report", type=Path, required=True)
|
|
51
|
-
args = parser.parse_args()
|
|
52
|
-
|
|
53
|
-
payload = json.loads(args.manifest.read_text(encoding="utf-8"))
|
|
54
|
-
items = payload.get("items", [])
|
|
55
|
-
rows = []
|
|
56
|
-
for item in items:
|
|
57
|
-
source = Path(item["file"])
|
|
58
|
-
in_stats = count_input_features(source)
|
|
59
|
-
md = args.out / source.stem / f"{source.stem}.md"
|
|
60
|
-
out_stats = count_output_features(md)
|
|
61
|
-
rows.append({
|
|
62
|
-
"file": source.name,
|
|
63
|
-
**in_stats,
|
|
64
|
-
**out_stats,
|
|
65
|
-
"slide_delta": out_stats["slides_out"] - in_stats["slides_in"],
|
|
66
|
-
"image_delta": out_stats["images_out"] - in_stats["images_in_slides"],
|
|
67
|
-
})
|
|
68
|
-
|
|
69
|
-
summary = {
|
|
70
|
-
"count": len(rows),
|
|
71
|
-
"slides_in_total": sum(r["slides_in"] for r in rows),
|
|
72
|
-
"slides_out_total": sum(r["slides_out"] for r in rows),
|
|
73
|
-
"images_in_package_total": sum(r["images_in_package"] for r in rows),
|
|
74
|
-
"images_in_slides_total": sum(r["images_in_slides"] for r in rows),
|
|
75
|
-
"images_out_total": sum(r["images_out"] for r in rows),
|
|
76
|
-
"files_with_slide_mismatch": len([r for r in rows if r["slide_delta"] != 0]),
|
|
77
|
-
"files_with_image_gap": len([r for r in rows if r["images_out"] < r["images_in_slides"]]),
|
|
78
|
-
}
|
|
79
|
-
report = {"summary": summary, "rows": rows}
|
|
80
|
-
args.report.parent.mkdir(parents=True, exist_ok=True)
|
|
81
|
-
args.report.write_text(json.dumps(report, indent=2), encoding="utf-8")
|
|
82
|
-
print(json.dumps(summary, indent=2))
|
|
83
|
-
return 0
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
if __name__ == "__main__":
|
|
87
|
-
raise SystemExit(main())
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import json
|
|
5
|
+
import re
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from zipfile import ZipFile
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def count_input_features(path: Path) -> dict:
|
|
11
|
+
with ZipFile(path) as zf:
|
|
12
|
+
names = zf.namelist()
|
|
13
|
+
slide_count = len([n for n in names if n.startswith("ppt/slides/slide") and n.endswith(".xml")])
|
|
14
|
+
media_count = len([n for n in names if n.startswith("ppt/media/")])
|
|
15
|
+
table_hint = 0
|
|
16
|
+
slide_referenced_images = 0
|
|
17
|
+
for n in names:
|
|
18
|
+
if not (n.startswith("ppt/slides/slide") and n.endswith(".xml")):
|
|
19
|
+
continue
|
|
20
|
+
xml = zf.read(n)
|
|
21
|
+
table_hint += len(re.findall(rb"<a:tbl\b", xml))
|
|
22
|
+
rels_path = f"{Path(n).parent.as_posix()}/_rels/{Path(n).name}.rels"
|
|
23
|
+
if rels_path in names:
|
|
24
|
+
rels = zf.read(rels_path)
|
|
25
|
+
slide_referenced_images += len(re.findall(rb"/media/[^\"']+", rels))
|
|
26
|
+
return {
|
|
27
|
+
"slides_in": slide_count,
|
|
28
|
+
"images_in_package": media_count,
|
|
29
|
+
"images_in_slides": slide_referenced_images,
|
|
30
|
+
"tables_in": table_hint,
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def count_output_features(md_path: Path) -> dict:
|
|
35
|
+
text = md_path.read_text(encoding="utf-8") if md_path.exists() else ""
|
|
36
|
+
slides_out = len(re.findall(r"^## Slide \d+:", text, flags=re.MULTILINE))
|
|
37
|
+
images_out = len(re.findall(r"!\[[^\]]*\]\([^)]+\)|<img\s+[^>]*src=", text, flags=re.IGNORECASE))
|
|
38
|
+
tables_out = len(re.findall(r"^\|\s.*\|\s*$", text, flags=re.MULTILINE))
|
|
39
|
+
return {
|
|
40
|
+
"slides_out": slides_out,
|
|
41
|
+
"images_out": images_out,
|
|
42
|
+
"table_lines_out": tables_out,
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def main() -> int:
|
|
47
|
+
parser = argparse.ArgumentParser()
|
|
48
|
+
parser.add_argument("--manifest", type=Path, required=True)
|
|
49
|
+
parser.add_argument("--out", type=Path, required=True, help="conversion output root")
|
|
50
|
+
parser.add_argument("--report", type=Path, required=True)
|
|
51
|
+
args = parser.parse_args()
|
|
52
|
+
|
|
53
|
+
payload = json.loads(args.manifest.read_text(encoding="utf-8"))
|
|
54
|
+
items = payload.get("items", [])
|
|
55
|
+
rows = []
|
|
56
|
+
for item in items:
|
|
57
|
+
source = Path(item["file"])
|
|
58
|
+
in_stats = count_input_features(source)
|
|
59
|
+
md = args.out / source.stem / f"{source.stem}.md"
|
|
60
|
+
out_stats = count_output_features(md)
|
|
61
|
+
rows.append({
|
|
62
|
+
"file": source.name,
|
|
63
|
+
**in_stats,
|
|
64
|
+
**out_stats,
|
|
65
|
+
"slide_delta": out_stats["slides_out"] - in_stats["slides_in"],
|
|
66
|
+
"image_delta": out_stats["images_out"] - in_stats["images_in_slides"],
|
|
67
|
+
})
|
|
68
|
+
|
|
69
|
+
summary = {
|
|
70
|
+
"count": len(rows),
|
|
71
|
+
"slides_in_total": sum(r["slides_in"] for r in rows),
|
|
72
|
+
"slides_out_total": sum(r["slides_out"] for r in rows),
|
|
73
|
+
"images_in_package_total": sum(r["images_in_package"] for r in rows),
|
|
74
|
+
"images_in_slides_total": sum(r["images_in_slides"] for r in rows),
|
|
75
|
+
"images_out_total": sum(r["images_out"] for r in rows),
|
|
76
|
+
"files_with_slide_mismatch": len([r for r in rows if r["slide_delta"] != 0]),
|
|
77
|
+
"files_with_image_gap": len([r for r in rows if r["images_out"] < r["images_in_slides"]]),
|
|
78
|
+
}
|
|
79
|
+
report = {"summary": summary, "rows": rows}
|
|
80
|
+
args.report.parent.mkdir(parents=True, exist_ok=True)
|
|
81
|
+
args.report.write_text(json.dumps(report, indent=2), encoding="utf-8")
|
|
82
|
+
print(json.dumps(summary, indent=2))
|
|
83
|
+
return 0
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
if __name__ == "__main__":
|
|
87
|
+
raise SystemExit(main())
|
|
@@ -1,171 +1,171 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import argparse
|
|
4
|
-
import difflib
|
|
5
|
-
import json
|
|
6
|
-
import re
|
|
7
|
-
import shutil
|
|
8
|
-
import sys
|
|
9
|
-
from collections import Counter
|
|
10
|
-
from pathlib import Path
|
|
11
|
-
|
|
12
|
-
ROOT = Path(__file__).resolve().parents[1]
|
|
13
|
-
SRC = ROOT / "src"
|
|
14
|
-
if str(SRC) not in sys.path:
|
|
15
|
-
sys.path.insert(0, str(SRC))
|
|
16
|
-
|
|
17
|
-
from build_corpus.docx_exporter import export_markdown_to_docx
|
|
18
|
-
from build_corpus.exporter import BuildCorpusExporter
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
def markdown_stats(text: str) -> dict[str, int]:
|
|
22
|
-
return {
|
|
23
|
-
"headings": len(re.findall(r"^#+\s", text, re.M)),
|
|
24
|
-
"tables": len(re.findall(r"^\| .* \|$", text, re.M)),
|
|
25
|
-
"images": len(re.findall(r"!\[[^\]]*\]\([^)]+\)", text)),
|
|
26
|
-
"links": len(re.findall(r"(?<!!)\[[^\]]+\]\([^)]+\)", text)),
|
|
27
|
-
"math_inline": len(re.findall(r"\$[^$\n]+\$", text)),
|
|
28
|
-
"math_block": len(re.findall(r"^\$\$$", text, re.M)),
|
|
29
|
-
"code_fences": len(re.findall(r"^```", text, re.M)) // 2,
|
|
30
|
-
"lists": len(re.findall(r"^(?:\s*)(?:[-*+]|\d+\.)\s+", text, re.M)),
|
|
31
|
-
"chars": len(text),
|
|
32
|
-
"lines": len(text.splitlines()),
|
|
33
|
-
}
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
def compare_markdown(source: str, roundtrip: str) -> dict:
|
|
37
|
-
source_lines = source.splitlines()
|
|
38
|
-
roundtrip_lines = roundtrip.splitlines()
|
|
39
|
-
diff = list(difflib.unified_diff(source_lines, roundtrip_lines, fromfile="pass1.md", tofile="pass2.md", lineterm=""))
|
|
40
|
-
return {
|
|
41
|
-
"diff_lines": sum(
|
|
42
|
-
1
|
|
43
|
-
for line in diff
|
|
44
|
-
if line.startswith(("+", "-")) and not line.startswith(("+++", "---"))
|
|
45
|
-
),
|
|
46
|
-
"diff_preview": diff[:40],
|
|
47
|
-
"source_stats": markdown_stats(source),
|
|
48
|
-
"roundtrip_stats": markdown_stats(roundtrip),
|
|
49
|
-
}
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
def collect_inputs(path: Path) -> list[Path]:
|
|
53
|
-
if path.is_file():
|
|
54
|
-
return [path]
|
|
55
|
-
return sorted(doc for doc in path.rglob("*.docx") if not doc.name.startswith("~$"))
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
def run_one(docx_path: Path, out_root: Path) -> dict:
|
|
59
|
-
slug = docx_path.stem
|
|
60
|
-
file_root = out_root / slug
|
|
61
|
-
pass1_root = file_root / "pass1"
|
|
62
|
-
pass2_root = file_root / "pass2"
|
|
63
|
-
if file_root.exists():
|
|
64
|
-
shutil.rmtree(file_root)
|
|
65
|
-
pass1_root.mkdir(parents=True, exist_ok=True)
|
|
66
|
-
pass2_root.mkdir(parents=True, exist_ok=True)
|
|
67
|
-
|
|
68
|
-
first = BuildCorpusExporter(docx_path, pass1_root).export()
|
|
69
|
-
md1 = Path(first["output"])
|
|
70
|
-
second = export_markdown_to_docx(md1, pass1_root, out_same_dir=False)
|
|
71
|
-
regenerated_docx = Path(second["output"])
|
|
72
|
-
third = BuildCorpusExporter(regenerated_docx, pass2_root).export()
|
|
73
|
-
md2 = Path(third["output"])
|
|
74
|
-
|
|
75
|
-
source_text = md1.read_text(encoding="utf-8")
|
|
76
|
-
roundtrip_text = md2.read_text(encoding="utf-8")
|
|
77
|
-
compare = compare_markdown(source_text, roundtrip_text)
|
|
78
|
-
compare["input"] = str(docx_path)
|
|
79
|
-
compare["pass1_markdown"] = str(md1)
|
|
80
|
-
compare["pass2_markdown"] = str(md2)
|
|
81
|
-
compare["roundtrip_docx"] = str(regenerated_docx)
|
|
82
|
-
compare["pass1_report"] = first
|
|
83
|
-
compare["pass2_report"] = third
|
|
84
|
-
return compare
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
def summarize(results: list[dict]) -> dict:
|
|
88
|
-
counter: Counter[str] = Counter()
|
|
89
|
-
total_diff = 0
|
|
90
|
-
for result in results:
|
|
91
|
-
total_diff += result.get("diff_lines", 0)
|
|
92
|
-
source_stats = result["source_stats"]
|
|
93
|
-
roundtrip_stats = result["roundtrip_stats"]
|
|
94
|
-
for key in ("headings", "tables", "images", "links", "math_inline", "math_block", "code_fences", "lists"):
|
|
95
|
-
if source_stats.get(key) != roundtrip_stats.get(key):
|
|
96
|
-
counter[f"{key}_mismatch"] += 1
|
|
97
|
-
if result.get("diff_lines", 0):
|
|
98
|
-
counter["files_with_diff"] += 1
|
|
99
|
-
if result.get("pass1_report", {}).get("stats", {}).get("warnings"):
|
|
100
|
-
counter["pass1_warnings"] += 1
|
|
101
|
-
if result.get("pass2_report", {}).get("stats", {}).get("warnings"):
|
|
102
|
-
counter["pass2_warnings"] += 1
|
|
103
|
-
return {
|
|
104
|
-
"files": len(results),
|
|
105
|
-
"files_with_diff": counter.get("files_with_diff", 0),
|
|
106
|
-
"avg_diff_lines": (total_diff / len(results)) if results else 0,
|
|
107
|
-
"max_diff_lines": max((result.get("diff_lines", 0) for result in results), default=0),
|
|
108
|
-
"mismatch_counts": dict(counter),
|
|
109
|
-
"worst_examples": sorted(
|
|
110
|
-
[
|
|
111
|
-
{
|
|
112
|
-
"input": result["input"],
|
|
113
|
-
"diff_lines": result["diff_lines"],
|
|
114
|
-
"source_stats": result["source_stats"],
|
|
115
|
-
"roundtrip_stats": result["roundtrip_stats"],
|
|
116
|
-
"diff_preview": result["diff_preview"][:12],
|
|
117
|
-
}
|
|
118
|
-
for result in results
|
|
119
|
-
],
|
|
120
|
-
key=lambda item: item["diff_lines"],
|
|
121
|
-
reverse=True,
|
|
122
|
-
)[:25],
|
|
123
|
-
}
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
def main() -> int:
|
|
127
|
-
parser = argparse.ArgumentParser(description="Run DOCX -> Markdown -> DOCX -> Markdown round-trip audits across a corpus.")
|
|
128
|
-
parser.add_argument("--source", type=Path, required=True, help="Single DOCX file or directory of DOCX files")
|
|
129
|
-
parser.add_argument("--out", type=Path, required=True, help="Output directory for round-trip artifacts and report")
|
|
130
|
-
parser.add_argument("--limit", type=int, default=0, help="Optional max number of files to process")
|
|
131
|
-
args = parser.parse_args()
|
|
132
|
-
|
|
133
|
-
inputs = collect_inputs(args.source)
|
|
134
|
-
if args.limit > 0:
|
|
135
|
-
inputs = inputs[: args.limit]
|
|
136
|
-
|
|
137
|
-
args.out.mkdir(parents=True, exist_ok=True)
|
|
138
|
-
results: list[dict] = []
|
|
139
|
-
for index, docx_path in enumerate(inputs, 1):
|
|
140
|
-
try:
|
|
141
|
-
result = run_one(docx_path, args.out)
|
|
142
|
-
results.append(result)
|
|
143
|
-
print(f"[{index}/{len(inputs)}] {docx_path.name} diff={result['diff_lines']}", flush=True)
|
|
144
|
-
except Exception as exc:
|
|
145
|
-
failure = {
|
|
146
|
-
"input": str(docx_path),
|
|
147
|
-
"error": repr(exc),
|
|
148
|
-
"diff_lines": -1,
|
|
149
|
-
"source_stats": {},
|
|
150
|
-
"roundtrip_stats": {},
|
|
151
|
-
"diff_preview": [],
|
|
152
|
-
}
|
|
153
|
-
results.append(failure)
|
|
154
|
-
print(f"[{index}/{len(inputs)}] {docx_path.name} FAILED {exc!r}", flush=True)
|
|
155
|
-
|
|
156
|
-
summary = summarize([result for result in results if result.get("diff_lines", 0) >= 0])
|
|
157
|
-
report = {
|
|
158
|
-
"source": str(args.source),
|
|
159
|
-
"out": str(args.out),
|
|
160
|
-
"summary": summary,
|
|
161
|
-
"results": results,
|
|
162
|
-
}
|
|
163
|
-
report_path = args.out / "roundtrip-report.json"
|
|
164
|
-
report_path.write_text(json.dumps(report, indent=2), encoding="utf-8")
|
|
165
|
-
print(json.dumps(summary, indent=2))
|
|
166
|
-
print(f"WROTE {report_path}")
|
|
167
|
-
return 0
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
if __name__ == "__main__":
|
|
171
|
-
raise SystemExit(main())
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import difflib
|
|
5
|
+
import json
|
|
6
|
+
import re
|
|
7
|
+
import shutil
|
|
8
|
+
import sys
|
|
9
|
+
from collections import Counter
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
ROOT = Path(__file__).resolve().parents[1]
|
|
13
|
+
SRC = ROOT / "src"
|
|
14
|
+
if str(SRC) not in sys.path:
|
|
15
|
+
sys.path.insert(0, str(SRC))
|
|
16
|
+
|
|
17
|
+
from build_corpus.docx_exporter import export_markdown_to_docx
|
|
18
|
+
from build_corpus.exporter import BuildCorpusExporter
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def markdown_stats(text: str) -> dict[str, int]:
|
|
22
|
+
return {
|
|
23
|
+
"headings": len(re.findall(r"^#+\s", text, re.M)),
|
|
24
|
+
"tables": len(re.findall(r"^\| .* \|$", text, re.M)),
|
|
25
|
+
"images": len(re.findall(r"!\[[^\]]*\]\([^)]+\)", text)),
|
|
26
|
+
"links": len(re.findall(r"(?<!!)\[[^\]]+\]\([^)]+\)", text)),
|
|
27
|
+
"math_inline": len(re.findall(r"\$[^$\n]+\$", text)),
|
|
28
|
+
"math_block": len(re.findall(r"^\$\$$", text, re.M)),
|
|
29
|
+
"code_fences": len(re.findall(r"^```", text, re.M)) // 2,
|
|
30
|
+
"lists": len(re.findall(r"^(?:\s*)(?:[-*+]|\d+\.)\s+", text, re.M)),
|
|
31
|
+
"chars": len(text),
|
|
32
|
+
"lines": len(text.splitlines()),
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def compare_markdown(source: str, roundtrip: str) -> dict:
|
|
37
|
+
source_lines = source.splitlines()
|
|
38
|
+
roundtrip_lines = roundtrip.splitlines()
|
|
39
|
+
diff = list(difflib.unified_diff(source_lines, roundtrip_lines, fromfile="pass1.md", tofile="pass2.md", lineterm=""))
|
|
40
|
+
return {
|
|
41
|
+
"diff_lines": sum(
|
|
42
|
+
1
|
|
43
|
+
for line in diff
|
|
44
|
+
if line.startswith(("+", "-")) and not line.startswith(("+++", "---"))
|
|
45
|
+
),
|
|
46
|
+
"diff_preview": diff[:40],
|
|
47
|
+
"source_stats": markdown_stats(source),
|
|
48
|
+
"roundtrip_stats": markdown_stats(roundtrip),
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def collect_inputs(path: Path) -> list[Path]:
|
|
53
|
+
if path.is_file():
|
|
54
|
+
return [path]
|
|
55
|
+
return sorted(doc for doc in path.rglob("*.docx") if not doc.name.startswith("~$"))
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def run_one(docx_path: Path, out_root: Path) -> dict:
|
|
59
|
+
slug = docx_path.stem
|
|
60
|
+
file_root = out_root / slug
|
|
61
|
+
pass1_root = file_root / "pass1"
|
|
62
|
+
pass2_root = file_root / "pass2"
|
|
63
|
+
if file_root.exists():
|
|
64
|
+
shutil.rmtree(file_root)
|
|
65
|
+
pass1_root.mkdir(parents=True, exist_ok=True)
|
|
66
|
+
pass2_root.mkdir(parents=True, exist_ok=True)
|
|
67
|
+
|
|
68
|
+
first = BuildCorpusExporter(docx_path, pass1_root).export()
|
|
69
|
+
md1 = Path(first["output"])
|
|
70
|
+
second = export_markdown_to_docx(md1, pass1_root, out_same_dir=False)
|
|
71
|
+
regenerated_docx = Path(second["output"])
|
|
72
|
+
third = BuildCorpusExporter(regenerated_docx, pass2_root).export()
|
|
73
|
+
md2 = Path(third["output"])
|
|
74
|
+
|
|
75
|
+
source_text = md1.read_text(encoding="utf-8")
|
|
76
|
+
roundtrip_text = md2.read_text(encoding="utf-8")
|
|
77
|
+
compare = compare_markdown(source_text, roundtrip_text)
|
|
78
|
+
compare["input"] = str(docx_path)
|
|
79
|
+
compare["pass1_markdown"] = str(md1)
|
|
80
|
+
compare["pass2_markdown"] = str(md2)
|
|
81
|
+
compare["roundtrip_docx"] = str(regenerated_docx)
|
|
82
|
+
compare["pass1_report"] = first
|
|
83
|
+
compare["pass2_report"] = third
|
|
84
|
+
return compare
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def summarize(results: list[dict]) -> dict:
|
|
88
|
+
counter: Counter[str] = Counter()
|
|
89
|
+
total_diff = 0
|
|
90
|
+
for result in results:
|
|
91
|
+
total_diff += result.get("diff_lines", 0)
|
|
92
|
+
source_stats = result["source_stats"]
|
|
93
|
+
roundtrip_stats = result["roundtrip_stats"]
|
|
94
|
+
for key in ("headings", "tables", "images", "links", "math_inline", "math_block", "code_fences", "lists"):
|
|
95
|
+
if source_stats.get(key) != roundtrip_stats.get(key):
|
|
96
|
+
counter[f"{key}_mismatch"] += 1
|
|
97
|
+
if result.get("diff_lines", 0):
|
|
98
|
+
counter["files_with_diff"] += 1
|
|
99
|
+
if result.get("pass1_report", {}).get("stats", {}).get("warnings"):
|
|
100
|
+
counter["pass1_warnings"] += 1
|
|
101
|
+
if result.get("pass2_report", {}).get("stats", {}).get("warnings"):
|
|
102
|
+
counter["pass2_warnings"] += 1
|
|
103
|
+
return {
|
|
104
|
+
"files": len(results),
|
|
105
|
+
"files_with_diff": counter.get("files_with_diff", 0),
|
|
106
|
+
"avg_diff_lines": (total_diff / len(results)) if results else 0,
|
|
107
|
+
"max_diff_lines": max((result.get("diff_lines", 0) for result in results), default=0),
|
|
108
|
+
"mismatch_counts": dict(counter),
|
|
109
|
+
"worst_examples": sorted(
|
|
110
|
+
[
|
|
111
|
+
{
|
|
112
|
+
"input": result["input"],
|
|
113
|
+
"diff_lines": result["diff_lines"],
|
|
114
|
+
"source_stats": result["source_stats"],
|
|
115
|
+
"roundtrip_stats": result["roundtrip_stats"],
|
|
116
|
+
"diff_preview": result["diff_preview"][:12],
|
|
117
|
+
}
|
|
118
|
+
for result in results
|
|
119
|
+
],
|
|
120
|
+
key=lambda item: item["diff_lines"],
|
|
121
|
+
reverse=True,
|
|
122
|
+
)[:25],
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def main() -> int:
|
|
127
|
+
parser = argparse.ArgumentParser(description="Run DOCX -> Markdown -> DOCX -> Markdown round-trip audits across a corpus.")
|
|
128
|
+
parser.add_argument("--source", type=Path, required=True, help="Single DOCX file or directory of DOCX files")
|
|
129
|
+
parser.add_argument("--out", type=Path, required=True, help="Output directory for round-trip artifacts and report")
|
|
130
|
+
parser.add_argument("--limit", type=int, default=0, help="Optional max number of files to process")
|
|
131
|
+
args = parser.parse_args()
|
|
132
|
+
|
|
133
|
+
inputs = collect_inputs(args.source)
|
|
134
|
+
if args.limit > 0:
|
|
135
|
+
inputs = inputs[: args.limit]
|
|
136
|
+
|
|
137
|
+
args.out.mkdir(parents=True, exist_ok=True)
|
|
138
|
+
results: list[dict] = []
|
|
139
|
+
for index, docx_path in enumerate(inputs, 1):
|
|
140
|
+
try:
|
|
141
|
+
result = run_one(docx_path, args.out)
|
|
142
|
+
results.append(result)
|
|
143
|
+
print(f"[{index}/{len(inputs)}] {docx_path.name} diff={result['diff_lines']}", flush=True)
|
|
144
|
+
except Exception as exc:
|
|
145
|
+
failure = {
|
|
146
|
+
"input": str(docx_path),
|
|
147
|
+
"error": repr(exc),
|
|
148
|
+
"diff_lines": -1,
|
|
149
|
+
"source_stats": {},
|
|
150
|
+
"roundtrip_stats": {},
|
|
151
|
+
"diff_preview": [],
|
|
152
|
+
}
|
|
153
|
+
results.append(failure)
|
|
154
|
+
print(f"[{index}/{len(inputs)}] {docx_path.name} FAILED {exc!r}", flush=True)
|
|
155
|
+
|
|
156
|
+
summary = summarize([result for result in results if result.get("diff_lines", 0) >= 0])
|
|
157
|
+
report = {
|
|
158
|
+
"source": str(args.source),
|
|
159
|
+
"out": str(args.out),
|
|
160
|
+
"summary": summary,
|
|
161
|
+
"results": results,
|
|
162
|
+
}
|
|
163
|
+
report_path = args.out / "roundtrip-report.json"
|
|
164
|
+
report_path.write_text(json.dumps(report, indent=2), encoding="utf-8")
|
|
165
|
+
print(json.dumps(summary, indent=2))
|
|
166
|
+
print(f"WROTE {report_path}")
|
|
167
|
+
return 0
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
if __name__ == "__main__":
|
|
171
|
+
raise SystemExit(main())
|
|
Binary file
|
|
Binary file
|