regen.mde 0.2.2 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +16 -16
- package/README.md +409 -295
- package/bin/build-corpus-editor.js +83 -81
- package/bin/build-corpus.js +41 -41
- package/bin/postinstall.js +259 -187
- package/bin/regen-mdeditor-install.js +27 -27
- package/bin/regen-mdeditor-uninstall.js +19 -19
- package/bin/validate-katex.js +93 -93
- package/desktop/BuildCorpusEditor/BuildCorpusBridge.cs +493 -270
- package/desktop/BuildCorpusEditor/BuildCorpusEditor.csproj +22 -22
- package/desktop/BuildCorpusEditor/EditorForm.cs +853 -540
- package/desktop/BuildCorpusEditor/Program.cs +85 -81
- package/desktop/BuildCorpusEditor/app.manifest +16 -16
- package/dist/release/regen-mde-0.8.0-win-x64.zip +0 -0
- package/dist/windows-editor/BuildCorpusEditor.dll +0 -0
- package/dist/windows-editor/BuildCorpusEditor.exe +0 -0
- package/dist/windows-editor/BuildCorpusEditor.pdb +0 -0
- package/dist/windows-editor/BuildCorpusEditor.runtimeconfig.json +1 -1
- package/dist/windows-editor/wwwroot/assets/index-C_VxJk4k.js +375 -0
- package/dist/windows-editor/wwwroot/assets/index-Wt9zSjIw.css +1 -0
- package/dist/windows-editor/wwwroot/index.html +22 -22
- package/editor-web/index.html +21 -21
- package/editor-web/src/main.jsx +1044 -399
- package/editor-web/src/styles.css +846 -602
- package/editor-web/vite.config.js +13 -13
- package/examples/build-corpus.config.example.json +21 -21
- package/installer/install-regen-mde.ps1 +214 -175
- package/installer/regen-mde.nsi +81 -81
- package/package.json +10 -6
- package/pyproject.toml +4 -3
- package/requirements.txt +5 -4
- package/scripts/build-windows-editor.ps1 +47 -47
- package/scripts/package-windows-editor.ps1 +90 -90
- package/scripts/release-dual.mjs +105 -0
- package/scripts/run-corpus.ps1 +28 -28
- package/scripts/run-editor-implementation-plane.ps1 +226 -203
- package/scripts/run-required-tests.ps1 +98 -98
- package/scripts/run-smoke.ps1 +28 -28
- package/src/build_corpus/__init__.py +1 -1
- package/src/build_corpus/docx_exporter.py +1055 -798
- package/src/build_corpus/equations.py +1345 -0
- package/src/build_corpus/exporter.py +1488 -1195
- package/src/build_corpus/frontmatter.py +302 -0
- package/src/build_corpus/ppt_exporter.py +543 -532
- package/src/build_corpus/templates/__init__.py +1 -1
- package/src/build_corpus/validate_assets.py +46 -46
- package/tools/audit_corpus.py +203 -203
- package/tools/collect_microsoft_word_templates.py +228 -228
- package/tools/collect_online_docx_corpus.py +272 -272
- package/tools/collect_online_pptx_corpus.py +252 -252
- package/tools/compare_pptx_inputs_outputs.py +87 -87
- package/tools/roundtrip_docx_corpus.py +171 -171
- package/dist/release/regen.mde-0.2.2-win-x64-setup.exe +0 -0
- package/dist/release/regen.mde-0.2.2-win-x64.zip +0 -0
- package/dist/windows-editor/wwwroot/assets/index-DjJ6xmhy.js +0 -326
- package/dist/windows-editor/wwwroot/assets/index-_dwMNNsm.css +0 -1
|
@@ -1 +1 @@
|
|
|
1
|
-
|
|
1
|
+
|
|
@@ -1,46 +1,46 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import argparse
|
|
4
|
-
import json
|
|
5
|
-
import re
|
|
6
|
-
from pathlib import Path
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
IMAGE_RE = re.compile(r"!\[[^\]]*\]\(([^)]+)\)")
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
def validate_file(path: Path) -> dict:
|
|
13
|
-
text = path.read_text(encoding="utf-8", errors="replace")
|
|
14
|
-
refs = IMAGE_RE.findall(text)
|
|
15
|
-
missing = []
|
|
16
|
-
for ref in refs:
|
|
17
|
-
if ref.startswith(("http://", "https://", "data:")):
|
|
18
|
-
continue
|
|
19
|
-
if not (path.parent / ref).exists() and not Path(ref).exists():
|
|
20
|
-
missing.append(ref)
|
|
21
|
-
return {
|
|
22
|
-
"file": str(path),
|
|
23
|
-
"image_refs": len(refs),
|
|
24
|
-
"missing_refs": len(missing),
|
|
25
|
-
"missing_samples": missing[:50],
|
|
26
|
-
}
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
def collect_markdown(path: Path) -> list[Path]:
|
|
30
|
-
if path.is_file():
|
|
31
|
-
return [path]
|
|
32
|
-
return sorted(path.rglob("*.md"))
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
def main() -> None:
|
|
36
|
-
parser = argparse.ArgumentParser()
|
|
37
|
-
parser.add_argument("target", type=Path)
|
|
38
|
-
args = parser.parse_args()
|
|
39
|
-
|
|
40
|
-
results = [validate_file(path) for path in collect_markdown(args.target)]
|
|
41
|
-
print(json.dumps({"files": len(results), "results": results}, indent=2))
|
|
42
|
-
raise SystemExit(1 if any(result["missing_refs"] for result in results) else 0)
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
if __name__ == "__main__":
|
|
46
|
-
main()
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import json
|
|
5
|
+
import re
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
IMAGE_RE = re.compile(r"!\[[^\]]*\]\(([^)]+)\)")
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def validate_file(path: Path) -> dict:
|
|
13
|
+
text = path.read_text(encoding="utf-8", errors="replace")
|
|
14
|
+
refs = IMAGE_RE.findall(text)
|
|
15
|
+
missing = []
|
|
16
|
+
for ref in refs:
|
|
17
|
+
if ref.startswith(("http://", "https://", "data:")):
|
|
18
|
+
continue
|
|
19
|
+
if not (path.parent / ref).exists() and not Path(ref).exists():
|
|
20
|
+
missing.append(ref)
|
|
21
|
+
return {
|
|
22
|
+
"file": str(path),
|
|
23
|
+
"image_refs": len(refs),
|
|
24
|
+
"missing_refs": len(missing),
|
|
25
|
+
"missing_samples": missing[:50],
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def collect_markdown(path: Path) -> list[Path]:
|
|
30
|
+
if path.is_file():
|
|
31
|
+
return [path]
|
|
32
|
+
return sorted(path.rglob("*.md"))
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def main() -> None:
|
|
36
|
+
parser = argparse.ArgumentParser()
|
|
37
|
+
parser.add_argument("target", type=Path)
|
|
38
|
+
args = parser.parse_args()
|
|
39
|
+
|
|
40
|
+
results = [validate_file(path) for path in collect_markdown(args.target)]
|
|
41
|
+
print(json.dumps({"files": len(results), "results": results}, indent=2))
|
|
42
|
+
raise SystemExit(1 if any(result["missing_refs"] for result in results) else 0)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
if __name__ == "__main__":
|
|
46
|
+
main()
|
package/tools/audit_corpus.py
CHANGED
|
@@ -1,203 +1,203 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import argparse
|
|
4
|
-
import json
|
|
5
|
-
import re
|
|
6
|
-
from dataclasses import dataclass
|
|
7
|
-
from pathlib import Path
|
|
8
|
-
from zipfile import ZipFile
|
|
9
|
-
from xml.etree import ElementTree as ET
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
|
|
13
|
-
W = f"{{{W_NS}}}"
|
|
14
|
-
|
|
15
|
-
TEXT_TAGS = {
|
|
16
|
-
f"{W}t",
|
|
17
|
-
f"{W}delText",
|
|
18
|
-
f"{W}instrText",
|
|
19
|
-
}
|
|
20
|
-
|
|
21
|
-
CONTENT_TAGS = TEXT_TAGS | {
|
|
22
|
-
f"{W}drawing",
|
|
23
|
-
f"{W}object",
|
|
24
|
-
f"{W}pict",
|
|
25
|
-
f"{W}oMath",
|
|
26
|
-
f"{W}oMathPara",
|
|
27
|
-
f"{W}noBreakHyphen",
|
|
28
|
-
f"{W}softHyphen",
|
|
29
|
-
f"{W}tab",
|
|
30
|
-
f"{W}br",
|
|
31
|
-
f"{W}cr",
|
|
32
|
-
}
|
|
33
|
-
|
|
34
|
-
IMAGE_RE = re.compile(r"!\[[^\]]*\]\([^)]+\)")
|
|
35
|
-
IMAGE_GLUE_RE = re.compile(
|
|
36
|
-
r"!\[[^\]]*\]\([^)]+\)(?=[^\s<>)\].,;:!?])|(?<=[^\s<(\[.,;:!?])!\[[^\]]*\]\([^)]+\)"
|
|
37
|
-
)
|
|
38
|
-
FOUR_PLUS_STARS_RE = re.compile(r"(?<!^)\*{4,}(?!$)", re.M)
|
|
39
|
-
ODD_BACKTICK_LINE_RE = re.compile(r"`")
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
@dataclass
|
|
43
|
-
class SourceStats:
|
|
44
|
-
total_paragraphs: int = 0
|
|
45
|
-
nonempty_paragraphs: int = 0
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
def local_name(tag: str) -> str:
|
|
49
|
-
return tag.rsplit("}", 1)[-1] if "}" in tag else tag
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
def paragraph_has_content(p: ET.Element) -> bool:
|
|
53
|
-
for node in p.iter():
|
|
54
|
-
if node.tag in TEXT_TAGS and (node.text or "").strip():
|
|
55
|
-
return True
|
|
56
|
-
if node.tag in CONTENT_TAGS - TEXT_TAGS:
|
|
57
|
-
return True
|
|
58
|
-
return False
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
def source_stats(docx_path: Path) -> SourceStats:
|
|
62
|
-
stats = SourceStats()
|
|
63
|
-
with ZipFile(docx_path) as zf:
|
|
64
|
-
root = ET.fromstring(zf.read("word/document.xml"))
|
|
65
|
-
for p in root.iter(f"{W}p"):
|
|
66
|
-
stats.total_paragraphs += 1
|
|
67
|
-
if paragraph_has_content(p):
|
|
68
|
-
stats.nonempty_paragraphs += 1
|
|
69
|
-
return stats
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
def load_json(path: Path) -> dict | list:
|
|
73
|
-
return json.loads(path.read_text(encoding="utf-8"))
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
def count_odd_backtick_lines(text: str) -> int:
|
|
77
|
-
count = 0
|
|
78
|
-
for line in text.splitlines():
|
|
79
|
-
if line.count("`") % 2:
|
|
80
|
-
count += 1
|
|
81
|
-
return count
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
def scan_markdown(md_path: Path) -> dict[str, int]:
|
|
85
|
-
text = md_path.read_text(encoding="utf-8", errors="replace")
|
|
86
|
-
four_plus = 0
|
|
87
|
-
for match in FOUR_PLUS_STARS_RE.finditer(text):
|
|
88
|
-
if match.group(0).strip("*"):
|
|
89
|
-
four_plus += 1
|
|
90
|
-
return {
|
|
91
|
-
"image_count": len(IMAGE_RE.findall(text)),
|
|
92
|
-
"image_glue_count": len(IMAGE_GLUE_RE.findall(text)),
|
|
93
|
-
"four_plus_stars_count": four_plus,
|
|
94
|
-
"odd_backtick_line_count": count_odd_backtick_lines(text),
|
|
95
|
-
}
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
def audit_entry(entry: dict) -> dict:
|
|
99
|
-
input_path = Path(entry["input"])
|
|
100
|
-
output_path = Path(entry["output"])
|
|
101
|
-
export_report_path = output_path.parent / "export-report.json"
|
|
102
|
-
|
|
103
|
-
problems: list[str] = []
|
|
104
|
-
source = source_stats(input_path)
|
|
105
|
-
|
|
106
|
-
if not output_path.exists():
|
|
107
|
-
problems.append("missing_markdown_output")
|
|
108
|
-
return {
|
|
109
|
-
"input": str(input_path),
|
|
110
|
-
"output": str(output_path),
|
|
111
|
-
"problems": problems,
|
|
112
|
-
}
|
|
113
|
-
|
|
114
|
-
if not export_report_path.exists():
|
|
115
|
-
problems.append("missing_export_report")
|
|
116
|
-
return {
|
|
117
|
-
"input": str(input_path),
|
|
118
|
-
"output": str(output_path),
|
|
119
|
-
"problems": problems,
|
|
120
|
-
}
|
|
121
|
-
|
|
122
|
-
export_report = load_json(export_report_path)
|
|
123
|
-
report_stats = export_report.get("stats", {})
|
|
124
|
-
batch_stats = entry.get("stats", {})
|
|
125
|
-
markdown = scan_markdown(output_path)
|
|
126
|
-
|
|
127
|
-
rendered_block_count = (
|
|
128
|
-
int(report_stats.get("paragraphs", 0))
|
|
129
|
-
+ int(report_stats.get("headings", 0))
|
|
130
|
-
+ int(report_stats.get("lists", 0))
|
|
131
|
-
+ int(report_stats.get("code_blocks", 0))
|
|
132
|
-
)
|
|
133
|
-
|
|
134
|
-
if report_stats != batch_stats:
|
|
135
|
-
problems.append("batch_report_mismatch")
|
|
136
|
-
if rendered_block_count != source.nonempty_paragraphs:
|
|
137
|
-
problems.append("paragraph_count_mismatch")
|
|
138
|
-
if markdown["image_count"] != int(report_stats.get("images", 0)):
|
|
139
|
-
problems.append("image_count_mismatch")
|
|
140
|
-
if markdown["image_glue_count"]:
|
|
141
|
-
problems.append("image_glue")
|
|
142
|
-
if markdown["four_plus_stars_count"]:
|
|
143
|
-
problems.append("four_plus_stars")
|
|
144
|
-
if markdown["odd_backtick_line_count"]:
|
|
145
|
-
problems.append("odd_backtick_lines")
|
|
146
|
-
|
|
147
|
-
return {
|
|
148
|
-
"input": str(input_path),
|
|
149
|
-
"output": str(output_path),
|
|
150
|
-
"source_total_paragraphs": source.total_paragraphs,
|
|
151
|
-
"source_nonempty_paragraphs": source.nonempty_paragraphs,
|
|
152
|
-
"report_rendered_blocks": rendered_block_count,
|
|
153
|
-
"report_images": int(report_stats.get("images", 0)),
|
|
154
|
-
"markdown_images": markdown["image_count"],
|
|
155
|
-
"image_glue_count": markdown["image_glue_count"],
|
|
156
|
-
"four_plus_stars_count": markdown["four_plus_stars_count"],
|
|
157
|
-
"odd_backtick_line_count": markdown["odd_backtick_line_count"],
|
|
158
|
-
"warnings": list(report_stats.get("warnings", [])),
|
|
159
|
-
"problems": problems,
|
|
160
|
-
}
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
def summarize(results: list[dict]) -> dict:
|
|
164
|
-
problem_counts: dict[str, int] = {}
|
|
165
|
-
for result in results:
|
|
166
|
-
for problem in result.get("problems", []):
|
|
167
|
-
problem_counts[problem] = problem_counts.get(problem, 0) + 1
|
|
168
|
-
return {
|
|
169
|
-
"files_audited": len(results),
|
|
170
|
-
"files_with_problems": sum(1 for result in results if result.get("problems")),
|
|
171
|
-
"problem_counts": problem_counts,
|
|
172
|
-
"problem_examples": [result for result in results if result.get("problems")][:25],
|
|
173
|
-
}
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
def main() -> int:
|
|
177
|
-
parser = argparse.ArgumentParser(description="Audit build-corpus batch output against source .docx files.")
|
|
178
|
-
parser.add_argument("--batch-report", required=True, help="Path to build-corpus-batch-report.json")
|
|
179
|
-
parser.add_argument("--out", default="", help="Optional path for the audit JSON report")
|
|
180
|
-
args = parser.parse_args()
|
|
181
|
-
|
|
182
|
-
batch_report_path = Path(args.batch_report).resolve()
|
|
183
|
-
entries = load_json(batch_report_path)
|
|
184
|
-
if not isinstance(entries, list):
|
|
185
|
-
raise SystemExit("Batch report must be a JSON array.")
|
|
186
|
-
|
|
187
|
-
results = [audit_entry(entry) for entry in entries]
|
|
188
|
-
summary = summarize(results)
|
|
189
|
-
payload = {
|
|
190
|
-
"batch_report": str(batch_report_path),
|
|
191
|
-
"summary": summary,
|
|
192
|
-
"results": results,
|
|
193
|
-
}
|
|
194
|
-
|
|
195
|
-
out_path = Path(args.out).resolve() if args.out else batch_report_path.with_name("corpus-audit-report.json")
|
|
196
|
-
out_path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
|
|
197
|
-
print(json.dumps(summary, indent=2))
|
|
198
|
-
print(f"WROTE {out_path}")
|
|
199
|
-
return 0
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
if __name__ == "__main__":
|
|
203
|
-
raise SystemExit(main())
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import json
|
|
5
|
+
import re
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from zipfile import ZipFile
|
|
9
|
+
from xml.etree import ElementTree as ET
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
|
|
13
|
+
W = f"{{{W_NS}}}"
|
|
14
|
+
|
|
15
|
+
TEXT_TAGS = {
|
|
16
|
+
f"{W}t",
|
|
17
|
+
f"{W}delText",
|
|
18
|
+
f"{W}instrText",
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
CONTENT_TAGS = TEXT_TAGS | {
|
|
22
|
+
f"{W}drawing",
|
|
23
|
+
f"{W}object",
|
|
24
|
+
f"{W}pict",
|
|
25
|
+
f"{W}oMath",
|
|
26
|
+
f"{W}oMathPara",
|
|
27
|
+
f"{W}noBreakHyphen",
|
|
28
|
+
f"{W}softHyphen",
|
|
29
|
+
f"{W}tab",
|
|
30
|
+
f"{W}br",
|
|
31
|
+
f"{W}cr",
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
IMAGE_RE = re.compile(r"!\[[^\]]*\]\([^)]+\)")
|
|
35
|
+
IMAGE_GLUE_RE = re.compile(
|
|
36
|
+
r"!\[[^\]]*\]\([^)]+\)(?=[^\s<>)\].,;:!?])|(?<=[^\s<(\[.,;:!?])!\[[^\]]*\]\([^)]+\)"
|
|
37
|
+
)
|
|
38
|
+
FOUR_PLUS_STARS_RE = re.compile(r"(?<!^)\*{4,}(?!$)", re.M)
|
|
39
|
+
ODD_BACKTICK_LINE_RE = re.compile(r"`")
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@dataclass
|
|
43
|
+
class SourceStats:
|
|
44
|
+
total_paragraphs: int = 0
|
|
45
|
+
nonempty_paragraphs: int = 0
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def local_name(tag: str) -> str:
|
|
49
|
+
return tag.rsplit("}", 1)[-1] if "}" in tag else tag
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def paragraph_has_content(p: ET.Element) -> bool:
|
|
53
|
+
for node in p.iter():
|
|
54
|
+
if node.tag in TEXT_TAGS and (node.text or "").strip():
|
|
55
|
+
return True
|
|
56
|
+
if node.tag in CONTENT_TAGS - TEXT_TAGS:
|
|
57
|
+
return True
|
|
58
|
+
return False
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def source_stats(docx_path: Path) -> SourceStats:
|
|
62
|
+
stats = SourceStats()
|
|
63
|
+
with ZipFile(docx_path) as zf:
|
|
64
|
+
root = ET.fromstring(zf.read("word/document.xml"))
|
|
65
|
+
for p in root.iter(f"{W}p"):
|
|
66
|
+
stats.total_paragraphs += 1
|
|
67
|
+
if paragraph_has_content(p):
|
|
68
|
+
stats.nonempty_paragraphs += 1
|
|
69
|
+
return stats
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def load_json(path: Path) -> dict | list:
|
|
73
|
+
return json.loads(path.read_text(encoding="utf-8"))
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def count_odd_backtick_lines(text: str) -> int:
|
|
77
|
+
count = 0
|
|
78
|
+
for line in text.splitlines():
|
|
79
|
+
if line.count("`") % 2:
|
|
80
|
+
count += 1
|
|
81
|
+
return count
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def scan_markdown(md_path: Path) -> dict[str, int]:
|
|
85
|
+
text = md_path.read_text(encoding="utf-8", errors="replace")
|
|
86
|
+
four_plus = 0
|
|
87
|
+
for match in FOUR_PLUS_STARS_RE.finditer(text):
|
|
88
|
+
if match.group(0).strip("*"):
|
|
89
|
+
four_plus += 1
|
|
90
|
+
return {
|
|
91
|
+
"image_count": len(IMAGE_RE.findall(text)),
|
|
92
|
+
"image_glue_count": len(IMAGE_GLUE_RE.findall(text)),
|
|
93
|
+
"four_plus_stars_count": four_plus,
|
|
94
|
+
"odd_backtick_line_count": count_odd_backtick_lines(text),
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def audit_entry(entry: dict) -> dict:
|
|
99
|
+
input_path = Path(entry["input"])
|
|
100
|
+
output_path = Path(entry["output"])
|
|
101
|
+
export_report_path = output_path.parent / "export-report.json"
|
|
102
|
+
|
|
103
|
+
problems: list[str] = []
|
|
104
|
+
source = source_stats(input_path)
|
|
105
|
+
|
|
106
|
+
if not output_path.exists():
|
|
107
|
+
problems.append("missing_markdown_output")
|
|
108
|
+
return {
|
|
109
|
+
"input": str(input_path),
|
|
110
|
+
"output": str(output_path),
|
|
111
|
+
"problems": problems,
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
if not export_report_path.exists():
|
|
115
|
+
problems.append("missing_export_report")
|
|
116
|
+
return {
|
|
117
|
+
"input": str(input_path),
|
|
118
|
+
"output": str(output_path),
|
|
119
|
+
"problems": problems,
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
export_report = load_json(export_report_path)
|
|
123
|
+
report_stats = export_report.get("stats", {})
|
|
124
|
+
batch_stats = entry.get("stats", {})
|
|
125
|
+
markdown = scan_markdown(output_path)
|
|
126
|
+
|
|
127
|
+
rendered_block_count = (
|
|
128
|
+
int(report_stats.get("paragraphs", 0))
|
|
129
|
+
+ int(report_stats.get("headings", 0))
|
|
130
|
+
+ int(report_stats.get("lists", 0))
|
|
131
|
+
+ int(report_stats.get("code_blocks", 0))
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
if report_stats != batch_stats:
|
|
135
|
+
problems.append("batch_report_mismatch")
|
|
136
|
+
if rendered_block_count != source.nonempty_paragraphs:
|
|
137
|
+
problems.append("paragraph_count_mismatch")
|
|
138
|
+
if markdown["image_count"] != int(report_stats.get("images", 0)):
|
|
139
|
+
problems.append("image_count_mismatch")
|
|
140
|
+
if markdown["image_glue_count"]:
|
|
141
|
+
problems.append("image_glue")
|
|
142
|
+
if markdown["four_plus_stars_count"]:
|
|
143
|
+
problems.append("four_plus_stars")
|
|
144
|
+
if markdown["odd_backtick_line_count"]:
|
|
145
|
+
problems.append("odd_backtick_lines")
|
|
146
|
+
|
|
147
|
+
return {
|
|
148
|
+
"input": str(input_path),
|
|
149
|
+
"output": str(output_path),
|
|
150
|
+
"source_total_paragraphs": source.total_paragraphs,
|
|
151
|
+
"source_nonempty_paragraphs": source.nonempty_paragraphs,
|
|
152
|
+
"report_rendered_blocks": rendered_block_count,
|
|
153
|
+
"report_images": int(report_stats.get("images", 0)),
|
|
154
|
+
"markdown_images": markdown["image_count"],
|
|
155
|
+
"image_glue_count": markdown["image_glue_count"],
|
|
156
|
+
"four_plus_stars_count": markdown["four_plus_stars_count"],
|
|
157
|
+
"odd_backtick_line_count": markdown["odd_backtick_line_count"],
|
|
158
|
+
"warnings": list(report_stats.get("warnings", [])),
|
|
159
|
+
"problems": problems,
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def summarize(results: list[dict]) -> dict:
|
|
164
|
+
problem_counts: dict[str, int] = {}
|
|
165
|
+
for result in results:
|
|
166
|
+
for problem in result.get("problems", []):
|
|
167
|
+
problem_counts[problem] = problem_counts.get(problem, 0) + 1
|
|
168
|
+
return {
|
|
169
|
+
"files_audited": len(results),
|
|
170
|
+
"files_with_problems": sum(1 for result in results if result.get("problems")),
|
|
171
|
+
"problem_counts": problem_counts,
|
|
172
|
+
"problem_examples": [result for result in results if result.get("problems")][:25],
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def main() -> int:
|
|
177
|
+
parser = argparse.ArgumentParser(description="Audit build-corpus batch output against source .docx files.")
|
|
178
|
+
parser.add_argument("--batch-report", required=True, help="Path to build-corpus-batch-report.json")
|
|
179
|
+
parser.add_argument("--out", default="", help="Optional path for the audit JSON report")
|
|
180
|
+
args = parser.parse_args()
|
|
181
|
+
|
|
182
|
+
batch_report_path = Path(args.batch_report).resolve()
|
|
183
|
+
entries = load_json(batch_report_path)
|
|
184
|
+
if not isinstance(entries, list):
|
|
185
|
+
raise SystemExit("Batch report must be a JSON array.")
|
|
186
|
+
|
|
187
|
+
results = [audit_entry(entry) for entry in entries]
|
|
188
|
+
summary = summarize(results)
|
|
189
|
+
payload = {
|
|
190
|
+
"batch_report": str(batch_report_path),
|
|
191
|
+
"summary": summary,
|
|
192
|
+
"results": results,
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
out_path = Path(args.out).resolve() if args.out else batch_report_path.with_name("corpus-audit-report.json")
|
|
196
|
+
out_path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
|
|
197
|
+
print(json.dumps(summary, indent=2))
|
|
198
|
+
print(f"WROTE {out_path}")
|
|
199
|
+
return 0
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
if __name__ == "__main__":
|
|
203
|
+
raise SystemExit(main())
|