regen.mde 0.8.1 → 0.8.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/LICENSE +16 -16
  2. package/bin/build-corpus-editor.js +83 -83
  3. package/bin/build-corpus.js +41 -41
  4. package/bin/regen-mdeditor-install.js +27 -27
  5. package/bin/regen-mdeditor-uninstall.js +19 -19
  6. package/bin/validate-katex.js +93 -93
  7. package/desktop/BuildCorpusEditor/BuildCorpusEditor.csproj +22 -22
  8. package/desktop/BuildCorpusEditor/EditorForm.cs +58 -58
  9. package/desktop/BuildCorpusEditor/app.manifest +16 -16
  10. package/dist/release/{regen-mde-0.8.0-win-x64.zip → regen-mde-0.6.1-win-x64.zip} +0 -0
  11. package/dist/release/{regen-mde-0.8.1-win-x64.zip → regen-mde-0.8.2-win-x64.zip} +0 -0
  12. package/dist/windows-editor/BuildCorpusEditor.dll +0 -0
  13. package/dist/windows-editor/BuildCorpusEditor.exe +0 -0
  14. package/dist/windows-editor/BuildCorpusEditor.pdb +0 -0
  15. package/dist/windows-editor/wwwroot/index.html +20 -20
  16. package/editor-web/index.html +21 -21
  17. package/editor-web/src/main.jsx +107 -107
  18. package/editor-web/src/styles.css +99 -99
  19. package/editor-web/vite.config.js +13 -13
  20. package/examples/build-corpus.config.example.json +21 -21
  21. package/installer/install-regen-mde.ps1 +214 -214
  22. package/installer/regen-mde.nsi +81 -81
  23. package/package.json +1 -1
  24. package/pyproject.toml +1 -1
  25. package/scripts/build-windows-editor.ps1 +47 -47
  26. package/scripts/package-windows-editor.ps1 +90 -90
  27. package/scripts/run-corpus.ps1 +28 -28
  28. package/scripts/run-editor-implementation-plane.ps1 +226 -226
  29. package/scripts/run-required-tests.ps1 +98 -98
  30. package/scripts/run-smoke.ps1 +28 -28
  31. package/src/build_corpus/__init__.py +3 -3
  32. package/src/build_corpus/docx_exporter.py +10 -4
  33. package/src/build_corpus/equations.py +1345 -1345
  34. package/src/build_corpus/templates/__init__.py +1 -1
  35. package/src/build_corpus/validate_assets.py +46 -46
  36. package/tools/audit_corpus.py +203 -203
  37. package/tools/collect_microsoft_word_templates.py +228 -228
  38. package/tools/collect_online_docx_corpus.py +272 -272
  39. package/tools/collect_online_pptx_corpus.py +252 -252
  40. package/tools/compare_pptx_inputs_outputs.py +87 -87
  41. package/tools/roundtrip_docx_corpus.py +171 -171
@@ -1 +1 @@
1
-
1
+
@@ -1,46 +1,46 @@
1
- from __future__ import annotations
2
-
3
- import argparse
4
- import json
5
- import re
6
- from pathlib import Path
7
-
8
-
9
- IMAGE_RE = re.compile(r"!\[[^\]]*\]\(([^)]+)\)")
10
-
11
-
12
- def validate_file(path: Path) -> dict:
13
- text = path.read_text(encoding="utf-8", errors="replace")
14
- refs = IMAGE_RE.findall(text)
15
- missing = []
16
- for ref in refs:
17
- if ref.startswith(("http://", "https://", "data:")):
18
- continue
19
- if not (path.parent / ref).exists() and not Path(ref).exists():
20
- missing.append(ref)
21
- return {
22
- "file": str(path),
23
- "image_refs": len(refs),
24
- "missing_refs": len(missing),
25
- "missing_samples": missing[:50],
26
- }
27
-
28
-
29
- def collect_markdown(path: Path) -> list[Path]:
30
- if path.is_file():
31
- return [path]
32
- return sorted(path.rglob("*.md"))
33
-
34
-
35
- def main() -> None:
36
- parser = argparse.ArgumentParser()
37
- parser.add_argument("target", type=Path)
38
- args = parser.parse_args()
39
-
40
- results = [validate_file(path) for path in collect_markdown(args.target)]
41
- print(json.dumps({"files": len(results), "results": results}, indent=2))
42
- raise SystemExit(1 if any(result["missing_refs"] for result in results) else 0)
43
-
44
-
45
- if __name__ == "__main__":
46
- main()
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import json
5
+ import re
6
+ from pathlib import Path
7
+
8
+
9
+ IMAGE_RE = re.compile(r"!\[[^\]]*\]\(([^)]+)\)")
10
+
11
+
12
+ def validate_file(path: Path) -> dict:
13
+ text = path.read_text(encoding="utf-8", errors="replace")
14
+ refs = IMAGE_RE.findall(text)
15
+ missing = []
16
+ for ref in refs:
17
+ if ref.startswith(("http://", "https://", "data:")):
18
+ continue
19
+ if not (path.parent / ref).exists() and not Path(ref).exists():
20
+ missing.append(ref)
21
+ return {
22
+ "file": str(path),
23
+ "image_refs": len(refs),
24
+ "missing_refs": len(missing),
25
+ "missing_samples": missing[:50],
26
+ }
27
+
28
+
29
+ def collect_markdown(path: Path) -> list[Path]:
30
+ if path.is_file():
31
+ return [path]
32
+ return sorted(path.rglob("*.md"))
33
+
34
+
35
+ def main() -> None:
36
+ parser = argparse.ArgumentParser()
37
+ parser.add_argument("target", type=Path)
38
+ args = parser.parse_args()
39
+
40
+ results = [validate_file(path) for path in collect_markdown(args.target)]
41
+ print(json.dumps({"files": len(results), "results": results}, indent=2))
42
+ raise SystemExit(1 if any(result["missing_refs"] for result in results) else 0)
43
+
44
+
45
+ if __name__ == "__main__":
46
+ main()
@@ -1,203 +1,203 @@
1
- from __future__ import annotations
2
-
3
- import argparse
4
- import json
5
- import re
6
- from dataclasses import dataclass
7
- from pathlib import Path
8
- from zipfile import ZipFile
9
- from xml.etree import ElementTree as ET
10
-
11
-
12
- W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
13
- W = f"{{{W_NS}}}"
14
-
15
- TEXT_TAGS = {
16
- f"{W}t",
17
- f"{W}delText",
18
- f"{W}instrText",
19
- }
20
-
21
- CONTENT_TAGS = TEXT_TAGS | {
22
- f"{W}drawing",
23
- f"{W}object",
24
- f"{W}pict",
25
- f"{W}oMath",
26
- f"{W}oMathPara",
27
- f"{W}noBreakHyphen",
28
- f"{W}softHyphen",
29
- f"{W}tab",
30
- f"{W}br",
31
- f"{W}cr",
32
- }
33
-
34
- IMAGE_RE = re.compile(r"!\[[^\]]*\]\([^)]+\)")
35
- IMAGE_GLUE_RE = re.compile(
36
- r"!\[[^\]]*\]\([^)]+\)(?=[^\s<>)\].,;:!?])|(?<=[^\s<(\[.,;:!?])!\[[^\]]*\]\([^)]+\)"
37
- )
38
- FOUR_PLUS_STARS_RE = re.compile(r"(?<!^)\*{4,}(?!$)", re.M)
39
- ODD_BACKTICK_LINE_RE = re.compile(r"`")
40
-
41
-
42
- @dataclass
43
- class SourceStats:
44
- total_paragraphs: int = 0
45
- nonempty_paragraphs: int = 0
46
-
47
-
48
- def local_name(tag: str) -> str:
49
- return tag.rsplit("}", 1)[-1] if "}" in tag else tag
50
-
51
-
52
- def paragraph_has_content(p: ET.Element) -> bool:
53
- for node in p.iter():
54
- if node.tag in TEXT_TAGS and (node.text or "").strip():
55
- return True
56
- if node.tag in CONTENT_TAGS - TEXT_TAGS:
57
- return True
58
- return False
59
-
60
-
61
- def source_stats(docx_path: Path) -> SourceStats:
62
- stats = SourceStats()
63
- with ZipFile(docx_path) as zf:
64
- root = ET.fromstring(zf.read("word/document.xml"))
65
- for p in root.iter(f"{W}p"):
66
- stats.total_paragraphs += 1
67
- if paragraph_has_content(p):
68
- stats.nonempty_paragraphs += 1
69
- return stats
70
-
71
-
72
- def load_json(path: Path) -> dict | list:
73
- return json.loads(path.read_text(encoding="utf-8"))
74
-
75
-
76
- def count_odd_backtick_lines(text: str) -> int:
77
- count = 0
78
- for line in text.splitlines():
79
- if line.count("`") % 2:
80
- count += 1
81
- return count
82
-
83
-
84
- def scan_markdown(md_path: Path) -> dict[str, int]:
85
- text = md_path.read_text(encoding="utf-8", errors="replace")
86
- four_plus = 0
87
- for match in FOUR_PLUS_STARS_RE.finditer(text):
88
- if match.group(0).strip("*"):
89
- four_plus += 1
90
- return {
91
- "image_count": len(IMAGE_RE.findall(text)),
92
- "image_glue_count": len(IMAGE_GLUE_RE.findall(text)),
93
- "four_plus_stars_count": four_plus,
94
- "odd_backtick_line_count": count_odd_backtick_lines(text),
95
- }
96
-
97
-
98
- def audit_entry(entry: dict) -> dict:
99
- input_path = Path(entry["input"])
100
- output_path = Path(entry["output"])
101
- export_report_path = output_path.parent / "export-report.json"
102
-
103
- problems: list[str] = []
104
- source = source_stats(input_path)
105
-
106
- if not output_path.exists():
107
- problems.append("missing_markdown_output")
108
- return {
109
- "input": str(input_path),
110
- "output": str(output_path),
111
- "problems": problems,
112
- }
113
-
114
- if not export_report_path.exists():
115
- problems.append("missing_export_report")
116
- return {
117
- "input": str(input_path),
118
- "output": str(output_path),
119
- "problems": problems,
120
- }
121
-
122
- export_report = load_json(export_report_path)
123
- report_stats = export_report.get("stats", {})
124
- batch_stats = entry.get("stats", {})
125
- markdown = scan_markdown(output_path)
126
-
127
- rendered_block_count = (
128
- int(report_stats.get("paragraphs", 0))
129
- + int(report_stats.get("headings", 0))
130
- + int(report_stats.get("lists", 0))
131
- + int(report_stats.get("code_blocks", 0))
132
- )
133
-
134
- if report_stats != batch_stats:
135
- problems.append("batch_report_mismatch")
136
- if rendered_block_count != source.nonempty_paragraphs:
137
- problems.append("paragraph_count_mismatch")
138
- if markdown["image_count"] != int(report_stats.get("images", 0)):
139
- problems.append("image_count_mismatch")
140
- if markdown["image_glue_count"]:
141
- problems.append("image_glue")
142
- if markdown["four_plus_stars_count"]:
143
- problems.append("four_plus_stars")
144
- if markdown["odd_backtick_line_count"]:
145
- problems.append("odd_backtick_lines")
146
-
147
- return {
148
- "input": str(input_path),
149
- "output": str(output_path),
150
- "source_total_paragraphs": source.total_paragraphs,
151
- "source_nonempty_paragraphs": source.nonempty_paragraphs,
152
- "report_rendered_blocks": rendered_block_count,
153
- "report_images": int(report_stats.get("images", 0)),
154
- "markdown_images": markdown["image_count"],
155
- "image_glue_count": markdown["image_glue_count"],
156
- "four_plus_stars_count": markdown["four_plus_stars_count"],
157
- "odd_backtick_line_count": markdown["odd_backtick_line_count"],
158
- "warnings": list(report_stats.get("warnings", [])),
159
- "problems": problems,
160
- }
161
-
162
-
163
- def summarize(results: list[dict]) -> dict:
164
- problem_counts: dict[str, int] = {}
165
- for result in results:
166
- for problem in result.get("problems", []):
167
- problem_counts[problem] = problem_counts.get(problem, 0) + 1
168
- return {
169
- "files_audited": len(results),
170
- "files_with_problems": sum(1 for result in results if result.get("problems")),
171
- "problem_counts": problem_counts,
172
- "problem_examples": [result for result in results if result.get("problems")][:25],
173
- }
174
-
175
-
176
- def main() -> int:
177
- parser = argparse.ArgumentParser(description="Audit build-corpus batch output against source .docx files.")
178
- parser.add_argument("--batch-report", required=True, help="Path to build-corpus-batch-report.json")
179
- parser.add_argument("--out", default="", help="Optional path for the audit JSON report")
180
- args = parser.parse_args()
181
-
182
- batch_report_path = Path(args.batch_report).resolve()
183
- entries = load_json(batch_report_path)
184
- if not isinstance(entries, list):
185
- raise SystemExit("Batch report must be a JSON array.")
186
-
187
- results = [audit_entry(entry) for entry in entries]
188
- summary = summarize(results)
189
- payload = {
190
- "batch_report": str(batch_report_path),
191
- "summary": summary,
192
- "results": results,
193
- }
194
-
195
- out_path = Path(args.out).resolve() if args.out else batch_report_path.with_name("corpus-audit-report.json")
196
- out_path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
197
- print(json.dumps(summary, indent=2))
198
- print(f"WROTE {out_path}")
199
- return 0
200
-
201
-
202
- if __name__ == "__main__":
203
- raise SystemExit(main())
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import json
5
+ import re
6
+ from dataclasses import dataclass
7
+ from pathlib import Path
8
+ from zipfile import ZipFile
9
+ from xml.etree import ElementTree as ET
10
+
11
+
12
+ W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
13
+ W = f"{{{W_NS}}}"
14
+
15
+ TEXT_TAGS = {
16
+ f"{W}t",
17
+ f"{W}delText",
18
+ f"{W}instrText",
19
+ }
20
+
21
+ CONTENT_TAGS = TEXT_TAGS | {
22
+ f"{W}drawing",
23
+ f"{W}object",
24
+ f"{W}pict",
25
+ f"{W}oMath",
26
+ f"{W}oMathPara",
27
+ f"{W}noBreakHyphen",
28
+ f"{W}softHyphen",
29
+ f"{W}tab",
30
+ f"{W}br",
31
+ f"{W}cr",
32
+ }
33
+
34
+ IMAGE_RE = re.compile(r"!\[[^\]]*\]\([^)]+\)")
35
+ IMAGE_GLUE_RE = re.compile(
36
+ r"!\[[^\]]*\]\([^)]+\)(?=[^\s<>)\].,;:!?])|(?<=[^\s<(\[.,;:!?])!\[[^\]]*\]\([^)]+\)"
37
+ )
38
+ FOUR_PLUS_STARS_RE = re.compile(r"(?<!^)\*{4,}(?!$)", re.M)
39
+ ODD_BACKTICK_LINE_RE = re.compile(r"`")
40
+
41
+
42
+ @dataclass
43
+ class SourceStats:
44
+ total_paragraphs: int = 0
45
+ nonempty_paragraphs: int = 0
46
+
47
+
48
+ def local_name(tag: str) -> str:
49
+ return tag.rsplit("}", 1)[-1] if "}" in tag else tag
50
+
51
+
52
+ def paragraph_has_content(p: ET.Element) -> bool:
53
+ for node in p.iter():
54
+ if node.tag in TEXT_TAGS and (node.text or "").strip():
55
+ return True
56
+ if node.tag in CONTENT_TAGS - TEXT_TAGS:
57
+ return True
58
+ return False
59
+
60
+
61
+ def source_stats(docx_path: Path) -> SourceStats:
62
+ stats = SourceStats()
63
+ with ZipFile(docx_path) as zf:
64
+ root = ET.fromstring(zf.read("word/document.xml"))
65
+ for p in root.iter(f"{W}p"):
66
+ stats.total_paragraphs += 1
67
+ if paragraph_has_content(p):
68
+ stats.nonempty_paragraphs += 1
69
+ return stats
70
+
71
+
72
+ def load_json(path: Path) -> dict | list:
73
+ return json.loads(path.read_text(encoding="utf-8"))
74
+
75
+
76
+ def count_odd_backtick_lines(text: str) -> int:
77
+ count = 0
78
+ for line in text.splitlines():
79
+ if line.count("`") % 2:
80
+ count += 1
81
+ return count
82
+
83
+
84
+ def scan_markdown(md_path: Path) -> dict[str, int]:
85
+ text = md_path.read_text(encoding="utf-8", errors="replace")
86
+ four_plus = 0
87
+ for match in FOUR_PLUS_STARS_RE.finditer(text):
88
+ if match.group(0).strip("*"):
89
+ four_plus += 1
90
+ return {
91
+ "image_count": len(IMAGE_RE.findall(text)),
92
+ "image_glue_count": len(IMAGE_GLUE_RE.findall(text)),
93
+ "four_plus_stars_count": four_plus,
94
+ "odd_backtick_line_count": count_odd_backtick_lines(text),
95
+ }
96
+
97
+
98
+ def audit_entry(entry: dict) -> dict:
99
+ input_path = Path(entry["input"])
100
+ output_path = Path(entry["output"])
101
+ export_report_path = output_path.parent / "export-report.json"
102
+
103
+ problems: list[str] = []
104
+ source = source_stats(input_path)
105
+
106
+ if not output_path.exists():
107
+ problems.append("missing_markdown_output")
108
+ return {
109
+ "input": str(input_path),
110
+ "output": str(output_path),
111
+ "problems": problems,
112
+ }
113
+
114
+ if not export_report_path.exists():
115
+ problems.append("missing_export_report")
116
+ return {
117
+ "input": str(input_path),
118
+ "output": str(output_path),
119
+ "problems": problems,
120
+ }
121
+
122
+ export_report = load_json(export_report_path)
123
+ report_stats = export_report.get("stats", {})
124
+ batch_stats = entry.get("stats", {})
125
+ markdown = scan_markdown(output_path)
126
+
127
+ rendered_block_count = (
128
+ int(report_stats.get("paragraphs", 0))
129
+ + int(report_stats.get("headings", 0))
130
+ + int(report_stats.get("lists", 0))
131
+ + int(report_stats.get("code_blocks", 0))
132
+ )
133
+
134
+ if report_stats != batch_stats:
135
+ problems.append("batch_report_mismatch")
136
+ if rendered_block_count != source.nonempty_paragraphs:
137
+ problems.append("paragraph_count_mismatch")
138
+ if markdown["image_count"] != int(report_stats.get("images", 0)):
139
+ problems.append("image_count_mismatch")
140
+ if markdown["image_glue_count"]:
141
+ problems.append("image_glue")
142
+ if markdown["four_plus_stars_count"]:
143
+ problems.append("four_plus_stars")
144
+ if markdown["odd_backtick_line_count"]:
145
+ problems.append("odd_backtick_lines")
146
+
147
+ return {
148
+ "input": str(input_path),
149
+ "output": str(output_path),
150
+ "source_total_paragraphs": source.total_paragraphs,
151
+ "source_nonempty_paragraphs": source.nonempty_paragraphs,
152
+ "report_rendered_blocks": rendered_block_count,
153
+ "report_images": int(report_stats.get("images", 0)),
154
+ "markdown_images": markdown["image_count"],
155
+ "image_glue_count": markdown["image_glue_count"],
156
+ "four_plus_stars_count": markdown["four_plus_stars_count"],
157
+ "odd_backtick_line_count": markdown["odd_backtick_line_count"],
158
+ "warnings": list(report_stats.get("warnings", [])),
159
+ "problems": problems,
160
+ }
161
+
162
+
163
+ def summarize(results: list[dict]) -> dict:
164
+ problem_counts: dict[str, int] = {}
165
+ for result in results:
166
+ for problem in result.get("problems", []):
167
+ problem_counts[problem] = problem_counts.get(problem, 0) + 1
168
+ return {
169
+ "files_audited": len(results),
170
+ "files_with_problems": sum(1 for result in results if result.get("problems")),
171
+ "problem_counts": problem_counts,
172
+ "problem_examples": [result for result in results if result.get("problems")][:25],
173
+ }
174
+
175
+
176
+ def main() -> int:
177
+ parser = argparse.ArgumentParser(description="Audit build-corpus batch output against source .docx files.")
178
+ parser.add_argument("--batch-report", required=True, help="Path to build-corpus-batch-report.json")
179
+ parser.add_argument("--out", default="", help="Optional path for the audit JSON report")
180
+ args = parser.parse_args()
181
+
182
+ batch_report_path = Path(args.batch_report).resolve()
183
+ entries = load_json(batch_report_path)
184
+ if not isinstance(entries, list):
185
+ raise SystemExit("Batch report must be a JSON array.")
186
+
187
+ results = [audit_entry(entry) for entry in entries]
188
+ summary = summarize(results)
189
+ payload = {
190
+ "batch_report": str(batch_report_path),
191
+ "summary": summary,
192
+ "results": results,
193
+ }
194
+
195
+ out_path = Path(args.out).resolve() if args.out else batch_report_path.with_name("corpus-audit-report.json")
196
+ out_path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
197
+ print(json.dumps(summary, indent=2))
198
+ print(f"WROTE {out_path}")
199
+ return 0
200
+
201
+
202
+ if __name__ == "__main__":
203
+ raise SystemExit(main())