regen.mde 0.2.2 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. package/LICENSE +16 -16
  2. package/README.md +409 -295
  3. package/bin/build-corpus-editor.js +83 -81
  4. package/bin/build-corpus.js +41 -41
  5. package/bin/postinstall.js +259 -187
  6. package/bin/regen-mdeditor-install.js +27 -27
  7. package/bin/regen-mdeditor-uninstall.js +19 -19
  8. package/bin/validate-katex.js +93 -93
  9. package/desktop/BuildCorpusEditor/BuildCorpusBridge.cs +493 -270
  10. package/desktop/BuildCorpusEditor/BuildCorpusEditor.csproj +22 -22
  11. package/desktop/BuildCorpusEditor/EditorForm.cs +853 -540
  12. package/desktop/BuildCorpusEditor/Program.cs +85 -81
  13. package/desktop/BuildCorpusEditor/app.manifest +16 -16
  14. package/dist/release/regen-mde-0.8.0-win-x64.zip +0 -0
  15. package/dist/windows-editor/BuildCorpusEditor.dll +0 -0
  16. package/dist/windows-editor/BuildCorpusEditor.exe +0 -0
  17. package/dist/windows-editor/BuildCorpusEditor.pdb +0 -0
  18. package/dist/windows-editor/BuildCorpusEditor.runtimeconfig.json +1 -1
  19. package/dist/windows-editor/wwwroot/assets/index-C_VxJk4k.js +375 -0
  20. package/dist/windows-editor/wwwroot/assets/index-Wt9zSjIw.css +1 -0
  21. package/dist/windows-editor/wwwroot/index.html +22 -22
  22. package/editor-web/index.html +21 -21
  23. package/editor-web/src/main.jsx +1044 -399
  24. package/editor-web/src/styles.css +846 -602
  25. package/editor-web/vite.config.js +13 -13
  26. package/examples/build-corpus.config.example.json +21 -21
  27. package/installer/install-regen-mde.ps1 +214 -175
  28. package/installer/regen-mde.nsi +81 -81
  29. package/package.json +10 -6
  30. package/pyproject.toml +4 -3
  31. package/requirements.txt +5 -4
  32. package/scripts/build-windows-editor.ps1 +47 -47
  33. package/scripts/package-windows-editor.ps1 +90 -90
  34. package/scripts/release-dual.mjs +105 -0
  35. package/scripts/run-corpus.ps1 +28 -28
  36. package/scripts/run-editor-implementation-plane.ps1 +226 -203
  37. package/scripts/run-required-tests.ps1 +98 -98
  38. package/scripts/run-smoke.ps1 +28 -28
  39. package/src/build_corpus/__init__.py +1 -1
  40. package/src/build_corpus/docx_exporter.py +1055 -798
  41. package/src/build_corpus/equations.py +1345 -0
  42. package/src/build_corpus/exporter.py +1488 -1195
  43. package/src/build_corpus/frontmatter.py +302 -0
  44. package/src/build_corpus/ppt_exporter.py +543 -532
  45. package/src/build_corpus/templates/__init__.py +1 -1
  46. package/src/build_corpus/validate_assets.py +46 -46
  47. package/tools/audit_corpus.py +203 -203
  48. package/tools/collect_microsoft_word_templates.py +228 -228
  49. package/tools/collect_online_docx_corpus.py +272 -272
  50. package/tools/collect_online_pptx_corpus.py +252 -252
  51. package/tools/compare_pptx_inputs_outputs.py +87 -87
  52. package/tools/roundtrip_docx_corpus.py +171 -171
  53. package/dist/release/regen.mde-0.2.2-win-x64-setup.exe +0 -0
  54. package/dist/release/regen.mde-0.2.2-win-x64.zip +0 -0
  55. package/dist/windows-editor/wwwroot/assets/index-DjJ6xmhy.js +0 -326
  56. package/dist/windows-editor/wwwroot/assets/index-_dwMNNsm.css +0 -1
@@ -1 +1 @@
1
-
1
+
@@ -1,46 +1,46 @@
1
- from __future__ import annotations
2
-
3
- import argparse
4
- import json
5
- import re
6
- from pathlib import Path
7
-
8
-
9
- IMAGE_RE = re.compile(r"!\[[^\]]*\]\(([^)]+)\)")
10
-
11
-
12
- def validate_file(path: Path) -> dict:
13
- text = path.read_text(encoding="utf-8", errors="replace")
14
- refs = IMAGE_RE.findall(text)
15
- missing = []
16
- for ref in refs:
17
- if ref.startswith(("http://", "https://", "data:")):
18
- continue
19
- if not (path.parent / ref).exists() and not Path(ref).exists():
20
- missing.append(ref)
21
- return {
22
- "file": str(path),
23
- "image_refs": len(refs),
24
- "missing_refs": len(missing),
25
- "missing_samples": missing[:50],
26
- }
27
-
28
-
29
- def collect_markdown(path: Path) -> list[Path]:
30
- if path.is_file():
31
- return [path]
32
- return sorted(path.rglob("*.md"))
33
-
34
-
35
- def main() -> None:
36
- parser = argparse.ArgumentParser()
37
- parser.add_argument("target", type=Path)
38
- args = parser.parse_args()
39
-
40
- results = [validate_file(path) for path in collect_markdown(args.target)]
41
- print(json.dumps({"files": len(results), "results": results}, indent=2))
42
- raise SystemExit(1 if any(result["missing_refs"] for result in results) else 0)
43
-
44
-
45
- if __name__ == "__main__":
46
- main()
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import json
5
+ import re
6
+ from pathlib import Path
7
+
8
+
9
+ IMAGE_RE = re.compile(r"!\[[^\]]*\]\(([^)]+)\)")
10
+
11
+
12
+ def validate_file(path: Path) -> dict:
13
+ text = path.read_text(encoding="utf-8", errors="replace")
14
+ refs = IMAGE_RE.findall(text)
15
+ missing = []
16
+ for ref in refs:
17
+ if ref.startswith(("http://", "https://", "data:")):
18
+ continue
19
+ if not (path.parent / ref).exists() and not Path(ref).exists():
20
+ missing.append(ref)
21
+ return {
22
+ "file": str(path),
23
+ "image_refs": len(refs),
24
+ "missing_refs": len(missing),
25
+ "missing_samples": missing[:50],
26
+ }
27
+
28
+
29
+ def collect_markdown(path: Path) -> list[Path]:
30
+ if path.is_file():
31
+ return [path]
32
+ return sorted(path.rglob("*.md"))
33
+
34
+
35
+ def main() -> None:
36
+ parser = argparse.ArgumentParser()
37
+ parser.add_argument("target", type=Path)
38
+ args = parser.parse_args()
39
+
40
+ results = [validate_file(path) for path in collect_markdown(args.target)]
41
+ print(json.dumps({"files": len(results), "results": results}, indent=2))
42
+ raise SystemExit(1 if any(result["missing_refs"] for result in results) else 0)
43
+
44
+
45
+ if __name__ == "__main__":
46
+ main()
@@ -1,203 +1,203 @@
1
- from __future__ import annotations
2
-
3
- import argparse
4
- import json
5
- import re
6
- from dataclasses import dataclass
7
- from pathlib import Path
8
- from zipfile import ZipFile
9
- from xml.etree import ElementTree as ET
10
-
11
-
12
- W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
13
- W = f"{{{W_NS}}}"
14
-
15
- TEXT_TAGS = {
16
- f"{W}t",
17
- f"{W}delText",
18
- f"{W}instrText",
19
- }
20
-
21
- CONTENT_TAGS = TEXT_TAGS | {
22
- f"{W}drawing",
23
- f"{W}object",
24
- f"{W}pict",
25
- f"{W}oMath",
26
- f"{W}oMathPara",
27
- f"{W}noBreakHyphen",
28
- f"{W}softHyphen",
29
- f"{W}tab",
30
- f"{W}br",
31
- f"{W}cr",
32
- }
33
-
34
- IMAGE_RE = re.compile(r"!\[[^\]]*\]\([^)]+\)")
35
- IMAGE_GLUE_RE = re.compile(
36
- r"!\[[^\]]*\]\([^)]+\)(?=[^\s<>)\].,;:!?])|(?<=[^\s<(\[.,;:!?])!\[[^\]]*\]\([^)]+\)"
37
- )
38
- FOUR_PLUS_STARS_RE = re.compile(r"(?<!^)\*{4,}(?!$)", re.M)
39
- ODD_BACKTICK_LINE_RE = re.compile(r"`")
40
-
41
-
42
- @dataclass
43
- class SourceStats:
44
- total_paragraphs: int = 0
45
- nonempty_paragraphs: int = 0
46
-
47
-
48
- def local_name(tag: str) -> str:
49
- return tag.rsplit("}", 1)[-1] if "}" in tag else tag
50
-
51
-
52
- def paragraph_has_content(p: ET.Element) -> bool:
53
- for node in p.iter():
54
- if node.tag in TEXT_TAGS and (node.text or "").strip():
55
- return True
56
- if node.tag in CONTENT_TAGS - TEXT_TAGS:
57
- return True
58
- return False
59
-
60
-
61
- def source_stats(docx_path: Path) -> SourceStats:
62
- stats = SourceStats()
63
- with ZipFile(docx_path) as zf:
64
- root = ET.fromstring(zf.read("word/document.xml"))
65
- for p in root.iter(f"{W}p"):
66
- stats.total_paragraphs += 1
67
- if paragraph_has_content(p):
68
- stats.nonempty_paragraphs += 1
69
- return stats
70
-
71
-
72
- def load_json(path: Path) -> dict | list:
73
- return json.loads(path.read_text(encoding="utf-8"))
74
-
75
-
76
- def count_odd_backtick_lines(text: str) -> int:
77
- count = 0
78
- for line in text.splitlines():
79
- if line.count("`") % 2:
80
- count += 1
81
- return count
82
-
83
-
84
- def scan_markdown(md_path: Path) -> dict[str, int]:
85
- text = md_path.read_text(encoding="utf-8", errors="replace")
86
- four_plus = 0
87
- for match in FOUR_PLUS_STARS_RE.finditer(text):
88
- if match.group(0).strip("*"):
89
- four_plus += 1
90
- return {
91
- "image_count": len(IMAGE_RE.findall(text)),
92
- "image_glue_count": len(IMAGE_GLUE_RE.findall(text)),
93
- "four_plus_stars_count": four_plus,
94
- "odd_backtick_line_count": count_odd_backtick_lines(text),
95
- }
96
-
97
-
98
- def audit_entry(entry: dict) -> dict:
99
- input_path = Path(entry["input"])
100
- output_path = Path(entry["output"])
101
- export_report_path = output_path.parent / "export-report.json"
102
-
103
- problems: list[str] = []
104
- source = source_stats(input_path)
105
-
106
- if not output_path.exists():
107
- problems.append("missing_markdown_output")
108
- return {
109
- "input": str(input_path),
110
- "output": str(output_path),
111
- "problems": problems,
112
- }
113
-
114
- if not export_report_path.exists():
115
- problems.append("missing_export_report")
116
- return {
117
- "input": str(input_path),
118
- "output": str(output_path),
119
- "problems": problems,
120
- }
121
-
122
- export_report = load_json(export_report_path)
123
- report_stats = export_report.get("stats", {})
124
- batch_stats = entry.get("stats", {})
125
- markdown = scan_markdown(output_path)
126
-
127
- rendered_block_count = (
128
- int(report_stats.get("paragraphs", 0))
129
- + int(report_stats.get("headings", 0))
130
- + int(report_stats.get("lists", 0))
131
- + int(report_stats.get("code_blocks", 0))
132
- )
133
-
134
- if report_stats != batch_stats:
135
- problems.append("batch_report_mismatch")
136
- if rendered_block_count != source.nonempty_paragraphs:
137
- problems.append("paragraph_count_mismatch")
138
- if markdown["image_count"] != int(report_stats.get("images", 0)):
139
- problems.append("image_count_mismatch")
140
- if markdown["image_glue_count"]:
141
- problems.append("image_glue")
142
- if markdown["four_plus_stars_count"]:
143
- problems.append("four_plus_stars")
144
- if markdown["odd_backtick_line_count"]:
145
- problems.append("odd_backtick_lines")
146
-
147
- return {
148
- "input": str(input_path),
149
- "output": str(output_path),
150
- "source_total_paragraphs": source.total_paragraphs,
151
- "source_nonempty_paragraphs": source.nonempty_paragraphs,
152
- "report_rendered_blocks": rendered_block_count,
153
- "report_images": int(report_stats.get("images", 0)),
154
- "markdown_images": markdown["image_count"],
155
- "image_glue_count": markdown["image_glue_count"],
156
- "four_plus_stars_count": markdown["four_plus_stars_count"],
157
- "odd_backtick_line_count": markdown["odd_backtick_line_count"],
158
- "warnings": list(report_stats.get("warnings", [])),
159
- "problems": problems,
160
- }
161
-
162
-
163
- def summarize(results: list[dict]) -> dict:
164
- problem_counts: dict[str, int] = {}
165
- for result in results:
166
- for problem in result.get("problems", []):
167
- problem_counts[problem] = problem_counts.get(problem, 0) + 1
168
- return {
169
- "files_audited": len(results),
170
- "files_with_problems": sum(1 for result in results if result.get("problems")),
171
- "problem_counts": problem_counts,
172
- "problem_examples": [result for result in results if result.get("problems")][:25],
173
- }
174
-
175
-
176
- def main() -> int:
177
- parser = argparse.ArgumentParser(description="Audit build-corpus batch output against source .docx files.")
178
- parser.add_argument("--batch-report", required=True, help="Path to build-corpus-batch-report.json")
179
- parser.add_argument("--out", default="", help="Optional path for the audit JSON report")
180
- args = parser.parse_args()
181
-
182
- batch_report_path = Path(args.batch_report).resolve()
183
- entries = load_json(batch_report_path)
184
- if not isinstance(entries, list):
185
- raise SystemExit("Batch report must be a JSON array.")
186
-
187
- results = [audit_entry(entry) for entry in entries]
188
- summary = summarize(results)
189
- payload = {
190
- "batch_report": str(batch_report_path),
191
- "summary": summary,
192
- "results": results,
193
- }
194
-
195
- out_path = Path(args.out).resolve() if args.out else batch_report_path.with_name("corpus-audit-report.json")
196
- out_path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
197
- print(json.dumps(summary, indent=2))
198
- print(f"WROTE {out_path}")
199
- return 0
200
-
201
-
202
- if __name__ == "__main__":
203
- raise SystemExit(main())
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import json
5
+ import re
6
+ from dataclasses import dataclass
7
+ from pathlib import Path
8
+ from zipfile import ZipFile
9
+ from xml.etree import ElementTree as ET
10
+
11
+
12
+ W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
13
+ W = f"{{{W_NS}}}"
14
+
15
+ TEXT_TAGS = {
16
+ f"{W}t",
17
+ f"{W}delText",
18
+ f"{W}instrText",
19
+ }
20
+
21
+ CONTENT_TAGS = TEXT_TAGS | {
22
+ f"{W}drawing",
23
+ f"{W}object",
24
+ f"{W}pict",
25
+ f"{W}oMath",
26
+ f"{W}oMathPara",
27
+ f"{W}noBreakHyphen",
28
+ f"{W}softHyphen",
29
+ f"{W}tab",
30
+ f"{W}br",
31
+ f"{W}cr",
32
+ }
33
+
34
+ IMAGE_RE = re.compile(r"!\[[^\]]*\]\([^)]+\)")
35
+ IMAGE_GLUE_RE = re.compile(
36
+ r"!\[[^\]]*\]\([^)]+\)(?=[^\s<>)\].,;:!?])|(?<=[^\s<(\[.,;:!?])!\[[^\]]*\]\([^)]+\)"
37
+ )
38
+ FOUR_PLUS_STARS_RE = re.compile(r"(?<!^)\*{4,}(?!$)", re.M)
39
+ ODD_BACKTICK_LINE_RE = re.compile(r"`")
40
+
41
+
42
+ @dataclass
43
+ class SourceStats:
44
+ total_paragraphs: int = 0
45
+ nonempty_paragraphs: int = 0
46
+
47
+
48
+ def local_name(tag: str) -> str:
49
+ return tag.rsplit("}", 1)[-1] if "}" in tag else tag
50
+
51
+
52
+ def paragraph_has_content(p: ET.Element) -> bool:
53
+ for node in p.iter():
54
+ if node.tag in TEXT_TAGS and (node.text or "").strip():
55
+ return True
56
+ if node.tag in CONTENT_TAGS - TEXT_TAGS:
57
+ return True
58
+ return False
59
+
60
+
61
+ def source_stats(docx_path: Path) -> SourceStats:
62
+ stats = SourceStats()
63
+ with ZipFile(docx_path) as zf:
64
+ root = ET.fromstring(zf.read("word/document.xml"))
65
+ for p in root.iter(f"{W}p"):
66
+ stats.total_paragraphs += 1
67
+ if paragraph_has_content(p):
68
+ stats.nonempty_paragraphs += 1
69
+ return stats
70
+
71
+
72
+ def load_json(path: Path) -> dict | list:
73
+ return json.loads(path.read_text(encoding="utf-8"))
74
+
75
+
76
+ def count_odd_backtick_lines(text: str) -> int:
77
+ count = 0
78
+ for line in text.splitlines():
79
+ if line.count("`") % 2:
80
+ count += 1
81
+ return count
82
+
83
+
84
+ def scan_markdown(md_path: Path) -> dict[str, int]:
85
+ text = md_path.read_text(encoding="utf-8", errors="replace")
86
+ four_plus = 0
87
+ for match in FOUR_PLUS_STARS_RE.finditer(text):
88
+ if match.group(0).strip("*"):
89
+ four_plus += 1
90
+ return {
91
+ "image_count": len(IMAGE_RE.findall(text)),
92
+ "image_glue_count": len(IMAGE_GLUE_RE.findall(text)),
93
+ "four_plus_stars_count": four_plus,
94
+ "odd_backtick_line_count": count_odd_backtick_lines(text),
95
+ }
96
+
97
+
98
+ def audit_entry(entry: dict) -> dict:
99
+ input_path = Path(entry["input"])
100
+ output_path = Path(entry["output"])
101
+ export_report_path = output_path.parent / "export-report.json"
102
+
103
+ problems: list[str] = []
104
+ source = source_stats(input_path)
105
+
106
+ if not output_path.exists():
107
+ problems.append("missing_markdown_output")
108
+ return {
109
+ "input": str(input_path),
110
+ "output": str(output_path),
111
+ "problems": problems,
112
+ }
113
+
114
+ if not export_report_path.exists():
115
+ problems.append("missing_export_report")
116
+ return {
117
+ "input": str(input_path),
118
+ "output": str(output_path),
119
+ "problems": problems,
120
+ }
121
+
122
+ export_report = load_json(export_report_path)
123
+ report_stats = export_report.get("stats", {})
124
+ batch_stats = entry.get("stats", {})
125
+ markdown = scan_markdown(output_path)
126
+
127
+ rendered_block_count = (
128
+ int(report_stats.get("paragraphs", 0))
129
+ + int(report_stats.get("headings", 0))
130
+ + int(report_stats.get("lists", 0))
131
+ + int(report_stats.get("code_blocks", 0))
132
+ )
133
+
134
+ if report_stats != batch_stats:
135
+ problems.append("batch_report_mismatch")
136
+ if rendered_block_count != source.nonempty_paragraphs:
137
+ problems.append("paragraph_count_mismatch")
138
+ if markdown["image_count"] != int(report_stats.get("images", 0)):
139
+ problems.append("image_count_mismatch")
140
+ if markdown["image_glue_count"]:
141
+ problems.append("image_glue")
142
+ if markdown["four_plus_stars_count"]:
143
+ problems.append("four_plus_stars")
144
+ if markdown["odd_backtick_line_count"]:
145
+ problems.append("odd_backtick_lines")
146
+
147
+ return {
148
+ "input": str(input_path),
149
+ "output": str(output_path),
150
+ "source_total_paragraphs": source.total_paragraphs,
151
+ "source_nonempty_paragraphs": source.nonempty_paragraphs,
152
+ "report_rendered_blocks": rendered_block_count,
153
+ "report_images": int(report_stats.get("images", 0)),
154
+ "markdown_images": markdown["image_count"],
155
+ "image_glue_count": markdown["image_glue_count"],
156
+ "four_plus_stars_count": markdown["four_plus_stars_count"],
157
+ "odd_backtick_line_count": markdown["odd_backtick_line_count"],
158
+ "warnings": list(report_stats.get("warnings", [])),
159
+ "problems": problems,
160
+ }
161
+
162
+
163
+ def summarize(results: list[dict]) -> dict:
164
+ problem_counts: dict[str, int] = {}
165
+ for result in results:
166
+ for problem in result.get("problems", []):
167
+ problem_counts[problem] = problem_counts.get(problem, 0) + 1
168
+ return {
169
+ "files_audited": len(results),
170
+ "files_with_problems": sum(1 for result in results if result.get("problems")),
171
+ "problem_counts": problem_counts,
172
+ "problem_examples": [result for result in results if result.get("problems")][:25],
173
+ }
174
+
175
+
176
+ def main() -> int:
177
+ parser = argparse.ArgumentParser(description="Audit build-corpus batch output against source .docx files.")
178
+ parser.add_argument("--batch-report", required=True, help="Path to build-corpus-batch-report.json")
179
+ parser.add_argument("--out", default="", help="Optional path for the audit JSON report")
180
+ args = parser.parse_args()
181
+
182
+ batch_report_path = Path(args.batch_report).resolve()
183
+ entries = load_json(batch_report_path)
184
+ if not isinstance(entries, list):
185
+ raise SystemExit("Batch report must be a JSON array.")
186
+
187
+ results = [audit_entry(entry) for entry in entries]
188
+ summary = summarize(results)
189
+ payload = {
190
+ "batch_report": str(batch_report_path),
191
+ "summary": summary,
192
+ "results": results,
193
+ }
194
+
195
+ out_path = Path(args.out).resolve() if args.out else batch_report_path.with_name("corpus-audit-report.json")
196
+ out_path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
197
+ print(json.dumps(summary, indent=2))
198
+ print(f"WROTE {out_path}")
199
+ return 0
200
+
201
+
202
+ if __name__ == "__main__":
203
+ raise SystemExit(main())