regen.mde 0.8.0 → 0.8.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/LICENSE +16 -16
  2. package/README.md +2 -1
  3. package/bin/build-corpus-editor.js +83 -83
  4. package/bin/build-corpus.js +41 -41
  5. package/bin/regen-mdeditor-install.js +27 -27
  6. package/bin/regen-mdeditor-uninstall.js +19 -19
  7. package/bin/validate-katex.js +93 -93
  8. package/desktop/BuildCorpusEditor/BuildCorpusEditor.csproj +22 -22
  9. package/desktop/BuildCorpusEditor/EditorForm.cs +48 -0
  10. package/desktop/BuildCorpusEditor/app.manifest +16 -16
  11. package/dist/release/{regen-mde-0.8.0-win-x64.zip → regen-mde-0.6.1-win-x64.zip} +0 -0
  12. package/dist/release/regen-mde-0.8.2-win-x64.zip +0 -0
  13. package/dist/windows-editor/BuildCorpusEditor.dll +0 -0
  14. package/dist/windows-editor/BuildCorpusEditor.exe +0 -0
  15. package/dist/windows-editor/BuildCorpusEditor.pdb +0 -0
  16. package/dist/windows-editor/wwwroot/assets/{index-C_VxJk4k.js → index-BB0sbZaD.js} +107 -107
  17. package/dist/windows-editor/wwwroot/assets/index-CtOv7qsC.css +1 -0
  18. package/dist/windows-editor/wwwroot/index.html +22 -22
  19. package/editor-web/index.html +21 -21
  20. package/editor-web/src/main.jsx +91 -53
  21. package/editor-web/src/styles.css +65 -1
  22. package/editor-web/vite.config.js +13 -13
  23. package/examples/build-corpus.config.example.json +21 -21
  24. package/installer/install-regen-mde.ps1 +214 -214
  25. package/installer/regen-mde.nsi +81 -81
  26. package/package.json +1 -1
  27. package/pyproject.toml +1 -1
  28. package/scripts/build-windows-editor.ps1 +47 -47
  29. package/scripts/package-windows-editor.ps1 +90 -90
  30. package/scripts/run-corpus.ps1 +28 -28
  31. package/scripts/run-editor-implementation-plane.ps1 +226 -226
  32. package/scripts/run-required-tests.ps1 +98 -98
  33. package/scripts/run-smoke.ps1 +28 -28
  34. package/src/build_corpus/__init__.py +3 -3
  35. package/src/build_corpus/docx_exporter.py +10 -4
  36. package/src/build_corpus/equations.py +1345 -1345
  37. package/src/build_corpus/templates/__init__.py +1 -1
  38. package/src/build_corpus/validate_assets.py +46 -46
  39. package/tools/audit_corpus.py +203 -203
  40. package/tools/collect_microsoft_word_templates.py +228 -228
  41. package/tools/collect_online_docx_corpus.py +272 -272
  42. package/tools/collect_online_pptx_corpus.py +252 -252
  43. package/tools/compare_pptx_inputs_outputs.py +87 -87
  44. package/tools/roundtrip_docx_corpus.py +171 -171
  45. package/dist/windows-editor/wwwroot/assets/index-Wt9zSjIw.css +0 -1
@@ -1,272 +1,272 @@
1
- from __future__ import annotations
2
-
3
- import argparse
4
- import hashlib
5
- import json
6
- import re
7
- import sys
8
- import time
9
- from pathlib import Path
10
- from typing import Iterable
11
- from urllib.parse import parse_qs, quote_plus, unquote, urlparse
12
- from zipfile import BadZipFile, ZipFile
13
-
14
- import requests
15
-
16
-
17
- DEFAULT_QUERIES = [
18
- 'filetype:docx "equation"',
19
- 'filetype:docx "equations"',
20
- 'filetype:docx "Cambria Math"',
21
- 'filetype:docx "math equations"',
22
- 'filetype:docx "Microsoft Equation"',
23
- 'filetype:docx "OMML"',
24
- 'filetype:docx "quadratic equation"',
25
- 'filetype:docx "integral"',
26
- 'filetype:docx "matrix"',
27
- 'filetype:docx "calculus"',
28
- 'filetype:docx "physics" "equation"',
29
- 'filetype:docx "engineering" "equation"',
30
- 'filetype:docx "statistics" "equation"',
31
- 'filetype:docx "algebra" "equation"',
32
- 'filetype:docx "geometry" "equation"',
33
- ]
34
-
35
- GITHUB_REPO_QUERIES = [
36
- "docx equation math",
37
- "docx equations latex",
38
- "docx omml latex",
39
- "word equation docx",
40
- "markdown docx equation",
41
- "pandoc docx equation",
42
- "math docx converter",
43
- "equation-heavy docx",
44
- "ooxml math docx",
45
- "Cambria Math docx",
46
- ]
47
-
48
- HEADERS = {
49
- "User-Agent": (
50
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
51
- "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36"
52
- )
53
- }
54
-
55
- DOCX_CT_HINTS = (
56
- "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
57
- "application/octet-stream",
58
- "application/zip",
59
- "binary/octet-stream",
60
- )
61
-
62
-
63
- def extract_urls(html: str) -> list[str]:
64
- raw = re.findall(r'https?://[^"\'<>\s)]+', html)
65
- urls: list[str] = []
66
- for url in raw:
67
- url = unquote(url).replace("&amp;", "&")
68
- parsed = urlparse(url)
69
- if parsed.netloc.endswith("bing.com") and parsed.path == "/ck/a":
70
- qs = parse_qs(parsed.query)
71
- for key in ("u", "r"):
72
- if key in qs:
73
- url = unquote(qs[key][0])
74
- break
75
- if "duckduckgo.com/l/?" in url:
76
- qs = parse_qs(urlparse(url).query)
77
- if "uddg" in qs:
78
- url = unquote(qs["uddg"][0])
79
- urls.append(url.rstrip(".,;"))
80
- return urls
81
-
82
-
83
- def search(query: str, pages: int = 2) -> Iterable[str]:
84
- endpoints = [
85
- "https://www.bing.com/search?q={query}&first={offset}",
86
- "https://html.duckduckgo.com/html/?q={query}&s={offset}",
87
- ]
88
- for endpoint in endpoints:
89
- for page in range(pages):
90
- offset = page * 10 + 1
91
- url = endpoint.format(query=quote_plus(query), offset=offset)
92
- try:
93
- response = requests.get(url, headers=HEADERS, timeout=20)
94
- if response.status_code >= 400:
95
- continue
96
- yield from extract_urls(response.text)
97
- time.sleep(0.4)
98
- except requests.RequestException:
99
- continue
100
-
101
-
102
- def github_json(url: str) -> dict | None:
103
- try:
104
- response = requests.get(url, headers={**HEADERS, "Accept": "application/vnd.github+json"}, timeout=25)
105
- if response.status_code >= 400:
106
- return None
107
- return response.json()
108
- except requests.RequestException:
109
- return None
110
-
111
-
112
- def github_docx_urls(max_repos_per_query: int = 8) -> Iterable[str]:
113
- seen_repos: set[str] = set()
114
- for query in GITHUB_REPO_QUERIES:
115
- search_url = (
116
- "https://api.github.com/search/repositories"
117
- f"?q={quote_plus(query)}&per_page={max_repos_per_query}"
118
- )
119
- payload = github_json(search_url)
120
- if not payload:
121
- continue
122
- for repo in payload.get("items", []):
123
- full_name = repo.get("full_name")
124
- branch = repo.get("default_branch") or "main"
125
- if not full_name or full_name in seen_repos:
126
- continue
127
- seen_repos.add(full_name)
128
- tree_url = f"https://api.github.com/repos/{full_name}/git/trees/{branch}?recursive=1"
129
- tree = github_json(tree_url)
130
- if not tree:
131
- continue
132
- for item in tree.get("tree", []):
133
- path = item.get("path", "")
134
- if item.get("type") == "blob" and path.lower().endswith(".docx"):
135
- yield f"https://raw.githubusercontent.com/{full_name}/{branch}/{quote_path(path)}"
136
- time.sleep(0.25)
137
-
138
-
139
- def quote_path(path: str) -> str:
140
- return "/".join(quote_plus(part).replace("+", "%20") for part in path.split("/"))
141
-
142
-
143
- def looks_like_docx_url(url: str) -> bool:
144
- low = url.lower()
145
- return ".docx" in low and not any(bad in low for bad in ("?format=pdf", "/view?", "webcache"))
146
-
147
-
148
- def safe_name(index: int, url: str, content: bytes) -> str:
149
- parsed = urlparse(url)
150
- stem = Path(unquote(parsed.path)).name or f"online-{index:03d}.docx"
151
- stem = re.sub(r"[^A-Za-z0-9._ -]+", "_", stem)
152
- if not stem.lower().endswith(".docx"):
153
- stem += ".docx"
154
- digest = hashlib.sha256(content).hexdigest()[:10]
155
- return f"{index:03d}-{digest}-{stem}"
156
-
157
-
158
- def validate_docx(path: Path) -> dict:
159
- with ZipFile(path) as zf:
160
- names = set(zf.namelist())
161
- if "[Content_Types].xml" not in names or "word/document.xml" not in names:
162
- raise BadZipFile("not a Word DOCX package")
163
- document_xml = zf.read("word/document.xml")
164
- return {
165
- "omml_nodes": len(re.findall(rb"<m:oMath\b|<m:oMathPara\b", document_xml)),
166
- "media_parts": len([name for name in names if name.startswith("word/media/")]),
167
- "document_xml_bytes": len(document_xml),
168
- }
169
-
170
-
171
- def download(url: str, out_dir: Path, index: int, max_mb: int) -> dict | None:
172
- try:
173
- with requests.get(url, headers=HEADERS, timeout=30, stream=True, allow_redirects=True) as response:
174
- if response.status_code >= 400:
175
- return None
176
- content_type = response.headers.get("content-type", "").split(";")[0].strip().lower()
177
- if content_type and content_type not in DOCX_CT_HINTS and "wordprocessingml" not in content_type:
178
- if ".docx" not in response.url.lower():
179
- return None
180
- chunks: list[bytes] = []
181
- total = 0
182
- limit = max_mb * 1024 * 1024
183
- for chunk in response.iter_content(chunk_size=1024 * 256):
184
- if not chunk:
185
- continue
186
- total += len(chunk)
187
- if total > limit:
188
- return None
189
- chunks.append(chunk)
190
- content = b"".join(chunks)
191
- except requests.RequestException:
192
- return None
193
-
194
- if not content.startswith(b"PK"):
195
- return None
196
-
197
- out_path = out_dir / safe_name(index, response.url, content)
198
- out_path.write_bytes(content)
199
- try:
200
- stats = validate_docx(out_path)
201
- except (BadZipFile, KeyError):
202
- out_path.unlink(missing_ok=True)
203
- return None
204
-
205
- return {
206
- "file": str(out_path),
207
- "source_url": response.url,
208
- "bytes": len(content),
209
- "content_type": content_type,
210
- **stats,
211
- }
212
-
213
-
214
- def main() -> int:
215
- parser = argparse.ArgumentParser()
216
- parser.add_argument("--out", type=Path, required=True)
217
- parser.add_argument("--target", type=int, default=50)
218
- parser.add_argument("--max-mb", type=int, default=40)
219
- parser.add_argument("--prefer-equations", action="store_true")
220
- args = parser.parse_args()
221
-
222
- args.out.mkdir(parents=True, exist_ok=True)
223
- seen: set[str] = set()
224
- manifest: list[dict] = []
225
- index = len(list(args.out.glob("*.docx"))) + 1
226
-
227
- print("source: github repositories", flush=True)
228
- for url in github_docx_urls():
229
- if len(manifest) >= args.target:
230
- break
231
- normalized = url.split("#", 1)[0]
232
- if normalized in seen:
233
- continue
234
- seen.add(normalized)
235
- item = download(normalized, args.out, index, args.max_mb)
236
- if not item:
237
- continue
238
- manifest.append(item)
239
- index += 1
240
- print(f"kept {len(manifest):02d}: {Path(item['file']).name} omml={item['omml_nodes']}", flush=True)
241
-
242
- for query in DEFAULT_QUERIES:
243
- print(f"search: {query}", flush=True)
244
- for url in search(query):
245
- if len(manifest) >= args.target:
246
- break
247
- if not looks_like_docx_url(url):
248
- continue
249
- normalized = url.split("#", 1)[0]
250
- if normalized in seen:
251
- continue
252
- seen.add(normalized)
253
- item = download(normalized, args.out, index, args.max_mb)
254
- if not item:
255
- continue
256
- if args.prefer_equations and item["omml_nodes"] == 0 and len(manifest) < args.target // 2:
257
- Path(item["file"]).unlink(missing_ok=True)
258
- continue
259
- manifest.append(item)
260
- index += 1
261
- print(f"kept {len(manifest):02d}: {Path(item['file']).name} omml={item['omml_nodes']}", flush=True)
262
- if len(manifest) >= args.target:
263
- break
264
-
265
- manifest_path = args.out / "online-docx-manifest.json"
266
- manifest_path.write_text(json.dumps({"count": len(manifest), "items": manifest}, indent=2), encoding="utf-8")
267
- print(f"saved manifest: {manifest_path}", flush=True)
268
- return 0 if len(manifest) >= args.target else 1
269
-
270
-
271
- if __name__ == "__main__":
272
- raise SystemExit(main())
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import hashlib
5
+ import json
6
+ import re
7
+ import sys
8
+ import time
9
+ from pathlib import Path
10
+ from typing import Iterable
11
+ from urllib.parse import parse_qs, quote_plus, unquote, urlparse
12
+ from zipfile import BadZipFile, ZipFile
13
+
14
+ import requests
15
+
16
+
17
+ DEFAULT_QUERIES = [
18
+ 'filetype:docx "equation"',
19
+ 'filetype:docx "equations"',
20
+ 'filetype:docx "Cambria Math"',
21
+ 'filetype:docx "math equations"',
22
+ 'filetype:docx "Microsoft Equation"',
23
+ 'filetype:docx "OMML"',
24
+ 'filetype:docx "quadratic equation"',
25
+ 'filetype:docx "integral"',
26
+ 'filetype:docx "matrix"',
27
+ 'filetype:docx "calculus"',
28
+ 'filetype:docx "physics" "equation"',
29
+ 'filetype:docx "engineering" "equation"',
30
+ 'filetype:docx "statistics" "equation"',
31
+ 'filetype:docx "algebra" "equation"',
32
+ 'filetype:docx "geometry" "equation"',
33
+ ]
34
+
35
+ GITHUB_REPO_QUERIES = [
36
+ "docx equation math",
37
+ "docx equations latex",
38
+ "docx omml latex",
39
+ "word equation docx",
40
+ "markdown docx equation",
41
+ "pandoc docx equation",
42
+ "math docx converter",
43
+ "equation-heavy docx",
44
+ "ooxml math docx",
45
+ "Cambria Math docx",
46
+ ]
47
+
48
+ HEADERS = {
49
+ "User-Agent": (
50
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
51
+ "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36"
52
+ )
53
+ }
54
+
55
+ DOCX_CT_HINTS = (
56
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
57
+ "application/octet-stream",
58
+ "application/zip",
59
+ "binary/octet-stream",
60
+ )
61
+
62
+
63
+ def extract_urls(html: str) -> list[str]:
64
+ raw = re.findall(r'https?://[^"\'<>\s)]+', html)
65
+ urls: list[str] = []
66
+ for url in raw:
67
+ url = unquote(url).replace("&amp;", "&")
68
+ parsed = urlparse(url)
69
+ if parsed.netloc.endswith("bing.com") and parsed.path == "/ck/a":
70
+ qs = parse_qs(parsed.query)
71
+ for key in ("u", "r"):
72
+ if key in qs:
73
+ url = unquote(qs[key][0])
74
+ break
75
+ if "duckduckgo.com/l/?" in url:
76
+ qs = parse_qs(urlparse(url).query)
77
+ if "uddg" in qs:
78
+ url = unquote(qs["uddg"][0])
79
+ urls.append(url.rstrip(".,;"))
80
+ return urls
81
+
82
+
83
+ def search(query: str, pages: int = 2) -> Iterable[str]:
84
+ endpoints = [
85
+ "https://www.bing.com/search?q={query}&first={offset}",
86
+ "https://html.duckduckgo.com/html/?q={query}&s={offset}",
87
+ ]
88
+ for endpoint in endpoints:
89
+ for page in range(pages):
90
+ offset = page * 10 + 1
91
+ url = endpoint.format(query=quote_plus(query), offset=offset)
92
+ try:
93
+ response = requests.get(url, headers=HEADERS, timeout=20)
94
+ if response.status_code >= 400:
95
+ continue
96
+ yield from extract_urls(response.text)
97
+ time.sleep(0.4)
98
+ except requests.RequestException:
99
+ continue
100
+
101
+
102
+ def github_json(url: str) -> dict | None:
103
+ try:
104
+ response = requests.get(url, headers={**HEADERS, "Accept": "application/vnd.github+json"}, timeout=25)
105
+ if response.status_code >= 400:
106
+ return None
107
+ return response.json()
108
+ except requests.RequestException:
109
+ return None
110
+
111
+
112
+ def github_docx_urls(max_repos_per_query: int = 8) -> Iterable[str]:
113
+ seen_repos: set[str] = set()
114
+ for query in GITHUB_REPO_QUERIES:
115
+ search_url = (
116
+ "https://api.github.com/search/repositories"
117
+ f"?q={quote_plus(query)}&per_page={max_repos_per_query}"
118
+ )
119
+ payload = github_json(search_url)
120
+ if not payload:
121
+ continue
122
+ for repo in payload.get("items", []):
123
+ full_name = repo.get("full_name")
124
+ branch = repo.get("default_branch") or "main"
125
+ if not full_name or full_name in seen_repos:
126
+ continue
127
+ seen_repos.add(full_name)
128
+ tree_url = f"https://api.github.com/repos/{full_name}/git/trees/{branch}?recursive=1"
129
+ tree = github_json(tree_url)
130
+ if not tree:
131
+ continue
132
+ for item in tree.get("tree", []):
133
+ path = item.get("path", "")
134
+ if item.get("type") == "blob" and path.lower().endswith(".docx"):
135
+ yield f"https://raw.githubusercontent.com/{full_name}/{branch}/{quote_path(path)}"
136
+ time.sleep(0.25)
137
+
138
+
139
+ def quote_path(path: str) -> str:
140
+ return "/".join(quote_plus(part).replace("+", "%20") for part in path.split("/"))
141
+
142
+
143
+ def looks_like_docx_url(url: str) -> bool:
144
+ low = url.lower()
145
+ return ".docx" in low and not any(bad in low for bad in ("?format=pdf", "/view?", "webcache"))
146
+
147
+
148
+ def safe_name(index: int, url: str, content: bytes) -> str:
149
+ parsed = urlparse(url)
150
+ stem = Path(unquote(parsed.path)).name or f"online-{index:03d}.docx"
151
+ stem = re.sub(r"[^A-Za-z0-9._ -]+", "_", stem)
152
+ if not stem.lower().endswith(".docx"):
153
+ stem += ".docx"
154
+ digest = hashlib.sha256(content).hexdigest()[:10]
155
+ return f"{index:03d}-{digest}-{stem}"
156
+
157
+
158
+ def validate_docx(path: Path) -> dict:
159
+ with ZipFile(path) as zf:
160
+ names = set(zf.namelist())
161
+ if "[Content_Types].xml" not in names or "word/document.xml" not in names:
162
+ raise BadZipFile("not a Word DOCX package")
163
+ document_xml = zf.read("word/document.xml")
164
+ return {
165
+ "omml_nodes": len(re.findall(rb"<m:oMath\b|<m:oMathPara\b", document_xml)),
166
+ "media_parts": len([name for name in names if name.startswith("word/media/")]),
167
+ "document_xml_bytes": len(document_xml),
168
+ }
169
+
170
+
171
+ def download(url: str, out_dir: Path, index: int, max_mb: int) -> dict | None:
172
+ try:
173
+ with requests.get(url, headers=HEADERS, timeout=30, stream=True, allow_redirects=True) as response:
174
+ if response.status_code >= 400:
175
+ return None
176
+ content_type = response.headers.get("content-type", "").split(";")[0].strip().lower()
177
+ if content_type and content_type not in DOCX_CT_HINTS and "wordprocessingml" not in content_type:
178
+ if ".docx" not in response.url.lower():
179
+ return None
180
+ chunks: list[bytes] = []
181
+ total = 0
182
+ limit = max_mb * 1024 * 1024
183
+ for chunk in response.iter_content(chunk_size=1024 * 256):
184
+ if not chunk:
185
+ continue
186
+ total += len(chunk)
187
+ if total > limit:
188
+ return None
189
+ chunks.append(chunk)
190
+ content = b"".join(chunks)
191
+ except requests.RequestException:
192
+ return None
193
+
194
+ if not content.startswith(b"PK"):
195
+ return None
196
+
197
+ out_path = out_dir / safe_name(index, response.url, content)
198
+ out_path.write_bytes(content)
199
+ try:
200
+ stats = validate_docx(out_path)
201
+ except (BadZipFile, KeyError):
202
+ out_path.unlink(missing_ok=True)
203
+ return None
204
+
205
+ return {
206
+ "file": str(out_path),
207
+ "source_url": response.url,
208
+ "bytes": len(content),
209
+ "content_type": content_type,
210
+ **stats,
211
+ }
212
+
213
+
214
+ def main() -> int:
215
+ parser = argparse.ArgumentParser()
216
+ parser.add_argument("--out", type=Path, required=True)
217
+ parser.add_argument("--target", type=int, default=50)
218
+ parser.add_argument("--max-mb", type=int, default=40)
219
+ parser.add_argument("--prefer-equations", action="store_true")
220
+ args = parser.parse_args()
221
+
222
+ args.out.mkdir(parents=True, exist_ok=True)
223
+ seen: set[str] = set()
224
+ manifest: list[dict] = []
225
+ index = len(list(args.out.glob("*.docx"))) + 1
226
+
227
+ print("source: github repositories", flush=True)
228
+ for url in github_docx_urls():
229
+ if len(manifest) >= args.target:
230
+ break
231
+ normalized = url.split("#", 1)[0]
232
+ if normalized in seen:
233
+ continue
234
+ seen.add(normalized)
235
+ item = download(normalized, args.out, index, args.max_mb)
236
+ if not item:
237
+ continue
238
+ manifest.append(item)
239
+ index += 1
240
+ print(f"kept {len(manifest):02d}: {Path(item['file']).name} omml={item['omml_nodes']}", flush=True)
241
+
242
+ for query in DEFAULT_QUERIES:
243
+ print(f"search: {query}", flush=True)
244
+ for url in search(query):
245
+ if len(manifest) >= args.target:
246
+ break
247
+ if not looks_like_docx_url(url):
248
+ continue
249
+ normalized = url.split("#", 1)[0]
250
+ if normalized in seen:
251
+ continue
252
+ seen.add(normalized)
253
+ item = download(normalized, args.out, index, args.max_mb)
254
+ if not item:
255
+ continue
256
+ if args.prefer_equations and item["omml_nodes"] == 0 and len(manifest) < args.target // 2:
257
+ Path(item["file"]).unlink(missing_ok=True)
258
+ continue
259
+ manifest.append(item)
260
+ index += 1
261
+ print(f"kept {len(manifest):02d}: {Path(item['file']).name} omml={item['omml_nodes']}", flush=True)
262
+ if len(manifest) >= args.target:
263
+ break
264
+
265
+ manifest_path = args.out / "online-docx-manifest.json"
266
+ manifest_path.write_text(json.dumps({"count": len(manifest), "items": manifest}, indent=2), encoding="utf-8")
267
+ print(f"saved manifest: {manifest_path}", flush=True)
268
+ return 0 if len(manifest) >= args.target else 1
269
+
270
+
271
+ if __name__ == "__main__":
272
+ raise SystemExit(main())