regen.mde 0.8.0 → 0.8.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/LICENSE +16 -16
  2. package/README.md +2 -1
  3. package/bin/build-corpus-editor.js +83 -83
  4. package/bin/build-corpus.js +41 -41
  5. package/bin/regen-mdeditor-install.js +27 -27
  6. package/bin/regen-mdeditor-uninstall.js +19 -19
  7. package/bin/validate-katex.js +93 -93
  8. package/desktop/BuildCorpusEditor/BuildCorpusEditor.csproj +22 -22
  9. package/desktop/BuildCorpusEditor/EditorForm.cs +48 -0
  10. package/desktop/BuildCorpusEditor/app.manifest +16 -16
  11. package/dist/release/{regen-mde-0.8.0-win-x64.zip → regen-mde-0.6.1-win-x64.zip} +0 -0
  12. package/dist/release/regen-mde-0.8.2-win-x64.zip +0 -0
  13. package/dist/windows-editor/BuildCorpusEditor.dll +0 -0
  14. package/dist/windows-editor/BuildCorpusEditor.exe +0 -0
  15. package/dist/windows-editor/BuildCorpusEditor.pdb +0 -0
  16. package/dist/windows-editor/wwwroot/assets/{index-C_VxJk4k.js → index-BB0sbZaD.js} +107 -107
  17. package/dist/windows-editor/wwwroot/assets/index-CtOv7qsC.css +1 -0
  18. package/dist/windows-editor/wwwroot/index.html +22 -22
  19. package/editor-web/index.html +21 -21
  20. package/editor-web/src/main.jsx +91 -53
  21. package/editor-web/src/styles.css +65 -1
  22. package/editor-web/vite.config.js +13 -13
  23. package/examples/build-corpus.config.example.json +21 -21
  24. package/installer/install-regen-mde.ps1 +214 -214
  25. package/installer/regen-mde.nsi +81 -81
  26. package/package.json +1 -1
  27. package/pyproject.toml +1 -1
  28. package/scripts/build-windows-editor.ps1 +47 -47
  29. package/scripts/package-windows-editor.ps1 +90 -90
  30. package/scripts/run-corpus.ps1 +28 -28
  31. package/scripts/run-editor-implementation-plane.ps1 +226 -226
  32. package/scripts/run-required-tests.ps1 +98 -98
  33. package/scripts/run-smoke.ps1 +28 -28
  34. package/src/build_corpus/__init__.py +3 -3
  35. package/src/build_corpus/docx_exporter.py +10 -4
  36. package/src/build_corpus/equations.py +1345 -1345
  37. package/src/build_corpus/templates/__init__.py +1 -1
  38. package/src/build_corpus/validate_assets.py +46 -46
  39. package/tools/audit_corpus.py +203 -203
  40. package/tools/collect_microsoft_word_templates.py +228 -228
  41. package/tools/collect_online_docx_corpus.py +272 -272
  42. package/tools/collect_online_pptx_corpus.py +252 -252
  43. package/tools/compare_pptx_inputs_outputs.py +87 -87
  44. package/tools/roundtrip_docx_corpus.py +171 -171
  45. package/dist/windows-editor/wwwroot/assets/index-Wt9zSjIw.css +0 -1
@@ -1,228 +1,228 @@
1
- from __future__ import annotations
2
-
3
- import argparse
4
- import hashlib
5
- import json
6
- import re
7
- import time
8
- from collections import deque
9
- from pathlib import Path
10
- from typing import Iterable
11
- from urllib.parse import urljoin, urlparse
12
- from zipfile import BadZipFile, ZipFile
13
-
14
- import requests
15
-
16
-
17
- BASE_URL = "https://word.cloud.microsoft"
18
- START_URLS = [
19
- "https://word.cloud.microsoft/create/en/templates/",
20
- "https://word.cloud.microsoft/create/en/resume-templates/",
21
- ]
22
-
23
- HEADERS = {
24
- "User-Agent": (
25
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
26
- "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36"
27
- )
28
- }
29
-
30
- PAGE_LINK_RE = re.compile(r'(?:"|href=")(/create/en/[^"#? ]+/)')
31
- FILE_URL_RE = re.compile(r'"fileUrl":"(https://cdn\.create\.microsoft\.com/[^"]+\.docx)"')
32
- TITLE_RE = re.compile(
33
- r'"fileUrl":"(?P<url>https://cdn\.create\.microsoft\.com/[^"]+\.docx)","title":"(?P<title>[^"]+)"'
34
- )
35
-
36
-
37
- def normalize_page(url: str) -> str:
38
- parsed = urlparse(url)
39
- return f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
40
-
41
-
42
- def should_visit(url: str) -> bool:
43
- parsed = urlparse(url)
44
- if parsed.netloc != "word.cloud.microsoft":
45
- return False
46
- if not parsed.path.startswith("/create/en/"):
47
- return False
48
- path = parsed.path.lower()
49
- if "/blog/" in path:
50
- return False
51
- if any(token in path for token in ("/copilot-", "/document-editor/", "/grammar-checker/", "/ai-", "/new/")):
52
- return False
53
- return True
54
-
55
-
56
- def fetch(session: requests.Session, url: str) -> str | None:
57
- try:
58
- response = session.get(url, headers=HEADERS, timeout=30)
59
- if response.status_code >= 400:
60
- return None
61
- return response.text
62
- except requests.RequestException:
63
- return None
64
-
65
-
66
- def extract_links(html: str) -> Iterable[str]:
67
- for match in PAGE_LINK_RE.finditer(html):
68
- yield urljoin(BASE_URL, match.group(1))
69
-
70
-
71
- def extract_templates(html: str) -> list[dict[str, str]]:
72
- titles_by_url = {match.group("url"): match.group("title") for match in TITLE_RE.finditer(html)}
73
- found = []
74
- for match in FILE_URL_RE.finditer(html):
75
- url = match.group(1).replace("\\u0026", "&")
76
- found.append({
77
- "file_url": url,
78
- "title": titles_by_url.get(url, ""),
79
- })
80
- return found
81
-
82
-
83
- def validate_docx(path: Path) -> dict:
84
- with ZipFile(path) as zf:
85
- names = set(zf.namelist())
86
- if "[Content_Types].xml" not in names or "word/document.xml" not in names:
87
- raise BadZipFile("not a Word DOCX package")
88
- document_xml = zf.read("word/document.xml")
89
- return {
90
- "omml_nodes": len(re.findall(rb"<m:oMath\b|<m:oMathPara\b", document_xml)),
91
- "media_parts": len([name for name in names if name.startswith("word/media/")]),
92
- "document_xml_bytes": len(document_xml),
93
- }
94
-
95
-
96
- def safe_name(index: int, title: str, content: bytes, file_url: str) -> str:
97
- digest = hashlib.sha256(content).hexdigest()[:10]
98
- stem = title.strip() or Path(urlparse(file_url).path).name
99
- stem = re.sub(r"[^A-Za-z0-9._ -]+", "_", stem).strip(" ._")
100
- if not stem.lower().endswith(".docx"):
101
- stem += ".docx"
102
- return f"{index:03d}-{digest}-{stem}"
103
-
104
-
105
- def download_template(
106
- session: requests.Session,
107
- item: dict[str, str],
108
- out_dir: Path,
109
- index: int,
110
- max_mb: int,
111
- ) -> dict | None:
112
- try:
113
- with session.get(item["file_url"], headers=HEADERS, timeout=60, stream=True) as response:
114
- if response.status_code >= 400:
115
- return None
116
- chunks: list[bytes] = []
117
- total = 0
118
- limit = max_mb * 1024 * 1024
119
- for chunk in response.iter_content(chunk_size=1024 * 256):
120
- if not chunk:
121
- continue
122
- total += len(chunk)
123
- if total > limit:
124
- return None
125
- chunks.append(chunk)
126
- content = b"".join(chunks)
127
- except requests.RequestException:
128
- return None
129
-
130
- if not content.startswith(b"PK"):
131
- return None
132
-
133
- filename = safe_name(index, item.get("title", ""), content, item["file_url"])
134
- out_path = out_dir / filename
135
- out_path.write_bytes(content)
136
- try:
137
- stats = validate_docx(out_path)
138
- except (BadZipFile, KeyError):
139
- out_path.unlink(missing_ok=True)
140
- return None
141
-
142
- return {
143
- "file": str(out_path),
144
- "title": item.get("title", ""),
145
- "source_url": item["source_url"],
146
- "file_url": item["file_url"],
147
- "bytes": len(content),
148
- **stats,
149
- }
150
-
151
-
152
- def crawl_template_pages(session: requests.Session, max_pages: int) -> list[dict[str, str]]:
153
- queue = deque(START_URLS)
154
- seen_pages: set[str] = set()
155
- seen_file_urls: set[str] = set()
156
- template_items: list[dict[str, str]] = []
157
-
158
- while queue and len(seen_pages) < max_pages:
159
- current = normalize_page(queue.popleft())
160
- if current in seen_pages or not should_visit(current):
161
- continue
162
- seen_pages.add(current)
163
- html = fetch(session, current)
164
- if not html:
165
- continue
166
-
167
- for template in extract_templates(html):
168
- file_url = template["file_url"]
169
- if file_url in seen_file_urls:
170
- continue
171
- seen_file_urls.add(file_url)
172
- template_items.append({
173
- **template,
174
- "source_url": current,
175
- })
176
-
177
- for link in extract_links(html):
178
- normalized = normalize_page(link)
179
- if normalized not in seen_pages and should_visit(normalized):
180
- queue.append(normalized)
181
-
182
- time.sleep(0.15)
183
-
184
- return template_items
185
-
186
-
187
- def main() -> int:
188
- parser = argparse.ArgumentParser()
189
- parser.add_argument("--out", type=Path, required=True)
190
- parser.add_argument("--max-pages", type=int, default=80)
191
- parser.add_argument("--max-mb", type=int, default=40)
192
- parser.add_argument("--limit", type=int, default=0, help="0 means no explicit template limit")
193
- args = parser.parse_args()
194
-
195
- args.out.mkdir(parents=True, exist_ok=True)
196
- session = requests.Session()
197
-
198
- template_items = crawl_template_pages(session, max_pages=args.max_pages)
199
- if args.limit > 0:
200
- template_items = template_items[: args.limit]
201
-
202
- manifest: list[dict] = []
203
- for index, item in enumerate(template_items, 1):
204
- downloaded = download_template(session, item, args.out, index, args.max_mb)
205
- if not downloaded:
206
- continue
207
- manifest.append(downloaded)
208
- print(f"kept {len(manifest):03d}: {Path(downloaded['file']).name}", flush=True)
209
- time.sleep(0.1)
210
-
211
- manifest_path = args.out / "microsoft-word-templates-manifest.json"
212
- manifest_path.write_text(
213
- json.dumps(
214
- {
215
- "count": len(manifest),
216
- "items": manifest,
217
- },
218
- indent=2,
219
- ensure_ascii=False,
220
- ),
221
- encoding="utf-8",
222
- )
223
- print(f"saved manifest: {manifest_path}", flush=True)
224
- return 0 if manifest else 1
225
-
226
-
227
- if __name__ == "__main__":
228
- raise SystemExit(main())
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import hashlib
5
+ import json
6
+ import re
7
+ import time
8
+ from collections import deque
9
+ from pathlib import Path
10
+ from typing import Iterable
11
+ from urllib.parse import urljoin, urlparse
12
+ from zipfile import BadZipFile, ZipFile
13
+
14
+ import requests
15
+
16
+
17
+ BASE_URL = "https://word.cloud.microsoft"
18
+ START_URLS = [
19
+ "https://word.cloud.microsoft/create/en/templates/",
20
+ "https://word.cloud.microsoft/create/en/resume-templates/",
21
+ ]
22
+
23
+ HEADERS = {
24
+ "User-Agent": (
25
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
26
+ "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36"
27
+ )
28
+ }
29
+
30
+ PAGE_LINK_RE = re.compile(r'(?:"|href=")(/create/en/[^"#? ]+/)')
31
+ FILE_URL_RE = re.compile(r'"fileUrl":"(https://cdn\.create\.microsoft\.com/[^"]+\.docx)"')
32
+ TITLE_RE = re.compile(
33
+ r'"fileUrl":"(?P<url>https://cdn\.create\.microsoft\.com/[^"]+\.docx)","title":"(?P<title>[^"]+)"'
34
+ )
35
+
36
+
37
+ def normalize_page(url: str) -> str:
38
+ parsed = urlparse(url)
39
+ return f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
40
+
41
+
42
+ def should_visit(url: str) -> bool:
43
+ parsed = urlparse(url)
44
+ if parsed.netloc != "word.cloud.microsoft":
45
+ return False
46
+ if not parsed.path.startswith("/create/en/"):
47
+ return False
48
+ path = parsed.path.lower()
49
+ if "/blog/" in path:
50
+ return False
51
+ if any(token in path for token in ("/copilot-", "/document-editor/", "/grammar-checker/", "/ai-", "/new/")):
52
+ return False
53
+ return True
54
+
55
+
56
+ def fetch(session: requests.Session, url: str) -> str | None:
57
+ try:
58
+ response = session.get(url, headers=HEADERS, timeout=30)
59
+ if response.status_code >= 400:
60
+ return None
61
+ return response.text
62
+ except requests.RequestException:
63
+ return None
64
+
65
+
66
+ def extract_links(html: str) -> Iterable[str]:
67
+ for match in PAGE_LINK_RE.finditer(html):
68
+ yield urljoin(BASE_URL, match.group(1))
69
+
70
+
71
+ def extract_templates(html: str) -> list[dict[str, str]]:
72
+ titles_by_url = {match.group("url"): match.group("title") for match in TITLE_RE.finditer(html)}
73
+ found = []
74
+ for match in FILE_URL_RE.finditer(html):
75
+ url = match.group(1).replace("\\u0026", "&")
76
+ found.append({
77
+ "file_url": url,
78
+ "title": titles_by_url.get(url, ""),
79
+ })
80
+ return found
81
+
82
+
83
+ def validate_docx(path: Path) -> dict:
84
+ with ZipFile(path) as zf:
85
+ names = set(zf.namelist())
86
+ if "[Content_Types].xml" not in names or "word/document.xml" not in names:
87
+ raise BadZipFile("not a Word DOCX package")
88
+ document_xml = zf.read("word/document.xml")
89
+ return {
90
+ "omml_nodes": len(re.findall(rb"<m:oMath\b|<m:oMathPara\b", document_xml)),
91
+ "media_parts": len([name for name in names if name.startswith("word/media/")]),
92
+ "document_xml_bytes": len(document_xml),
93
+ }
94
+
95
+
96
+ def safe_name(index: int, title: str, content: bytes, file_url: str) -> str:
97
+ digest = hashlib.sha256(content).hexdigest()[:10]
98
+ stem = title.strip() or Path(urlparse(file_url).path).name
99
+ stem = re.sub(r"[^A-Za-z0-9._ -]+", "_", stem).strip(" ._")
100
+ if not stem.lower().endswith(".docx"):
101
+ stem += ".docx"
102
+ return f"{index:03d}-{digest}-{stem}"
103
+
104
+
105
+ def download_template(
106
+ session: requests.Session,
107
+ item: dict[str, str],
108
+ out_dir: Path,
109
+ index: int,
110
+ max_mb: int,
111
+ ) -> dict | None:
112
+ try:
113
+ with session.get(item["file_url"], headers=HEADERS, timeout=60, stream=True) as response:
114
+ if response.status_code >= 400:
115
+ return None
116
+ chunks: list[bytes] = []
117
+ total = 0
118
+ limit = max_mb * 1024 * 1024
119
+ for chunk in response.iter_content(chunk_size=1024 * 256):
120
+ if not chunk:
121
+ continue
122
+ total += len(chunk)
123
+ if total > limit:
124
+ return None
125
+ chunks.append(chunk)
126
+ content = b"".join(chunks)
127
+ except requests.RequestException:
128
+ return None
129
+
130
+ if not content.startswith(b"PK"):
131
+ return None
132
+
133
+ filename = safe_name(index, item.get("title", ""), content, item["file_url"])
134
+ out_path = out_dir / filename
135
+ out_path.write_bytes(content)
136
+ try:
137
+ stats = validate_docx(out_path)
138
+ except (BadZipFile, KeyError):
139
+ out_path.unlink(missing_ok=True)
140
+ return None
141
+
142
+ return {
143
+ "file": str(out_path),
144
+ "title": item.get("title", ""),
145
+ "source_url": item["source_url"],
146
+ "file_url": item["file_url"],
147
+ "bytes": len(content),
148
+ **stats,
149
+ }
150
+
151
+
152
+ def crawl_template_pages(session: requests.Session, max_pages: int) -> list[dict[str, str]]:
153
+ queue = deque(START_URLS)
154
+ seen_pages: set[str] = set()
155
+ seen_file_urls: set[str] = set()
156
+ template_items: list[dict[str, str]] = []
157
+
158
+ while queue and len(seen_pages) < max_pages:
159
+ current = normalize_page(queue.popleft())
160
+ if current in seen_pages or not should_visit(current):
161
+ continue
162
+ seen_pages.add(current)
163
+ html = fetch(session, current)
164
+ if not html:
165
+ continue
166
+
167
+ for template in extract_templates(html):
168
+ file_url = template["file_url"]
169
+ if file_url in seen_file_urls:
170
+ continue
171
+ seen_file_urls.add(file_url)
172
+ template_items.append({
173
+ **template,
174
+ "source_url": current,
175
+ })
176
+
177
+ for link in extract_links(html):
178
+ normalized = normalize_page(link)
179
+ if normalized not in seen_pages and should_visit(normalized):
180
+ queue.append(normalized)
181
+
182
+ time.sleep(0.15)
183
+
184
+ return template_items
185
+
186
+
187
+ def main() -> int:
188
+ parser = argparse.ArgumentParser()
189
+ parser.add_argument("--out", type=Path, required=True)
190
+ parser.add_argument("--max-pages", type=int, default=80)
191
+ parser.add_argument("--max-mb", type=int, default=40)
192
+ parser.add_argument("--limit", type=int, default=0, help="0 means no explicit template limit")
193
+ args = parser.parse_args()
194
+
195
+ args.out.mkdir(parents=True, exist_ok=True)
196
+ session = requests.Session()
197
+
198
+ template_items = crawl_template_pages(session, max_pages=args.max_pages)
199
+ if args.limit > 0:
200
+ template_items = template_items[: args.limit]
201
+
202
+ manifest: list[dict] = []
203
+ for index, item in enumerate(template_items, 1):
204
+ downloaded = download_template(session, item, args.out, index, args.max_mb)
205
+ if not downloaded:
206
+ continue
207
+ manifest.append(downloaded)
208
+ print(f"kept {len(manifest):03d}: {Path(downloaded['file']).name}", flush=True)
209
+ time.sleep(0.1)
210
+
211
+ manifest_path = args.out / "microsoft-word-templates-manifest.json"
212
+ manifest_path.write_text(
213
+ json.dumps(
214
+ {
215
+ "count": len(manifest),
216
+ "items": manifest,
217
+ },
218
+ indent=2,
219
+ ensure_ascii=False,
220
+ ),
221
+ encoding="utf-8",
222
+ )
223
+ print(f"saved manifest: {manifest_path}", flush=True)
224
+ return 0 if manifest else 1
225
+
226
+
227
+ if __name__ == "__main__":
228
+ raise SystemExit(main())