regen.mde 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. package/LICENSE +16 -0
  2. package/README.md +295 -0
  3. package/bin/build-corpus-editor.js +81 -0
  4. package/bin/build-corpus.js +41 -0
  5. package/bin/postinstall.js +187 -0
  6. package/bin/regen-mdeditor-install.js +27 -0
  7. package/bin/regen-mdeditor-uninstall.js +19 -0
  8. package/bin/validate-katex.js +93 -0
  9. package/desktop/BuildCorpusEditor/BuildCorpusBridge.cs +270 -0
  10. package/desktop/BuildCorpusEditor/BuildCorpusEditor.csproj +22 -0
  11. package/desktop/BuildCorpusEditor/EditorForm.cs +540 -0
  12. package/desktop/BuildCorpusEditor/Program.cs +81 -0
  13. package/desktop/BuildCorpusEditor/app.manifest +16 -0
  14. package/dist/release/regen.mde-0.2.2-win-x64-setup.exe +0 -0
  15. package/dist/release/regen.mde-0.2.2-win-x64.zip +0 -0
  16. package/dist/windows-editor/BuildCorpusEditor.deps.json +83 -0
  17. package/dist/windows-editor/BuildCorpusEditor.dll +0 -0
  18. package/dist/windows-editor/BuildCorpusEditor.exe +0 -0
  19. package/dist/windows-editor/BuildCorpusEditor.pdb +0 -0
  20. package/dist/windows-editor/BuildCorpusEditor.runtimeconfig.json +19 -0
  21. package/dist/windows-editor/Microsoft.Web.WebView2.Core.dll +0 -0
  22. package/dist/windows-editor/Microsoft.Web.WebView2.Core.xml +6817 -0
  23. package/dist/windows-editor/Microsoft.Web.WebView2.WinForms.dll +0 -0
  24. package/dist/windows-editor/Microsoft.Web.WebView2.WinForms.xml +510 -0
  25. package/dist/windows-editor/Microsoft.Web.WebView2.Wpf.dll +0 -0
  26. package/dist/windows-editor/Microsoft.Web.WebView2.Wpf.xml +1902 -0
  27. package/dist/windows-editor/WebView2Loader.dll +0 -0
  28. package/dist/windows-editor/runtimes/win-x64/native/WebView2Loader.dll +0 -0
  29. package/dist/windows-editor/wwwroot/assets/index-DjJ6xmhy.js +326 -0
  30. package/dist/windows-editor/wwwroot/assets/index-_dwMNNsm.css +1 -0
  31. package/dist/windows-editor/wwwroot/index.html +22 -0
  32. package/editor-web/index.html +21 -0
  33. package/editor-web/src/main.jsx +399 -0
  34. package/editor-web/src/styles.css +602 -0
  35. package/editor-web/vite.config.js +13 -0
  36. package/examples/build-corpus.config.example.json +21 -0
  37. package/installer/install-regen-mde.ps1 +175 -0
  38. package/installer/regen-mde.nsi +81 -0
  39. package/package.json +86 -0
  40. package/pyproject.toml +33 -0
  41. package/requirements.txt +4 -0
  42. package/scripts/build-windows-editor.ps1 +47 -0
  43. package/scripts/package-windows-editor.ps1 +90 -0
  44. package/scripts/run-corpus.ps1 +28 -0
  45. package/scripts/run-editor-implementation-plane.ps1 +203 -0
  46. package/scripts/run-required-tests.ps1 +98 -0
  47. package/scripts/run-smoke.ps1 +28 -0
  48. package/src/build_corpus/__init__.py +3 -0
  49. package/src/build_corpus/docx_exporter.py +798 -0
  50. package/src/build_corpus/exporter.py +1195 -0
  51. package/src/build_corpus/ppt_exporter.py +532 -0
  52. package/src/build_corpus/templates/__init__.py +1 -0
  53. package/src/build_corpus/templates/md-to-word-template.dotx +0 -0
  54. package/src/build_corpus/validate_assets.py +46 -0
  55. package/tools/audit_corpus.py +203 -0
  56. package/tools/collect_microsoft_word_templates.py +228 -0
  57. package/tools/collect_online_docx_corpus.py +272 -0
  58. package/tools/collect_online_pptx_corpus.py +252 -0
  59. package/tools/compare_pptx_inputs_outputs.py +87 -0
  60. package/tools/roundtrip_docx_corpus.py +171 -0
@@ -0,0 +1,203 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import json
5
+ import re
6
+ from dataclasses import dataclass
7
+ from pathlib import Path
8
+ from zipfile import ZipFile
9
+ from xml.etree import ElementTree as ET
10
+
11
+
12
+ W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
13
+ W = f"{{{W_NS}}}"
14
+
15
+ TEXT_TAGS = {
16
+ f"{W}t",
17
+ f"{W}delText",
18
+ f"{W}instrText",
19
+ }
20
+
21
+ CONTENT_TAGS = TEXT_TAGS | {
22
+ f"{W}drawing",
23
+ f"{W}object",
24
+ f"{W}pict",
25
+ f"{W}oMath",
26
+ f"{W}oMathPara",
27
+ f"{W}noBreakHyphen",
28
+ f"{W}softHyphen",
29
+ f"{W}tab",
30
+ f"{W}br",
31
+ f"{W}cr",
32
+ }
33
+
34
+ IMAGE_RE = re.compile(r"!\[[^\]]*\]\([^)]+\)")
35
+ IMAGE_GLUE_RE = re.compile(
36
+ r"!\[[^\]]*\]\([^)]+\)(?=[^\s<>)\].,;:!?])|(?<=[^\s<(\[.,;:!?])!\[[^\]]*\]\([^)]+\)"
37
+ )
38
+ FOUR_PLUS_STARS_RE = re.compile(r"(?<!^)\*{4,}(?!$)", re.M)
39
+ ODD_BACKTICK_LINE_RE = re.compile(r"`")
40
+
41
+
42
+ @dataclass
43
+ class SourceStats:
44
+ total_paragraphs: int = 0
45
+ nonempty_paragraphs: int = 0
46
+
47
+
48
+ def local_name(tag: str) -> str:
49
+ return tag.rsplit("}", 1)[-1] if "}" in tag else tag
50
+
51
+
52
+ def paragraph_has_content(p: ET.Element) -> bool:
53
+ for node in p.iter():
54
+ if node.tag in TEXT_TAGS and (node.text or "").strip():
55
+ return True
56
+ if node.tag in CONTENT_TAGS - TEXT_TAGS:
57
+ return True
58
+ return False
59
+
60
+
61
+ def source_stats(docx_path: Path) -> SourceStats:
62
+ stats = SourceStats()
63
+ with ZipFile(docx_path) as zf:
64
+ root = ET.fromstring(zf.read("word/document.xml"))
65
+ for p in root.iter(f"{W}p"):
66
+ stats.total_paragraphs += 1
67
+ if paragraph_has_content(p):
68
+ stats.nonempty_paragraphs += 1
69
+ return stats
70
+
71
+
72
+ def load_json(path: Path) -> dict | list:
73
+ return json.loads(path.read_text(encoding="utf-8"))
74
+
75
+
76
+ def count_odd_backtick_lines(text: str) -> int:
77
+ count = 0
78
+ for line in text.splitlines():
79
+ if line.count("`") % 2:
80
+ count += 1
81
+ return count
82
+
83
+
84
+ def scan_markdown(md_path: Path) -> dict[str, int]:
85
+ text = md_path.read_text(encoding="utf-8", errors="replace")
86
+ four_plus = 0
87
+ for match in FOUR_PLUS_STARS_RE.finditer(text):
88
+ if match.group(0).strip("*"):
89
+ four_plus += 1
90
+ return {
91
+ "image_count": len(IMAGE_RE.findall(text)),
92
+ "image_glue_count": len(IMAGE_GLUE_RE.findall(text)),
93
+ "four_plus_stars_count": four_plus,
94
+ "odd_backtick_line_count": count_odd_backtick_lines(text),
95
+ }
96
+
97
+
98
+ def audit_entry(entry: dict) -> dict:
99
+ input_path = Path(entry["input"])
100
+ output_path = Path(entry["output"])
101
+ export_report_path = output_path.parent / "export-report.json"
102
+
103
+ problems: list[str] = []
104
+ source = source_stats(input_path)
105
+
106
+ if not output_path.exists():
107
+ problems.append("missing_markdown_output")
108
+ return {
109
+ "input": str(input_path),
110
+ "output": str(output_path),
111
+ "problems": problems,
112
+ }
113
+
114
+ if not export_report_path.exists():
115
+ problems.append("missing_export_report")
116
+ return {
117
+ "input": str(input_path),
118
+ "output": str(output_path),
119
+ "problems": problems,
120
+ }
121
+
122
+ export_report = load_json(export_report_path)
123
+ report_stats = export_report.get("stats", {})
124
+ batch_stats = entry.get("stats", {})
125
+ markdown = scan_markdown(output_path)
126
+
127
+ rendered_block_count = (
128
+ int(report_stats.get("paragraphs", 0))
129
+ + int(report_stats.get("headings", 0))
130
+ + int(report_stats.get("lists", 0))
131
+ + int(report_stats.get("code_blocks", 0))
132
+ )
133
+
134
+ if report_stats != batch_stats:
135
+ problems.append("batch_report_mismatch")
136
+ if rendered_block_count != source.nonempty_paragraphs:
137
+ problems.append("paragraph_count_mismatch")
138
+ if markdown["image_count"] != int(report_stats.get("images", 0)):
139
+ problems.append("image_count_mismatch")
140
+ if markdown["image_glue_count"]:
141
+ problems.append("image_glue")
142
+ if markdown["four_plus_stars_count"]:
143
+ problems.append("four_plus_stars")
144
+ if markdown["odd_backtick_line_count"]:
145
+ problems.append("odd_backtick_lines")
146
+
147
+ return {
148
+ "input": str(input_path),
149
+ "output": str(output_path),
150
+ "source_total_paragraphs": source.total_paragraphs,
151
+ "source_nonempty_paragraphs": source.nonempty_paragraphs,
152
+ "report_rendered_blocks": rendered_block_count,
153
+ "report_images": int(report_stats.get("images", 0)),
154
+ "markdown_images": markdown["image_count"],
155
+ "image_glue_count": markdown["image_glue_count"],
156
+ "four_plus_stars_count": markdown["four_plus_stars_count"],
157
+ "odd_backtick_line_count": markdown["odd_backtick_line_count"],
158
+ "warnings": list(report_stats.get("warnings", [])),
159
+ "problems": problems,
160
+ }
161
+
162
+
163
+ def summarize(results: list[dict]) -> dict:
164
+ problem_counts: dict[str, int] = {}
165
+ for result in results:
166
+ for problem in result.get("problems", []):
167
+ problem_counts[problem] = problem_counts.get(problem, 0) + 1
168
+ return {
169
+ "files_audited": len(results),
170
+ "files_with_problems": sum(1 for result in results if result.get("problems")),
171
+ "problem_counts": problem_counts,
172
+ "problem_examples": [result for result in results if result.get("problems")][:25],
173
+ }
174
+
175
+
176
+ def main() -> int:
177
+ parser = argparse.ArgumentParser(description="Audit build-corpus batch output against source .docx files.")
178
+ parser.add_argument("--batch-report", required=True, help="Path to build-corpus-batch-report.json")
179
+ parser.add_argument("--out", default="", help="Optional path for the audit JSON report")
180
+ args = parser.parse_args()
181
+
182
+ batch_report_path = Path(args.batch_report).resolve()
183
+ entries = load_json(batch_report_path)
184
+ if not isinstance(entries, list):
185
+ raise SystemExit("Batch report must be a JSON array.")
186
+
187
+ results = [audit_entry(entry) for entry in entries]
188
+ summary = summarize(results)
189
+ payload = {
190
+ "batch_report": str(batch_report_path),
191
+ "summary": summary,
192
+ "results": results,
193
+ }
194
+
195
+ out_path = Path(args.out).resolve() if args.out else batch_report_path.with_name("corpus-audit-report.json")
196
+ out_path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
197
+ print(json.dumps(summary, indent=2))
198
+ print(f"WROTE {out_path}")
199
+ return 0
200
+
201
+
202
+ if __name__ == "__main__":
203
+ raise SystemExit(main())
@@ -0,0 +1,228 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import hashlib
5
+ import json
6
+ import re
7
+ import time
8
+ from collections import deque
9
+ from pathlib import Path
10
+ from typing import Iterable
11
+ from urllib.parse import urljoin, urlparse
12
+ from zipfile import BadZipFile, ZipFile
13
+
14
+ import requests
15
+
16
+
17
+ BASE_URL = "https://word.cloud.microsoft"
18
+ START_URLS = [
19
+ "https://word.cloud.microsoft/create/en/templates/",
20
+ "https://word.cloud.microsoft/create/en/resume-templates/",
21
+ ]
22
+
23
+ HEADERS = {
24
+ "User-Agent": (
25
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
26
+ "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36"
27
+ )
28
+ }
29
+
30
+ PAGE_LINK_RE = re.compile(r'(?:"|href=")(/create/en/[^"#? ]+/)')
31
+ FILE_URL_RE = re.compile(r'"fileUrl":"(https://cdn\.create\.microsoft\.com/[^"]+\.docx)"')
32
+ TITLE_RE = re.compile(
33
+ r'"fileUrl":"(?P<url>https://cdn\.create\.microsoft\.com/[^"]+\.docx)","title":"(?P<title>[^"]+)"'
34
+ )
35
+
36
+
37
+ def normalize_page(url: str) -> str:
38
+ parsed = urlparse(url)
39
+ return f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
40
+
41
+
42
+ def should_visit(url: str) -> bool:
43
+ parsed = urlparse(url)
44
+ if parsed.netloc != "word.cloud.microsoft":
45
+ return False
46
+ if not parsed.path.startswith("/create/en/"):
47
+ return False
48
+ path = parsed.path.lower()
49
+ if "/blog/" in path:
50
+ return False
51
+ if any(token in path for token in ("/copilot-", "/document-editor/", "/grammar-checker/", "/ai-", "/new/")):
52
+ return False
53
+ return True
54
+
55
+
56
+ def fetch(session: requests.Session, url: str) -> str | None:
57
+ try:
58
+ response = session.get(url, headers=HEADERS, timeout=30)
59
+ if response.status_code >= 400:
60
+ return None
61
+ return response.text
62
+ except requests.RequestException:
63
+ return None
64
+
65
+
66
+ def extract_links(html: str) -> Iterable[str]:
67
+ for match in PAGE_LINK_RE.finditer(html):
68
+ yield urljoin(BASE_URL, match.group(1))
69
+
70
+
71
+ def extract_templates(html: str) -> list[dict[str, str]]:
72
+ titles_by_url = {match.group("url"): match.group("title") for match in TITLE_RE.finditer(html)}
73
+ found = []
74
+ for match in FILE_URL_RE.finditer(html):
75
+ url = match.group(1).replace("\\u0026", "&")
76
+ found.append({
77
+ "file_url": url,
78
+ "title": titles_by_url.get(url, ""),
79
+ })
80
+ return found
81
+
82
+
83
+ def validate_docx(path: Path) -> dict:
84
+ with ZipFile(path) as zf:
85
+ names = set(zf.namelist())
86
+ if "[Content_Types].xml" not in names or "word/document.xml" not in names:
87
+ raise BadZipFile("not a Word DOCX package")
88
+ document_xml = zf.read("word/document.xml")
89
+ return {
90
+ "omml_nodes": len(re.findall(rb"<m:oMath\b|<m:oMathPara\b", document_xml)),
91
+ "media_parts": len([name for name in names if name.startswith("word/media/")]),
92
+ "document_xml_bytes": len(document_xml),
93
+ }
94
+
95
+
96
+ def safe_name(index: int, title: str, content: bytes, file_url: str) -> str:
97
+ digest = hashlib.sha256(content).hexdigest()[:10]
98
+ stem = title.strip() or Path(urlparse(file_url).path).name
99
+ stem = re.sub(r"[^A-Za-z0-9._ -]+", "_", stem).strip(" ._")
100
+ if not stem.lower().endswith(".docx"):
101
+ stem += ".docx"
102
+ return f"{index:03d}-{digest}-{stem}"
103
+
104
+
105
+ def download_template(
106
+ session: requests.Session,
107
+ item: dict[str, str],
108
+ out_dir: Path,
109
+ index: int,
110
+ max_mb: int,
111
+ ) -> dict | None:
112
+ try:
113
+ with session.get(item["file_url"], headers=HEADERS, timeout=60, stream=True) as response:
114
+ if response.status_code >= 400:
115
+ return None
116
+ chunks: list[bytes] = []
117
+ total = 0
118
+ limit = max_mb * 1024 * 1024
119
+ for chunk in response.iter_content(chunk_size=1024 * 256):
120
+ if not chunk:
121
+ continue
122
+ total += len(chunk)
123
+ if total > limit:
124
+ return None
125
+ chunks.append(chunk)
126
+ content = b"".join(chunks)
127
+ except requests.RequestException:
128
+ return None
129
+
130
+ if not content.startswith(b"PK"):
131
+ return None
132
+
133
+ filename = safe_name(index, item.get("title", ""), content, item["file_url"])
134
+ out_path = out_dir / filename
135
+ out_path.write_bytes(content)
136
+ try:
137
+ stats = validate_docx(out_path)
138
+ except (BadZipFile, KeyError):
139
+ out_path.unlink(missing_ok=True)
140
+ return None
141
+
142
+ return {
143
+ "file": str(out_path),
144
+ "title": item.get("title", ""),
145
+ "source_url": item["source_url"],
146
+ "file_url": item["file_url"],
147
+ "bytes": len(content),
148
+ **stats,
149
+ }
150
+
151
+
152
+ def crawl_template_pages(session: requests.Session, max_pages: int) -> list[dict[str, str]]:
153
+ queue = deque(START_URLS)
154
+ seen_pages: set[str] = set()
155
+ seen_file_urls: set[str] = set()
156
+ template_items: list[dict[str, str]] = []
157
+
158
+ while queue and len(seen_pages) < max_pages:
159
+ current = normalize_page(queue.popleft())
160
+ if current in seen_pages or not should_visit(current):
161
+ continue
162
+ seen_pages.add(current)
163
+ html = fetch(session, current)
164
+ if not html:
165
+ continue
166
+
167
+ for template in extract_templates(html):
168
+ file_url = template["file_url"]
169
+ if file_url in seen_file_urls:
170
+ continue
171
+ seen_file_urls.add(file_url)
172
+ template_items.append({
173
+ **template,
174
+ "source_url": current,
175
+ })
176
+
177
+ for link in extract_links(html):
178
+ normalized = normalize_page(link)
179
+ if normalized not in seen_pages and should_visit(normalized):
180
+ queue.append(normalized)
181
+
182
+ time.sleep(0.15)
183
+
184
+ return template_items
185
+
186
+
187
+ def main() -> int:
188
+ parser = argparse.ArgumentParser()
189
+ parser.add_argument("--out", type=Path, required=True)
190
+ parser.add_argument("--max-pages", type=int, default=80)
191
+ parser.add_argument("--max-mb", type=int, default=40)
192
+ parser.add_argument("--limit", type=int, default=0, help="0 means no explicit template limit")
193
+ args = parser.parse_args()
194
+
195
+ args.out.mkdir(parents=True, exist_ok=True)
196
+ session = requests.Session()
197
+
198
+ template_items = crawl_template_pages(session, max_pages=args.max_pages)
199
+ if args.limit > 0:
200
+ template_items = template_items[: args.limit]
201
+
202
+ manifest: list[dict] = []
203
+ for index, item in enumerate(template_items, 1):
204
+ downloaded = download_template(session, item, args.out, index, args.max_mb)
205
+ if not downloaded:
206
+ continue
207
+ manifest.append(downloaded)
208
+ print(f"kept {len(manifest):03d}: {Path(downloaded['file']).name}", flush=True)
209
+ time.sleep(0.1)
210
+
211
+ manifest_path = args.out / "microsoft-word-templates-manifest.json"
212
+ manifest_path.write_text(
213
+ json.dumps(
214
+ {
215
+ "count": len(manifest),
216
+ "items": manifest,
217
+ },
218
+ indent=2,
219
+ ensure_ascii=False,
220
+ ),
221
+ encoding="utf-8",
222
+ )
223
+ print(f"saved manifest: {manifest_path}", flush=True)
224
+ return 0 if manifest else 1
225
+
226
+
227
+ if __name__ == "__main__":
228
+ raise SystemExit(main())
@@ -0,0 +1,272 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import hashlib
5
+ import json
6
+ import re
7
+ import sys
8
+ import time
9
+ from pathlib import Path
10
+ from typing import Iterable
11
+ from urllib.parse import parse_qs, quote_plus, unquote, urlparse
12
+ from zipfile import BadZipFile, ZipFile
13
+
14
+ import requests
15
+
16
+
17
+ DEFAULT_QUERIES = [
18
+ 'filetype:docx "equation"',
19
+ 'filetype:docx "equations"',
20
+ 'filetype:docx "Cambria Math"',
21
+ 'filetype:docx "math equations"',
22
+ 'filetype:docx "Microsoft Equation"',
23
+ 'filetype:docx "OMML"',
24
+ 'filetype:docx "quadratic equation"',
25
+ 'filetype:docx "integral"',
26
+ 'filetype:docx "matrix"',
27
+ 'filetype:docx "calculus"',
28
+ 'filetype:docx "physics" "equation"',
29
+ 'filetype:docx "engineering" "equation"',
30
+ 'filetype:docx "statistics" "equation"',
31
+ 'filetype:docx "algebra" "equation"',
32
+ 'filetype:docx "geometry" "equation"',
33
+ ]
34
+
35
+ GITHUB_REPO_QUERIES = [
36
+ "docx equation math",
37
+ "docx equations latex",
38
+ "docx omml latex",
39
+ "word equation docx",
40
+ "markdown docx equation",
41
+ "pandoc docx equation",
42
+ "math docx converter",
43
+ "equation-heavy docx",
44
+ "ooxml math docx",
45
+ "Cambria Math docx",
46
+ ]
47
+
48
+ HEADERS = {
49
+ "User-Agent": (
50
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
51
+ "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36"
52
+ )
53
+ }
54
+
55
+ DOCX_CT_HINTS = (
56
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
57
+ "application/octet-stream",
58
+ "application/zip",
59
+ "binary/octet-stream",
60
+ )
61
+
62
+
63
+ def extract_urls(html: str) -> list[str]:
64
+ raw = re.findall(r'https?://[^"\'<>\s)]+', html)
65
+ urls: list[str] = []
66
+ for url in raw:
67
+ url = unquote(url).replace("&amp;", "&")
68
+ parsed = urlparse(url)
69
+ if parsed.netloc.endswith("bing.com") and parsed.path == "/ck/a":
70
+ qs = parse_qs(parsed.query)
71
+ for key in ("u", "r"):
72
+ if key in qs:
73
+ url = unquote(qs[key][0])
74
+ break
75
+ if "duckduckgo.com/l/?" in url:
76
+ qs = parse_qs(urlparse(url).query)
77
+ if "uddg" in qs:
78
+ url = unquote(qs["uddg"][0])
79
+ urls.append(url.rstrip(".,;"))
80
+ return urls
81
+
82
+
83
+ def search(query: str, pages: int = 2) -> Iterable[str]:
84
+ endpoints = [
85
+ "https://www.bing.com/search?q={query}&first={offset}",
86
+ "https://html.duckduckgo.com/html/?q={query}&s={offset}",
87
+ ]
88
+ for endpoint in endpoints:
89
+ for page in range(pages):
90
+ offset = page * 10 + 1
91
+ url = endpoint.format(query=quote_plus(query), offset=offset)
92
+ try:
93
+ response = requests.get(url, headers=HEADERS, timeout=20)
94
+ if response.status_code >= 400:
95
+ continue
96
+ yield from extract_urls(response.text)
97
+ time.sleep(0.4)
98
+ except requests.RequestException:
99
+ continue
100
+
101
+
102
+ def github_json(url: str) -> dict | None:
103
+ try:
104
+ response = requests.get(url, headers={**HEADERS, "Accept": "application/vnd.github+json"}, timeout=25)
105
+ if response.status_code >= 400:
106
+ return None
107
+ return response.json()
108
+ except requests.RequestException:
109
+ return None
110
+
111
+
112
+ def github_docx_urls(max_repos_per_query: int = 8) -> Iterable[str]:
113
+ seen_repos: set[str] = set()
114
+ for query in GITHUB_REPO_QUERIES:
115
+ search_url = (
116
+ "https://api.github.com/search/repositories"
117
+ f"?q={quote_plus(query)}&per_page={max_repos_per_query}"
118
+ )
119
+ payload = github_json(search_url)
120
+ if not payload:
121
+ continue
122
+ for repo in payload.get("items", []):
123
+ full_name = repo.get("full_name")
124
+ branch = repo.get("default_branch") or "main"
125
+ if not full_name or full_name in seen_repos:
126
+ continue
127
+ seen_repos.add(full_name)
128
+ tree_url = f"https://api.github.com/repos/{full_name}/git/trees/{branch}?recursive=1"
129
+ tree = github_json(tree_url)
130
+ if not tree:
131
+ continue
132
+ for item in tree.get("tree", []):
133
+ path = item.get("path", "")
134
+ if item.get("type") == "blob" and path.lower().endswith(".docx"):
135
+ yield f"https://raw.githubusercontent.com/{full_name}/{branch}/{quote_path(path)}"
136
+ time.sleep(0.25)
137
+
138
+
139
+ def quote_path(path: str) -> str:
140
+ return "/".join(quote_plus(part).replace("+", "%20") for part in path.split("/"))
141
+
142
+
143
+ def looks_like_docx_url(url: str) -> bool:
144
+ low = url.lower()
145
+ return ".docx" in low and not any(bad in low for bad in ("?format=pdf", "/view?", "webcache"))
146
+
147
+
148
+ def safe_name(index: int, url: str, content: bytes) -> str:
149
+ parsed = urlparse(url)
150
+ stem = Path(unquote(parsed.path)).name or f"online-{index:03d}.docx"
151
+ stem = re.sub(r"[^A-Za-z0-9._ -]+", "_", stem)
152
+ if not stem.lower().endswith(".docx"):
153
+ stem += ".docx"
154
+ digest = hashlib.sha256(content).hexdigest()[:10]
155
+ return f"{index:03d}-{digest}-{stem}"
156
+
157
+
158
+ def validate_docx(path: Path) -> dict:
159
+ with ZipFile(path) as zf:
160
+ names = set(zf.namelist())
161
+ if "[Content_Types].xml" not in names or "word/document.xml" not in names:
162
+ raise BadZipFile("not a Word DOCX package")
163
+ document_xml = zf.read("word/document.xml")
164
+ return {
165
+ "omml_nodes": len(re.findall(rb"<m:oMath\b|<m:oMathPara\b", document_xml)),
166
+ "media_parts": len([name for name in names if name.startswith("word/media/")]),
167
+ "document_xml_bytes": len(document_xml),
168
+ }
169
+
170
+
171
+ def download(url: str, out_dir: Path, index: int, max_mb: int) -> dict | None:
172
+ try:
173
+ with requests.get(url, headers=HEADERS, timeout=30, stream=True, allow_redirects=True) as response:
174
+ if response.status_code >= 400:
175
+ return None
176
+ content_type = response.headers.get("content-type", "").split(";")[0].strip().lower()
177
+ if content_type and content_type not in DOCX_CT_HINTS and "wordprocessingml" not in content_type:
178
+ if ".docx" not in response.url.lower():
179
+ return None
180
+ chunks: list[bytes] = []
181
+ total = 0
182
+ limit = max_mb * 1024 * 1024
183
+ for chunk in response.iter_content(chunk_size=1024 * 256):
184
+ if not chunk:
185
+ continue
186
+ total += len(chunk)
187
+ if total > limit:
188
+ return None
189
+ chunks.append(chunk)
190
+ content = b"".join(chunks)
191
+ except requests.RequestException:
192
+ return None
193
+
194
+ if not content.startswith(b"PK"):
195
+ return None
196
+
197
+ out_path = out_dir / safe_name(index, response.url, content)
198
+ out_path.write_bytes(content)
199
+ try:
200
+ stats = validate_docx(out_path)
201
+ except (BadZipFile, KeyError):
202
+ out_path.unlink(missing_ok=True)
203
+ return None
204
+
205
+ return {
206
+ "file": str(out_path),
207
+ "source_url": response.url,
208
+ "bytes": len(content),
209
+ "content_type": content_type,
210
+ **stats,
211
+ }
212
+
213
+
214
+ def main() -> int:
215
+ parser = argparse.ArgumentParser()
216
+ parser.add_argument("--out", type=Path, required=True)
217
+ parser.add_argument("--target", type=int, default=50)
218
+ parser.add_argument("--max-mb", type=int, default=40)
219
+ parser.add_argument("--prefer-equations", action="store_true")
220
+ args = parser.parse_args()
221
+
222
+ args.out.mkdir(parents=True, exist_ok=True)
223
+ seen: set[str] = set()
224
+ manifest: list[dict] = []
225
+ index = len(list(args.out.glob("*.docx"))) + 1
226
+
227
+ print("source: github repositories", flush=True)
228
+ for url in github_docx_urls():
229
+ if len(manifest) >= args.target:
230
+ break
231
+ normalized = url.split("#", 1)[0]
232
+ if normalized in seen:
233
+ continue
234
+ seen.add(normalized)
235
+ item = download(normalized, args.out, index, args.max_mb)
236
+ if not item:
237
+ continue
238
+ manifest.append(item)
239
+ index += 1
240
+ print(f"kept {len(manifest):02d}: {Path(item['file']).name} omml={item['omml_nodes']}", flush=True)
241
+
242
+ for query in DEFAULT_QUERIES:
243
+ print(f"search: {query}", flush=True)
244
+ for url in search(query):
245
+ if len(manifest) >= args.target:
246
+ break
247
+ if not looks_like_docx_url(url):
248
+ continue
249
+ normalized = url.split("#", 1)[0]
250
+ if normalized in seen:
251
+ continue
252
+ seen.add(normalized)
253
+ item = download(normalized, args.out, index, args.max_mb)
254
+ if not item:
255
+ continue
256
+ if args.prefer_equations and item["omml_nodes"] == 0 and len(manifest) < args.target // 2:
257
+ Path(item["file"]).unlink(missing_ok=True)
258
+ continue
259
+ manifest.append(item)
260
+ index += 1
261
+ print(f"kept {len(manifest):02d}: {Path(item['file']).name} omml={item['omml_nodes']}", flush=True)
262
+ if len(manifest) >= args.target:
263
+ break
264
+
265
+ manifest_path = args.out / "online-docx-manifest.json"
266
+ manifest_path.write_text(json.dumps({"count": len(manifest), "items": manifest}, indent=2), encoding="utf-8")
267
+ print(f"saved manifest: {manifest_path}", flush=True)
268
+ return 0 if len(manifest) >= args.target else 1
269
+
270
+
271
+ if __name__ == "__main__":
272
+ raise SystemExit(main())