regen.mde 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +16 -0
- package/README.md +295 -0
- package/bin/build-corpus-editor.js +81 -0
- package/bin/build-corpus.js +41 -0
- package/bin/postinstall.js +187 -0
- package/bin/regen-mdeditor-install.js +27 -0
- package/bin/regen-mdeditor-uninstall.js +19 -0
- package/bin/validate-katex.js +93 -0
- package/desktop/BuildCorpusEditor/BuildCorpusBridge.cs +270 -0
- package/desktop/BuildCorpusEditor/BuildCorpusEditor.csproj +22 -0
- package/desktop/BuildCorpusEditor/EditorForm.cs +540 -0
- package/desktop/BuildCorpusEditor/Program.cs +81 -0
- package/desktop/BuildCorpusEditor/app.manifest +16 -0
- package/dist/release/regen.mde-0.2.2-win-x64-setup.exe +0 -0
- package/dist/release/regen.mde-0.2.2-win-x64.zip +0 -0
- package/dist/windows-editor/BuildCorpusEditor.deps.json +83 -0
- package/dist/windows-editor/BuildCorpusEditor.dll +0 -0
- package/dist/windows-editor/BuildCorpusEditor.exe +0 -0
- package/dist/windows-editor/BuildCorpusEditor.pdb +0 -0
- package/dist/windows-editor/BuildCorpusEditor.runtimeconfig.json +19 -0
- package/dist/windows-editor/Microsoft.Web.WebView2.Core.dll +0 -0
- package/dist/windows-editor/Microsoft.Web.WebView2.Core.xml +6817 -0
- package/dist/windows-editor/Microsoft.Web.WebView2.WinForms.dll +0 -0
- package/dist/windows-editor/Microsoft.Web.WebView2.WinForms.xml +510 -0
- package/dist/windows-editor/Microsoft.Web.WebView2.Wpf.dll +0 -0
- package/dist/windows-editor/Microsoft.Web.WebView2.Wpf.xml +1902 -0
- package/dist/windows-editor/WebView2Loader.dll +0 -0
- package/dist/windows-editor/runtimes/win-x64/native/WebView2Loader.dll +0 -0
- package/dist/windows-editor/wwwroot/assets/index-DjJ6xmhy.js +326 -0
- package/dist/windows-editor/wwwroot/assets/index-_dwMNNsm.css +1 -0
- package/dist/windows-editor/wwwroot/index.html +22 -0
- package/editor-web/index.html +21 -0
- package/editor-web/src/main.jsx +399 -0
- package/editor-web/src/styles.css +602 -0
- package/editor-web/vite.config.js +13 -0
- package/examples/build-corpus.config.example.json +21 -0
- package/installer/install-regen-mde.ps1 +175 -0
- package/installer/regen-mde.nsi +81 -0
- package/package.json +86 -0
- package/pyproject.toml +33 -0
- package/requirements.txt +4 -0
- package/scripts/build-windows-editor.ps1 +47 -0
- package/scripts/package-windows-editor.ps1 +90 -0
- package/scripts/run-corpus.ps1 +28 -0
- package/scripts/run-editor-implementation-plane.ps1 +203 -0
- package/scripts/run-required-tests.ps1 +98 -0
- package/scripts/run-smoke.ps1 +28 -0
- package/src/build_corpus/__init__.py +3 -0
- package/src/build_corpus/docx_exporter.py +798 -0
- package/src/build_corpus/exporter.py +1195 -0
- package/src/build_corpus/ppt_exporter.py +532 -0
- package/src/build_corpus/templates/__init__.py +1 -0
- package/src/build_corpus/templates/md-to-word-template.dotx +0 -0
- package/src/build_corpus/validate_assets.py +46 -0
- package/tools/audit_corpus.py +203 -0
- package/tools/collect_microsoft_word_templates.py +228 -0
- package/tools/collect_online_docx_corpus.py +272 -0
- package/tools/collect_online_pptx_corpus.py +252 -0
- package/tools/compare_pptx_inputs_outputs.py +87 -0
- package/tools/roundtrip_docx_corpus.py +171 -0
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import json
|
|
5
|
+
import re
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from zipfile import ZipFile
|
|
9
|
+
from xml.etree import ElementTree as ET
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
|
|
13
|
+
W = f"{{{W_NS}}}"
|
|
14
|
+
|
|
15
|
+
TEXT_TAGS = {
|
|
16
|
+
f"{W}t",
|
|
17
|
+
f"{W}delText",
|
|
18
|
+
f"{W}instrText",
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
CONTENT_TAGS = TEXT_TAGS | {
|
|
22
|
+
f"{W}drawing",
|
|
23
|
+
f"{W}object",
|
|
24
|
+
f"{W}pict",
|
|
25
|
+
f"{W}oMath",
|
|
26
|
+
f"{W}oMathPara",
|
|
27
|
+
f"{W}noBreakHyphen",
|
|
28
|
+
f"{W}softHyphen",
|
|
29
|
+
f"{W}tab",
|
|
30
|
+
f"{W}br",
|
|
31
|
+
f"{W}cr",
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
IMAGE_RE = re.compile(r"!\[[^\]]*\]\([^)]+\)")
|
|
35
|
+
IMAGE_GLUE_RE = re.compile(
|
|
36
|
+
r"!\[[^\]]*\]\([^)]+\)(?=[^\s<>)\].,;:!?])|(?<=[^\s<(\[.,;:!?])!\[[^\]]*\]\([^)]+\)"
|
|
37
|
+
)
|
|
38
|
+
FOUR_PLUS_STARS_RE = re.compile(r"(?<!^)\*{4,}(?!$)", re.M)
|
|
39
|
+
ODD_BACKTICK_LINE_RE = re.compile(r"`")
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@dataclass
|
|
43
|
+
class SourceStats:
|
|
44
|
+
total_paragraphs: int = 0
|
|
45
|
+
nonempty_paragraphs: int = 0
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def local_name(tag: str) -> str:
|
|
49
|
+
return tag.rsplit("}", 1)[-1] if "}" in tag else tag
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def paragraph_has_content(p: ET.Element) -> bool:
|
|
53
|
+
for node in p.iter():
|
|
54
|
+
if node.tag in TEXT_TAGS and (node.text or "").strip():
|
|
55
|
+
return True
|
|
56
|
+
if node.tag in CONTENT_TAGS - TEXT_TAGS:
|
|
57
|
+
return True
|
|
58
|
+
return False
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def source_stats(docx_path: Path) -> SourceStats:
|
|
62
|
+
stats = SourceStats()
|
|
63
|
+
with ZipFile(docx_path) as zf:
|
|
64
|
+
root = ET.fromstring(zf.read("word/document.xml"))
|
|
65
|
+
for p in root.iter(f"{W}p"):
|
|
66
|
+
stats.total_paragraphs += 1
|
|
67
|
+
if paragraph_has_content(p):
|
|
68
|
+
stats.nonempty_paragraphs += 1
|
|
69
|
+
return stats
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def load_json(path: Path) -> dict | list:
|
|
73
|
+
return json.loads(path.read_text(encoding="utf-8"))
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def count_odd_backtick_lines(text: str) -> int:
|
|
77
|
+
count = 0
|
|
78
|
+
for line in text.splitlines():
|
|
79
|
+
if line.count("`") % 2:
|
|
80
|
+
count += 1
|
|
81
|
+
return count
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def scan_markdown(md_path: Path) -> dict[str, int]:
|
|
85
|
+
text = md_path.read_text(encoding="utf-8", errors="replace")
|
|
86
|
+
four_plus = 0
|
|
87
|
+
for match in FOUR_PLUS_STARS_RE.finditer(text):
|
|
88
|
+
if match.group(0).strip("*"):
|
|
89
|
+
four_plus += 1
|
|
90
|
+
return {
|
|
91
|
+
"image_count": len(IMAGE_RE.findall(text)),
|
|
92
|
+
"image_glue_count": len(IMAGE_GLUE_RE.findall(text)),
|
|
93
|
+
"four_plus_stars_count": four_plus,
|
|
94
|
+
"odd_backtick_line_count": count_odd_backtick_lines(text),
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def audit_entry(entry: dict) -> dict:
|
|
99
|
+
input_path = Path(entry["input"])
|
|
100
|
+
output_path = Path(entry["output"])
|
|
101
|
+
export_report_path = output_path.parent / "export-report.json"
|
|
102
|
+
|
|
103
|
+
problems: list[str] = []
|
|
104
|
+
source = source_stats(input_path)
|
|
105
|
+
|
|
106
|
+
if not output_path.exists():
|
|
107
|
+
problems.append("missing_markdown_output")
|
|
108
|
+
return {
|
|
109
|
+
"input": str(input_path),
|
|
110
|
+
"output": str(output_path),
|
|
111
|
+
"problems": problems,
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
if not export_report_path.exists():
|
|
115
|
+
problems.append("missing_export_report")
|
|
116
|
+
return {
|
|
117
|
+
"input": str(input_path),
|
|
118
|
+
"output": str(output_path),
|
|
119
|
+
"problems": problems,
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
export_report = load_json(export_report_path)
|
|
123
|
+
report_stats = export_report.get("stats", {})
|
|
124
|
+
batch_stats = entry.get("stats", {})
|
|
125
|
+
markdown = scan_markdown(output_path)
|
|
126
|
+
|
|
127
|
+
rendered_block_count = (
|
|
128
|
+
int(report_stats.get("paragraphs", 0))
|
|
129
|
+
+ int(report_stats.get("headings", 0))
|
|
130
|
+
+ int(report_stats.get("lists", 0))
|
|
131
|
+
+ int(report_stats.get("code_blocks", 0))
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
if report_stats != batch_stats:
|
|
135
|
+
problems.append("batch_report_mismatch")
|
|
136
|
+
if rendered_block_count != source.nonempty_paragraphs:
|
|
137
|
+
problems.append("paragraph_count_mismatch")
|
|
138
|
+
if markdown["image_count"] != int(report_stats.get("images", 0)):
|
|
139
|
+
problems.append("image_count_mismatch")
|
|
140
|
+
if markdown["image_glue_count"]:
|
|
141
|
+
problems.append("image_glue")
|
|
142
|
+
if markdown["four_plus_stars_count"]:
|
|
143
|
+
problems.append("four_plus_stars")
|
|
144
|
+
if markdown["odd_backtick_line_count"]:
|
|
145
|
+
problems.append("odd_backtick_lines")
|
|
146
|
+
|
|
147
|
+
return {
|
|
148
|
+
"input": str(input_path),
|
|
149
|
+
"output": str(output_path),
|
|
150
|
+
"source_total_paragraphs": source.total_paragraphs,
|
|
151
|
+
"source_nonempty_paragraphs": source.nonempty_paragraphs,
|
|
152
|
+
"report_rendered_blocks": rendered_block_count,
|
|
153
|
+
"report_images": int(report_stats.get("images", 0)),
|
|
154
|
+
"markdown_images": markdown["image_count"],
|
|
155
|
+
"image_glue_count": markdown["image_glue_count"],
|
|
156
|
+
"four_plus_stars_count": markdown["four_plus_stars_count"],
|
|
157
|
+
"odd_backtick_line_count": markdown["odd_backtick_line_count"],
|
|
158
|
+
"warnings": list(report_stats.get("warnings", [])),
|
|
159
|
+
"problems": problems,
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def summarize(results: list[dict]) -> dict:
|
|
164
|
+
problem_counts: dict[str, int] = {}
|
|
165
|
+
for result in results:
|
|
166
|
+
for problem in result.get("problems", []):
|
|
167
|
+
problem_counts[problem] = problem_counts.get(problem, 0) + 1
|
|
168
|
+
return {
|
|
169
|
+
"files_audited": len(results),
|
|
170
|
+
"files_with_problems": sum(1 for result in results if result.get("problems")),
|
|
171
|
+
"problem_counts": problem_counts,
|
|
172
|
+
"problem_examples": [result for result in results if result.get("problems")][:25],
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def main() -> int:
|
|
177
|
+
parser = argparse.ArgumentParser(description="Audit build-corpus batch output against source .docx files.")
|
|
178
|
+
parser.add_argument("--batch-report", required=True, help="Path to build-corpus-batch-report.json")
|
|
179
|
+
parser.add_argument("--out", default="", help="Optional path for the audit JSON report")
|
|
180
|
+
args = parser.parse_args()
|
|
181
|
+
|
|
182
|
+
batch_report_path = Path(args.batch_report).resolve()
|
|
183
|
+
entries = load_json(batch_report_path)
|
|
184
|
+
if not isinstance(entries, list):
|
|
185
|
+
raise SystemExit("Batch report must be a JSON array.")
|
|
186
|
+
|
|
187
|
+
results = [audit_entry(entry) for entry in entries]
|
|
188
|
+
summary = summarize(results)
|
|
189
|
+
payload = {
|
|
190
|
+
"batch_report": str(batch_report_path),
|
|
191
|
+
"summary": summary,
|
|
192
|
+
"results": results,
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
out_path = Path(args.out).resolve() if args.out else batch_report_path.with_name("corpus-audit-report.json")
|
|
196
|
+
out_path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
|
|
197
|
+
print(json.dumps(summary, indent=2))
|
|
198
|
+
print(f"WROTE {out_path}")
|
|
199
|
+
return 0
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
if __name__ == "__main__":
|
|
203
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import hashlib
|
|
5
|
+
import json
|
|
6
|
+
import re
|
|
7
|
+
import time
|
|
8
|
+
from collections import deque
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Iterable
|
|
11
|
+
from urllib.parse import urljoin, urlparse
|
|
12
|
+
from zipfile import BadZipFile, ZipFile
|
|
13
|
+
|
|
14
|
+
import requests
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
BASE_URL = "https://word.cloud.microsoft"
|
|
18
|
+
START_URLS = [
|
|
19
|
+
"https://word.cloud.microsoft/create/en/templates/",
|
|
20
|
+
"https://word.cloud.microsoft/create/en/resume-templates/",
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
HEADERS = {
|
|
24
|
+
"User-Agent": (
|
|
25
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
26
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36"
|
|
27
|
+
)
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
PAGE_LINK_RE = re.compile(r'(?:"|href=")(/create/en/[^"#? ]+/)')
|
|
31
|
+
FILE_URL_RE = re.compile(r'"fileUrl":"(https://cdn\.create\.microsoft\.com/[^"]+\.docx)"')
|
|
32
|
+
TITLE_RE = re.compile(
|
|
33
|
+
r'"fileUrl":"(?P<url>https://cdn\.create\.microsoft\.com/[^"]+\.docx)","title":"(?P<title>[^"]+)"'
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def normalize_page(url: str) -> str:
|
|
38
|
+
parsed = urlparse(url)
|
|
39
|
+
return f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def should_visit(url: str) -> bool:
|
|
43
|
+
parsed = urlparse(url)
|
|
44
|
+
if parsed.netloc != "word.cloud.microsoft":
|
|
45
|
+
return False
|
|
46
|
+
if not parsed.path.startswith("/create/en/"):
|
|
47
|
+
return False
|
|
48
|
+
path = parsed.path.lower()
|
|
49
|
+
if "/blog/" in path:
|
|
50
|
+
return False
|
|
51
|
+
if any(token in path for token in ("/copilot-", "/document-editor/", "/grammar-checker/", "/ai-", "/new/")):
|
|
52
|
+
return False
|
|
53
|
+
return True
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def fetch(session: requests.Session, url: str) -> str | None:
|
|
57
|
+
try:
|
|
58
|
+
response = session.get(url, headers=HEADERS, timeout=30)
|
|
59
|
+
if response.status_code >= 400:
|
|
60
|
+
return None
|
|
61
|
+
return response.text
|
|
62
|
+
except requests.RequestException:
|
|
63
|
+
return None
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def extract_links(html: str) -> Iterable[str]:
|
|
67
|
+
for match in PAGE_LINK_RE.finditer(html):
|
|
68
|
+
yield urljoin(BASE_URL, match.group(1))
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def extract_templates(html: str) -> list[dict[str, str]]:
|
|
72
|
+
titles_by_url = {match.group("url"): match.group("title") for match in TITLE_RE.finditer(html)}
|
|
73
|
+
found = []
|
|
74
|
+
for match in FILE_URL_RE.finditer(html):
|
|
75
|
+
url = match.group(1).replace("\\u0026", "&")
|
|
76
|
+
found.append({
|
|
77
|
+
"file_url": url,
|
|
78
|
+
"title": titles_by_url.get(url, ""),
|
|
79
|
+
})
|
|
80
|
+
return found
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def validate_docx(path: Path) -> dict:
|
|
84
|
+
with ZipFile(path) as zf:
|
|
85
|
+
names = set(zf.namelist())
|
|
86
|
+
if "[Content_Types].xml" not in names or "word/document.xml" not in names:
|
|
87
|
+
raise BadZipFile("not a Word DOCX package")
|
|
88
|
+
document_xml = zf.read("word/document.xml")
|
|
89
|
+
return {
|
|
90
|
+
"omml_nodes": len(re.findall(rb"<m:oMath\b|<m:oMathPara\b", document_xml)),
|
|
91
|
+
"media_parts": len([name for name in names if name.startswith("word/media/")]),
|
|
92
|
+
"document_xml_bytes": len(document_xml),
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def safe_name(index: int, title: str, content: bytes, file_url: str) -> str:
|
|
97
|
+
digest = hashlib.sha256(content).hexdigest()[:10]
|
|
98
|
+
stem = title.strip() or Path(urlparse(file_url).path).name
|
|
99
|
+
stem = re.sub(r"[^A-Za-z0-9._ -]+", "_", stem).strip(" ._")
|
|
100
|
+
if not stem.lower().endswith(".docx"):
|
|
101
|
+
stem += ".docx"
|
|
102
|
+
return f"{index:03d}-{digest}-{stem}"
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def download_template(
|
|
106
|
+
session: requests.Session,
|
|
107
|
+
item: dict[str, str],
|
|
108
|
+
out_dir: Path,
|
|
109
|
+
index: int,
|
|
110
|
+
max_mb: int,
|
|
111
|
+
) -> dict | None:
|
|
112
|
+
try:
|
|
113
|
+
with session.get(item["file_url"], headers=HEADERS, timeout=60, stream=True) as response:
|
|
114
|
+
if response.status_code >= 400:
|
|
115
|
+
return None
|
|
116
|
+
chunks: list[bytes] = []
|
|
117
|
+
total = 0
|
|
118
|
+
limit = max_mb * 1024 * 1024
|
|
119
|
+
for chunk in response.iter_content(chunk_size=1024 * 256):
|
|
120
|
+
if not chunk:
|
|
121
|
+
continue
|
|
122
|
+
total += len(chunk)
|
|
123
|
+
if total > limit:
|
|
124
|
+
return None
|
|
125
|
+
chunks.append(chunk)
|
|
126
|
+
content = b"".join(chunks)
|
|
127
|
+
except requests.RequestException:
|
|
128
|
+
return None
|
|
129
|
+
|
|
130
|
+
if not content.startswith(b"PK"):
|
|
131
|
+
return None
|
|
132
|
+
|
|
133
|
+
filename = safe_name(index, item.get("title", ""), content, item["file_url"])
|
|
134
|
+
out_path = out_dir / filename
|
|
135
|
+
out_path.write_bytes(content)
|
|
136
|
+
try:
|
|
137
|
+
stats = validate_docx(out_path)
|
|
138
|
+
except (BadZipFile, KeyError):
|
|
139
|
+
out_path.unlink(missing_ok=True)
|
|
140
|
+
return None
|
|
141
|
+
|
|
142
|
+
return {
|
|
143
|
+
"file": str(out_path),
|
|
144
|
+
"title": item.get("title", ""),
|
|
145
|
+
"source_url": item["source_url"],
|
|
146
|
+
"file_url": item["file_url"],
|
|
147
|
+
"bytes": len(content),
|
|
148
|
+
**stats,
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def crawl_template_pages(session: requests.Session, max_pages: int) -> list[dict[str, str]]:
|
|
153
|
+
queue = deque(START_URLS)
|
|
154
|
+
seen_pages: set[str] = set()
|
|
155
|
+
seen_file_urls: set[str] = set()
|
|
156
|
+
template_items: list[dict[str, str]] = []
|
|
157
|
+
|
|
158
|
+
while queue and len(seen_pages) < max_pages:
|
|
159
|
+
current = normalize_page(queue.popleft())
|
|
160
|
+
if current in seen_pages or not should_visit(current):
|
|
161
|
+
continue
|
|
162
|
+
seen_pages.add(current)
|
|
163
|
+
html = fetch(session, current)
|
|
164
|
+
if not html:
|
|
165
|
+
continue
|
|
166
|
+
|
|
167
|
+
for template in extract_templates(html):
|
|
168
|
+
file_url = template["file_url"]
|
|
169
|
+
if file_url in seen_file_urls:
|
|
170
|
+
continue
|
|
171
|
+
seen_file_urls.add(file_url)
|
|
172
|
+
template_items.append({
|
|
173
|
+
**template,
|
|
174
|
+
"source_url": current,
|
|
175
|
+
})
|
|
176
|
+
|
|
177
|
+
for link in extract_links(html):
|
|
178
|
+
normalized = normalize_page(link)
|
|
179
|
+
if normalized not in seen_pages and should_visit(normalized):
|
|
180
|
+
queue.append(normalized)
|
|
181
|
+
|
|
182
|
+
time.sleep(0.15)
|
|
183
|
+
|
|
184
|
+
return template_items
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def main() -> int:
|
|
188
|
+
parser = argparse.ArgumentParser()
|
|
189
|
+
parser.add_argument("--out", type=Path, required=True)
|
|
190
|
+
parser.add_argument("--max-pages", type=int, default=80)
|
|
191
|
+
parser.add_argument("--max-mb", type=int, default=40)
|
|
192
|
+
parser.add_argument("--limit", type=int, default=0, help="0 means no explicit template limit")
|
|
193
|
+
args = parser.parse_args()
|
|
194
|
+
|
|
195
|
+
args.out.mkdir(parents=True, exist_ok=True)
|
|
196
|
+
session = requests.Session()
|
|
197
|
+
|
|
198
|
+
template_items = crawl_template_pages(session, max_pages=args.max_pages)
|
|
199
|
+
if args.limit > 0:
|
|
200
|
+
template_items = template_items[: args.limit]
|
|
201
|
+
|
|
202
|
+
manifest: list[dict] = []
|
|
203
|
+
for index, item in enumerate(template_items, 1):
|
|
204
|
+
downloaded = download_template(session, item, args.out, index, args.max_mb)
|
|
205
|
+
if not downloaded:
|
|
206
|
+
continue
|
|
207
|
+
manifest.append(downloaded)
|
|
208
|
+
print(f"kept {len(manifest):03d}: {Path(downloaded['file']).name}", flush=True)
|
|
209
|
+
time.sleep(0.1)
|
|
210
|
+
|
|
211
|
+
manifest_path = args.out / "microsoft-word-templates-manifest.json"
|
|
212
|
+
manifest_path.write_text(
|
|
213
|
+
json.dumps(
|
|
214
|
+
{
|
|
215
|
+
"count": len(manifest),
|
|
216
|
+
"items": manifest,
|
|
217
|
+
},
|
|
218
|
+
indent=2,
|
|
219
|
+
ensure_ascii=False,
|
|
220
|
+
),
|
|
221
|
+
encoding="utf-8",
|
|
222
|
+
)
|
|
223
|
+
print(f"saved manifest: {manifest_path}", flush=True)
|
|
224
|
+
return 0 if manifest else 1
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
if __name__ == "__main__":
|
|
228
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,272 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import hashlib
|
|
5
|
+
import json
|
|
6
|
+
import re
|
|
7
|
+
import sys
|
|
8
|
+
import time
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Iterable
|
|
11
|
+
from urllib.parse import parse_qs, quote_plus, unquote, urlparse
|
|
12
|
+
from zipfile import BadZipFile, ZipFile
|
|
13
|
+
|
|
14
|
+
import requests
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
DEFAULT_QUERIES = [
|
|
18
|
+
'filetype:docx "equation"',
|
|
19
|
+
'filetype:docx "equations"',
|
|
20
|
+
'filetype:docx "Cambria Math"',
|
|
21
|
+
'filetype:docx "math equations"',
|
|
22
|
+
'filetype:docx "Microsoft Equation"',
|
|
23
|
+
'filetype:docx "OMML"',
|
|
24
|
+
'filetype:docx "quadratic equation"',
|
|
25
|
+
'filetype:docx "integral"',
|
|
26
|
+
'filetype:docx "matrix"',
|
|
27
|
+
'filetype:docx "calculus"',
|
|
28
|
+
'filetype:docx "physics" "equation"',
|
|
29
|
+
'filetype:docx "engineering" "equation"',
|
|
30
|
+
'filetype:docx "statistics" "equation"',
|
|
31
|
+
'filetype:docx "algebra" "equation"',
|
|
32
|
+
'filetype:docx "geometry" "equation"',
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
GITHUB_REPO_QUERIES = [
|
|
36
|
+
"docx equation math",
|
|
37
|
+
"docx equations latex",
|
|
38
|
+
"docx omml latex",
|
|
39
|
+
"word equation docx",
|
|
40
|
+
"markdown docx equation",
|
|
41
|
+
"pandoc docx equation",
|
|
42
|
+
"math docx converter",
|
|
43
|
+
"equation-heavy docx",
|
|
44
|
+
"ooxml math docx",
|
|
45
|
+
"Cambria Math docx",
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
HEADERS = {
|
|
49
|
+
"User-Agent": (
|
|
50
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
51
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36"
|
|
52
|
+
)
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
DOCX_CT_HINTS = (
|
|
56
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
57
|
+
"application/octet-stream",
|
|
58
|
+
"application/zip",
|
|
59
|
+
"binary/octet-stream",
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def extract_urls(html: str) -> list[str]:
|
|
64
|
+
raw = re.findall(r'https?://[^"\'<>\s)]+', html)
|
|
65
|
+
urls: list[str] = []
|
|
66
|
+
for url in raw:
|
|
67
|
+
url = unquote(url).replace("&", "&")
|
|
68
|
+
parsed = urlparse(url)
|
|
69
|
+
if parsed.netloc.endswith("bing.com") and parsed.path == "/ck/a":
|
|
70
|
+
qs = parse_qs(parsed.query)
|
|
71
|
+
for key in ("u", "r"):
|
|
72
|
+
if key in qs:
|
|
73
|
+
url = unquote(qs[key][0])
|
|
74
|
+
break
|
|
75
|
+
if "duckduckgo.com/l/?" in url:
|
|
76
|
+
qs = parse_qs(urlparse(url).query)
|
|
77
|
+
if "uddg" in qs:
|
|
78
|
+
url = unquote(qs["uddg"][0])
|
|
79
|
+
urls.append(url.rstrip(".,;"))
|
|
80
|
+
return urls
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def search(query: str, pages: int = 2) -> Iterable[str]:
|
|
84
|
+
endpoints = [
|
|
85
|
+
"https://www.bing.com/search?q={query}&first={offset}",
|
|
86
|
+
"https://html.duckduckgo.com/html/?q={query}&s={offset}",
|
|
87
|
+
]
|
|
88
|
+
for endpoint in endpoints:
|
|
89
|
+
for page in range(pages):
|
|
90
|
+
offset = page * 10 + 1
|
|
91
|
+
url = endpoint.format(query=quote_plus(query), offset=offset)
|
|
92
|
+
try:
|
|
93
|
+
response = requests.get(url, headers=HEADERS, timeout=20)
|
|
94
|
+
if response.status_code >= 400:
|
|
95
|
+
continue
|
|
96
|
+
yield from extract_urls(response.text)
|
|
97
|
+
time.sleep(0.4)
|
|
98
|
+
except requests.RequestException:
|
|
99
|
+
continue
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def github_json(url: str) -> dict | None:
|
|
103
|
+
try:
|
|
104
|
+
response = requests.get(url, headers={**HEADERS, "Accept": "application/vnd.github+json"}, timeout=25)
|
|
105
|
+
if response.status_code >= 400:
|
|
106
|
+
return None
|
|
107
|
+
return response.json()
|
|
108
|
+
except requests.RequestException:
|
|
109
|
+
return None
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def github_docx_urls(max_repos_per_query: int = 8) -> Iterable[str]:
|
|
113
|
+
seen_repos: set[str] = set()
|
|
114
|
+
for query in GITHUB_REPO_QUERIES:
|
|
115
|
+
search_url = (
|
|
116
|
+
"https://api.github.com/search/repositories"
|
|
117
|
+
f"?q={quote_plus(query)}&per_page={max_repos_per_query}"
|
|
118
|
+
)
|
|
119
|
+
payload = github_json(search_url)
|
|
120
|
+
if not payload:
|
|
121
|
+
continue
|
|
122
|
+
for repo in payload.get("items", []):
|
|
123
|
+
full_name = repo.get("full_name")
|
|
124
|
+
branch = repo.get("default_branch") or "main"
|
|
125
|
+
if not full_name or full_name in seen_repos:
|
|
126
|
+
continue
|
|
127
|
+
seen_repos.add(full_name)
|
|
128
|
+
tree_url = f"https://api.github.com/repos/{full_name}/git/trees/{branch}?recursive=1"
|
|
129
|
+
tree = github_json(tree_url)
|
|
130
|
+
if not tree:
|
|
131
|
+
continue
|
|
132
|
+
for item in tree.get("tree", []):
|
|
133
|
+
path = item.get("path", "")
|
|
134
|
+
if item.get("type") == "blob" and path.lower().endswith(".docx"):
|
|
135
|
+
yield f"https://raw.githubusercontent.com/{full_name}/{branch}/{quote_path(path)}"
|
|
136
|
+
time.sleep(0.25)
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def quote_path(path: str) -> str:
|
|
140
|
+
return "/".join(quote_plus(part).replace("+", "%20") for part in path.split("/"))
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def looks_like_docx_url(url: str) -> bool:
|
|
144
|
+
low = url.lower()
|
|
145
|
+
return ".docx" in low and not any(bad in low for bad in ("?format=pdf", "/view?", "webcache"))
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def safe_name(index: int, url: str, content: bytes) -> str:
|
|
149
|
+
parsed = urlparse(url)
|
|
150
|
+
stem = Path(unquote(parsed.path)).name or f"online-{index:03d}.docx"
|
|
151
|
+
stem = re.sub(r"[^A-Za-z0-9._ -]+", "_", stem)
|
|
152
|
+
if not stem.lower().endswith(".docx"):
|
|
153
|
+
stem += ".docx"
|
|
154
|
+
digest = hashlib.sha256(content).hexdigest()[:10]
|
|
155
|
+
return f"{index:03d}-{digest}-{stem}"
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def validate_docx(path: Path) -> dict:
|
|
159
|
+
with ZipFile(path) as zf:
|
|
160
|
+
names = set(zf.namelist())
|
|
161
|
+
if "[Content_Types].xml" not in names or "word/document.xml" not in names:
|
|
162
|
+
raise BadZipFile("not a Word DOCX package")
|
|
163
|
+
document_xml = zf.read("word/document.xml")
|
|
164
|
+
return {
|
|
165
|
+
"omml_nodes": len(re.findall(rb"<m:oMath\b|<m:oMathPara\b", document_xml)),
|
|
166
|
+
"media_parts": len([name for name in names if name.startswith("word/media/")]),
|
|
167
|
+
"document_xml_bytes": len(document_xml),
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def download(url: str, out_dir: Path, index: int, max_mb: int) -> dict | None:
|
|
172
|
+
try:
|
|
173
|
+
with requests.get(url, headers=HEADERS, timeout=30, stream=True, allow_redirects=True) as response:
|
|
174
|
+
if response.status_code >= 400:
|
|
175
|
+
return None
|
|
176
|
+
content_type = response.headers.get("content-type", "").split(";")[0].strip().lower()
|
|
177
|
+
if content_type and content_type not in DOCX_CT_HINTS and "wordprocessingml" not in content_type:
|
|
178
|
+
if ".docx" not in response.url.lower():
|
|
179
|
+
return None
|
|
180
|
+
chunks: list[bytes] = []
|
|
181
|
+
total = 0
|
|
182
|
+
limit = max_mb * 1024 * 1024
|
|
183
|
+
for chunk in response.iter_content(chunk_size=1024 * 256):
|
|
184
|
+
if not chunk:
|
|
185
|
+
continue
|
|
186
|
+
total += len(chunk)
|
|
187
|
+
if total > limit:
|
|
188
|
+
return None
|
|
189
|
+
chunks.append(chunk)
|
|
190
|
+
content = b"".join(chunks)
|
|
191
|
+
except requests.RequestException:
|
|
192
|
+
return None
|
|
193
|
+
|
|
194
|
+
if not content.startswith(b"PK"):
|
|
195
|
+
return None
|
|
196
|
+
|
|
197
|
+
out_path = out_dir / safe_name(index, response.url, content)
|
|
198
|
+
out_path.write_bytes(content)
|
|
199
|
+
try:
|
|
200
|
+
stats = validate_docx(out_path)
|
|
201
|
+
except (BadZipFile, KeyError):
|
|
202
|
+
out_path.unlink(missing_ok=True)
|
|
203
|
+
return None
|
|
204
|
+
|
|
205
|
+
return {
|
|
206
|
+
"file": str(out_path),
|
|
207
|
+
"source_url": response.url,
|
|
208
|
+
"bytes": len(content),
|
|
209
|
+
"content_type": content_type,
|
|
210
|
+
**stats,
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def main() -> int:
|
|
215
|
+
parser = argparse.ArgumentParser()
|
|
216
|
+
parser.add_argument("--out", type=Path, required=True)
|
|
217
|
+
parser.add_argument("--target", type=int, default=50)
|
|
218
|
+
parser.add_argument("--max-mb", type=int, default=40)
|
|
219
|
+
parser.add_argument("--prefer-equations", action="store_true")
|
|
220
|
+
args = parser.parse_args()
|
|
221
|
+
|
|
222
|
+
args.out.mkdir(parents=True, exist_ok=True)
|
|
223
|
+
seen: set[str] = set()
|
|
224
|
+
manifest: list[dict] = []
|
|
225
|
+
index = len(list(args.out.glob("*.docx"))) + 1
|
|
226
|
+
|
|
227
|
+
print("source: github repositories", flush=True)
|
|
228
|
+
for url in github_docx_urls():
|
|
229
|
+
if len(manifest) >= args.target:
|
|
230
|
+
break
|
|
231
|
+
normalized = url.split("#", 1)[0]
|
|
232
|
+
if normalized in seen:
|
|
233
|
+
continue
|
|
234
|
+
seen.add(normalized)
|
|
235
|
+
item = download(normalized, args.out, index, args.max_mb)
|
|
236
|
+
if not item:
|
|
237
|
+
continue
|
|
238
|
+
manifest.append(item)
|
|
239
|
+
index += 1
|
|
240
|
+
print(f"kept {len(manifest):02d}: {Path(item['file']).name} omml={item['omml_nodes']}", flush=True)
|
|
241
|
+
|
|
242
|
+
for query in DEFAULT_QUERIES:
|
|
243
|
+
print(f"search: {query}", flush=True)
|
|
244
|
+
for url in search(query):
|
|
245
|
+
if len(manifest) >= args.target:
|
|
246
|
+
break
|
|
247
|
+
if not looks_like_docx_url(url):
|
|
248
|
+
continue
|
|
249
|
+
normalized = url.split("#", 1)[0]
|
|
250
|
+
if normalized in seen:
|
|
251
|
+
continue
|
|
252
|
+
seen.add(normalized)
|
|
253
|
+
item = download(normalized, args.out, index, args.max_mb)
|
|
254
|
+
if not item:
|
|
255
|
+
continue
|
|
256
|
+
if args.prefer_equations and item["omml_nodes"] == 0 and len(manifest) < args.target // 2:
|
|
257
|
+
Path(item["file"]).unlink(missing_ok=True)
|
|
258
|
+
continue
|
|
259
|
+
manifest.append(item)
|
|
260
|
+
index += 1
|
|
261
|
+
print(f"kept {len(manifest):02d}: {Path(item['file']).name} omml={item['omml_nodes']}", flush=True)
|
|
262
|
+
if len(manifest) >= args.target:
|
|
263
|
+
break
|
|
264
|
+
|
|
265
|
+
manifest_path = args.out / "online-docx-manifest.json"
|
|
266
|
+
manifest_path.write_text(json.dumps({"count": len(manifest), "items": manifest}, indent=2), encoding="utf-8")
|
|
267
|
+
print(f"saved manifest: {manifest_path}", flush=True)
|
|
268
|
+
return 0 if len(manifest) >= args.target else 1
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
if __name__ == "__main__":
|
|
272
|
+
raise SystemExit(main())
|