regen.mde 0.7.0 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +16 -16
- package/bin/build-corpus-editor.js +83 -83
- package/bin/build-corpus.js +41 -41
- package/bin/regen-mdeditor-install.js +27 -27
- package/bin/regen-mdeditor-uninstall.js +19 -19
- package/bin/validate-katex.js +93 -93
- package/desktop/BuildCorpusEditor/BuildCorpusEditor.csproj +22 -22
- package/desktop/BuildCorpusEditor/app.manifest +16 -16
- package/dist/release/{regen-mde-0.7.0-win-x64.zip → regen-mde-0.8.0-win-x64.zip} +0 -0
- package/dist/windows-editor/BuildCorpusEditor.dll +0 -0
- package/dist/windows-editor/BuildCorpusEditor.exe +0 -0
- package/dist/windows-editor/BuildCorpusEditor.pdb +0 -0
- package/dist/windows-editor/BuildCorpusEditor.runtimeconfig.json +1 -1
- package/dist/windows-editor/wwwroot/index.html +20 -20
- package/editor-web/index.html +21 -21
- package/editor-web/vite.config.js +13 -13
- package/examples/build-corpus.config.example.json +21 -21
- package/installer/install-regen-mde.ps1 +214 -214
- package/installer/regen-mde.nsi +81 -81
- package/package.json +90 -90
- package/pyproject.toml +34 -35
- package/requirements.txt +0 -1
- package/scripts/build-windows-editor.ps1 +47 -47
- package/scripts/package-windows-editor.ps1 +90 -90
- package/scripts/run-corpus.ps1 +28 -28
- package/scripts/run-editor-implementation-plane.ps1 +226 -226
- package/scripts/run-required-tests.ps1 +98 -98
- package/scripts/run-smoke.ps1 +28 -28
- package/src/build_corpus/__init__.py +1 -1
- package/src/build_corpus/equations.py +1345 -80
- package/src/build_corpus/templates/__init__.py +1 -1
- package/src/build_corpus/validate_assets.py +46 -46
- package/tools/audit_corpus.py +203 -203
- package/tools/collect_microsoft_word_templates.py +228 -228
- package/tools/collect_online_docx_corpus.py +272 -272
- package/tools/collect_online_pptx_corpus.py +252 -252
- package/tools/compare_pptx_inputs_outputs.py +87 -87
- package/tools/roundtrip_docx_corpus.py +171 -171
- package/dist/release/regen-mde-0.3.0-win-x64-setup.exe +0 -0
- package/dist/release/regen-mde-0.3.0-win-x64.zip +0 -0
- package/dist/release/regen-mde-0.7.0-win-x64-setup.exe +0 -0
|
@@ -1,228 +1,228 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import argparse
|
|
4
|
-
import hashlib
|
|
5
|
-
import json
|
|
6
|
-
import re
|
|
7
|
-
import time
|
|
8
|
-
from collections import deque
|
|
9
|
-
from pathlib import Path
|
|
10
|
-
from typing import Iterable
|
|
11
|
-
from urllib.parse import urljoin, urlparse
|
|
12
|
-
from zipfile import BadZipFile, ZipFile
|
|
13
|
-
|
|
14
|
-
import requests
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
BASE_URL = "https://word.cloud.microsoft"
|
|
18
|
-
START_URLS = [
|
|
19
|
-
"https://word.cloud.microsoft/create/en/templates/",
|
|
20
|
-
"https://word.cloud.microsoft/create/en/resume-templates/",
|
|
21
|
-
]
|
|
22
|
-
|
|
23
|
-
HEADERS = {
|
|
24
|
-
"User-Agent": (
|
|
25
|
-
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
26
|
-
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36"
|
|
27
|
-
)
|
|
28
|
-
}
|
|
29
|
-
|
|
30
|
-
PAGE_LINK_RE = re.compile(r'(?:"|href=")(/create/en/[^"#? ]+/)')
|
|
31
|
-
FILE_URL_RE = re.compile(r'"fileUrl":"(https://cdn\.create\.microsoft\.com/[^"]+\.docx)"')
|
|
32
|
-
TITLE_RE = re.compile(
|
|
33
|
-
r'"fileUrl":"(?P<url>https://cdn\.create\.microsoft\.com/[^"]+\.docx)","title":"(?P<title>[^"]+)"'
|
|
34
|
-
)
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
def normalize_page(url: str) -> str:
|
|
38
|
-
parsed = urlparse(url)
|
|
39
|
-
return f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
def should_visit(url: str) -> bool:
|
|
43
|
-
parsed = urlparse(url)
|
|
44
|
-
if parsed.netloc != "word.cloud.microsoft":
|
|
45
|
-
return False
|
|
46
|
-
if not parsed.path.startswith("/create/en/"):
|
|
47
|
-
return False
|
|
48
|
-
path = parsed.path.lower()
|
|
49
|
-
if "/blog/" in path:
|
|
50
|
-
return False
|
|
51
|
-
if any(token in path for token in ("/copilot-", "/document-editor/", "/grammar-checker/", "/ai-", "/new/")):
|
|
52
|
-
return False
|
|
53
|
-
return True
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
def fetch(session: requests.Session, url: str) -> str | None:
|
|
57
|
-
try:
|
|
58
|
-
response = session.get(url, headers=HEADERS, timeout=30)
|
|
59
|
-
if response.status_code >= 400:
|
|
60
|
-
return None
|
|
61
|
-
return response.text
|
|
62
|
-
except requests.RequestException:
|
|
63
|
-
return None
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
def extract_links(html: str) -> Iterable[str]:
|
|
67
|
-
for match in PAGE_LINK_RE.finditer(html):
|
|
68
|
-
yield urljoin(BASE_URL, match.group(1))
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
def extract_templates(html: str) -> list[dict[str, str]]:
|
|
72
|
-
titles_by_url = {match.group("url"): match.group("title") for match in TITLE_RE.finditer(html)}
|
|
73
|
-
found = []
|
|
74
|
-
for match in FILE_URL_RE.finditer(html):
|
|
75
|
-
url = match.group(1).replace("\\u0026", "&")
|
|
76
|
-
found.append({
|
|
77
|
-
"file_url": url,
|
|
78
|
-
"title": titles_by_url.get(url, ""),
|
|
79
|
-
})
|
|
80
|
-
return found
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
def validate_docx(path: Path) -> dict:
|
|
84
|
-
with ZipFile(path) as zf:
|
|
85
|
-
names = set(zf.namelist())
|
|
86
|
-
if "[Content_Types].xml" not in names or "word/document.xml" not in names:
|
|
87
|
-
raise BadZipFile("not a Word DOCX package")
|
|
88
|
-
document_xml = zf.read("word/document.xml")
|
|
89
|
-
return {
|
|
90
|
-
"omml_nodes": len(re.findall(rb"<m:oMath\b|<m:oMathPara\b", document_xml)),
|
|
91
|
-
"media_parts": len([name for name in names if name.startswith("word/media/")]),
|
|
92
|
-
"document_xml_bytes": len(document_xml),
|
|
93
|
-
}
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
def safe_name(index: int, title: str, content: bytes, file_url: str) -> str:
|
|
97
|
-
digest = hashlib.sha256(content).hexdigest()[:10]
|
|
98
|
-
stem = title.strip() or Path(urlparse(file_url).path).name
|
|
99
|
-
stem = re.sub(r"[^A-Za-z0-9._ -]+", "_", stem).strip(" ._")
|
|
100
|
-
if not stem.lower().endswith(".docx"):
|
|
101
|
-
stem += ".docx"
|
|
102
|
-
return f"{index:03d}-{digest}-{stem}"
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
def download_template(
|
|
106
|
-
session: requests.Session,
|
|
107
|
-
item: dict[str, str],
|
|
108
|
-
out_dir: Path,
|
|
109
|
-
index: int,
|
|
110
|
-
max_mb: int,
|
|
111
|
-
) -> dict | None:
|
|
112
|
-
try:
|
|
113
|
-
with session.get(item["file_url"], headers=HEADERS, timeout=60, stream=True) as response:
|
|
114
|
-
if response.status_code >= 400:
|
|
115
|
-
return None
|
|
116
|
-
chunks: list[bytes] = []
|
|
117
|
-
total = 0
|
|
118
|
-
limit = max_mb * 1024 * 1024
|
|
119
|
-
for chunk in response.iter_content(chunk_size=1024 * 256):
|
|
120
|
-
if not chunk:
|
|
121
|
-
continue
|
|
122
|
-
total += len(chunk)
|
|
123
|
-
if total > limit:
|
|
124
|
-
return None
|
|
125
|
-
chunks.append(chunk)
|
|
126
|
-
content = b"".join(chunks)
|
|
127
|
-
except requests.RequestException:
|
|
128
|
-
return None
|
|
129
|
-
|
|
130
|
-
if not content.startswith(b"PK"):
|
|
131
|
-
return None
|
|
132
|
-
|
|
133
|
-
filename = safe_name(index, item.get("title", ""), content, item["file_url"])
|
|
134
|
-
out_path = out_dir / filename
|
|
135
|
-
out_path.write_bytes(content)
|
|
136
|
-
try:
|
|
137
|
-
stats = validate_docx(out_path)
|
|
138
|
-
except (BadZipFile, KeyError):
|
|
139
|
-
out_path.unlink(missing_ok=True)
|
|
140
|
-
return None
|
|
141
|
-
|
|
142
|
-
return {
|
|
143
|
-
"file": str(out_path),
|
|
144
|
-
"title": item.get("title", ""),
|
|
145
|
-
"source_url": item["source_url"],
|
|
146
|
-
"file_url": item["file_url"],
|
|
147
|
-
"bytes": len(content),
|
|
148
|
-
**stats,
|
|
149
|
-
}
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
def crawl_template_pages(session: requests.Session, max_pages: int) -> list[dict[str, str]]:
|
|
153
|
-
queue = deque(START_URLS)
|
|
154
|
-
seen_pages: set[str] = set()
|
|
155
|
-
seen_file_urls: set[str] = set()
|
|
156
|
-
template_items: list[dict[str, str]] = []
|
|
157
|
-
|
|
158
|
-
while queue and len(seen_pages) < max_pages:
|
|
159
|
-
current = normalize_page(queue.popleft())
|
|
160
|
-
if current in seen_pages or not should_visit(current):
|
|
161
|
-
continue
|
|
162
|
-
seen_pages.add(current)
|
|
163
|
-
html = fetch(session, current)
|
|
164
|
-
if not html:
|
|
165
|
-
continue
|
|
166
|
-
|
|
167
|
-
for template in extract_templates(html):
|
|
168
|
-
file_url = template["file_url"]
|
|
169
|
-
if file_url in seen_file_urls:
|
|
170
|
-
continue
|
|
171
|
-
seen_file_urls.add(file_url)
|
|
172
|
-
template_items.append({
|
|
173
|
-
**template,
|
|
174
|
-
"source_url": current,
|
|
175
|
-
})
|
|
176
|
-
|
|
177
|
-
for link in extract_links(html):
|
|
178
|
-
normalized = normalize_page(link)
|
|
179
|
-
if normalized not in seen_pages and should_visit(normalized):
|
|
180
|
-
queue.append(normalized)
|
|
181
|
-
|
|
182
|
-
time.sleep(0.15)
|
|
183
|
-
|
|
184
|
-
return template_items
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
def main() -> int:
|
|
188
|
-
parser = argparse.ArgumentParser()
|
|
189
|
-
parser.add_argument("--out", type=Path, required=True)
|
|
190
|
-
parser.add_argument("--max-pages", type=int, default=80)
|
|
191
|
-
parser.add_argument("--max-mb", type=int, default=40)
|
|
192
|
-
parser.add_argument("--limit", type=int, default=0, help="0 means no explicit template limit")
|
|
193
|
-
args = parser.parse_args()
|
|
194
|
-
|
|
195
|
-
args.out.mkdir(parents=True, exist_ok=True)
|
|
196
|
-
session = requests.Session()
|
|
197
|
-
|
|
198
|
-
template_items = crawl_template_pages(session, max_pages=args.max_pages)
|
|
199
|
-
if args.limit > 0:
|
|
200
|
-
template_items = template_items[: args.limit]
|
|
201
|
-
|
|
202
|
-
manifest: list[dict] = []
|
|
203
|
-
for index, item in enumerate(template_items, 1):
|
|
204
|
-
downloaded = download_template(session, item, args.out, index, args.max_mb)
|
|
205
|
-
if not downloaded:
|
|
206
|
-
continue
|
|
207
|
-
manifest.append(downloaded)
|
|
208
|
-
print(f"kept {len(manifest):03d}: {Path(downloaded['file']).name}", flush=True)
|
|
209
|
-
time.sleep(0.1)
|
|
210
|
-
|
|
211
|
-
manifest_path = args.out / "microsoft-word-templates-manifest.json"
|
|
212
|
-
manifest_path.write_text(
|
|
213
|
-
json.dumps(
|
|
214
|
-
{
|
|
215
|
-
"count": len(manifest),
|
|
216
|
-
"items": manifest,
|
|
217
|
-
},
|
|
218
|
-
indent=2,
|
|
219
|
-
ensure_ascii=False,
|
|
220
|
-
),
|
|
221
|
-
encoding="utf-8",
|
|
222
|
-
)
|
|
223
|
-
print(f"saved manifest: {manifest_path}", flush=True)
|
|
224
|
-
return 0 if manifest else 1
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
if __name__ == "__main__":
|
|
228
|
-
raise SystemExit(main())
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import hashlib
|
|
5
|
+
import json
|
|
6
|
+
import re
|
|
7
|
+
import time
|
|
8
|
+
from collections import deque
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Iterable
|
|
11
|
+
from urllib.parse import urljoin, urlparse
|
|
12
|
+
from zipfile import BadZipFile, ZipFile
|
|
13
|
+
|
|
14
|
+
import requests
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
BASE_URL = "https://word.cloud.microsoft"
|
|
18
|
+
START_URLS = [
|
|
19
|
+
"https://word.cloud.microsoft/create/en/templates/",
|
|
20
|
+
"https://word.cloud.microsoft/create/en/resume-templates/",
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
HEADERS = {
|
|
24
|
+
"User-Agent": (
|
|
25
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
26
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36"
|
|
27
|
+
)
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
PAGE_LINK_RE = re.compile(r'(?:"|href=")(/create/en/[^"#? ]+/)')
|
|
31
|
+
FILE_URL_RE = re.compile(r'"fileUrl":"(https://cdn\.create\.microsoft\.com/[^"]+\.docx)"')
|
|
32
|
+
TITLE_RE = re.compile(
|
|
33
|
+
r'"fileUrl":"(?P<url>https://cdn\.create\.microsoft\.com/[^"]+\.docx)","title":"(?P<title>[^"]+)"'
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def normalize_page(url: str) -> str:
|
|
38
|
+
parsed = urlparse(url)
|
|
39
|
+
return f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def should_visit(url: str) -> bool:
|
|
43
|
+
parsed = urlparse(url)
|
|
44
|
+
if parsed.netloc != "word.cloud.microsoft":
|
|
45
|
+
return False
|
|
46
|
+
if not parsed.path.startswith("/create/en/"):
|
|
47
|
+
return False
|
|
48
|
+
path = parsed.path.lower()
|
|
49
|
+
if "/blog/" in path:
|
|
50
|
+
return False
|
|
51
|
+
if any(token in path for token in ("/copilot-", "/document-editor/", "/grammar-checker/", "/ai-", "/new/")):
|
|
52
|
+
return False
|
|
53
|
+
return True
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def fetch(session: requests.Session, url: str) -> str | None:
|
|
57
|
+
try:
|
|
58
|
+
response = session.get(url, headers=HEADERS, timeout=30)
|
|
59
|
+
if response.status_code >= 400:
|
|
60
|
+
return None
|
|
61
|
+
return response.text
|
|
62
|
+
except requests.RequestException:
|
|
63
|
+
return None
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def extract_links(html: str) -> Iterable[str]:
|
|
67
|
+
for match in PAGE_LINK_RE.finditer(html):
|
|
68
|
+
yield urljoin(BASE_URL, match.group(1))
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def extract_templates(html: str) -> list[dict[str, str]]:
|
|
72
|
+
titles_by_url = {match.group("url"): match.group("title") for match in TITLE_RE.finditer(html)}
|
|
73
|
+
found = []
|
|
74
|
+
for match in FILE_URL_RE.finditer(html):
|
|
75
|
+
url = match.group(1).replace("\\u0026", "&")
|
|
76
|
+
found.append({
|
|
77
|
+
"file_url": url,
|
|
78
|
+
"title": titles_by_url.get(url, ""),
|
|
79
|
+
})
|
|
80
|
+
return found
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def validate_docx(path: Path) -> dict:
|
|
84
|
+
with ZipFile(path) as zf:
|
|
85
|
+
names = set(zf.namelist())
|
|
86
|
+
if "[Content_Types].xml" not in names or "word/document.xml" not in names:
|
|
87
|
+
raise BadZipFile("not a Word DOCX package")
|
|
88
|
+
document_xml = zf.read("word/document.xml")
|
|
89
|
+
return {
|
|
90
|
+
"omml_nodes": len(re.findall(rb"<m:oMath\b|<m:oMathPara\b", document_xml)),
|
|
91
|
+
"media_parts": len([name for name in names if name.startswith("word/media/")]),
|
|
92
|
+
"document_xml_bytes": len(document_xml),
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def safe_name(index: int, title: str, content: bytes, file_url: str) -> str:
|
|
97
|
+
digest = hashlib.sha256(content).hexdigest()[:10]
|
|
98
|
+
stem = title.strip() or Path(urlparse(file_url).path).name
|
|
99
|
+
stem = re.sub(r"[^A-Za-z0-9._ -]+", "_", stem).strip(" ._")
|
|
100
|
+
if not stem.lower().endswith(".docx"):
|
|
101
|
+
stem += ".docx"
|
|
102
|
+
return f"{index:03d}-{digest}-{stem}"
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def download_template(
|
|
106
|
+
session: requests.Session,
|
|
107
|
+
item: dict[str, str],
|
|
108
|
+
out_dir: Path,
|
|
109
|
+
index: int,
|
|
110
|
+
max_mb: int,
|
|
111
|
+
) -> dict | None:
|
|
112
|
+
try:
|
|
113
|
+
with session.get(item["file_url"], headers=HEADERS, timeout=60, stream=True) as response:
|
|
114
|
+
if response.status_code >= 400:
|
|
115
|
+
return None
|
|
116
|
+
chunks: list[bytes] = []
|
|
117
|
+
total = 0
|
|
118
|
+
limit = max_mb * 1024 * 1024
|
|
119
|
+
for chunk in response.iter_content(chunk_size=1024 * 256):
|
|
120
|
+
if not chunk:
|
|
121
|
+
continue
|
|
122
|
+
total += len(chunk)
|
|
123
|
+
if total > limit:
|
|
124
|
+
return None
|
|
125
|
+
chunks.append(chunk)
|
|
126
|
+
content = b"".join(chunks)
|
|
127
|
+
except requests.RequestException:
|
|
128
|
+
return None
|
|
129
|
+
|
|
130
|
+
if not content.startswith(b"PK"):
|
|
131
|
+
return None
|
|
132
|
+
|
|
133
|
+
filename = safe_name(index, item.get("title", ""), content, item["file_url"])
|
|
134
|
+
out_path = out_dir / filename
|
|
135
|
+
out_path.write_bytes(content)
|
|
136
|
+
try:
|
|
137
|
+
stats = validate_docx(out_path)
|
|
138
|
+
except (BadZipFile, KeyError):
|
|
139
|
+
out_path.unlink(missing_ok=True)
|
|
140
|
+
return None
|
|
141
|
+
|
|
142
|
+
return {
|
|
143
|
+
"file": str(out_path),
|
|
144
|
+
"title": item.get("title", ""),
|
|
145
|
+
"source_url": item["source_url"],
|
|
146
|
+
"file_url": item["file_url"],
|
|
147
|
+
"bytes": len(content),
|
|
148
|
+
**stats,
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def crawl_template_pages(session: requests.Session, max_pages: int) -> list[dict[str, str]]:
|
|
153
|
+
queue = deque(START_URLS)
|
|
154
|
+
seen_pages: set[str] = set()
|
|
155
|
+
seen_file_urls: set[str] = set()
|
|
156
|
+
template_items: list[dict[str, str]] = []
|
|
157
|
+
|
|
158
|
+
while queue and len(seen_pages) < max_pages:
|
|
159
|
+
current = normalize_page(queue.popleft())
|
|
160
|
+
if current in seen_pages or not should_visit(current):
|
|
161
|
+
continue
|
|
162
|
+
seen_pages.add(current)
|
|
163
|
+
html = fetch(session, current)
|
|
164
|
+
if not html:
|
|
165
|
+
continue
|
|
166
|
+
|
|
167
|
+
for template in extract_templates(html):
|
|
168
|
+
file_url = template["file_url"]
|
|
169
|
+
if file_url in seen_file_urls:
|
|
170
|
+
continue
|
|
171
|
+
seen_file_urls.add(file_url)
|
|
172
|
+
template_items.append({
|
|
173
|
+
**template,
|
|
174
|
+
"source_url": current,
|
|
175
|
+
})
|
|
176
|
+
|
|
177
|
+
for link in extract_links(html):
|
|
178
|
+
normalized = normalize_page(link)
|
|
179
|
+
if normalized not in seen_pages and should_visit(normalized):
|
|
180
|
+
queue.append(normalized)
|
|
181
|
+
|
|
182
|
+
time.sleep(0.15)
|
|
183
|
+
|
|
184
|
+
return template_items
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def main() -> int:
|
|
188
|
+
parser = argparse.ArgumentParser()
|
|
189
|
+
parser.add_argument("--out", type=Path, required=True)
|
|
190
|
+
parser.add_argument("--max-pages", type=int, default=80)
|
|
191
|
+
parser.add_argument("--max-mb", type=int, default=40)
|
|
192
|
+
parser.add_argument("--limit", type=int, default=0, help="0 means no explicit template limit")
|
|
193
|
+
args = parser.parse_args()
|
|
194
|
+
|
|
195
|
+
args.out.mkdir(parents=True, exist_ok=True)
|
|
196
|
+
session = requests.Session()
|
|
197
|
+
|
|
198
|
+
template_items = crawl_template_pages(session, max_pages=args.max_pages)
|
|
199
|
+
if args.limit > 0:
|
|
200
|
+
template_items = template_items[: args.limit]
|
|
201
|
+
|
|
202
|
+
manifest: list[dict] = []
|
|
203
|
+
for index, item in enumerate(template_items, 1):
|
|
204
|
+
downloaded = download_template(session, item, args.out, index, args.max_mb)
|
|
205
|
+
if not downloaded:
|
|
206
|
+
continue
|
|
207
|
+
manifest.append(downloaded)
|
|
208
|
+
print(f"kept {len(manifest):03d}: {Path(downloaded['file']).name}", flush=True)
|
|
209
|
+
time.sleep(0.1)
|
|
210
|
+
|
|
211
|
+
manifest_path = args.out / "microsoft-word-templates-manifest.json"
|
|
212
|
+
manifest_path.write_text(
|
|
213
|
+
json.dumps(
|
|
214
|
+
{
|
|
215
|
+
"count": len(manifest),
|
|
216
|
+
"items": manifest,
|
|
217
|
+
},
|
|
218
|
+
indent=2,
|
|
219
|
+
ensure_ascii=False,
|
|
220
|
+
),
|
|
221
|
+
encoding="utf-8",
|
|
222
|
+
)
|
|
223
|
+
print(f"saved manifest: {manifest_path}", flush=True)
|
|
224
|
+
return 0 if manifest else 1
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
if __name__ == "__main__":
|
|
228
|
+
raise SystemExit(main())
|