jekyll-theme-zer0 1.8.2 → 1.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +26 -0
- data/README.md +98 -7
- data/_data/content_statistics.yml +253 -251
- data/_includes/components/nav-export.html +61 -0
- data/_includes/components/nav-overview.html +54 -0
- data/scripts/bin/install +52 -705
- data/scripts/install/README.md +162 -0
- data/scripts/install/ai/client.sh +164 -0
- data/scripts/install/ai/diagnose.sh +81 -0
- data/scripts/install/ai/prompts/diagnose.system.md +42 -0
- data/scripts/install/ai/prompts/spec.schema.json +129 -0
- data/scripts/install/ai/prompts/suggest.system.md +43 -0
- data/scripts/install/ai/prompts/wizard.system.md +142 -0
- data/scripts/install/ai/suggest.sh +57 -0
- data/scripts/install/ai/wizard.sh +150 -0
- data/scripts/install/apply.sh +156 -0
- data/scripts/install/cli.sh +561 -0
- data/scripts/install/diff.sh +128 -0
- data/scripts/install/doctor.sh +168 -0
- data/scripts/install/fs.sh +138 -0
- data/scripts/install/log.sh +119 -0
- data/scripts/install/plan.sh +299 -0
- data/scripts/install/platform.sh +122 -0
- data/scripts/install/prompt.sh +124 -0
- data/scripts/install/repair.sh +45 -0
- data/scripts/install/scrape.sh +535 -0
- data/scripts/install/scrape_html.py +764 -0
- data/scripts/install/spec.sh +486 -0
- data/scripts/install/tasks/_registry.sh +65 -0
- data/scripts/install/tasks/agents.sh +60 -0
- data/scripts/install/tasks/config.sh +37 -0
- data/scripts/install/tasks/data.sh +18 -0
- data/scripts/install/tasks/deploy_azure-swa.sh +17 -0
- data/scripts/install/tasks/deploy_docker-prod.sh +21 -0
- data/scripts/install/tasks/deploy_github-pages.sh +18 -0
- data/scripts/install/tasks/devcontainer.sh +26 -0
- data/scripts/install/tasks/docker.sh +29 -0
- data/scripts/install/tasks/gemfile.sh +42 -0
- data/scripts/install/tasks/gitignore.sh +26 -0
- data/scripts/install/tasks/marker.sh +46 -0
- data/scripts/install/tasks/nav.sh +18 -0
- data/scripts/install/tasks/pages.sh +61 -0
- data/scripts/install/tasks/readme.sh +27 -0
- data/scripts/install/tasks/scrape.sh +348 -0
- data/scripts/install/template.sh +138 -0
- data/scripts/install/tui.sh +110 -0
- data/scripts/install/upgrade.sh +49 -0
- data/scripts/lib/install/template.sh +1 -0
- metadata +45 -2
|
@@ -0,0 +1,535 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
# =============================================================================
|
|
3
|
+
# scripts/install/scrape.sh — BFS site scraper for the installer
|
|
4
|
+
# =============================================================================
|
|
5
|
+
# Crawls a public website with curl, runs each page through scrape_html.py,
|
|
6
|
+
# and writes a structured corpus under ${OUT_DIR}:
|
|
7
|
+
#
|
|
8
|
+
# ${OUT_DIR}/
|
|
9
|
+
# site.json — site-level summary (title, nav, page index)
|
|
10
|
+
# raw/<slug>.html — raw HTML as fetched
|
|
11
|
+
# pages/<slug>.json — per-page extraction result
|
|
12
|
+
# jekyll/<slug>.md — Jekyll-ready Markdown with frontmatter
|
|
13
|
+
#
|
|
14
|
+
# The Jekyll markdown is the artifact consumed by tasks/scrape.sh.
|
|
15
|
+
#
|
|
16
|
+
# Public API:
|
|
17
|
+
#
|
|
18
|
+
# scrape_run URL OUT_DIR [DEPTH] [MAX_PAGES]
|
|
19
|
+
# Crawl starting at URL. Default DEPTH=2, MAX_PAGES=25.
|
|
20
|
+
#
|
|
21
|
+
# scrape_url_to_slug URL
|
|
22
|
+
# Stable filesystem-safe identifier for a URL.
|
|
23
|
+
#
|
|
24
|
+
# Honors:
|
|
25
|
+
# SCRAPE_USER_AGENT (default: zer0-mistakes-scraper/1.0 …)
|
|
26
|
+
# SCRAPE_TIMEOUT per-request curl timeout in seconds (default: 15)
|
|
27
|
+
# SCRAPE_RATE_DELAY seconds to sleep between requests (default: 0)
|
|
28
|
+
# SCRAPE_ALLOW_SUBDOMAINS 1 to allow same-suffix subdomains (default: 0)
|
|
29
|
+
# _FS_DRY_RUN inherited from fs.sh — when 1, do not curl
|
|
30
|
+
#
|
|
31
|
+
# Bash 3.2 compatible. No set -euo pipefail here.
|
|
32
|
+
# =============================================================================
|
|
33
|
+
[[ -n "${_HAS_SCRAPE_LIB:-}" ]] && return 0
|
|
34
|
+
_HAS_SCRAPE_LIB=1
|
|
35
|
+
|
|
36
|
+
_SCRAPE_DIR="${_SCRAPE_DIR:-$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" 2>/dev/null && pwd)}"
|
|
37
|
+
_SCRAPE_HTML_PY="${_SCRAPE_DIR}/scrape_html.py"
|
|
38
|
+
|
|
39
|
+
# ---------------------------------------------------------------------------
|
|
40
|
+
# scrape_check_deps — verify python3 + curl
|
|
41
|
+
# ---------------------------------------------------------------------------
|
|
42
|
+
scrape_check_deps() {
|
|
43
|
+
if ! command -v python3 >/dev/null 2>&1; then
|
|
44
|
+
log_error "scrape: python3 is required (not found in PATH)"
|
|
45
|
+
return 1
|
|
46
|
+
fi
|
|
47
|
+
if ! command -v curl >/dev/null 2>&1; then
|
|
48
|
+
log_error "scrape: curl is required (not found in PATH)"
|
|
49
|
+
return 1
|
|
50
|
+
fi
|
|
51
|
+
if [[ ! -f "$_SCRAPE_HTML_PY" ]]; then
|
|
52
|
+
log_error "scrape: helper not found: $_SCRAPE_HTML_PY"
|
|
53
|
+
return 1
|
|
54
|
+
fi
|
|
55
|
+
return 0
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
# ---------------------------------------------------------------------------
|
|
59
|
+
# scrape_url_to_slug URL — print a deterministic slug for the given URL
|
|
60
|
+
# ---------------------------------------------------------------------------
|
|
61
|
+
scrape_url_to_slug() {
|
|
62
|
+
local url="$1"
|
|
63
|
+
local path
|
|
64
|
+
path=$(printf '%s' "$url" | python3 -c '
|
|
65
|
+
import sys, re
|
|
66
|
+
from urllib.parse import urlparse
|
|
67
|
+
u = urlparse(sys.stdin.read().strip())
|
|
68
|
+
p = (u.path or "/").strip("/")
|
|
69
|
+
# Strip extensions that round-trip badly as Jekyll page slugs.
|
|
70
|
+
p = re.sub(r"\.(html?|php|aspx?|jsp)$", "", p, flags=re.I)
|
|
71
|
+
if u.query:
|
|
72
|
+
p = (p + "/" + u.query) if p else u.query
|
|
73
|
+
if not p:
|
|
74
|
+
p = "index"
|
|
75
|
+
p = re.sub(r"[^A-Za-z0-9._/-]+", "-", p).strip("-/")
|
|
76
|
+
p = re.sub(r"/+", "--", p)
|
|
77
|
+
p = re.sub(r"-+", "-", p)
|
|
78
|
+
print(p[:100] or "index")
|
|
79
|
+
')
|
|
80
|
+
printf '%s' "$path"
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
# ---------------------------------------------------------------------------
|
|
84
|
+
# scrape_normalize_url URL [BASE_URL]
|
|
85
|
+
# ---------------------------------------------------------------------------
|
|
86
|
+
scrape_normalize_url() {
|
|
87
|
+
local url="$1"
|
|
88
|
+
local base="${2:-$1}"
|
|
89
|
+
URL="$url" BASE="$base" python3 -c '
|
|
90
|
+
import os
|
|
91
|
+
from urllib.parse import urljoin, urldefrag, urlparse
|
|
92
|
+
u, _ = urldefrag(urljoin(os.environ["BASE"], os.environ["URL"]))
|
|
93
|
+
p = urlparse(u)
|
|
94
|
+
if p.scheme not in ("http", "https"):
|
|
95
|
+
raise SystemExit(0)
|
|
96
|
+
netloc = p.netloc
|
|
97
|
+
if netloc.endswith(":80") and p.scheme == "http": netloc = netloc[:-3]
|
|
98
|
+
if netloc.endswith(":443") and p.scheme == "https": netloc = netloc[:-4]
|
|
99
|
+
path = p.path or "/"
|
|
100
|
+
print(f"{p.scheme}://{netloc}{path}" + (("?" + p.query) if p.query else ""))
|
|
101
|
+
'
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
# ---------------------------------------------------------------------------
|
|
105
|
+
# scrape_same_host URL BASE_URL
|
|
106
|
+
# ---------------------------------------------------------------------------
|
|
107
|
+
scrape_same_host() {
|
|
108
|
+
local url="$1" base="$2"
|
|
109
|
+
local allow_sub="${SCRAPE_ALLOW_SUBDOMAINS:-0}"
|
|
110
|
+
URL="$url" BASE="$base" ALLOW="$allow_sub" python3 -c '
|
|
111
|
+
import os, sys
|
|
112
|
+
from urllib.parse import urlparse
|
|
113
|
+
def host(u):
|
|
114
|
+
h = urlparse(u).netloc.lower()
|
|
115
|
+
if h.startswith("www."): h = h[4:]
|
|
116
|
+
return h
|
|
117
|
+
a, b = host(os.environ["URL"]), host(os.environ["BASE"])
|
|
118
|
+
if not a or not b:
|
|
119
|
+
sys.exit(1)
|
|
120
|
+
if a == b:
|
|
121
|
+
sys.exit(0)
|
|
122
|
+
if os.environ.get("ALLOW") == "1" and (a.endswith("." + b) or b.endswith("." + a)):
|
|
123
|
+
sys.exit(0)
|
|
124
|
+
sys.exit(1)
|
|
125
|
+
'
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
# ---------------------------------------------------------------------------
|
|
129
|
+
# scrape_fetch URL OUT_FILE — fetch URL to file; print HTTP status to stdout
|
|
130
|
+
# ---------------------------------------------------------------------------
|
|
131
|
+
scrape_fetch() {
|
|
132
|
+
local url="$1" out="$2"
|
|
133
|
+
local ua="${SCRAPE_USER_AGENT:-zer0-mistakes-scraper/1.0 (+https://github.com/bamr87/zer0-mistakes)}"
|
|
134
|
+
local timeout="${SCRAPE_TIMEOUT:-15}"
|
|
135
|
+
local code
|
|
136
|
+
code=$(curl -fsSL \
|
|
137
|
+
--max-time "$timeout" \
|
|
138
|
+
--retry 1 --retry-delay 1 \
|
|
139
|
+
-A "$ua" \
|
|
140
|
+
-H "Accept: text/html,application/xhtml+xml" \
|
|
141
|
+
-H "Accept-Language: en-US,en;q=0.9" \
|
|
142
|
+
-o "$out" \
|
|
143
|
+
-w "%{http_code}" \
|
|
144
|
+
"$url" 2>/dev/null) || true
|
|
145
|
+
printf '%s' "$code"
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
# ---------------------------------------------------------------------------
|
|
149
|
+
# scrape_run URL OUT_DIR [DEPTH] [MAX_PAGES]
|
|
150
|
+
# ---------------------------------------------------------------------------
|
|
151
|
+
scrape_run() {
|
|
152
|
+
local start_url="$1"
|
|
153
|
+
local out_dir="$2"
|
|
154
|
+
local max_depth="${3:-2}"
|
|
155
|
+
local max_pages="${4:-25}"
|
|
156
|
+
|
|
157
|
+
scrape_check_deps || return 1
|
|
158
|
+
|
|
159
|
+
if [[ -z "$start_url" || -z "$out_dir" ]]; then
|
|
160
|
+
log_error "scrape_run: URL and OUT_DIR required"
|
|
161
|
+
return 2
|
|
162
|
+
fi
|
|
163
|
+
|
|
164
|
+
# Normalize the seed URL.
|
|
165
|
+
local base
|
|
166
|
+
base=$(scrape_normalize_url "$start_url") || base="$start_url"
|
|
167
|
+
[[ -n "$base" ]] || { log_error "scrape: invalid URL: $start_url"; return 2; }
|
|
168
|
+
|
|
169
|
+
mkdir -p "$out_dir/raw" "$out_dir/pages" "$out_dir/jekyll"
|
|
170
|
+
|
|
171
|
+
log_info "Scraping site: $base"
|
|
172
|
+
log_info " → output: $out_dir"
|
|
173
|
+
log_info " → depth: $max_depth max-pages: $max_pages"
|
|
174
|
+
|
|
175
|
+
if [[ "${_FS_DRY_RUN:-0}" == "1" ]]; then
|
|
176
|
+
log_warning "DRY RUN — no network requests will be issued"
|
|
177
|
+
return 0
|
|
178
|
+
fi
|
|
179
|
+
|
|
180
|
+
# Queue parallel arrays (URL + depth). Visited set kept in a file for
|
|
181
|
+
# bash 3.2 portability.
|
|
182
|
+
local visited_file
|
|
183
|
+
visited_file="$out_dir/.visited"
|
|
184
|
+
: > "$visited_file"
|
|
185
|
+
|
|
186
|
+
local -a queue_url queue_depth
|
|
187
|
+
queue_url=("$base")
|
|
188
|
+
queue_depth=(0)
|
|
189
|
+
|
|
190
|
+
local fetched=0
|
|
191
|
+
local site_pages_json="$out_dir/.site_pages.tmp"
|
|
192
|
+
: > "$site_pages_json"
|
|
193
|
+
local site_nav_json="$out_dir/.site_nav.tmp"
|
|
194
|
+
: > "$site_nav_json"
|
|
195
|
+
local site_title="" site_description="" site_lang="" site_image=""
|
|
196
|
+
|
|
197
|
+
while [[ ${#queue_url[@]} -gt 0 && $fetched -lt $max_pages ]]; do
|
|
198
|
+
local url="${queue_url[0]}"
|
|
199
|
+
local depth="${queue_depth[0]}"
|
|
200
|
+
queue_url=("${queue_url[@]:1}")
|
|
201
|
+
queue_depth=("${queue_depth[@]:1}")
|
|
202
|
+
|
|
203
|
+
# Visited check.
|
|
204
|
+
if grep -Fxq "$url" "$visited_file" 2>/dev/null; then
|
|
205
|
+
continue
|
|
206
|
+
fi
|
|
207
|
+
echo "$url" >> "$visited_file"
|
|
208
|
+
|
|
209
|
+
local slug
|
|
210
|
+
slug=$(scrape_url_to_slug "$url")
|
|
211
|
+
local raw="$out_dir/raw/${slug}.html"
|
|
212
|
+
local page_json="$out_dir/pages/${slug}.json"
|
|
213
|
+
|
|
214
|
+
log_info " [${fetched}/$max_pages] depth=$depth GET $url"
|
|
215
|
+
|
|
216
|
+
local code
|
|
217
|
+
code=$(scrape_fetch "$url" "$raw")
|
|
218
|
+
if [[ "$code" != "200" && "$code" != "203" ]]; then
|
|
219
|
+
log_warning " HTTP $code — skipping"
|
|
220
|
+
rm -f "$raw"
|
|
221
|
+
continue
|
|
222
|
+
fi
|
|
223
|
+
fetched=$((fetched + 1))
|
|
224
|
+
|
|
225
|
+
# Extract metadata + markdown.
|
|
226
|
+
if ! python3 "$_SCRAPE_HTML_PY" extract \
|
|
227
|
+
--url "$url" --base-url "$base" "$raw" > "$page_json" 2>/dev/null; then
|
|
228
|
+
log_warning " extraction failed — skipping"
|
|
229
|
+
continue
|
|
230
|
+
fi
|
|
231
|
+
|
|
232
|
+
# First page seeds site-level metadata.
|
|
233
|
+
if [[ -z "$site_title" ]]; then
|
|
234
|
+
site_title=$(python3 -c 'import json,sys; d=json.load(open(sys.argv[1])); print(d.get("site_name") or d.get("title") or "")' "$page_json")
|
|
235
|
+
site_description=$(python3 -c 'import json,sys; d=json.load(open(sys.argv[1])); print(d.get("description") or "")' "$page_json")
|
|
236
|
+
site_lang=$(python3 -c 'import json,sys; d=json.load(open(sys.argv[1])); print(d.get("lang") or "en")' "$page_json")
|
|
237
|
+
site_image=$(python3 -c 'import json,sys; d=json.load(open(sys.argv[1])); print(d.get("image") or "")' "$page_json")
|
|
238
|
+
# Capture nav from the first page only.
|
|
239
|
+
python3 -c 'import json,sys; d=json.load(open(sys.argv[1])); print(json.dumps(d.get("nav") or []))' "$page_json" > "$site_nav_json"
|
|
240
|
+
fi
|
|
241
|
+
|
|
242
|
+
# Render Jekyll markdown after assets are downloaded + rewritten
|
|
243
|
+
# (see post-loop step below).
|
|
244
|
+
|
|
245
|
+
# Record in the site index.
|
|
246
|
+
printf '%s\t%s\n' "$slug" "$url" >> "$site_pages_json"
|
|
247
|
+
|
|
248
|
+
# Enqueue same-host links if we have depth budget.
|
|
249
|
+
if [[ $depth -lt $max_depth ]]; then
|
|
250
|
+
local next_url
|
|
251
|
+
while IFS= read -r next_url; do
|
|
252
|
+
[[ -z "$next_url" ]] && continue
|
|
253
|
+
# Skip commerce/auth paths that almost never make good content.
|
|
254
|
+
case "$next_url" in
|
|
255
|
+
*/cart|*/cart/|*/cart\?*|*/checkout*|*/login*|*/signin*|*/signup*|*/account*)
|
|
256
|
+
continue ;;
|
|
257
|
+
esac
|
|
258
|
+
if scrape_same_host "$next_url" "$base"; then
|
|
259
|
+
if ! grep -Fxq "$next_url" "$visited_file" 2>/dev/null; then
|
|
260
|
+
queue_url+=("$next_url")
|
|
261
|
+
queue_depth+=($((depth + 1)))
|
|
262
|
+
fi
|
|
263
|
+
fi
|
|
264
|
+
done < <(python3 "$_SCRAPE_HTML_PY" crawl-links \
|
|
265
|
+
--base-url "$base" --url "$url" "$raw" 2>/dev/null)
|
|
266
|
+
fi
|
|
267
|
+
|
|
268
|
+
# Optional rate limiting.
|
|
269
|
+
if [[ -n "${SCRAPE_RATE_DELAY:-}" && "${SCRAPE_RATE_DELAY}" != "0" ]]; then
|
|
270
|
+
sleep "$SCRAPE_RATE_DELAY" 2>/dev/null || true
|
|
271
|
+
fi
|
|
272
|
+
done
|
|
273
|
+
|
|
274
|
+
# Download referenced images locally and rewrite page markdown to point
|
|
275
|
+
# at the new local paths (unless caller opted out).
|
|
276
|
+
if [[ "${SCRAPE_DOWNLOAD_ASSETS:-1}" == "1" ]]; then
|
|
277
|
+
scrape_download_assets "$out_dir" || \
|
|
278
|
+
log_warning " asset download had errors (continuing)"
|
|
279
|
+
fi
|
|
280
|
+
|
|
281
|
+
# Emit Jekyll markdown from the (now asset-rewritten) page JSON files.
|
|
282
|
+
local pj slug2
|
|
283
|
+
for pj in "$out_dir"/pages/*.json; do
|
|
284
|
+
[[ -f "$pj" ]] || continue
|
|
285
|
+
slug2=$(basename "$pj" .json)
|
|
286
|
+
scrape_emit_jekyll_page "$pj" "$out_dir/jekyll/${slug2}.md" "$slug2"
|
|
287
|
+
done
|
|
288
|
+
|
|
289
|
+
# Build site.json
|
|
290
|
+
scrape_emit_site_json \
|
|
291
|
+
"$base" "$site_title" "$site_description" "$site_lang" "$site_image" \
|
|
292
|
+
"$site_nav_json" "$site_pages_json" "$out_dir/site.json"
|
|
293
|
+
|
|
294
|
+
rm -f "$site_pages_json" "$site_nav_json" "$visited_file"
|
|
295
|
+
log_success "Scrape complete: $fetched page(s) → $out_dir"
|
|
296
|
+
return 0
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
# ---------------------------------------------------------------------------
|
|
300
|
+
# scrape_download_assets OUT_DIR
|
|
301
|
+
# Collects every unique image URL referenced in pages/*.json, downloads to
|
|
302
|
+
# ${OUT_DIR}/assets/<hash>.<ext>, then rewrites the markdown + image field
|
|
303
|
+
# of each page JSON to use the local path "/assets/scraped/<file>".
|
|
304
|
+
# ---------------------------------------------------------------------------
|
|
305
|
+
scrape_download_assets() {
|
|
306
|
+
local out_dir="$1"
|
|
307
|
+
local assets_dir="$out_dir/assets"
|
|
308
|
+
local map_file="$out_dir/.assets_map.tsv"
|
|
309
|
+
mkdir -p "$assets_dir"
|
|
310
|
+
: > "$map_file"
|
|
311
|
+
|
|
312
|
+
# Collect unique image URLs across all pages.
|
|
313
|
+
local urls_file="$out_dir/.asset_urls.tmp"
|
|
314
|
+
python3 - "$out_dir" > "$urls_file" <<'PY'
|
|
315
|
+
import json, os, sys, glob
|
|
316
|
+
seen=set(); out=[]
|
|
317
|
+
for p in sorted(glob.glob(os.path.join(sys.argv[1], "pages", "*.json"))):
|
|
318
|
+
try:
|
|
319
|
+
d=json.load(open(p,encoding="utf-8"))
|
|
320
|
+
except Exception:
|
|
321
|
+
continue
|
|
322
|
+
for img in d.get("images") or []:
|
|
323
|
+
u=img.get("url") or ""
|
|
324
|
+
if u and u.startswith(("http://","https://")) and u not in seen:
|
|
325
|
+
seen.add(u); out.append(u)
|
|
326
|
+
img=d.get("image") or ""
|
|
327
|
+
if img and img.startswith(("http://","https://")) and img not in seen:
|
|
328
|
+
seen.add(img); out.append(img)
|
|
329
|
+
print("\n".join(out))
|
|
330
|
+
PY
|
|
331
|
+
|
|
332
|
+
local count=0 failed=0
|
|
333
|
+
local url ext hash local_name local_path
|
|
334
|
+
while IFS= read -r url; do
|
|
335
|
+
[[ -z "$url" ]] && continue
|
|
336
|
+
# Derive a stable local filename: <md5>.<ext>
|
|
337
|
+
hash=$(printf '%s' "$url" | md5sum 2>/dev/null | awk '{print $1}')
|
|
338
|
+
[[ -z "$hash" ]] && hash=$(printf '%s' "$url" | md5 2>/dev/null | awk '{print $NF}')
|
|
339
|
+
[[ -z "$hash" ]] && hash=$(printf '%s' "$url" | shasum 2>/dev/null | awk '{print substr($1,1,16)}')
|
|
340
|
+
[[ -z "$hash" ]] && continue
|
|
341
|
+
ext=$(python3 -c '
|
|
342
|
+
import sys, os
|
|
343
|
+
from urllib.parse import urlparse, unquote
|
|
344
|
+
p = urlparse(sys.argv[1]).path
|
|
345
|
+
e = os.path.splitext(unquote(p))[1].lower().strip(".")
|
|
346
|
+
if e and len(e) <= 5 and e.isalnum():
|
|
347
|
+
print(e)
|
|
348
|
+
else:
|
|
349
|
+
print("img")
|
|
350
|
+
' "$url" 2>/dev/null)
|
|
351
|
+
[[ -z "$ext" ]] && ext="img"
|
|
352
|
+
local_name="${hash}.${ext}"
|
|
353
|
+
local_path="$assets_dir/$local_name"
|
|
354
|
+
if [[ ! -s "$local_path" ]]; then
|
|
355
|
+
if ! curl -fsSL --max-time "${SCRAPE_TIMEOUT:-15}" \
|
|
356
|
+
-A "${SCRAPE_USER_AGENT:-zer0-mistakes-scraper/1.0}" \
|
|
357
|
+
-o "$local_path" "$url" 2>/dev/null; then
|
|
358
|
+
rm -f "$local_path"
|
|
359
|
+
failed=$((failed + 1))
|
|
360
|
+
continue
|
|
361
|
+
fi
|
|
362
|
+
fi
|
|
363
|
+
printf '%s\t%s\n' "$url" "/assets/scraped/$local_name" >> "$map_file"
|
|
364
|
+
count=$((count + 1))
|
|
365
|
+
done < "$urls_file"
|
|
366
|
+
rm -f "$urls_file"
|
|
367
|
+
|
|
368
|
+
log_info " assets: downloaded $count (failed: $failed)"
|
|
369
|
+
|
|
370
|
+
# Rewrite each page JSON's markdown + image fields using the map.
|
|
371
|
+
python3 - "$out_dir" "$map_file" <<'PY'
|
|
372
|
+
import json, os, sys, glob
|
|
373
|
+
out_dir, map_file = sys.argv[1], sys.argv[2]
|
|
374
|
+
mp = {}
|
|
375
|
+
if os.path.exists(map_file):
|
|
376
|
+
with open(map_file, encoding="utf-8") as f:
|
|
377
|
+
for line in f:
|
|
378
|
+
line = line.rstrip("\n")
|
|
379
|
+
if not line or "\t" not in line: continue
|
|
380
|
+
k, v = line.split("\t", 1)
|
|
381
|
+
mp[k] = v
|
|
382
|
+
if not mp:
|
|
383
|
+
sys.exit(0)
|
|
384
|
+
for p in sorted(glob.glob(os.path.join(out_dir, "pages", "*.json"))):
|
|
385
|
+
try:
|
|
386
|
+
d = json.load(open(p, encoding="utf-8"))
|
|
387
|
+
except Exception:
|
|
388
|
+
continue
|
|
389
|
+
changed = False
|
|
390
|
+
md = d.get("markdown") or ""
|
|
391
|
+
for src, dst in mp.items():
|
|
392
|
+
if src in md:
|
|
393
|
+
md = md.replace(src, dst)
|
|
394
|
+
changed = True
|
|
395
|
+
if changed:
|
|
396
|
+
d["markdown"] = md
|
|
397
|
+
img = d.get("image") or ""
|
|
398
|
+
if img in mp:
|
|
399
|
+
d["image"] = mp[img]
|
|
400
|
+
changed = True
|
|
401
|
+
if d.get("images"):
|
|
402
|
+
for it in d["images"]:
|
|
403
|
+
u = it.get("url") or ""
|
|
404
|
+
if u in mp:
|
|
405
|
+
it["local"] = mp[u]
|
|
406
|
+
if changed:
|
|
407
|
+
with open(p, "w", encoding="utf-8") as f:
|
|
408
|
+
json.dump(d, f, ensure_ascii=False, indent=2)
|
|
409
|
+
PY
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
# ---------------------------------------------------------------------------
|
|
413
|
+
# scrape_emit_jekyll_page PAGE_JSON OUT_MD SLUG
|
|
414
|
+
# Writes Jekyll-friendly markdown with frontmatter.
|
|
415
|
+
# ---------------------------------------------------------------------------
|
|
416
|
+
scrape_emit_jekyll_page() {
|
|
417
|
+
local page_json="$1" out_md="$2" slug="$3"
|
|
418
|
+
SLUG="$slug" python3 - "$page_json" "$out_md" <<'PY'
|
|
419
|
+
import json, os, sys, re
|
|
420
|
+
slug = os.environ["SLUG"]
|
|
421
|
+
src, dst = sys.argv[1], sys.argv[2]
|
|
422
|
+
with open(src, "r", encoding="utf-8") as f:
|
|
423
|
+
d = json.load(f)
|
|
424
|
+
|
|
425
|
+
def y(v):
|
|
426
|
+
if v is None: return '""'
|
|
427
|
+
s = str(v).replace("\\", "\\\\").replace('"', '\\"')
|
|
428
|
+
return f'"{s}"'
|
|
429
|
+
|
|
430
|
+
title = d.get("title") or slug.replace("-", " ").replace("/", " ").title()
|
|
431
|
+
description = d.get("description") or ""
|
|
432
|
+
canonical = d.get("canonical") or d.get("url") or ""
|
|
433
|
+
image = d.get("image") or ""
|
|
434
|
+
kind = d.get("kind") or "page"
|
|
435
|
+
|
|
436
|
+
# Permalink: home wins; otherwise turn slug back into a URL-ish path.
|
|
437
|
+
if kind == "home" or slug in ("index", "home"):
|
|
438
|
+
permalink = "/"
|
|
439
|
+
else:
|
|
440
|
+
permalink = "/" + slug.replace("--", "/")
|
|
441
|
+
if not permalink.endswith("/"):
|
|
442
|
+
permalink = permalink + "/"
|
|
443
|
+
|
|
444
|
+
# Layout: post-like kinds use the article layout, everything else default.
|
|
445
|
+
layout = "article" if kind in ("post", "event") else "default"
|
|
446
|
+
|
|
447
|
+
frontmatter = [
|
|
448
|
+
"---",
|
|
449
|
+
f"title: {y(title)}",
|
|
450
|
+
f"description: {y(description)}",
|
|
451
|
+
f"permalink: {y(permalink)}",
|
|
452
|
+
f"layout: {y(layout)}",
|
|
453
|
+
f"kind: {y(kind)}",
|
|
454
|
+
f"source_url: {y(canonical)}",
|
|
455
|
+
]
|
|
456
|
+
if image:
|
|
457
|
+
frontmatter.append(f"preview: {y(image)}")
|
|
458
|
+
frontmatter.append("scraped: true")
|
|
459
|
+
frontmatter.append("---")
|
|
460
|
+
frontmatter.append("")
|
|
461
|
+
|
|
462
|
+
body = d.get("markdown") or ""
|
|
463
|
+
# Strip the leading H1 if it duplicates the title.
|
|
464
|
+
m = re.match(r"\s*#\s+(.+?)\s*\n", body)
|
|
465
|
+
if m and m.group(1).strip().lower() == title.strip().lower():
|
|
466
|
+
body = body[m.end():]
|
|
467
|
+
|
|
468
|
+
os.makedirs(os.path.dirname(dst), exist_ok=True)
|
|
469
|
+
with open(dst, "w", encoding="utf-8") as f:
|
|
470
|
+
f.write("\n".join(frontmatter))
|
|
471
|
+
f.write(body.lstrip("\n"))
|
|
472
|
+
PY
|
|
473
|
+
}
|
|
474
|
+
|
|
475
|
+
# ---------------------------------------------------------------------------
|
|
476
|
+
# scrape_emit_site_json …
|
|
477
|
+
# Combines site metadata + nav + page index into a single JSON file.
|
|
478
|
+
# ---------------------------------------------------------------------------
|
|
479
|
+
scrape_emit_site_json() {
|
|
480
|
+
local base="$1" title="$2" desc="$3" lang="$4" image="$5"
|
|
481
|
+
local nav_file="$6" pages_tsv="$7" out_file="$8"
|
|
482
|
+
|
|
483
|
+
BASE="$base" TITLE="$title" DESC="$desc" LANG="$lang" IMAGE="$image" \
|
|
484
|
+
NAV="$nav_file" PAGES="$pages_tsv" OUT="$out_file" \
|
|
485
|
+
python3 <<'PY'
|
|
486
|
+
import json, os
|
|
487
|
+
nav = []
|
|
488
|
+
try:
|
|
489
|
+
with open(os.environ["NAV"], "r", encoding="utf-8") as f:
|
|
490
|
+
nav = json.load(f) or []
|
|
491
|
+
except Exception:
|
|
492
|
+
nav = []
|
|
493
|
+
|
|
494
|
+
pages = []
|
|
495
|
+
try:
|
|
496
|
+
with open(os.environ["PAGES"], "r", encoding="utf-8") as f:
|
|
497
|
+
for line in f:
|
|
498
|
+
line = line.rstrip("\n")
|
|
499
|
+
if not line: continue
|
|
500
|
+
parts = line.split("\t", 1)
|
|
501
|
+
if len(parts) == 2:
|
|
502
|
+
slug, url = parts
|
|
503
|
+
# Read the per-page JSON to pick up kind/title for the index.
|
|
504
|
+
pg_json = os.path.join(os.path.dirname(os.environ["OUT"]), "pages", slug + ".json")
|
|
505
|
+
kind = "page"; title = ""; description = ""
|
|
506
|
+
try:
|
|
507
|
+
import json as _j
|
|
508
|
+
with open(pg_json, encoding="utf-8") as pf:
|
|
509
|
+
pd = _j.load(pf)
|
|
510
|
+
kind = pd.get("kind") or "page"
|
|
511
|
+
title = pd.get("title") or ""
|
|
512
|
+
description = pd.get("description") or ""
|
|
513
|
+
except Exception:
|
|
514
|
+
pass
|
|
515
|
+
pages.append({
|
|
516
|
+
"slug": slug, "url": url, "kind": kind,
|
|
517
|
+
"title": title, "description": description,
|
|
518
|
+
})
|
|
519
|
+
except Exception:
|
|
520
|
+
pass
|
|
521
|
+
|
|
522
|
+
out = {
|
|
523
|
+
"base_url": os.environ["BASE"],
|
|
524
|
+
"title": os.environ["TITLE"],
|
|
525
|
+
"description": os.environ["DESC"],
|
|
526
|
+
"lang": os.environ["LANG"] or "en",
|
|
527
|
+
"image": os.environ["IMAGE"],
|
|
528
|
+
"nav": nav,
|
|
529
|
+
"pages": pages,
|
|
530
|
+
"page_count": len(pages),
|
|
531
|
+
}
|
|
532
|
+
with open(os.environ["OUT"], "w", encoding="utf-8") as f:
|
|
533
|
+
json.dump(out, f, ensure_ascii=False, indent=2)
|
|
534
|
+
PY
|
|
535
|
+
}
|