jekyll-theme-zer0 1.8.2 → 1.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +26 -0
  3. data/README.md +98 -7
  4. data/_data/content_statistics.yml +253 -251
  5. data/_includes/components/nav-export.html +61 -0
  6. data/_includes/components/nav-overview.html +54 -0
  7. data/scripts/bin/install +52 -705
  8. data/scripts/install/README.md +162 -0
  9. data/scripts/install/ai/client.sh +164 -0
  10. data/scripts/install/ai/diagnose.sh +81 -0
  11. data/scripts/install/ai/prompts/diagnose.system.md +42 -0
  12. data/scripts/install/ai/prompts/spec.schema.json +129 -0
  13. data/scripts/install/ai/prompts/suggest.system.md +43 -0
  14. data/scripts/install/ai/prompts/wizard.system.md +142 -0
  15. data/scripts/install/ai/suggest.sh +57 -0
  16. data/scripts/install/ai/wizard.sh +150 -0
  17. data/scripts/install/apply.sh +156 -0
  18. data/scripts/install/cli.sh +561 -0
  19. data/scripts/install/diff.sh +128 -0
  20. data/scripts/install/doctor.sh +168 -0
  21. data/scripts/install/fs.sh +138 -0
  22. data/scripts/install/log.sh +119 -0
  23. data/scripts/install/plan.sh +299 -0
  24. data/scripts/install/platform.sh +122 -0
  25. data/scripts/install/prompt.sh +124 -0
  26. data/scripts/install/repair.sh +45 -0
  27. data/scripts/install/scrape.sh +535 -0
  28. data/scripts/install/scrape_html.py +764 -0
  29. data/scripts/install/spec.sh +486 -0
  30. data/scripts/install/tasks/_registry.sh +65 -0
  31. data/scripts/install/tasks/agents.sh +60 -0
  32. data/scripts/install/tasks/config.sh +37 -0
  33. data/scripts/install/tasks/data.sh +18 -0
  34. data/scripts/install/tasks/deploy_azure-swa.sh +17 -0
  35. data/scripts/install/tasks/deploy_docker-prod.sh +21 -0
  36. data/scripts/install/tasks/deploy_github-pages.sh +18 -0
  37. data/scripts/install/tasks/devcontainer.sh +26 -0
  38. data/scripts/install/tasks/docker.sh +29 -0
  39. data/scripts/install/tasks/gemfile.sh +42 -0
  40. data/scripts/install/tasks/gitignore.sh +26 -0
  41. data/scripts/install/tasks/marker.sh +46 -0
  42. data/scripts/install/tasks/nav.sh +18 -0
  43. data/scripts/install/tasks/pages.sh +61 -0
  44. data/scripts/install/tasks/readme.sh +27 -0
  45. data/scripts/install/tasks/scrape.sh +348 -0
  46. data/scripts/install/template.sh +138 -0
  47. data/scripts/install/tui.sh +110 -0
  48. data/scripts/install/upgrade.sh +49 -0
  49. data/scripts/lib/install/template.sh +1 -0
  50. metadata +45 -2
@@ -0,0 +1,535 @@
1
+ #!/bin/bash
2
+ # =============================================================================
3
+ # scripts/install/scrape.sh — BFS site scraper for the installer
4
+ # =============================================================================
5
+ # Crawls a public website with curl, runs each page through scrape_html.py,
6
+ # and writes a structured corpus under ${OUT_DIR}:
7
+ #
8
+ # ${OUT_DIR}/
9
+ # site.json — site-level summary (title, nav, page index)
10
+ # raw/<slug>.html — raw HTML as fetched
11
+ # pages/<slug>.json — per-page extraction result
12
+ # jekyll/<slug>.md — Jekyll-ready Markdown with frontmatter
13
+ #
14
+ # The Jekyll markdown is the artifact consumed by tasks/scrape.sh.
15
+ #
16
+ # Public API:
17
+ #
18
+ # scrape_run URL OUT_DIR [DEPTH] [MAX_PAGES]
19
+ # Crawl starting at URL. Default DEPTH=2, MAX_PAGES=25.
20
+ #
21
+ # scrape_url_to_slug URL
22
+ # Stable filesystem-safe identifier for a URL.
23
+ #
24
+ # Honors:
25
+ # SCRAPE_USER_AGENT (default: zer0-mistakes-scraper/1.0 …)
26
+ # SCRAPE_TIMEOUT per-request curl timeout in seconds (default: 15)
27
+ # SCRAPE_RATE_DELAY seconds to sleep between requests (default: 0)
28
+ # SCRAPE_ALLOW_SUBDOMAINS 1 to allow same-suffix subdomains (default: 0)
29
+ # _FS_DRY_RUN inherited from fs.sh — when 1, do not curl
30
+ #
31
+ # Bash 3.2 compatible. No set -euo pipefail here.
32
+ # =============================================================================
33
+ [[ -n "${_HAS_SCRAPE_LIB:-}" ]] && return 0
34
+ _HAS_SCRAPE_LIB=1
35
+
36
+ _SCRAPE_DIR="${_SCRAPE_DIR:-$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" 2>/dev/null && pwd)}"
37
+ _SCRAPE_HTML_PY="${_SCRAPE_DIR}/scrape_html.py"
38
+
39
+ # ---------------------------------------------------------------------------
40
+ # scrape_check_deps — verify python3 + curl
41
+ # ---------------------------------------------------------------------------
42
+ scrape_check_deps() {
43
+ if ! command -v python3 >/dev/null 2>&1; then
44
+ log_error "scrape: python3 is required (not found in PATH)"
45
+ return 1
46
+ fi
47
+ if ! command -v curl >/dev/null 2>&1; then
48
+ log_error "scrape: curl is required (not found in PATH)"
49
+ return 1
50
+ fi
51
+ if [[ ! -f "$_SCRAPE_HTML_PY" ]]; then
52
+ log_error "scrape: helper not found: $_SCRAPE_HTML_PY"
53
+ return 1
54
+ fi
55
+ return 0
56
+ }
57
+
58
+ # ---------------------------------------------------------------------------
59
+ # scrape_url_to_slug URL — print a deterministic slug for the given URL
60
+ # ---------------------------------------------------------------------------
61
+ scrape_url_to_slug() {
62
+ local url="$1"
63
+ local path
64
+ path=$(printf '%s' "$url" | python3 -c '
65
+ import sys, re
66
+ from urllib.parse import urlparse
67
+ u = urlparse(sys.stdin.read().strip())
68
+ p = (u.path or "/").strip("/")
69
+ # Strip extensions that round-trip badly as Jekyll page slugs.
70
+ p = re.sub(r"\.(html?|php|aspx?|jsp)$", "", p, flags=re.I)
71
+ if u.query:
72
+ p = (p + "/" + u.query) if p else u.query
73
+ if not p:
74
+ p = "index"
75
+ p = re.sub(r"[^A-Za-z0-9._/-]+", "-", p).strip("-/")
76
+ p = re.sub(r"/+", "--", p)
77
+ p = re.sub(r"-+", "-", p)
78
+ print(p[:100] or "index")
79
+ ')
80
+ printf '%s' "$path"
81
+ }
82
+
83
+ # ---------------------------------------------------------------------------
84
+ # scrape_normalize_url URL [BASE_URL]
85
+ # ---------------------------------------------------------------------------
86
+ scrape_normalize_url() {
87
+ local url="$1"
88
+ local base="${2:-$1}"
89
+ URL="$url" BASE="$base" python3 -c '
90
+ import os
91
+ from urllib.parse import urljoin, urldefrag, urlparse
92
+ u, _ = urldefrag(urljoin(os.environ["BASE"], os.environ["URL"]))
93
+ p = urlparse(u)
94
+ if p.scheme not in ("http", "https"):
95
+ raise SystemExit(0)
96
+ netloc = p.netloc
97
+ if netloc.endswith(":80") and p.scheme == "http": netloc = netloc[:-3]
98
+ if netloc.endswith(":443") and p.scheme == "https": netloc = netloc[:-4]
99
+ path = p.path or "/"
100
+ print(f"{p.scheme}://{netloc}{path}" + (("?" + p.query) if p.query else ""))
101
+ '
102
+ }
103
+
104
+ # ---------------------------------------------------------------------------
105
+ # scrape_same_host URL BASE_URL
106
+ # ---------------------------------------------------------------------------
107
+ scrape_same_host() {
108
+ local url="$1" base="$2"
109
+ local allow_sub="${SCRAPE_ALLOW_SUBDOMAINS:-0}"
110
+ URL="$url" BASE="$base" ALLOW="$allow_sub" python3 -c '
111
+ import os, sys
112
+ from urllib.parse import urlparse
113
+ def host(u):
114
+ h = urlparse(u).netloc.lower()
115
+ if h.startswith("www."): h = h[4:]
116
+ return h
117
+ a, b = host(os.environ["URL"]), host(os.environ["BASE"])
118
+ if not a or not b:
119
+ sys.exit(1)
120
+ if a == b:
121
+ sys.exit(0)
122
+ if os.environ.get("ALLOW") == "1" and (a.endswith("." + b) or b.endswith("." + a)):
123
+ sys.exit(0)
124
+ sys.exit(1)
125
+ '
126
+ }
127
+
128
+ # ---------------------------------------------------------------------------
129
+ # scrape_fetch URL OUT_FILE — fetch URL to file; print HTTP status to stdout
130
+ # ---------------------------------------------------------------------------
131
+ scrape_fetch() {
132
+ local url="$1" out="$2"
133
+ local ua="${SCRAPE_USER_AGENT:-zer0-mistakes-scraper/1.0 (+https://github.com/bamr87/zer0-mistakes)}"
134
+ local timeout="${SCRAPE_TIMEOUT:-15}"
135
+ local code
136
+ code=$(curl -fsSL \
137
+ --max-time "$timeout" \
138
+ --retry 1 --retry-delay 1 \
139
+ -A "$ua" \
140
+ -H "Accept: text/html,application/xhtml+xml" \
141
+ -H "Accept-Language: en-US,en;q=0.9" \
142
+ -o "$out" \
143
+ -w "%{http_code}" \
144
+ "$url" 2>/dev/null) || true
145
+ printf '%s' "$code"
146
+ }
147
+
148
+ # ---------------------------------------------------------------------------
149
+ # scrape_run URL OUT_DIR [DEPTH] [MAX_PAGES]
150
+ # ---------------------------------------------------------------------------
151
+ scrape_run() {
152
+ local start_url="$1"
153
+ local out_dir="$2"
154
+ local max_depth="${3:-2}"
155
+ local max_pages="${4:-25}"
156
+
157
+ scrape_check_deps || return 1
158
+
159
+ if [[ -z "$start_url" || -z "$out_dir" ]]; then
160
+ log_error "scrape_run: URL and OUT_DIR required"
161
+ return 2
162
+ fi
163
+
164
+ # Normalize the seed URL.
165
+ local base
166
+ base=$(scrape_normalize_url "$start_url") || base="$start_url"
167
+ [[ -n "$base" ]] || { log_error "scrape: invalid URL: $start_url"; return 2; }
168
+
169
+ mkdir -p "$out_dir/raw" "$out_dir/pages" "$out_dir/jekyll"
170
+
171
+ log_info "Scraping site: $base"
172
+ log_info " → output: $out_dir"
173
+ log_info " → depth: $max_depth max-pages: $max_pages"
174
+
175
+ if [[ "${_FS_DRY_RUN:-0}" == "1" ]]; then
176
+ log_warning "DRY RUN — no network requests will be issued"
177
+ return 0
178
+ fi
179
+
180
+ # Queue parallel arrays (URL + depth). Visited set kept in a file for
181
+ # bash 3.2 portability.
182
+ local visited_file
183
+ visited_file="$out_dir/.visited"
184
+ : > "$visited_file"
185
+
186
+ local -a queue_url queue_depth
187
+ queue_url=("$base")
188
+ queue_depth=(0)
189
+
190
+ local fetched=0
191
+ local site_pages_json="$out_dir/.site_pages.tmp"
192
+ : > "$site_pages_json"
193
+ local site_nav_json="$out_dir/.site_nav.tmp"
194
+ : > "$site_nav_json"
195
+ local site_title="" site_description="" site_lang="" site_image=""
196
+
197
+ while [[ ${#queue_url[@]} -gt 0 && $fetched -lt $max_pages ]]; do
198
+ local url="${queue_url[0]}"
199
+ local depth="${queue_depth[0]}"
200
+ queue_url=("${queue_url[@]:1}")
201
+ queue_depth=("${queue_depth[@]:1}")
202
+
203
+ # Visited check.
204
+ if grep -Fxq "$url" "$visited_file" 2>/dev/null; then
205
+ continue
206
+ fi
207
+ echo "$url" >> "$visited_file"
208
+
209
+ local slug
210
+ slug=$(scrape_url_to_slug "$url")
211
+ local raw="$out_dir/raw/${slug}.html"
212
+ local page_json="$out_dir/pages/${slug}.json"
213
+
214
+ log_info " [${fetched}/$max_pages] depth=$depth GET $url"
215
+
216
+ local code
217
+ code=$(scrape_fetch "$url" "$raw")
218
+ if [[ "$code" != "200" && "$code" != "203" ]]; then
219
+ log_warning " HTTP $code — skipping"
220
+ rm -f "$raw"
221
+ continue
222
+ fi
223
+ fetched=$((fetched + 1))
224
+
225
+ # Extract metadata + markdown.
226
+ if ! python3 "$_SCRAPE_HTML_PY" extract \
227
+ --url "$url" --base-url "$base" "$raw" > "$page_json" 2>/dev/null; then
228
+ log_warning " extraction failed — skipping"
229
+ continue
230
+ fi
231
+
232
+ # First page seeds site-level metadata.
233
+ if [[ -z "$site_title" ]]; then
234
+ site_title=$(python3 -c 'import json,sys; d=json.load(open(sys.argv[1])); print(d.get("site_name") or d.get("title") or "")' "$page_json")
235
+ site_description=$(python3 -c 'import json,sys; d=json.load(open(sys.argv[1])); print(d.get("description") or "")' "$page_json")
236
+ site_lang=$(python3 -c 'import json,sys; d=json.load(open(sys.argv[1])); print(d.get("lang") or "en")' "$page_json")
237
+ site_image=$(python3 -c 'import json,sys; d=json.load(open(sys.argv[1])); print(d.get("image") or "")' "$page_json")
238
+ # Capture nav from the first page only.
239
+ python3 -c 'import json,sys; d=json.load(open(sys.argv[1])); print(json.dumps(d.get("nav") or []))' "$page_json" > "$site_nav_json"
240
+ fi
241
+
242
+ # Render Jekyll markdown after assets are downloaded + rewritten
243
+ # (see post-loop step below).
244
+
245
+ # Record in the site index.
246
+ printf '%s\t%s\n' "$slug" "$url" >> "$site_pages_json"
247
+
248
+ # Enqueue same-host links if we have depth budget.
249
+ if [[ $depth -lt $max_depth ]]; then
250
+ local next_url
251
+ while IFS= read -r next_url; do
252
+ [[ -z "$next_url" ]] && continue
253
+ # Skip commerce/auth paths that almost never make good content.
254
+ case "$next_url" in
255
+ */cart|*/cart/|*/cart\?*|*/checkout*|*/login*|*/signin*|*/signup*|*/account*)
256
+ continue ;;
257
+ esac
258
+ if scrape_same_host "$next_url" "$base"; then
259
+ if ! grep -Fxq "$next_url" "$visited_file" 2>/dev/null; then
260
+ queue_url+=("$next_url")
261
+ queue_depth+=($((depth + 1)))
262
+ fi
263
+ fi
264
+ done < <(python3 "$_SCRAPE_HTML_PY" crawl-links \
265
+ --base-url "$base" --url "$url" "$raw" 2>/dev/null)
266
+ fi
267
+
268
+ # Optional rate limiting.
269
+ if [[ -n "${SCRAPE_RATE_DELAY:-}" && "${SCRAPE_RATE_DELAY}" != "0" ]]; then
270
+ sleep "$SCRAPE_RATE_DELAY" 2>/dev/null || true
271
+ fi
272
+ done
273
+
274
+ # Download referenced images locally and rewrite page markdown to point
275
+ # at the new local paths (unless caller opted out).
276
+ if [[ "${SCRAPE_DOWNLOAD_ASSETS:-1}" == "1" ]]; then
277
+ scrape_download_assets "$out_dir" || \
278
+ log_warning " asset download had errors (continuing)"
279
+ fi
280
+
281
+ # Emit Jekyll markdown from the (now asset-rewritten) page JSON files.
282
+ local pj slug2
283
+ for pj in "$out_dir"/pages/*.json; do
284
+ [[ -f "$pj" ]] || continue
285
+ slug2=$(basename "$pj" .json)
286
+ scrape_emit_jekyll_page "$pj" "$out_dir/jekyll/${slug2}.md" "$slug2"
287
+ done
288
+
289
+ # Build site.json
290
+ scrape_emit_site_json \
291
+ "$base" "$site_title" "$site_description" "$site_lang" "$site_image" \
292
+ "$site_nav_json" "$site_pages_json" "$out_dir/site.json"
293
+
294
+ rm -f "$site_pages_json" "$site_nav_json" "$visited_file"
295
+ log_success "Scrape complete: $fetched page(s) → $out_dir"
296
+ return 0
297
+ }
298
+
299
+ # ---------------------------------------------------------------------------
300
+ # scrape_download_assets OUT_DIR
301
+ # Collects every unique image URL referenced in pages/*.json, downloads to
302
+ # ${OUT_DIR}/assets/<hash>.<ext>, then rewrites the markdown + image field
303
+ # of each page JSON to use the local path "/assets/scraped/<file>".
304
+ # ---------------------------------------------------------------------------
305
+ scrape_download_assets() {
306
+ local out_dir="$1"
307
+ local assets_dir="$out_dir/assets"
308
+ local map_file="$out_dir/.assets_map.tsv"
309
+ mkdir -p "$assets_dir"
310
+ : > "$map_file"
311
+
312
+ # Collect unique image URLs across all pages.
313
+ local urls_file="$out_dir/.asset_urls.tmp"
314
+ python3 - "$out_dir" > "$urls_file" <<'PY'
315
+ import json, os, sys, glob
316
+ seen=set(); out=[]
317
+ for p in sorted(glob.glob(os.path.join(sys.argv[1], "pages", "*.json"))):
318
+ try:
319
+ d=json.load(open(p,encoding="utf-8"))
320
+ except Exception:
321
+ continue
322
+ for img in d.get("images") or []:
323
+ u=img.get("url") or ""
324
+ if u and u.startswith(("http://","https://")) and u not in seen:
325
+ seen.add(u); out.append(u)
326
+ img=d.get("image") or ""
327
+ if img and img.startswith(("http://","https://")) and img not in seen:
328
+ seen.add(img); out.append(img)
329
+ print("\n".join(out))
330
+ PY
331
+
332
+ local count=0 failed=0
333
+ local url ext hash local_name local_path
334
+ while IFS= read -r url; do
335
+ [[ -z "$url" ]] && continue
336
+ # Derive a stable local filename: <md5>.<ext>
337
+ hash=$(printf '%s' "$url" | md5sum 2>/dev/null | awk '{print $1}')
338
+ [[ -z "$hash" ]] && hash=$(printf '%s' "$url" | md5 2>/dev/null | awk '{print $NF}')
339
+ [[ -z "$hash" ]] && hash=$(printf '%s' "$url" | shasum 2>/dev/null | awk '{print substr($1,1,16)}')
340
+ [[ -z "$hash" ]] && continue
341
+ ext=$(python3 -c '
342
+ import sys, os
343
+ from urllib.parse import urlparse, unquote
344
+ p = urlparse(sys.argv[1]).path
345
+ e = os.path.splitext(unquote(p))[1].lower().strip(".")
346
+ if e and len(e) <= 5 and e.isalnum():
347
+ print(e)
348
+ else:
349
+ print("img")
350
+ ' "$url" 2>/dev/null)
351
+ [[ -z "$ext" ]] && ext="img"
352
+ local_name="${hash}.${ext}"
353
+ local_path="$assets_dir/$local_name"
354
+ if [[ ! -s "$local_path" ]]; then
355
+ if ! curl -fsSL --max-time "${SCRAPE_TIMEOUT:-15}" \
356
+ -A "${SCRAPE_USER_AGENT:-zer0-mistakes-scraper/1.0}" \
357
+ -o "$local_path" "$url" 2>/dev/null; then
358
+ rm -f "$local_path"
359
+ failed=$((failed + 1))
360
+ continue
361
+ fi
362
+ fi
363
+ printf '%s\t%s\n' "$url" "/assets/scraped/$local_name" >> "$map_file"
364
+ count=$((count + 1))
365
+ done < "$urls_file"
366
+ rm -f "$urls_file"
367
+
368
+ log_info " assets: downloaded $count (failed: $failed)"
369
+
370
+ # Rewrite each page JSON's markdown + image fields using the map.
371
+ python3 - "$out_dir" "$map_file" <<'PY'
372
+ import json, os, sys, glob
373
+ out_dir, map_file = sys.argv[1], sys.argv[2]
374
+ mp = {}
375
+ if os.path.exists(map_file):
376
+ with open(map_file, encoding="utf-8") as f:
377
+ for line in f:
378
+ line = line.rstrip("\n")
379
+ if not line or "\t" not in line: continue
380
+ k, v = line.split("\t", 1)
381
+ mp[k] = v
382
+ if not mp:
383
+ sys.exit(0)
384
+ for p in sorted(glob.glob(os.path.join(out_dir, "pages", "*.json"))):
385
+ try:
386
+ d = json.load(open(p, encoding="utf-8"))
387
+ except Exception:
388
+ continue
389
+ changed = False
390
+ md = d.get("markdown") or ""
391
+ for src, dst in mp.items():
392
+ if src in md:
393
+ md = md.replace(src, dst)
394
+ changed = True
395
+ if changed:
396
+ d["markdown"] = md
397
+ img = d.get("image") or ""
398
+ if img in mp:
399
+ d["image"] = mp[img]
400
+ changed = True
401
+ if d.get("images"):
402
+ for it in d["images"]:
403
+ u = it.get("url") or ""
404
+ if u in mp:
405
+ it["local"] = mp[u]
406
+ if changed:
407
+ with open(p, "w", encoding="utf-8") as f:
408
+ json.dump(d, f, ensure_ascii=False, indent=2)
409
+ PY
410
+ }
411
+
412
+ # ---------------------------------------------------------------------------
413
+ # scrape_emit_jekyll_page PAGE_JSON OUT_MD SLUG
414
+ # Writes Jekyll-friendly markdown with frontmatter.
415
+ # ---------------------------------------------------------------------------
416
+ scrape_emit_jekyll_page() {
417
+ local page_json="$1" out_md="$2" slug="$3"
418
+ SLUG="$slug" python3 - "$page_json" "$out_md" <<'PY'
419
+ import json, os, sys, re
420
+ slug = os.environ["SLUG"]
421
+ src, dst = sys.argv[1], sys.argv[2]
422
+ with open(src, "r", encoding="utf-8") as f:
423
+ d = json.load(f)
424
+
425
+ def y(v):
426
+ if v is None: return '""'
427
+ s = str(v).replace("\\", "\\\\").replace('"', '\\"')
428
+ return f'"{s}"'
429
+
430
+ title = d.get("title") or slug.replace("-", " ").replace("/", " ").title()
431
+ description = d.get("description") or ""
432
+ canonical = d.get("canonical") or d.get("url") or ""
433
+ image = d.get("image") or ""
434
+ kind = d.get("kind") or "page"
435
+
436
+ # Permalink: home wins; otherwise turn slug back into a URL-ish path.
437
+ if kind == "home" or slug in ("index", "home"):
438
+ permalink = "/"
439
+ else:
440
+ permalink = "/" + slug.replace("--", "/")
441
+ if not permalink.endswith("/"):
442
+ permalink = permalink + "/"
443
+
444
+ # Layout: post-like kinds use the article layout, everything else default.
445
+ layout = "article" if kind in ("post", "event") else "default"
446
+
447
+ frontmatter = [
448
+ "---",
449
+ f"title: {y(title)}",
450
+ f"description: {y(description)}",
451
+ f"permalink: {y(permalink)}",
452
+ f"layout: {y(layout)}",
453
+ f"kind: {y(kind)}",
454
+ f"source_url: {y(canonical)}",
455
+ ]
456
+ if image:
457
+ frontmatter.append(f"preview: {y(image)}")
458
+ frontmatter.append("scraped: true")
459
+ frontmatter.append("---")
460
+ frontmatter.append("")
461
+
462
+ body = d.get("markdown") or ""
463
+ # Strip the leading H1 if it duplicates the title.
464
+ m = re.match(r"\s*#\s+(.+?)\s*\n", body)
465
+ if m and m.group(1).strip().lower() == title.strip().lower():
466
+ body = body[m.end():]
467
+
468
+ os.makedirs(os.path.dirname(dst), exist_ok=True)
469
+ with open(dst, "w", encoding="utf-8") as f:
470
+ f.write("\n".join(frontmatter))
471
+ f.write(body.lstrip("\n"))
472
+ PY
473
+ }
474
+
475
+ # ---------------------------------------------------------------------------
476
+ # scrape_emit_site_json …
477
+ # Combines site metadata + nav + page index into a single JSON file.
478
+ # ---------------------------------------------------------------------------
479
+ scrape_emit_site_json() {
480
+ local base="$1" title="$2" desc="$3" lang="$4" image="$5"
481
+ local nav_file="$6" pages_tsv="$7" out_file="$8"
482
+
483
+ BASE="$base" TITLE="$title" DESC="$desc" LANG="$lang" IMAGE="$image" \
484
+ NAV="$nav_file" PAGES="$pages_tsv" OUT="$out_file" \
485
+ python3 <<'PY'
486
+ import json, os
487
+ nav = []
488
+ try:
489
+ with open(os.environ["NAV"], "r", encoding="utf-8") as f:
490
+ nav = json.load(f) or []
491
+ except Exception:
492
+ nav = []
493
+
494
+ pages = []
495
+ try:
496
+ with open(os.environ["PAGES"], "r", encoding="utf-8") as f:
497
+ for line in f:
498
+ line = line.rstrip("\n")
499
+ if not line: continue
500
+ parts = line.split("\t", 1)
501
+ if len(parts) == 2:
502
+ slug, url = parts
503
+ # Read the per-page JSON to pick up kind/title for the index.
504
+ pg_json = os.path.join(os.path.dirname(os.environ["OUT"]), "pages", slug + ".json")
505
+ kind = "page"; title = ""; description = ""
506
+ try:
507
+ import json as _j
508
+ with open(pg_json, encoding="utf-8") as pf:
509
+ pd = _j.load(pf)
510
+ kind = pd.get("kind") or "page"
511
+ title = pd.get("title") or ""
512
+ description = pd.get("description") or ""
513
+ except Exception:
514
+ pass
515
+ pages.append({
516
+ "slug": slug, "url": url, "kind": kind,
517
+ "title": title, "description": description,
518
+ })
519
+ except Exception:
520
+ pass
521
+
522
+ out = {
523
+ "base_url": os.environ["BASE"],
524
+ "title": os.environ["TITLE"],
525
+ "description": os.environ["DESC"],
526
+ "lang": os.environ["LANG"] or "en",
527
+ "image": os.environ["IMAGE"],
528
+ "nav": nav,
529
+ "pages": pages,
530
+ "page_count": len(pages),
531
+ }
532
+ with open(os.environ["OUT"], "w", encoding="utf-8") as f:
533
+ json.dump(out, f, ensure_ascii=False, indent=2)
534
+ PY
535
+ }