jekyll-theme-zer0 1.8.2 → 1.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +33 -3
  3. data/README.md +98 -7
  4. data/_data/content_statistics.yml +253 -251
  5. data/_includes/components/nav-export.html +61 -0
  6. data/_includes/components/nav-overview.html +54 -0
  7. data/scripts/bin/install +52 -705
  8. data/scripts/github-setup.sh +0 -0
  9. data/scripts/install/README.md +162 -0
  10. data/scripts/install/ai/client.sh +164 -0
  11. data/scripts/install/ai/diagnose.sh +81 -0
  12. data/scripts/install/ai/prompts/diagnose.system.md +42 -0
  13. data/scripts/install/ai/prompts/spec.schema.json +129 -0
  14. data/scripts/install/ai/prompts/suggest.system.md +43 -0
  15. data/scripts/install/ai/prompts/wizard.system.md +142 -0
  16. data/scripts/install/ai/suggest.sh +57 -0
  17. data/scripts/install/ai/wizard.sh +150 -0
  18. data/scripts/install/apply.sh +156 -0
  19. data/scripts/install/cli.sh +561 -0
  20. data/scripts/install/diff.sh +128 -0
  21. data/scripts/install/doctor.sh +168 -0
  22. data/scripts/install/fs.sh +138 -0
  23. data/scripts/install/log.sh +119 -0
  24. data/scripts/install/plan.sh +299 -0
  25. data/scripts/install/platform.sh +122 -0
  26. data/scripts/install/prompt.sh +124 -0
  27. data/scripts/install/repair.sh +45 -0
  28. data/scripts/install/scrape.sh +535 -0
  29. data/scripts/install/scrape_html.py +764 -0
  30. data/scripts/install/spec.sh +486 -0
  31. data/scripts/install/tasks/_registry.sh +65 -0
  32. data/scripts/install/tasks/agents.sh +60 -0
  33. data/scripts/install/tasks/config.sh +37 -0
  34. data/scripts/install/tasks/data.sh +18 -0
  35. data/scripts/install/tasks/deploy_azure-swa.sh +17 -0
  36. data/scripts/install/tasks/deploy_docker-prod.sh +21 -0
  37. data/scripts/install/tasks/deploy_github-pages.sh +18 -0
  38. data/scripts/install/tasks/devcontainer.sh +26 -0
  39. data/scripts/install/tasks/docker.sh +29 -0
  40. data/scripts/install/tasks/gemfile.sh +42 -0
  41. data/scripts/install/tasks/gitignore.sh +26 -0
  42. data/scripts/install/tasks/marker.sh +46 -0
  43. data/scripts/install/tasks/nav.sh +18 -0
  44. data/scripts/install/tasks/pages.sh +61 -0
  45. data/scripts/install/tasks/readme.sh +27 -0
  46. data/scripts/install/tasks/scrape.sh +348 -0
  47. data/scripts/install/template.sh +138 -0
  48. data/scripts/install/tui.sh +110 -0
  49. data/scripts/install/upgrade.sh +49 -0
  50. data/scripts/lib/install/template.sh +1 -0
  51. metadata +49 -6
@@ -0,0 +1,764 @@
1
+ #!/usr/bin/env python3
2
+ # =============================================================================
3
+ # scripts/install/scrape_html.py — HTML extractor for the zer0-mistakes
4
+ # installer's site-scraping pipeline.
5
+ # =============================================================================
6
+ # Stdlib only (Python >= 3.6). Reads HTML from a file or stdin, extracts:
7
+ #
8
+ # - title, description, language, canonical URL
9
+ # - Open Graph + Twitter card metadata
10
+ # - main content as a heuristic-selected subtree, rendered to Markdown
11
+ # - all internal links (same host as --base-url, normalized + deduped)
12
+ # - top-level navigation links (from <nav>, <header>, role="navigation")
13
+ # - first image (used as preview)
14
+ #
15
+ # Two subcommands:
16
+ # extract --url URL [--base-url BASE] [HTML_FILE]
17
+ # Emit a JSON document describing the page.
18
+ #
19
+ # crawl-links --base-url BASE [HTML_FILE]
20
+ # Emit a newline-delimited list of in-scope links discovered in the page
21
+ # (used by the bash crawler to enqueue further pages).
22
+ #
23
+ # Why a Python helper? Pure bash + sed is too brittle for real-world HTML and
24
+ # pandoc isn't guaranteed to be installed; html.parser is in every supported
25
+ # Python install and gives us deterministic, cross-platform behaviour.
26
+ # =============================================================================
27
+ from __future__ import annotations
28
+
29
+ import argparse
30
+ import json
31
+ import re
32
+ import sys
33
+ from html import unescape
34
+ from html.parser import HTMLParser
35
+ from typing import Dict, List, Optional, Tuple
36
+ from urllib.parse import urldefrag, urljoin, urlparse
37
+
38
+
39
+ # ---------------------------------------------------------------------------
40
+ # DOM model — minimal tree we can reason about.
41
+ # ---------------------------------------------------------------------------
42
+ VOID_TAGS = {
43
+ "area", "base", "br", "col", "embed", "hr", "img", "input",
44
+ "link", "meta", "param", "source", "track", "wbr",
45
+ }
46
+
47
+ # Tags that never contribute meaningful content (stripped wholesale).
48
+ DROP_TAGS = {"script", "style", "noscript", "template", "iframe", "svg"}
49
+
50
+ # Tags that typically wrap chrome (header/footer/nav/aside). We use these
51
+ # as heuristic anti-signals when selecting main content.
52
+ CHROME_TAGS = {"header", "footer", "nav", "aside"}
53
+
54
+
55
+ class Node:
56
+ __slots__ = ("tag", "attrs", "children", "parent")
57
+
58
+ def __init__(self, tag: str, attrs: Optional[Dict[str, str]] = None):
59
+ self.tag = tag
60
+ self.attrs = attrs or {}
61
+ self.children: List["Node | str"] = []
62
+ self.parent: Optional["Node"] = None
63
+
64
+ def add(self, child):
65
+ if isinstance(child, Node):
66
+ child.parent = self
67
+ self.children.append(child)
68
+
69
+ def find_all(self, tag: str) -> List["Node"]:
70
+ out: List[Node] = []
71
+ stack: List[Node] = [self]
72
+ while stack:
73
+ n = stack.pop()
74
+ if n.tag == tag:
75
+ out.append(n)
76
+ for c in n.children:
77
+ if isinstance(c, Node):
78
+ stack.append(c)
79
+ return out
80
+
81
+ def text(self) -> str:
82
+ parts: List[str] = []
83
+ for c in self.children:
84
+ if isinstance(c, str):
85
+ parts.append(c)
86
+ elif c.tag not in DROP_TAGS:
87
+ parts.append(c.text())
88
+ return "".join(parts)
89
+
90
+
91
+ class DOMBuilder(HTMLParser):
92
+ """Build a tolerant DOM from real-world HTML."""
93
+
94
+ def __init__(self):
95
+ super().__init__(convert_charrefs=True)
96
+ self.root = Node("__root__")
97
+ self.stack: List[Node] = [self.root]
98
+ self._dropping = 0 # nested depth inside DROP_TAGS
99
+
100
+ def handle_starttag(self, tag, attrs):
101
+ tag = tag.lower()
102
+ if tag in DROP_TAGS:
103
+ # Track depth so nested drops work, but still push a node so
104
+ # the matching end tag is consumed.
105
+ self._dropping += 1
106
+ node = Node(tag, {k.lower(): (v or "") for k, v in attrs})
107
+ self.stack[-1].add(node)
108
+ if tag not in VOID_TAGS:
109
+ self.stack.append(node)
110
+
111
+ def handle_startendtag(self, tag, attrs):
112
+ # Self-closing form — treat as void regardless of tag name.
113
+ tag = tag.lower()
114
+ node = Node(tag, {k.lower(): (v or "") for k, v in attrs})
115
+ self.stack[-1].add(node)
116
+
117
+ def handle_endtag(self, tag):
118
+ tag = tag.lower()
119
+ if tag in VOID_TAGS:
120
+ return
121
+ # Pop until we find a matching tag (HTML is forgiving).
122
+ for i in range(len(self.stack) - 1, 0, -1):
123
+ if self.stack[i].tag == tag:
124
+ if tag in DROP_TAGS and self._dropping > 0:
125
+ self._dropping -= 1
126
+ del self.stack[i:]
127
+ return
128
+ # No match — ignore the stray end tag.
129
+
130
+ def handle_data(self, data):
131
+ if self._dropping:
132
+ return
133
+ if data:
134
+ self.stack[-1].add(data)
135
+
136
+
137
+ def parse_html(html: str) -> Node:
138
+ b = DOMBuilder()
139
+ try:
140
+ b.feed(html)
141
+ except Exception:
142
+ # Be forgiving: return whatever was parsed so far.
143
+ pass
144
+ return b.root
145
+
146
+
147
+ # ---------------------------------------------------------------------------
148
+ # Metadata extraction
149
+ # ---------------------------------------------------------------------------
150
+ def first(seq, default=None):
151
+ for x in seq:
152
+ return x
153
+ return default
154
+
155
+
156
+ def _meta(root: Node) -> Dict[str, str]:
157
+ meta: Dict[str, str] = {}
158
+ for m in root.find_all("meta"):
159
+ a = m.attrs
160
+ key = a.get("name") or a.get("property") or a.get("itemprop")
161
+ val = a.get("content")
162
+ if key and val:
163
+ meta[key.lower()] = val.strip()
164
+ return meta
165
+
166
+
167
+ def extract_metadata(root: Node, url: str) -> Dict[str, str]:
168
+ meta = _meta(root)
169
+ title_node = first(root.find_all("title"))
170
+ title = (title_node.text().strip() if title_node else "").strip()
171
+ if not title:
172
+ title = meta.get("og:title") or meta.get("twitter:title") or ""
173
+
174
+ description = (
175
+ meta.get("description")
176
+ or meta.get("og:description")
177
+ or meta.get("twitter:description")
178
+ or ""
179
+ ).strip()
180
+
181
+ canonical = ""
182
+ for l in root.find_all("link"):
183
+ if l.attrs.get("rel", "").lower() == "canonical":
184
+ canonical = l.attrs.get("href", "").strip()
185
+ break
186
+ if not canonical:
187
+ canonical = meta.get("og:url", "")
188
+
189
+ html_nodes = root.find_all("html")
190
+ lang = html_nodes[0].attrs.get("lang", "").strip() if html_nodes else ""
191
+
192
+ image = (meta.get("og:image") or meta.get("twitter:image") or "").strip()
193
+ if image:
194
+ image = urljoin(url, image)
195
+
196
+ site_name = (
197
+ meta.get("og:site_name")
198
+ or meta.get("application-name")
199
+ or ""
200
+ ).strip()
201
+
202
+ return {
203
+ "title": title,
204
+ "description": description,
205
+ "canonical": canonical,
206
+ "lang": lang or "en",
207
+ "image": image,
208
+ "site_name": site_name,
209
+ }
210
+
211
+
212
+ # ---------------------------------------------------------------------------
213
+ # Link extraction
214
+ # ---------------------------------------------------------------------------
215
+ SKIP_LINK_PREFIXES = ("mailto:", "tel:", "javascript:", "data:", "#")
216
+ SKIP_LINK_EXTS = (
217
+ ".pdf", ".zip", ".tar", ".tgz", ".gz", ".rar", ".7z",
218
+ ".jpg", ".jpeg", ".png", ".gif", ".webp", ".svg", ".ico",
219
+ ".mp4", ".webm", ".mp3", ".wav", ".ogg",
220
+ ".css", ".js", ".xml", ".json", ".rss", ".atom",
221
+ ".doc", ".docx", ".xls", ".xlsx", ".ppt", ".pptx",
222
+ )
223
+ # Query keys whose presence marks the URL as an export/feed variant of a
224
+ # page we likely already crawled (e.g. ?format=ical, ?format=json-pretty).
225
+ SKIP_QUERY_FORMATS = {"ical", "json", "json-pretty", "rss", "atom", "feed", "xml", "pdf", "csv"}
226
+ # Nav labels we never want in the rendered navbar.
227
+ NAV_LABEL_BLOCKLIST = {
228
+ "back", "cart", "checkout", "login", "log in", "sign in", "signup", "sign up",
229
+ "menu", "toggle navigation", "skip to main content", "skip to content",
230
+ "search", "close", "open menu",
231
+ }
232
+ NAV_LABEL_PREFIX_BLOCKLIST = ("folder:",)
233
+
234
+
235
+ def normalize_link(href: str, base_url: str) -> Optional[str]:
236
+ if not href:
237
+ return None
238
+ h = href.strip()
239
+ if not h or any(h.lower().startswith(p) for p in SKIP_LINK_PREFIXES):
240
+ return None
241
+ full = urljoin(base_url, h)
242
+ full, _ = urldefrag(full)
243
+ if not full:
244
+ return None
245
+ parsed = urlparse(full)
246
+ if parsed.scheme not in ("http", "https"):
247
+ return None
248
+ if any(parsed.path.lower().endswith(e) for e in SKIP_LINK_EXTS):
249
+ return None
250
+ # Drop export-format variants (e.g. ?format=ical, ?format=json-pretty).
251
+ if parsed.query:
252
+ from urllib.parse import parse_qs
253
+ try:
254
+ qs = parse_qs(parsed.query, keep_blank_values=False)
255
+ for v in qs.get("format", []):
256
+ if v.lower() in SKIP_QUERY_FORMATS:
257
+ return None
258
+ except Exception:
259
+ pass
260
+ # Strip default ports for stable dedup
261
+ netloc = parsed.netloc
262
+ if netloc.endswith(":80") and parsed.scheme == "http":
263
+ netloc = netloc[:-3]
264
+ if netloc.endswith(":443") and parsed.scheme == "https":
265
+ netloc = netloc[:-4]
266
+ path = parsed.path or "/"
267
+ query = ("?" + parsed.query) if parsed.query else ""
268
+ return f"{parsed.scheme}://{netloc}{path}{query}"
269
+
270
+
271
+ def same_host(url: str, base_url: str) -> bool:
272
+ a = urlparse(url).netloc.lower().lstrip("www.")
273
+ b = urlparse(base_url).netloc.lower().lstrip("www.")
274
+ return a == b and a != ""
275
+
276
+
277
+ def extract_links(root: Node, url: str, base_url: str) -> List[Tuple[str, str]]:
278
+ out: List[Tuple[str, str]] = []
279
+ seen = set()
280
+ for a in root.find_all("a"):
281
+ href = a.attrs.get("href", "")
282
+ norm = normalize_link(href, url)
283
+ if not norm or not same_host(norm, base_url):
284
+ continue
285
+ if norm in seen:
286
+ continue
287
+ seen.add(norm)
288
+ label = " ".join(a.text().split())[:120]
289
+ out.append((norm, label))
290
+ return out
291
+
292
+
293
+ def extract_nav_links(root: Node, url: str, base_url: str) -> List[Tuple[str, str]]:
294
+ out: List[Tuple[str, str]] = []
295
+ seen = set()
296
+ candidates: List[Node] = []
297
+ candidates.extend(root.find_all("nav"))
298
+ for h in root.find_all("header"):
299
+ candidates.append(h)
300
+ # role="navigation" anywhere
301
+ stack: List[Node] = [root]
302
+ while stack:
303
+ n = stack.pop()
304
+ if isinstance(n, Node):
305
+ if n.attrs.get("role", "").lower() == "navigation":
306
+ candidates.append(n)
307
+ for c in n.children:
308
+ if isinstance(c, Node):
309
+ stack.append(c)
310
+ for container in candidates:
311
+ for a in container.find_all("a"):
312
+ href = a.attrs.get("href", "")
313
+ norm = normalize_link(href, url)
314
+ if not norm or not same_host(norm, base_url):
315
+ continue
316
+ if norm in seen:
317
+ continue
318
+ label = " ".join(a.text().split())
319
+ if not label or len(label) > 80:
320
+ continue
321
+ seen.add(norm)
322
+ out.append((norm, label))
323
+ return out
324
+
325
+
326
+ def filter_nav_links(nav: List[Tuple[str, str]], base_url: str) -> List[Tuple[str, str]]:
327
+ """Drop blocked labels, duplicates by path, and the home link itself."""
328
+ cleaned: List[Tuple[str, str]] = []
329
+ seen_paths = set()
330
+ base_path = (urlparse(base_url).path or "/").rstrip("/") or "/"
331
+ for u, label in nav:
332
+ lab_l = label.strip().lower()
333
+ if not lab_l or lab_l in NAV_LABEL_BLOCKLIST:
334
+ continue
335
+ if any(lab_l.startswith(p) for p in NAV_LABEL_PREFIX_BLOCKLIST):
336
+ continue
337
+ path = (urlparse(u).path or "/").rstrip("/") or "/"
338
+ if path in seen_paths:
339
+ continue
340
+ if path == base_path and lab_l in {"home", "back", base_path}:
341
+ continue
342
+ seen_paths.add(path)
343
+ cleaned.append((u, label))
344
+ return cleaned[:10] # cap nav size
345
+
346
+
347
+ def extract_images(root: Node, url: str) -> List[Dict[str, str]]:
348
+ """Collect every <img src=...> (absolute URL + alt) in document order."""
349
+ out: List[Dict[str, str]] = []
350
+ seen = set()
351
+ for img in root.find_all("img"):
352
+ src = img.attrs.get("src", "").strip()
353
+ if not src:
354
+ continue
355
+ if src.startswith("data:"):
356
+ continue
357
+ try:
358
+ absu = urljoin(url, src)
359
+ except Exception:
360
+ continue
361
+ if absu in seen:
362
+ continue
363
+ seen.add(absu)
364
+ out.append({"url": absu, "alt": (img.attrs.get("alt") or "").strip()})
365
+ return out
366
+
367
+
368
+ # URL patterns → page "kind" (drives layout + destination dir in tasks/scrape.sh).
369
+ _KIND_PATTERNS = [
370
+ (re.compile(r"^/?(events?|calendar)(/|$)", re.I), "event"),
371
+ (re.compile(r"^/?(blog|news|posts?|articles?)(/|$)", re.I), "post"),
372
+ (re.compile(r"^/?(about|who-we-are|mission|team|associates|people)(/|$)", re.I), "about"),
373
+ (re.compile(r"^/?(contact|reach-us|get-in-touch)(/|$)", re.I), "contact"),
374
+ (re.compile(r"^/?(services|products?|programs?|activities|offerings)(/|$)", re.I), "service"),
375
+ (re.compile(r"^/?(faq|help|support)(/|$)", re.I), "faq"),
376
+ ]
377
+
378
+
379
+ def classify_page(url: str, base_url: str, title: str = "") -> str:
380
+ """Return a kind for the URL: home/event/post/about/contact/service/faq/page."""
381
+ parsed = urlparse(url)
382
+ base = urlparse(base_url)
383
+ path = (parsed.path or "/").strip("/")
384
+ base_path = (base.path or "/").strip("/")
385
+ # Home: same path as the base URL (after stripping).
386
+ if path == base_path or path == "" or path in ("index", "home"):
387
+ return "home"
388
+ for pat, kind in _KIND_PATTERNS:
389
+ if pat.match("/" + path):
390
+ return kind
391
+ return "page"
392
+
393
+
394
+
395
+ # ---------------------------------------------------------------------------
396
+ # Main-content selection
397
+ # ---------------------------------------------------------------------------
398
+ def _text_length(node: Node) -> int:
399
+ if node.tag in DROP_TAGS:
400
+ return 0
401
+ return len(" ".join(node.text().split()))
402
+
403
+
404
+ def select_main(root: Node) -> Node:
405
+ """Pick the subtree most likely to contain the page's primary content."""
406
+ # Explicit markers, in order of trust.
407
+ for tag in ("main",):
408
+ nodes = root.find_all(tag)
409
+ if nodes:
410
+ return max(nodes, key=_text_length)
411
+ # role="main"
412
+ stack: List[Node] = [root]
413
+ role_main: Optional[Node] = None
414
+ while stack:
415
+ n = stack.pop()
416
+ if n.attrs.get("role", "").lower() == "main":
417
+ role_main = n
418
+ break
419
+ for c in n.children:
420
+ if isinstance(c, Node):
421
+ stack.append(c)
422
+ if role_main:
423
+ return role_main
424
+
425
+ articles = root.find_all("article")
426
+ if articles:
427
+ return max(articles, key=_text_length)
428
+
429
+ # Heuristic: pick the descendant with the most text length that isn't
430
+ # inside chrome.
431
+ body = first(root.find_all("body")) or root
432
+ best = body
433
+ best_score = _text_length(body)
434
+ stack = [body]
435
+ while stack:
436
+ n = stack.pop()
437
+ for c in n.children:
438
+ if not isinstance(c, Node):
439
+ continue
440
+ if c.tag in CHROME_TAGS or c.tag in DROP_TAGS:
441
+ continue
442
+ score = _text_length(c)
443
+ # Favor deeper rich containers (sections/divs with most text).
444
+ if score > best_score * 0.9 and c.tag in (
445
+ "section", "div", "article", "main",
446
+ ):
447
+ if score > best_score:
448
+ best = c
449
+ best_score = score
450
+ stack.append(c)
451
+ return best
452
+
453
+
454
+ # ---------------------------------------------------------------------------
455
+ # Markdown rendering
456
+ # ---------------------------------------------------------------------------
457
+ INLINE_TAGS = {"a", "b", "strong", "i", "em", "u", "code", "span", "small",
458
+ "sub", "sup", "abbr", "mark", "kbd", "var", "samp"}
459
+
460
+
461
+ def _normalize_ws(s: str) -> str:
462
+ return re.sub(r"[ \t\r\n\f]+", " ", s).strip()
463
+
464
+
465
+ def _md_escape(s: str) -> str:
466
+ # Escape characters that have markdown meaning at the start of a line
467
+ # or inline. We keep this conservative.
468
+ return s.replace("\\", "\\\\").replace("`", "\\`")
469
+
470
+
471
+ def _render_inline(node: Node, base_url: str) -> str:
472
+ parts: List[str] = []
473
+ for c in node.children:
474
+ if isinstance(c, str):
475
+ parts.append(c)
476
+ continue
477
+ if c.tag in DROP_TAGS:
478
+ continue
479
+ if c.tag == "br":
480
+ parts.append("\n")
481
+ continue
482
+ inner = _render_inline(c, base_url)
483
+ if c.tag == "a":
484
+ href = c.attrs.get("href", "").strip()
485
+ full = urljoin(base_url, href) if href else ""
486
+ label = _normalize_ws(inner) or full
487
+ if full:
488
+ parts.append(f"[{label}]({full})")
489
+ else:
490
+ parts.append(label)
491
+ elif c.tag in ("strong", "b"):
492
+ t = _normalize_ws(inner)
493
+ parts.append(f"**{t}**" if t else "")
494
+ elif c.tag in ("em", "i"):
495
+ t = _normalize_ws(inner)
496
+ parts.append(f"*{t}*" if t else "")
497
+ elif c.tag == "code":
498
+ t = inner.strip()
499
+ parts.append(f"`{t}`" if t else "")
500
+ elif c.tag == "img":
501
+ alt = c.attrs.get("alt", "").strip()
502
+ src = urljoin(base_url, c.attrs.get("src", "").strip())
503
+ if src:
504
+ parts.append(f"![{alt}]({src})")
505
+ else:
506
+ parts.append(inner)
507
+ return "".join(parts)
508
+
509
+
510
+ def _render_block(node: Node, base_url: str, lines: List[str], list_depth: int = 0):
511
+ tag = node.tag
512
+ if tag in DROP_TAGS:
513
+ return
514
+ if tag in ("h1", "h2", "h3", "h4", "h5", "h6"):
515
+ level = int(tag[1])
516
+ text = _normalize_ws(_render_inline(node, base_url))
517
+ if text:
518
+ lines.append("")
519
+ lines.append("#" * level + " " + text)
520
+ lines.append("")
521
+ return
522
+ if tag == "p":
523
+ text = _normalize_ws(_render_inline(node, base_url))
524
+ if text:
525
+ lines.append("")
526
+ lines.append(text)
527
+ lines.append("")
528
+ return
529
+ if tag == "br":
530
+ return
531
+ if tag in ("ul", "ol"):
532
+ ordered = tag == "ol"
533
+ idx = 1
534
+ for c in node.children:
535
+ if isinstance(c, Node) and c.tag == "li":
536
+ bullet = f"{idx}." if ordered else "-"
537
+ # Render li inline content + nested blocks.
538
+ inline = _normalize_ws(_render_inline(c, base_url))
539
+ indent = " " * list_depth
540
+ if inline:
541
+ lines.append(f"{indent}{bullet} {inline}")
542
+ # Walk nested lists.
543
+ for cc in c.children:
544
+ if isinstance(cc, Node) and cc.tag in ("ul", "ol"):
545
+ _render_block(cc, base_url, lines, list_depth + 1)
546
+ idx += 1
547
+ lines.append("")
548
+ return
549
+ if tag == "blockquote":
550
+ inner: List[str] = []
551
+ for c in node.children:
552
+ if isinstance(c, Node):
553
+ _render_block(c, base_url, inner, list_depth)
554
+ elif isinstance(c, str):
555
+ t = _normalize_ws(c)
556
+ if t:
557
+ inner.append(t)
558
+ if inner:
559
+ lines.append("")
560
+ for ln in inner:
561
+ if ln:
562
+ lines.append("> " + ln)
563
+ else:
564
+ lines.append(">")
565
+ lines.append("")
566
+ return
567
+ if tag == "pre":
568
+ code = node.text()
569
+ lines.append("")
570
+ lines.append("```")
571
+ for ln in code.rstrip().splitlines():
572
+ lines.append(ln)
573
+ lines.append("```")
574
+ lines.append("")
575
+ return
576
+ if tag == "hr":
577
+ lines.append("")
578
+ lines.append("---")
579
+ lines.append("")
580
+ return
581
+ if tag in ("table", "thead", "tbody", "tr", "td", "th"):
582
+ # Tables: render as a simplified GFM table when we see <table>.
583
+ if tag == "table":
584
+ _render_table(node, base_url, lines)
585
+ return
586
+ if tag == "img":
587
+ alt = node.attrs.get("alt", "").strip()
588
+ src = urljoin(base_url, node.attrs.get("src", "").strip())
589
+ if src:
590
+ lines.append("")
591
+ lines.append(f"![{alt}]({src})")
592
+ lines.append("")
593
+ return
594
+ if tag == "figure":
595
+ for c in node.children:
596
+ if isinstance(c, Node):
597
+ _render_block(c, base_url, lines, list_depth)
598
+ return
599
+ # Container — recurse.
600
+ if tag in INLINE_TAGS or tag == "__root__" or tag in (
601
+ "div", "section", "article", "main", "body", "html",
602
+ "figure", "figcaption", "details", "summary",
603
+ ):
604
+ # If this node is purely inline, emit a paragraph.
605
+ only_inline = all(
606
+ isinstance(c, str) or c.tag in INLINE_TAGS or c.tag == "br"
607
+ for c in node.children
608
+ )
609
+ if only_inline and tag != "__root__":
610
+ text = _normalize_ws(_render_inline(node, base_url))
611
+ if text:
612
+ lines.append("")
613
+ lines.append(text)
614
+ lines.append("")
615
+ return
616
+ for c in node.children:
617
+ if isinstance(c, Node):
618
+ _render_block(c, base_url, lines, list_depth)
619
+ elif isinstance(c, str):
620
+ t = _normalize_ws(c)
621
+ if t:
622
+ lines.append("")
623
+ lines.append(t)
624
+ lines.append("")
625
+ return
626
+ # Unknown tag — recurse blindly.
627
+ for c in node.children:
628
+ if isinstance(c, Node):
629
+ _render_block(c, base_url, lines, list_depth)
630
+
631
+
632
+ def _render_table(table: Node, base_url: str, lines: List[str]):
633
+ rows: List[List[str]] = []
634
+ header: Optional[List[str]] = None
635
+ for tr in table.find_all("tr"):
636
+ cells: List[str] = []
637
+ is_header_row = False
638
+ for c in tr.children:
639
+ if not isinstance(c, Node):
640
+ continue
641
+ if c.tag in ("td", "th"):
642
+ if c.tag == "th":
643
+ is_header_row = True
644
+ cells.append(_normalize_ws(_render_inline(c, base_url)))
645
+ if not cells:
646
+ continue
647
+ if header is None and is_header_row:
648
+ header = cells
649
+ else:
650
+ rows.append(cells)
651
+ if not header and rows:
652
+ header = rows.pop(0)
653
+ if not header:
654
+ return
655
+ lines.append("")
656
+ lines.append("| " + " | ".join(header) + " |")
657
+ lines.append("| " + " | ".join(["---"] * len(header)) + " |")
658
+ for r in rows:
659
+ # Pad/truncate to header width
660
+ if len(r) < len(header):
661
+ r = r + [""] * (len(header) - len(r))
662
+ else:
663
+ r = r[: len(header)]
664
+ lines.append("| " + " | ".join(r) + " |")
665
+ lines.append("")
666
+
667
+
668
+ def to_markdown(node: Node, base_url: str) -> str:
669
+ lines: List[str] = []
670
+ _render_block(node, base_url, lines)
671
+ # Collapse 3+ blank lines.
672
+ out: List[str] = []
673
+ blank = 0
674
+ for ln in lines:
675
+ if ln == "":
676
+ blank += 1
677
+ if blank <= 1:
678
+ out.append("")
679
+ else:
680
+ blank = 0
681
+ out.append(ln.rstrip())
682
+ return "\n".join(out).strip() + "\n"
683
+
684
+
685
+ # ---------------------------------------------------------------------------
686
+ # Entry points
687
+ # ---------------------------------------------------------------------------
688
+ def cmd_extract(args) -> int:
689
+ html = _read_input(args.html_file)
690
+ root = parse_html(html)
691
+ base = args.base_url or args.url
692
+ meta = extract_metadata(root, args.url)
693
+ main = select_main(root)
694
+ markdown = to_markdown(main, args.url)
695
+ links = extract_links(root, args.url, base)
696
+ nav_raw = extract_nav_links(root, args.url, base)
697
+ nav = filter_nav_links(nav_raw, base)
698
+ images = extract_images(main, args.url)
699
+ # Include page-level og:image as the first asset if it isn't already.
700
+ if meta.get("image"):
701
+ if not any(img["url"] == meta["image"] for img in images):
702
+ images.insert(0, {"url": meta["image"], "alt": meta.get("title") or ""})
703
+ kind = classify_page(args.url, base, meta.get("title", ""))
704
+ word_count = len(markdown.split())
705
+ out = {
706
+ "url": args.url,
707
+ "base_url": base,
708
+ "kind": kind,
709
+ "title": meta["title"],
710
+ "description": meta["description"],
711
+ "canonical": meta["canonical"],
712
+ "lang": meta["lang"],
713
+ "image": meta["image"],
714
+ "site_name": meta["site_name"],
715
+ "word_count": word_count,
716
+ "markdown": markdown,
717
+ "links": [{"url": u, "label": l} for u, l in links],
718
+ "nav": [{"url": u, "label": l} for u, l in nav],
719
+ "images": images,
720
+ }
721
+ json.dump(out, sys.stdout, ensure_ascii=False, indent=2)
722
+ sys.stdout.write("\n")
723
+ return 0
724
+
725
+
726
+ def cmd_crawl_links(args) -> int:
727
+ html = _read_input(args.html_file)
728
+ root = parse_html(html)
729
+ base = args.base_url
730
+ url = args.url or base
731
+ for u, _ in extract_links(root, url, base):
732
+ sys.stdout.write(u + "\n")
733
+ return 0
734
+
735
+
736
+ def _read_input(path: Optional[str]) -> str:
737
+ if not path or path == "-":
738
+ return sys.stdin.read()
739
+ with open(path, "r", encoding="utf-8", errors="replace") as f:
740
+ return f.read()
741
+
742
+
743
+ def main(argv: List[str]) -> int:
744
+ p = argparse.ArgumentParser(description="zer0-mistakes site scraper helper")
745
+ sub = p.add_subparsers(dest="command", required=True)
746
+
747
+ p_ext = sub.add_parser("extract", help="Parse one HTML page → JSON")
748
+ p_ext.add_argument("--url", required=True, help="URL the HTML was fetched from")
749
+ p_ext.add_argument("--base-url", default="", help="Site base URL for same-host filter")
750
+ p_ext.add_argument("html_file", nargs="?", default="-")
751
+ p_ext.set_defaults(func=cmd_extract)
752
+
753
+ p_lnk = sub.add_parser("crawl-links", help="List in-scope links in HTML")
754
+ p_lnk.add_argument("--base-url", required=True)
755
+ p_lnk.add_argument("--url", default="")
756
+ p_lnk.add_argument("html_file", nargs="?", default="-")
757
+ p_lnk.set_defaults(func=cmd_crawl_links)
758
+
759
+ args = p.parse_args(argv)
760
+ return args.func(args)
761
+
762
+
763
+ if __name__ == "__main__":
764
+ sys.exit(main(sys.argv[1:]))