jekyll-theme-zer0 1.8.2 → 1.9.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +33 -3
- data/README.md +98 -7
- data/_data/content_statistics.yml +253 -251
- data/_includes/components/nav-export.html +61 -0
- data/_includes/components/nav-overview.html +54 -0
- data/scripts/bin/install +52 -705
- data/scripts/github-setup.sh +0 -0
- data/scripts/install/README.md +162 -0
- data/scripts/install/ai/client.sh +164 -0
- data/scripts/install/ai/diagnose.sh +81 -0
- data/scripts/install/ai/prompts/diagnose.system.md +42 -0
- data/scripts/install/ai/prompts/spec.schema.json +129 -0
- data/scripts/install/ai/prompts/suggest.system.md +43 -0
- data/scripts/install/ai/prompts/wizard.system.md +142 -0
- data/scripts/install/ai/suggest.sh +57 -0
- data/scripts/install/ai/wizard.sh +150 -0
- data/scripts/install/apply.sh +156 -0
- data/scripts/install/cli.sh +561 -0
- data/scripts/install/diff.sh +128 -0
- data/scripts/install/doctor.sh +168 -0
- data/scripts/install/fs.sh +138 -0
- data/scripts/install/log.sh +119 -0
- data/scripts/install/plan.sh +299 -0
- data/scripts/install/platform.sh +122 -0
- data/scripts/install/prompt.sh +124 -0
- data/scripts/install/repair.sh +45 -0
- data/scripts/install/scrape.sh +535 -0
- data/scripts/install/scrape_html.py +764 -0
- data/scripts/install/spec.sh +486 -0
- data/scripts/install/tasks/_registry.sh +65 -0
- data/scripts/install/tasks/agents.sh +60 -0
- data/scripts/install/tasks/config.sh +37 -0
- data/scripts/install/tasks/data.sh +18 -0
- data/scripts/install/tasks/deploy_azure-swa.sh +17 -0
- data/scripts/install/tasks/deploy_docker-prod.sh +21 -0
- data/scripts/install/tasks/deploy_github-pages.sh +18 -0
- data/scripts/install/tasks/devcontainer.sh +26 -0
- data/scripts/install/tasks/docker.sh +29 -0
- data/scripts/install/tasks/gemfile.sh +42 -0
- data/scripts/install/tasks/gitignore.sh +26 -0
- data/scripts/install/tasks/marker.sh +46 -0
- data/scripts/install/tasks/nav.sh +18 -0
- data/scripts/install/tasks/pages.sh +61 -0
- data/scripts/install/tasks/readme.sh +27 -0
- data/scripts/install/tasks/scrape.sh +348 -0
- data/scripts/install/template.sh +138 -0
- data/scripts/install/tui.sh +110 -0
- data/scripts/install/upgrade.sh +49 -0
- data/scripts/lib/install/template.sh +1 -0
- metadata +49 -6
|
@@ -0,0 +1,764 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# =============================================================================
|
|
3
|
+
# scripts/install/scrape_html.py — HTML extractor for the zer0-mistakes
|
|
4
|
+
# installer's site-scraping pipeline.
|
|
5
|
+
# =============================================================================
|
|
6
|
+
# Stdlib only (Python >= 3.6). Reads HTML from a file or stdin, extracts:
|
|
7
|
+
#
|
|
8
|
+
# - title, description, language, canonical URL
|
|
9
|
+
# - Open Graph + Twitter card metadata
|
|
10
|
+
# - main content as a heuristic-selected subtree, rendered to Markdown
|
|
11
|
+
# - all internal links (same host as --base-url, normalized + deduped)
|
|
12
|
+
# - top-level navigation links (from <nav>, <header>, role="navigation")
|
|
13
|
+
# - first image (used as preview)
|
|
14
|
+
#
|
|
15
|
+
# Two subcommands:
|
|
16
|
+
# extract --url URL [--base-url BASE] [HTML_FILE]
|
|
17
|
+
# Emit a JSON document describing the page.
|
|
18
|
+
#
|
|
19
|
+
# crawl-links --base-url BASE [HTML_FILE]
|
|
20
|
+
# Emit a newline-delimited list of in-scope links discovered in the page
|
|
21
|
+
# (used by the bash crawler to enqueue further pages).
|
|
22
|
+
#
|
|
23
|
+
# Why a Python helper? Pure bash + sed is too brittle for real-world HTML and
|
|
24
|
+
# pandoc isn't guaranteed to be installed; html.parser is in every supported
|
|
25
|
+
# Python install and gives us deterministic, cross-platform behaviour.
|
|
26
|
+
# =============================================================================
|
|
27
|
+
from __future__ import annotations
|
|
28
|
+
|
|
29
|
+
import argparse
|
|
30
|
+
import json
|
|
31
|
+
import re
|
|
32
|
+
import sys
|
|
33
|
+
from html import unescape
|
|
34
|
+
from html.parser import HTMLParser
|
|
35
|
+
from typing import Dict, List, Optional, Tuple
|
|
36
|
+
from urllib.parse import urldefrag, urljoin, urlparse
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
# ---------------------------------------------------------------------------
|
|
40
|
+
# DOM model — minimal tree we can reason about.
|
|
41
|
+
# ---------------------------------------------------------------------------
|
|
42
|
+
VOID_TAGS = {
|
|
43
|
+
"area", "base", "br", "col", "embed", "hr", "img", "input",
|
|
44
|
+
"link", "meta", "param", "source", "track", "wbr",
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
# Tags that never contribute meaningful content (stripped wholesale).
|
|
48
|
+
DROP_TAGS = {"script", "style", "noscript", "template", "iframe", "svg"}
|
|
49
|
+
|
|
50
|
+
# Tags that typically wrap chrome (header/footer/nav/aside). We use these
|
|
51
|
+
# as heuristic anti-signals when selecting main content.
|
|
52
|
+
CHROME_TAGS = {"header", "footer", "nav", "aside"}
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class Node:
|
|
56
|
+
__slots__ = ("tag", "attrs", "children", "parent")
|
|
57
|
+
|
|
58
|
+
def __init__(self, tag: str, attrs: Optional[Dict[str, str]] = None):
|
|
59
|
+
self.tag = tag
|
|
60
|
+
self.attrs = attrs or {}
|
|
61
|
+
self.children: List["Node | str"] = []
|
|
62
|
+
self.parent: Optional["Node"] = None
|
|
63
|
+
|
|
64
|
+
def add(self, child):
|
|
65
|
+
if isinstance(child, Node):
|
|
66
|
+
child.parent = self
|
|
67
|
+
self.children.append(child)
|
|
68
|
+
|
|
69
|
+
def find_all(self, tag: str) -> List["Node"]:
|
|
70
|
+
out: List[Node] = []
|
|
71
|
+
stack: List[Node] = [self]
|
|
72
|
+
while stack:
|
|
73
|
+
n = stack.pop()
|
|
74
|
+
if n.tag == tag:
|
|
75
|
+
out.append(n)
|
|
76
|
+
for c in n.children:
|
|
77
|
+
if isinstance(c, Node):
|
|
78
|
+
stack.append(c)
|
|
79
|
+
return out
|
|
80
|
+
|
|
81
|
+
def text(self) -> str:
|
|
82
|
+
parts: List[str] = []
|
|
83
|
+
for c in self.children:
|
|
84
|
+
if isinstance(c, str):
|
|
85
|
+
parts.append(c)
|
|
86
|
+
elif c.tag not in DROP_TAGS:
|
|
87
|
+
parts.append(c.text())
|
|
88
|
+
return "".join(parts)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class DOMBuilder(HTMLParser):
|
|
92
|
+
"""Build a tolerant DOM from real-world HTML."""
|
|
93
|
+
|
|
94
|
+
def __init__(self):
|
|
95
|
+
super().__init__(convert_charrefs=True)
|
|
96
|
+
self.root = Node("__root__")
|
|
97
|
+
self.stack: List[Node] = [self.root]
|
|
98
|
+
self._dropping = 0 # nested depth inside DROP_TAGS
|
|
99
|
+
|
|
100
|
+
def handle_starttag(self, tag, attrs):
|
|
101
|
+
tag = tag.lower()
|
|
102
|
+
if tag in DROP_TAGS:
|
|
103
|
+
# Track depth so nested drops work, but still push a node so
|
|
104
|
+
# the matching end tag is consumed.
|
|
105
|
+
self._dropping += 1
|
|
106
|
+
node = Node(tag, {k.lower(): (v or "") for k, v in attrs})
|
|
107
|
+
self.stack[-1].add(node)
|
|
108
|
+
if tag not in VOID_TAGS:
|
|
109
|
+
self.stack.append(node)
|
|
110
|
+
|
|
111
|
+
def handle_startendtag(self, tag, attrs):
|
|
112
|
+
# Self-closing form — treat as void regardless of tag name.
|
|
113
|
+
tag = tag.lower()
|
|
114
|
+
node = Node(tag, {k.lower(): (v or "") for k, v in attrs})
|
|
115
|
+
self.stack[-1].add(node)
|
|
116
|
+
|
|
117
|
+
def handle_endtag(self, tag):
|
|
118
|
+
tag = tag.lower()
|
|
119
|
+
if tag in VOID_TAGS:
|
|
120
|
+
return
|
|
121
|
+
# Pop until we find a matching tag (HTML is forgiving).
|
|
122
|
+
for i in range(len(self.stack) - 1, 0, -1):
|
|
123
|
+
if self.stack[i].tag == tag:
|
|
124
|
+
if tag in DROP_TAGS and self._dropping > 0:
|
|
125
|
+
self._dropping -= 1
|
|
126
|
+
del self.stack[i:]
|
|
127
|
+
return
|
|
128
|
+
# No match — ignore the stray end tag.
|
|
129
|
+
|
|
130
|
+
def handle_data(self, data):
|
|
131
|
+
if self._dropping:
|
|
132
|
+
return
|
|
133
|
+
if data:
|
|
134
|
+
self.stack[-1].add(data)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def parse_html(html: str) -> Node:
|
|
138
|
+
b = DOMBuilder()
|
|
139
|
+
try:
|
|
140
|
+
b.feed(html)
|
|
141
|
+
except Exception:
|
|
142
|
+
# Be forgiving: return whatever was parsed so far.
|
|
143
|
+
pass
|
|
144
|
+
return b.root
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
# ---------------------------------------------------------------------------
|
|
148
|
+
# Metadata extraction
|
|
149
|
+
# ---------------------------------------------------------------------------
|
|
150
|
+
def first(seq, default=None):
|
|
151
|
+
for x in seq:
|
|
152
|
+
return x
|
|
153
|
+
return default
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def _meta(root: Node) -> Dict[str, str]:
|
|
157
|
+
meta: Dict[str, str] = {}
|
|
158
|
+
for m in root.find_all("meta"):
|
|
159
|
+
a = m.attrs
|
|
160
|
+
key = a.get("name") or a.get("property") or a.get("itemprop")
|
|
161
|
+
val = a.get("content")
|
|
162
|
+
if key and val:
|
|
163
|
+
meta[key.lower()] = val.strip()
|
|
164
|
+
return meta
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def extract_metadata(root: Node, url: str) -> Dict[str, str]:
|
|
168
|
+
meta = _meta(root)
|
|
169
|
+
title_node = first(root.find_all("title"))
|
|
170
|
+
title = (title_node.text().strip() if title_node else "").strip()
|
|
171
|
+
if not title:
|
|
172
|
+
title = meta.get("og:title") or meta.get("twitter:title") or ""
|
|
173
|
+
|
|
174
|
+
description = (
|
|
175
|
+
meta.get("description")
|
|
176
|
+
or meta.get("og:description")
|
|
177
|
+
or meta.get("twitter:description")
|
|
178
|
+
or ""
|
|
179
|
+
).strip()
|
|
180
|
+
|
|
181
|
+
canonical = ""
|
|
182
|
+
for l in root.find_all("link"):
|
|
183
|
+
if l.attrs.get("rel", "").lower() == "canonical":
|
|
184
|
+
canonical = l.attrs.get("href", "").strip()
|
|
185
|
+
break
|
|
186
|
+
if not canonical:
|
|
187
|
+
canonical = meta.get("og:url", "")
|
|
188
|
+
|
|
189
|
+
html_nodes = root.find_all("html")
|
|
190
|
+
lang = html_nodes[0].attrs.get("lang", "").strip() if html_nodes else ""
|
|
191
|
+
|
|
192
|
+
image = (meta.get("og:image") or meta.get("twitter:image") or "").strip()
|
|
193
|
+
if image:
|
|
194
|
+
image = urljoin(url, image)
|
|
195
|
+
|
|
196
|
+
site_name = (
|
|
197
|
+
meta.get("og:site_name")
|
|
198
|
+
or meta.get("application-name")
|
|
199
|
+
or ""
|
|
200
|
+
).strip()
|
|
201
|
+
|
|
202
|
+
return {
|
|
203
|
+
"title": title,
|
|
204
|
+
"description": description,
|
|
205
|
+
"canonical": canonical,
|
|
206
|
+
"lang": lang or "en",
|
|
207
|
+
"image": image,
|
|
208
|
+
"site_name": site_name,
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
# ---------------------------------------------------------------------------
|
|
213
|
+
# Link extraction
|
|
214
|
+
# ---------------------------------------------------------------------------
|
|
215
|
+
SKIP_LINK_PREFIXES = ("mailto:", "tel:", "javascript:", "data:", "#")
|
|
216
|
+
SKIP_LINK_EXTS = (
|
|
217
|
+
".pdf", ".zip", ".tar", ".tgz", ".gz", ".rar", ".7z",
|
|
218
|
+
".jpg", ".jpeg", ".png", ".gif", ".webp", ".svg", ".ico",
|
|
219
|
+
".mp4", ".webm", ".mp3", ".wav", ".ogg",
|
|
220
|
+
".css", ".js", ".xml", ".json", ".rss", ".atom",
|
|
221
|
+
".doc", ".docx", ".xls", ".xlsx", ".ppt", ".pptx",
|
|
222
|
+
)
|
|
223
|
+
# Query keys whose presence marks the URL as an export/feed variant of a
|
|
224
|
+
# page we likely already crawled (e.g. ?format=ical, ?format=json-pretty).
|
|
225
|
+
SKIP_QUERY_FORMATS = {"ical", "json", "json-pretty", "rss", "atom", "feed", "xml", "pdf", "csv"}
|
|
226
|
+
# Nav labels we never want in the rendered navbar.
|
|
227
|
+
NAV_LABEL_BLOCKLIST = {
|
|
228
|
+
"back", "cart", "checkout", "login", "log in", "sign in", "signup", "sign up",
|
|
229
|
+
"menu", "toggle navigation", "skip to main content", "skip to content",
|
|
230
|
+
"search", "close", "open menu",
|
|
231
|
+
}
|
|
232
|
+
NAV_LABEL_PREFIX_BLOCKLIST = ("folder:",)
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def normalize_link(href: str, base_url: str) -> Optional[str]:
|
|
236
|
+
if not href:
|
|
237
|
+
return None
|
|
238
|
+
h = href.strip()
|
|
239
|
+
if not h or any(h.lower().startswith(p) for p in SKIP_LINK_PREFIXES):
|
|
240
|
+
return None
|
|
241
|
+
full = urljoin(base_url, h)
|
|
242
|
+
full, _ = urldefrag(full)
|
|
243
|
+
if not full:
|
|
244
|
+
return None
|
|
245
|
+
parsed = urlparse(full)
|
|
246
|
+
if parsed.scheme not in ("http", "https"):
|
|
247
|
+
return None
|
|
248
|
+
if any(parsed.path.lower().endswith(e) for e in SKIP_LINK_EXTS):
|
|
249
|
+
return None
|
|
250
|
+
# Drop export-format variants (e.g. ?format=ical, ?format=json-pretty).
|
|
251
|
+
if parsed.query:
|
|
252
|
+
from urllib.parse import parse_qs
|
|
253
|
+
try:
|
|
254
|
+
qs = parse_qs(parsed.query, keep_blank_values=False)
|
|
255
|
+
for v in qs.get("format", []):
|
|
256
|
+
if v.lower() in SKIP_QUERY_FORMATS:
|
|
257
|
+
return None
|
|
258
|
+
except Exception:
|
|
259
|
+
pass
|
|
260
|
+
# Strip default ports for stable dedup
|
|
261
|
+
netloc = parsed.netloc
|
|
262
|
+
if netloc.endswith(":80") and parsed.scheme == "http":
|
|
263
|
+
netloc = netloc[:-3]
|
|
264
|
+
if netloc.endswith(":443") and parsed.scheme == "https":
|
|
265
|
+
netloc = netloc[:-4]
|
|
266
|
+
path = parsed.path or "/"
|
|
267
|
+
query = ("?" + parsed.query) if parsed.query else ""
|
|
268
|
+
return f"{parsed.scheme}://{netloc}{path}{query}"
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
def same_host(url: str, base_url: str) -> bool:
|
|
272
|
+
a = urlparse(url).netloc.lower().lstrip("www.")
|
|
273
|
+
b = urlparse(base_url).netloc.lower().lstrip("www.")
|
|
274
|
+
return a == b and a != ""
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
def extract_links(root: Node, url: str, base_url: str) -> List[Tuple[str, str]]:
|
|
278
|
+
out: List[Tuple[str, str]] = []
|
|
279
|
+
seen = set()
|
|
280
|
+
for a in root.find_all("a"):
|
|
281
|
+
href = a.attrs.get("href", "")
|
|
282
|
+
norm = normalize_link(href, url)
|
|
283
|
+
if not norm or not same_host(norm, base_url):
|
|
284
|
+
continue
|
|
285
|
+
if norm in seen:
|
|
286
|
+
continue
|
|
287
|
+
seen.add(norm)
|
|
288
|
+
label = " ".join(a.text().split())[:120]
|
|
289
|
+
out.append((norm, label))
|
|
290
|
+
return out
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
def extract_nav_links(root: Node, url: str, base_url: str) -> List[Tuple[str, str]]:
|
|
294
|
+
out: List[Tuple[str, str]] = []
|
|
295
|
+
seen = set()
|
|
296
|
+
candidates: List[Node] = []
|
|
297
|
+
candidates.extend(root.find_all("nav"))
|
|
298
|
+
for h in root.find_all("header"):
|
|
299
|
+
candidates.append(h)
|
|
300
|
+
# role="navigation" anywhere
|
|
301
|
+
stack: List[Node] = [root]
|
|
302
|
+
while stack:
|
|
303
|
+
n = stack.pop()
|
|
304
|
+
if isinstance(n, Node):
|
|
305
|
+
if n.attrs.get("role", "").lower() == "navigation":
|
|
306
|
+
candidates.append(n)
|
|
307
|
+
for c in n.children:
|
|
308
|
+
if isinstance(c, Node):
|
|
309
|
+
stack.append(c)
|
|
310
|
+
for container in candidates:
|
|
311
|
+
for a in container.find_all("a"):
|
|
312
|
+
href = a.attrs.get("href", "")
|
|
313
|
+
norm = normalize_link(href, url)
|
|
314
|
+
if not norm or not same_host(norm, base_url):
|
|
315
|
+
continue
|
|
316
|
+
if norm in seen:
|
|
317
|
+
continue
|
|
318
|
+
label = " ".join(a.text().split())
|
|
319
|
+
if not label or len(label) > 80:
|
|
320
|
+
continue
|
|
321
|
+
seen.add(norm)
|
|
322
|
+
out.append((norm, label))
|
|
323
|
+
return out
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
def filter_nav_links(nav: List[Tuple[str, str]], base_url: str) -> List[Tuple[str, str]]:
|
|
327
|
+
"""Drop blocked labels, duplicates by path, and the home link itself."""
|
|
328
|
+
cleaned: List[Tuple[str, str]] = []
|
|
329
|
+
seen_paths = set()
|
|
330
|
+
base_path = (urlparse(base_url).path or "/").rstrip("/") or "/"
|
|
331
|
+
for u, label in nav:
|
|
332
|
+
lab_l = label.strip().lower()
|
|
333
|
+
if not lab_l or lab_l in NAV_LABEL_BLOCKLIST:
|
|
334
|
+
continue
|
|
335
|
+
if any(lab_l.startswith(p) for p in NAV_LABEL_PREFIX_BLOCKLIST):
|
|
336
|
+
continue
|
|
337
|
+
path = (urlparse(u).path or "/").rstrip("/") or "/"
|
|
338
|
+
if path in seen_paths:
|
|
339
|
+
continue
|
|
340
|
+
if path == base_path and lab_l in {"home", "back", base_path}:
|
|
341
|
+
continue
|
|
342
|
+
seen_paths.add(path)
|
|
343
|
+
cleaned.append((u, label))
|
|
344
|
+
return cleaned[:10] # cap nav size
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
def extract_images(root: Node, url: str) -> List[Dict[str, str]]:
|
|
348
|
+
"""Collect every <img src=...> (absolute URL + alt) in document order."""
|
|
349
|
+
out: List[Dict[str, str]] = []
|
|
350
|
+
seen = set()
|
|
351
|
+
for img in root.find_all("img"):
|
|
352
|
+
src = img.attrs.get("src", "").strip()
|
|
353
|
+
if not src:
|
|
354
|
+
continue
|
|
355
|
+
if src.startswith("data:"):
|
|
356
|
+
continue
|
|
357
|
+
try:
|
|
358
|
+
absu = urljoin(url, src)
|
|
359
|
+
except Exception:
|
|
360
|
+
continue
|
|
361
|
+
if absu in seen:
|
|
362
|
+
continue
|
|
363
|
+
seen.add(absu)
|
|
364
|
+
out.append({"url": absu, "alt": (img.attrs.get("alt") or "").strip()})
|
|
365
|
+
return out
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
# URL patterns → page "kind" (drives layout + destination dir in tasks/scrape.sh).
|
|
369
|
+
_KIND_PATTERNS = [
|
|
370
|
+
(re.compile(r"^/?(events?|calendar)(/|$)", re.I), "event"),
|
|
371
|
+
(re.compile(r"^/?(blog|news|posts?|articles?)(/|$)", re.I), "post"),
|
|
372
|
+
(re.compile(r"^/?(about|who-we-are|mission|team|associates|people)(/|$)", re.I), "about"),
|
|
373
|
+
(re.compile(r"^/?(contact|reach-us|get-in-touch)(/|$)", re.I), "contact"),
|
|
374
|
+
(re.compile(r"^/?(services|products?|programs?|activities|offerings)(/|$)", re.I), "service"),
|
|
375
|
+
(re.compile(r"^/?(faq|help|support)(/|$)", re.I), "faq"),
|
|
376
|
+
]
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
def classify_page(url: str, base_url: str, title: str = "") -> str:
|
|
380
|
+
"""Return a kind for the URL: home/event/post/about/contact/service/faq/page."""
|
|
381
|
+
parsed = urlparse(url)
|
|
382
|
+
base = urlparse(base_url)
|
|
383
|
+
path = (parsed.path or "/").strip("/")
|
|
384
|
+
base_path = (base.path or "/").strip("/")
|
|
385
|
+
# Home: same path as the base URL (after stripping).
|
|
386
|
+
if path == base_path or path == "" or path in ("index", "home"):
|
|
387
|
+
return "home"
|
|
388
|
+
for pat, kind in _KIND_PATTERNS:
|
|
389
|
+
if pat.match("/" + path):
|
|
390
|
+
return kind
|
|
391
|
+
return "page"
|
|
392
|
+
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
# ---------------------------------------------------------------------------
|
|
396
|
+
# Main-content selection
|
|
397
|
+
# ---------------------------------------------------------------------------
|
|
398
|
+
def _text_length(node: Node) -> int:
|
|
399
|
+
if node.tag in DROP_TAGS:
|
|
400
|
+
return 0
|
|
401
|
+
return len(" ".join(node.text().split()))
|
|
402
|
+
|
|
403
|
+
|
|
404
|
+
def select_main(root: Node) -> Node:
|
|
405
|
+
"""Pick the subtree most likely to contain the page's primary content."""
|
|
406
|
+
# Explicit markers, in order of trust.
|
|
407
|
+
for tag in ("main",):
|
|
408
|
+
nodes = root.find_all(tag)
|
|
409
|
+
if nodes:
|
|
410
|
+
return max(nodes, key=_text_length)
|
|
411
|
+
# role="main"
|
|
412
|
+
stack: List[Node] = [root]
|
|
413
|
+
role_main: Optional[Node] = None
|
|
414
|
+
while stack:
|
|
415
|
+
n = stack.pop()
|
|
416
|
+
if n.attrs.get("role", "").lower() == "main":
|
|
417
|
+
role_main = n
|
|
418
|
+
break
|
|
419
|
+
for c in n.children:
|
|
420
|
+
if isinstance(c, Node):
|
|
421
|
+
stack.append(c)
|
|
422
|
+
if role_main:
|
|
423
|
+
return role_main
|
|
424
|
+
|
|
425
|
+
articles = root.find_all("article")
|
|
426
|
+
if articles:
|
|
427
|
+
return max(articles, key=_text_length)
|
|
428
|
+
|
|
429
|
+
# Heuristic: pick the descendant with the most text length that isn't
|
|
430
|
+
# inside chrome.
|
|
431
|
+
body = first(root.find_all("body")) or root
|
|
432
|
+
best = body
|
|
433
|
+
best_score = _text_length(body)
|
|
434
|
+
stack = [body]
|
|
435
|
+
while stack:
|
|
436
|
+
n = stack.pop()
|
|
437
|
+
for c in n.children:
|
|
438
|
+
if not isinstance(c, Node):
|
|
439
|
+
continue
|
|
440
|
+
if c.tag in CHROME_TAGS or c.tag in DROP_TAGS:
|
|
441
|
+
continue
|
|
442
|
+
score = _text_length(c)
|
|
443
|
+
# Favor deeper rich containers (sections/divs with most text).
|
|
444
|
+
if score > best_score * 0.9 and c.tag in (
|
|
445
|
+
"section", "div", "article", "main",
|
|
446
|
+
):
|
|
447
|
+
if score > best_score:
|
|
448
|
+
best = c
|
|
449
|
+
best_score = score
|
|
450
|
+
stack.append(c)
|
|
451
|
+
return best
|
|
452
|
+
|
|
453
|
+
|
|
454
|
+
# ---------------------------------------------------------------------------
|
|
455
|
+
# Markdown rendering
|
|
456
|
+
# ---------------------------------------------------------------------------
|
|
457
|
+
INLINE_TAGS = {"a", "b", "strong", "i", "em", "u", "code", "span", "small",
|
|
458
|
+
"sub", "sup", "abbr", "mark", "kbd", "var", "samp"}
|
|
459
|
+
|
|
460
|
+
|
|
461
|
+
def _normalize_ws(s: str) -> str:
|
|
462
|
+
return re.sub(r"[ \t\r\n\f]+", " ", s).strip()
|
|
463
|
+
|
|
464
|
+
|
|
465
|
+
def _md_escape(s: str) -> str:
|
|
466
|
+
# Escape characters that have markdown meaning at the start of a line
|
|
467
|
+
# or inline. We keep this conservative.
|
|
468
|
+
return s.replace("\\", "\\\\").replace("`", "\\`")
|
|
469
|
+
|
|
470
|
+
|
|
471
|
+
def _render_inline(node: Node, base_url: str) -> str:
|
|
472
|
+
parts: List[str] = []
|
|
473
|
+
for c in node.children:
|
|
474
|
+
if isinstance(c, str):
|
|
475
|
+
parts.append(c)
|
|
476
|
+
continue
|
|
477
|
+
if c.tag in DROP_TAGS:
|
|
478
|
+
continue
|
|
479
|
+
if c.tag == "br":
|
|
480
|
+
parts.append("\n")
|
|
481
|
+
continue
|
|
482
|
+
inner = _render_inline(c, base_url)
|
|
483
|
+
if c.tag == "a":
|
|
484
|
+
href = c.attrs.get("href", "").strip()
|
|
485
|
+
full = urljoin(base_url, href) if href else ""
|
|
486
|
+
label = _normalize_ws(inner) or full
|
|
487
|
+
if full:
|
|
488
|
+
parts.append(f"[{label}]({full})")
|
|
489
|
+
else:
|
|
490
|
+
parts.append(label)
|
|
491
|
+
elif c.tag in ("strong", "b"):
|
|
492
|
+
t = _normalize_ws(inner)
|
|
493
|
+
parts.append(f"**{t}**" if t else "")
|
|
494
|
+
elif c.tag in ("em", "i"):
|
|
495
|
+
t = _normalize_ws(inner)
|
|
496
|
+
parts.append(f"*{t}*" if t else "")
|
|
497
|
+
elif c.tag == "code":
|
|
498
|
+
t = inner.strip()
|
|
499
|
+
parts.append(f"`{t}`" if t else "")
|
|
500
|
+
elif c.tag == "img":
|
|
501
|
+
alt = c.attrs.get("alt", "").strip()
|
|
502
|
+
src = urljoin(base_url, c.attrs.get("src", "").strip())
|
|
503
|
+
if src:
|
|
504
|
+
parts.append(f"")
|
|
505
|
+
else:
|
|
506
|
+
parts.append(inner)
|
|
507
|
+
return "".join(parts)
|
|
508
|
+
|
|
509
|
+
|
|
510
|
+
def _render_block(node: Node, base_url: str, lines: List[str], list_depth: int = 0):
|
|
511
|
+
tag = node.tag
|
|
512
|
+
if tag in DROP_TAGS:
|
|
513
|
+
return
|
|
514
|
+
if tag in ("h1", "h2", "h3", "h4", "h5", "h6"):
|
|
515
|
+
level = int(tag[1])
|
|
516
|
+
text = _normalize_ws(_render_inline(node, base_url))
|
|
517
|
+
if text:
|
|
518
|
+
lines.append("")
|
|
519
|
+
lines.append("#" * level + " " + text)
|
|
520
|
+
lines.append("")
|
|
521
|
+
return
|
|
522
|
+
if tag == "p":
|
|
523
|
+
text = _normalize_ws(_render_inline(node, base_url))
|
|
524
|
+
if text:
|
|
525
|
+
lines.append("")
|
|
526
|
+
lines.append(text)
|
|
527
|
+
lines.append("")
|
|
528
|
+
return
|
|
529
|
+
if tag == "br":
|
|
530
|
+
return
|
|
531
|
+
if tag in ("ul", "ol"):
|
|
532
|
+
ordered = tag == "ol"
|
|
533
|
+
idx = 1
|
|
534
|
+
for c in node.children:
|
|
535
|
+
if isinstance(c, Node) and c.tag == "li":
|
|
536
|
+
bullet = f"{idx}." if ordered else "-"
|
|
537
|
+
# Render li inline content + nested blocks.
|
|
538
|
+
inline = _normalize_ws(_render_inline(c, base_url))
|
|
539
|
+
indent = " " * list_depth
|
|
540
|
+
if inline:
|
|
541
|
+
lines.append(f"{indent}{bullet} {inline}")
|
|
542
|
+
# Walk nested lists.
|
|
543
|
+
for cc in c.children:
|
|
544
|
+
if isinstance(cc, Node) and cc.tag in ("ul", "ol"):
|
|
545
|
+
_render_block(cc, base_url, lines, list_depth + 1)
|
|
546
|
+
idx += 1
|
|
547
|
+
lines.append("")
|
|
548
|
+
return
|
|
549
|
+
if tag == "blockquote":
|
|
550
|
+
inner: List[str] = []
|
|
551
|
+
for c in node.children:
|
|
552
|
+
if isinstance(c, Node):
|
|
553
|
+
_render_block(c, base_url, inner, list_depth)
|
|
554
|
+
elif isinstance(c, str):
|
|
555
|
+
t = _normalize_ws(c)
|
|
556
|
+
if t:
|
|
557
|
+
inner.append(t)
|
|
558
|
+
if inner:
|
|
559
|
+
lines.append("")
|
|
560
|
+
for ln in inner:
|
|
561
|
+
if ln:
|
|
562
|
+
lines.append("> " + ln)
|
|
563
|
+
else:
|
|
564
|
+
lines.append(">")
|
|
565
|
+
lines.append("")
|
|
566
|
+
return
|
|
567
|
+
if tag == "pre":
|
|
568
|
+
code = node.text()
|
|
569
|
+
lines.append("")
|
|
570
|
+
lines.append("```")
|
|
571
|
+
for ln in code.rstrip().splitlines():
|
|
572
|
+
lines.append(ln)
|
|
573
|
+
lines.append("```")
|
|
574
|
+
lines.append("")
|
|
575
|
+
return
|
|
576
|
+
if tag == "hr":
|
|
577
|
+
lines.append("")
|
|
578
|
+
lines.append("---")
|
|
579
|
+
lines.append("")
|
|
580
|
+
return
|
|
581
|
+
if tag in ("table", "thead", "tbody", "tr", "td", "th"):
|
|
582
|
+
# Tables: render as a simplified GFM table when we see <table>.
|
|
583
|
+
if tag == "table":
|
|
584
|
+
_render_table(node, base_url, lines)
|
|
585
|
+
return
|
|
586
|
+
if tag == "img":
|
|
587
|
+
alt = node.attrs.get("alt", "").strip()
|
|
588
|
+
src = urljoin(base_url, node.attrs.get("src", "").strip())
|
|
589
|
+
if src:
|
|
590
|
+
lines.append("")
|
|
591
|
+
lines.append(f"")
|
|
592
|
+
lines.append("")
|
|
593
|
+
return
|
|
594
|
+
if tag == "figure":
|
|
595
|
+
for c in node.children:
|
|
596
|
+
if isinstance(c, Node):
|
|
597
|
+
_render_block(c, base_url, lines, list_depth)
|
|
598
|
+
return
|
|
599
|
+
# Container — recurse.
|
|
600
|
+
if tag in INLINE_TAGS or tag == "__root__" or tag in (
|
|
601
|
+
"div", "section", "article", "main", "body", "html",
|
|
602
|
+
"figure", "figcaption", "details", "summary",
|
|
603
|
+
):
|
|
604
|
+
# If this node is purely inline, emit a paragraph.
|
|
605
|
+
only_inline = all(
|
|
606
|
+
isinstance(c, str) or c.tag in INLINE_TAGS or c.tag == "br"
|
|
607
|
+
for c in node.children
|
|
608
|
+
)
|
|
609
|
+
if only_inline and tag != "__root__":
|
|
610
|
+
text = _normalize_ws(_render_inline(node, base_url))
|
|
611
|
+
if text:
|
|
612
|
+
lines.append("")
|
|
613
|
+
lines.append(text)
|
|
614
|
+
lines.append("")
|
|
615
|
+
return
|
|
616
|
+
for c in node.children:
|
|
617
|
+
if isinstance(c, Node):
|
|
618
|
+
_render_block(c, base_url, lines, list_depth)
|
|
619
|
+
elif isinstance(c, str):
|
|
620
|
+
t = _normalize_ws(c)
|
|
621
|
+
if t:
|
|
622
|
+
lines.append("")
|
|
623
|
+
lines.append(t)
|
|
624
|
+
lines.append("")
|
|
625
|
+
return
|
|
626
|
+
# Unknown tag — recurse blindly.
|
|
627
|
+
for c in node.children:
|
|
628
|
+
if isinstance(c, Node):
|
|
629
|
+
_render_block(c, base_url, lines, list_depth)
|
|
630
|
+
|
|
631
|
+
|
|
632
|
+
def _render_table(table: Node, base_url: str, lines: List[str]):
|
|
633
|
+
rows: List[List[str]] = []
|
|
634
|
+
header: Optional[List[str]] = None
|
|
635
|
+
for tr in table.find_all("tr"):
|
|
636
|
+
cells: List[str] = []
|
|
637
|
+
is_header_row = False
|
|
638
|
+
for c in tr.children:
|
|
639
|
+
if not isinstance(c, Node):
|
|
640
|
+
continue
|
|
641
|
+
if c.tag in ("td", "th"):
|
|
642
|
+
if c.tag == "th":
|
|
643
|
+
is_header_row = True
|
|
644
|
+
cells.append(_normalize_ws(_render_inline(c, base_url)))
|
|
645
|
+
if not cells:
|
|
646
|
+
continue
|
|
647
|
+
if header is None and is_header_row:
|
|
648
|
+
header = cells
|
|
649
|
+
else:
|
|
650
|
+
rows.append(cells)
|
|
651
|
+
if not header and rows:
|
|
652
|
+
header = rows.pop(0)
|
|
653
|
+
if not header:
|
|
654
|
+
return
|
|
655
|
+
lines.append("")
|
|
656
|
+
lines.append("| " + " | ".join(header) + " |")
|
|
657
|
+
lines.append("| " + " | ".join(["---"] * len(header)) + " |")
|
|
658
|
+
for r in rows:
|
|
659
|
+
# Pad/truncate to header width
|
|
660
|
+
if len(r) < len(header):
|
|
661
|
+
r = r + [""] * (len(header) - len(r))
|
|
662
|
+
else:
|
|
663
|
+
r = r[: len(header)]
|
|
664
|
+
lines.append("| " + " | ".join(r) + " |")
|
|
665
|
+
lines.append("")
|
|
666
|
+
|
|
667
|
+
|
|
668
|
+
def to_markdown(node: Node, base_url: str) -> str:
|
|
669
|
+
lines: List[str] = []
|
|
670
|
+
_render_block(node, base_url, lines)
|
|
671
|
+
# Collapse 3+ blank lines.
|
|
672
|
+
out: List[str] = []
|
|
673
|
+
blank = 0
|
|
674
|
+
for ln in lines:
|
|
675
|
+
if ln == "":
|
|
676
|
+
blank += 1
|
|
677
|
+
if blank <= 1:
|
|
678
|
+
out.append("")
|
|
679
|
+
else:
|
|
680
|
+
blank = 0
|
|
681
|
+
out.append(ln.rstrip())
|
|
682
|
+
return "\n".join(out).strip() + "\n"
|
|
683
|
+
|
|
684
|
+
|
|
685
|
+
# ---------------------------------------------------------------------------
|
|
686
|
+
# Entry points
|
|
687
|
+
# ---------------------------------------------------------------------------
|
|
688
|
+
def cmd_extract(args) -> int:
|
|
689
|
+
html = _read_input(args.html_file)
|
|
690
|
+
root = parse_html(html)
|
|
691
|
+
base = args.base_url or args.url
|
|
692
|
+
meta = extract_metadata(root, args.url)
|
|
693
|
+
main = select_main(root)
|
|
694
|
+
markdown = to_markdown(main, args.url)
|
|
695
|
+
links = extract_links(root, args.url, base)
|
|
696
|
+
nav_raw = extract_nav_links(root, args.url, base)
|
|
697
|
+
nav = filter_nav_links(nav_raw, base)
|
|
698
|
+
images = extract_images(main, args.url)
|
|
699
|
+
# Include page-level og:image as the first asset if it isn't already.
|
|
700
|
+
if meta.get("image"):
|
|
701
|
+
if not any(img["url"] == meta["image"] for img in images):
|
|
702
|
+
images.insert(0, {"url": meta["image"], "alt": meta.get("title") or ""})
|
|
703
|
+
kind = classify_page(args.url, base, meta.get("title", ""))
|
|
704
|
+
word_count = len(markdown.split())
|
|
705
|
+
out = {
|
|
706
|
+
"url": args.url,
|
|
707
|
+
"base_url": base,
|
|
708
|
+
"kind": kind,
|
|
709
|
+
"title": meta["title"],
|
|
710
|
+
"description": meta["description"],
|
|
711
|
+
"canonical": meta["canonical"],
|
|
712
|
+
"lang": meta["lang"],
|
|
713
|
+
"image": meta["image"],
|
|
714
|
+
"site_name": meta["site_name"],
|
|
715
|
+
"word_count": word_count,
|
|
716
|
+
"markdown": markdown,
|
|
717
|
+
"links": [{"url": u, "label": l} for u, l in links],
|
|
718
|
+
"nav": [{"url": u, "label": l} for u, l in nav],
|
|
719
|
+
"images": images,
|
|
720
|
+
}
|
|
721
|
+
json.dump(out, sys.stdout, ensure_ascii=False, indent=2)
|
|
722
|
+
sys.stdout.write("\n")
|
|
723
|
+
return 0
|
|
724
|
+
|
|
725
|
+
|
|
726
|
+
def cmd_crawl_links(args) -> int:
|
|
727
|
+
html = _read_input(args.html_file)
|
|
728
|
+
root = parse_html(html)
|
|
729
|
+
base = args.base_url
|
|
730
|
+
url = args.url or base
|
|
731
|
+
for u, _ in extract_links(root, url, base):
|
|
732
|
+
sys.stdout.write(u + "\n")
|
|
733
|
+
return 0
|
|
734
|
+
|
|
735
|
+
|
|
736
|
+
def _read_input(path: Optional[str]) -> str:
|
|
737
|
+
if not path or path == "-":
|
|
738
|
+
return sys.stdin.read()
|
|
739
|
+
with open(path, "r", encoding="utf-8", errors="replace") as f:
|
|
740
|
+
return f.read()
|
|
741
|
+
|
|
742
|
+
|
|
743
|
+
def main(argv: List[str]) -> int:
|
|
744
|
+
p = argparse.ArgumentParser(description="zer0-mistakes site scraper helper")
|
|
745
|
+
sub = p.add_subparsers(dest="command", required=True)
|
|
746
|
+
|
|
747
|
+
p_ext = sub.add_parser("extract", help="Parse one HTML page → JSON")
|
|
748
|
+
p_ext.add_argument("--url", required=True, help="URL the HTML was fetched from")
|
|
749
|
+
p_ext.add_argument("--base-url", default="", help="Site base URL for same-host filter")
|
|
750
|
+
p_ext.add_argument("html_file", nargs="?", default="-")
|
|
751
|
+
p_ext.set_defaults(func=cmd_extract)
|
|
752
|
+
|
|
753
|
+
p_lnk = sub.add_parser("crawl-links", help="List in-scope links in HTML")
|
|
754
|
+
p_lnk.add_argument("--base-url", required=True)
|
|
755
|
+
p_lnk.add_argument("--url", default="")
|
|
756
|
+
p_lnk.add_argument("html_file", nargs="?", default="-")
|
|
757
|
+
p_lnk.set_defaults(func=cmd_crawl_links)
|
|
758
|
+
|
|
759
|
+
args = p.parse_args(argv)
|
|
760
|
+
return args.func(args)
|
|
761
|
+
|
|
762
|
+
|
|
763
|
+
if __name__ == "__main__":
|
|
764
|
+
sys.exit(main(sys.argv[1:]))
|