getdocs 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- getdocs/__init__.py +0 -0
- getdocs/__main__.py +3 -0
- getdocs/api.py +95 -0
- getdocs/cli.py +220 -0
- getdocs/config.py +36 -0
- getdocs/engine.py +418 -0
- getdocs/extract.py +190 -0
- getdocs/identity.py +32 -0
- getdocs/jobs.py +204 -0
- getdocs/navharvest.py +242 -0
- getdocs/output.py +191 -0
- getdocs/scope.py +84 -0
- getdocs/sitemap.py +35 -0
- getdocs/source.py +238 -0
- getdocs/urlnorm.py +34 -0
- getdocs-0.1.0.dist-info/METADATA +169 -0
- getdocs-0.1.0.dist-info/RECORD +21 -0
- getdocs-0.1.0.dist-info/WHEEL +5 -0
- getdocs-0.1.0.dist-info/entry_points.txt +2 -0
- getdocs-0.1.0.dist-info/licenses/LICENSE +21 -0
- getdocs-0.1.0.dist-info/top_level.txt +1 -0
getdocs/jobs.py
ADDED
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
"""Jobs: run Crawls as subprocesses and track their state.
|
|
2
|
+
|
|
3
|
+
Per ADR-0002, a job is the getdocs CLI run with --format jsonl; its stdout
|
|
4
|
+
stream is the event protocol. One subprocess per Crawl sidesteps the
|
|
5
|
+
one-reactor-per-process constraint and isolates crashes.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import asyncio
|
|
9
|
+
import json
|
|
10
|
+
import sys
|
|
11
|
+
import tempfile
|
|
12
|
+
import uuid
|
|
13
|
+
from dataclasses import dataclass, field
|
|
14
|
+
|
|
15
|
+
_BOOL_FLAGS = {
|
|
16
|
+
"allow_backward": "--allow-backward",
|
|
17
|
+
"allow_subdomains": "--allow-subdomains",
|
|
18
|
+
"ignore_robots": "--ignore-robots",
|
|
19
|
+
"keep_html": "--keep-html",
|
|
20
|
+
}
|
|
21
|
+
_VALUE_FLAGS = {
|
|
22
|
+
"limit": "--limit",
|
|
23
|
+
"depth": "--depth",
|
|
24
|
+
"delay": "--delay",
|
|
25
|
+
"concurrency": "--concurrency",
|
|
26
|
+
"render": "--render",
|
|
27
|
+
"selector": "--selector",
|
|
28
|
+
}
|
|
29
|
+
_LIST_FLAGS = {
|
|
30
|
+
"include_paths": "--include-paths",
|
|
31
|
+
"exclude_paths": "--exclude-paths",
|
|
32
|
+
}
|
|
33
|
+
_SITEMAP_FLAGS = {"off": "--no-sitemap", "only": "--sitemap-only"}
|
|
34
|
+
|
|
35
|
+
# JSONL lines carry whole pages (and raw HTML with keep_html).
|
|
36
|
+
_STREAM_LIMIT = 32 * 1024 * 1024
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def build_args(options: dict, output_dir: str) -> list[str]:
|
|
40
|
+
seeds = options.get("urls") or [options["url"]]
|
|
41
|
+
args = ["crawl", *seeds, "--format", "jsonl", "-o", output_dir]
|
|
42
|
+
for key, flag in _VALUE_FLAGS.items():
|
|
43
|
+
if options.get(key) is not None:
|
|
44
|
+
args += [flag, str(options[key])]
|
|
45
|
+
for key, flag in _BOOL_FLAGS.items():
|
|
46
|
+
if options.get(key):
|
|
47
|
+
args.append(flag)
|
|
48
|
+
for key, flag in _LIST_FLAGS.items():
|
|
49
|
+
for value in options.get(key) or []:
|
|
50
|
+
args += [flag, value]
|
|
51
|
+
sitemap_flag = _SITEMAP_FLAGS.get(options.get("sitemap", ""))
|
|
52
|
+
if sitemap_flag:
|
|
53
|
+
args.append(sitemap_flag)
|
|
54
|
+
return args
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@dataclass
|
|
58
|
+
class CrawlJob:
|
|
59
|
+
id: str
|
|
60
|
+
seeds: list[str]
|
|
61
|
+
status: str = "running" # running | completed | failed | cancelled
|
|
62
|
+
pages: list[dict] = field(default_factory=list)
|
|
63
|
+
manifest: dict | None = None
|
|
64
|
+
error: str | None = None
|
|
65
|
+
webhook_failures: int = 0
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class JobManager:
|
|
69
|
+
def __init__(self):
|
|
70
|
+
self.jobs: dict[str, CrawlJob] = {}
|
|
71
|
+
self._tasks: dict[str, asyncio.Task] = {}
|
|
72
|
+
self._processes: dict[str, asyncio.subprocess.Process] = {}
|
|
73
|
+
self._subscribers: dict[str, list[asyncio.Queue]] = {}
|
|
74
|
+
|
|
75
|
+
def start(self, options: dict) -> CrawlJob:
|
|
76
|
+
seeds = options.get("urls") or [options["url"]]
|
|
77
|
+
job = CrawlJob(id=uuid.uuid4().hex, seeds=seeds)
|
|
78
|
+
self.jobs[job.id] = job
|
|
79
|
+
output_dir = tempfile.mkdtemp(prefix=f"getdocs-{job.id[:8]}-")
|
|
80
|
+
args = build_args(options, output_dir=output_dir)
|
|
81
|
+
self._tasks[job.id] = asyncio.ensure_future(
|
|
82
|
+
self._run(job, args, webhook=options.get("webhook"))
|
|
83
|
+
)
|
|
84
|
+
return job
|
|
85
|
+
|
|
86
|
+
def get(self, job_id: str) -> CrawlJob | None:
|
|
87
|
+
return self.jobs.get(job_id)
|
|
88
|
+
|
|
89
|
+
def cancel(self, job_id: str) -> CrawlJob | None:
|
|
90
|
+
"""Cancel a running job (terminates its subprocess, keeps partial
|
|
91
|
+
results). A no-op on finished jobs; None for unknown ids."""
|
|
92
|
+
job = self.jobs.get(job_id)
|
|
93
|
+
if job is None:
|
|
94
|
+
return None
|
|
95
|
+
if job.status == "running":
|
|
96
|
+
job.status = "cancelled"
|
|
97
|
+
process = self._processes.get(job_id)
|
|
98
|
+
if process is not None and process.returncode is None:
|
|
99
|
+
process.terminate()
|
|
100
|
+
return job
|
|
101
|
+
|
|
102
|
+
async def wait(self, job_id: str) -> CrawlJob:
|
|
103
|
+
await self._tasks[job_id]
|
|
104
|
+
return self.jobs[job_id]
|
|
105
|
+
|
|
106
|
+
def _publish(self, job_id: str, event: dict) -> None:
|
|
107
|
+
for queue in self._subscribers.get(job_id, []):
|
|
108
|
+
queue.put_nowait(event)
|
|
109
|
+
|
|
110
|
+
async def stream(self, job_id: str):
|
|
111
|
+
"""Yield a job's events: a replay of everything so far, then live
|
|
112
|
+
page events, ending with the manifest (when one was produced).
|
|
113
|
+
|
|
114
|
+
The queue is attached in the same event-loop step as the replay
|
|
115
|
+
snapshot, so no event is missed or duplicated around the boundary.
|
|
116
|
+
"""
|
|
117
|
+
job = self.jobs[job_id]
|
|
118
|
+
queue: asyncio.Queue = asyncio.Queue()
|
|
119
|
+
self._subscribers.setdefault(job_id, []).append(queue)
|
|
120
|
+
try:
|
|
121
|
+
replay = list(job.pages)
|
|
122
|
+
finished = job.status != "running"
|
|
123
|
+
for record in replay:
|
|
124
|
+
yield {"type": "page", **record}
|
|
125
|
+
if finished:
|
|
126
|
+
if job.manifest is not None:
|
|
127
|
+
yield {"type": "manifest", **job.manifest}
|
|
128
|
+
return
|
|
129
|
+
while True:
|
|
130
|
+
event = await queue.get()
|
|
131
|
+
if event["type"] == "end":
|
|
132
|
+
return
|
|
133
|
+
yield event
|
|
134
|
+
if event["type"] == "manifest":
|
|
135
|
+
return
|
|
136
|
+
finally:
|
|
137
|
+
self._subscribers[job_id].remove(queue)
|
|
138
|
+
|
|
139
|
+
async def _deliver(self, job: CrawlJob, url: str, payload: dict) -> None:
|
|
140
|
+
"""Bounded-retry webhook POST; failures are recorded, never raised."""
|
|
141
|
+
import httpx
|
|
142
|
+
|
|
143
|
+
for attempt in range(3):
|
|
144
|
+
try:
|
|
145
|
+
async with httpx.AsyncClient(timeout=5) as client:
|
|
146
|
+
response = await client.post(url, json=payload)
|
|
147
|
+
if response.status_code < 400:
|
|
148
|
+
return
|
|
149
|
+
except httpx.HTTPError:
|
|
150
|
+
pass
|
|
151
|
+
await asyncio.sleep(0.05 * (attempt + 1))
|
|
152
|
+
job.webhook_failures += 1
|
|
153
|
+
|
|
154
|
+
async def _run(self, job: CrawlJob, args: list[str], webhook: str | None = None) -> None:
|
|
155
|
+
if webhook:
|
|
156
|
+
await self._deliver(
|
|
157
|
+
job, webhook, {"event": "started", "id": job.id, "seeds": job.seeds}
|
|
158
|
+
)
|
|
159
|
+
process = await asyncio.create_subprocess_exec(
|
|
160
|
+
sys.executable, "-m", "getdocs", *args,
|
|
161
|
+
stdout=asyncio.subprocess.PIPE,
|
|
162
|
+
stderr=asyncio.subprocess.PIPE,
|
|
163
|
+
limit=_STREAM_LIMIT,
|
|
164
|
+
)
|
|
165
|
+
self._processes[job.id] = process
|
|
166
|
+
async for line in process.stdout:
|
|
167
|
+
try:
|
|
168
|
+
record = json.loads(line)
|
|
169
|
+
except json.JSONDecodeError:
|
|
170
|
+
continue
|
|
171
|
+
if record.get("type") == "page":
|
|
172
|
+
page = {k: v for k, v in record.items() if k != "type"}
|
|
173
|
+
job.pages.append(page)
|
|
174
|
+
self._publish(job.id, record)
|
|
175
|
+
if webhook:
|
|
176
|
+
await self._deliver(
|
|
177
|
+
job, webhook, {"event": "page", "id": job.id, "page": page}
|
|
178
|
+
)
|
|
179
|
+
elif record.get("type") == "manifest":
|
|
180
|
+
job.manifest = {k: v for k, v in record.items() if k != "type"}
|
|
181
|
+
self._publish(job.id, record)
|
|
182
|
+
stderr = await process.stderr.read()
|
|
183
|
+
returncode = await process.wait()
|
|
184
|
+
if job.status == "cancelled":
|
|
185
|
+
pass # keep the cancelled status and partial pages
|
|
186
|
+
elif returncode == 0:
|
|
187
|
+
job.status = "completed"
|
|
188
|
+
else:
|
|
189
|
+
job.status = "failed"
|
|
190
|
+
job.error = stderr.decode(errors="replace").strip()[-2000:] or (
|
|
191
|
+
f"crawl exited with code {returncode}"
|
|
192
|
+
)
|
|
193
|
+
self._publish(job.id, {"type": "end", "status": job.status})
|
|
194
|
+
if webhook:
|
|
195
|
+
await self._deliver(
|
|
196
|
+
job,
|
|
197
|
+
webhook,
|
|
198
|
+
{
|
|
199
|
+
"event": "completed",
|
|
200
|
+
"id": job.id,
|
|
201
|
+
"status": job.status,
|
|
202
|
+
"manifest": job.manifest,
|
|
203
|
+
},
|
|
204
|
+
)
|
getdocs/navharvest.py
ADDED
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
"""Nav harvesting: capture the original site's Nav Order and Reading Order.
|
|
2
|
+
|
|
3
|
+
Per ADR-0004, three signals are harvested from each fetched Page's raw HTML
|
|
4
|
+
and merged at Crawl end: header tabs and sidebar trees build the Nav Order
|
|
5
|
+
(grouping, labels, nesting); prev/next link chains are authoritative for the
|
|
6
|
+
linear Reading Order. Everything is plain dicts so harvests serialize
|
|
7
|
+
directly into the resume state and the Manifest.
|
|
8
|
+
|
|
9
|
+
Node shape: {"title": str, "url": str | None, "children": [node, ...]}
|
|
10
|
+
Harvest shape: {"tree": [node], "tabs": [node], "prev": url?, "next": url?}
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from urllib.parse import urljoin
|
|
14
|
+
|
|
15
|
+
from bs4 import BeautifulSoup
|
|
16
|
+
|
|
17
|
+
from getdocs.urlnorm import normalize
|
|
18
|
+
|
|
19
|
+
_TAB_SELECTORS = [
|
|
20
|
+
".md-tabs a.md-tabs__link", # MkDocs Material tabs
|
|
21
|
+
"nav.navbar a.navbar__item[href]", # Docusaurus navbar
|
|
22
|
+
]
|
|
23
|
+
_SIDEBAR_SELECTORS = [
|
|
24
|
+
"nav.md-nav--primary", # MkDocs Material
|
|
25
|
+
"nav.menu", # Docusaurus
|
|
26
|
+
"div.sphinxsidebarwrapper", # Sphinx
|
|
27
|
+
"aside nav",
|
|
28
|
+
"aside",
|
|
29
|
+
'[class*="sidebar"] nav',
|
|
30
|
+
]
|
|
31
|
+
_PREV_SELECTORS = ['a[rel="prev"]', "a.md-footer__link--prev", "a.pagination-nav__link--prev"]
|
|
32
|
+
_NEXT_SELECTORS = ['a[rel="next"]', "a.md-footer__link--next", "a.pagination-nav__link--next"]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _node(title: str, url: str | None, children: list) -> dict:
|
|
36
|
+
return {"title": title, "url": url, "children": children}
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _own_link(li, nested_ul):
|
|
40
|
+
for a in li.find_all("a", href=True):
|
|
41
|
+
if nested_ul is None or nested_ul not in a.parents:
|
|
42
|
+
return a
|
|
43
|
+
return None
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _parse_list(ul, page_url: str) -> list[dict]:
|
|
47
|
+
nodes = []
|
|
48
|
+
for li in ul.find_all("li", recursive=False):
|
|
49
|
+
nested = li.find("ul")
|
|
50
|
+
link = _own_link(li, nested)
|
|
51
|
+
if link is not None:
|
|
52
|
+
title = link.get_text(strip=True)
|
|
53
|
+
url = urljoin(page_url, link["href"])
|
|
54
|
+
else:
|
|
55
|
+
label = li.find("label")
|
|
56
|
+
title = (label or li).find(string=True, recursive=bool(label))
|
|
57
|
+
title = (title or "").strip()
|
|
58
|
+
url = None
|
|
59
|
+
children = _parse_list(nested, page_url) if nested else []
|
|
60
|
+
if title or children:
|
|
61
|
+
nodes.append(_node(title, url, children))
|
|
62
|
+
return nodes
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _first_href(soup, selectors: list[str], page_url: str) -> str | None:
|
|
66
|
+
for selector in selectors:
|
|
67
|
+
el = soup.select_one(selector)
|
|
68
|
+
if el is not None and el.get("href"):
|
|
69
|
+
return urljoin(page_url, el["href"])
|
|
70
|
+
return None
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def harvest_nav(html: str, page_url: str) -> dict:
|
|
74
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
75
|
+
|
|
76
|
+
tabs = []
|
|
77
|
+
for selector in _TAB_SELECTORS:
|
|
78
|
+
links = soup.select(selector)
|
|
79
|
+
if links:
|
|
80
|
+
tabs = [
|
|
81
|
+
_node(a.get_text(strip=True), urljoin(page_url, a["href"]), [])
|
|
82
|
+
for a in links
|
|
83
|
+
if a.get("href") and a.get_text(strip=True)
|
|
84
|
+
]
|
|
85
|
+
break
|
|
86
|
+
|
|
87
|
+
tree: list[dict] = []
|
|
88
|
+
for selector in _SIDEBAR_SELECTORS:
|
|
89
|
+
container = soup.select_one(selector)
|
|
90
|
+
if container is not None:
|
|
91
|
+
ul = container if container.name == "ul" else container.find("ul")
|
|
92
|
+
if ul is not None:
|
|
93
|
+
tree = _parse_list(ul, page_url)
|
|
94
|
+
if tree:
|
|
95
|
+
break
|
|
96
|
+
|
|
97
|
+
return {
|
|
98
|
+
"tree": tree,
|
|
99
|
+
"tabs": tabs,
|
|
100
|
+
"prev": _first_href(soup, _PREV_SELECTORS, page_url),
|
|
101
|
+
"next": _first_href(soup, _NEXT_SELECTORS, page_url),
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
# -- merging ----------------------------------------------------------------
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def _count_nodes(nodes: list[dict]) -> int:
|
|
109
|
+
return sum(1 + _count_nodes(n["children"]) for n in nodes)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def _index_tree(nodes: list[dict], index: dict) -> None:
|
|
113
|
+
for node in nodes:
|
|
114
|
+
if node["url"]:
|
|
115
|
+
index.setdefault(normalize(node["url"]), node)
|
|
116
|
+
_index_tree(node["children"], index)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def _first_seen_merge(skeleton: list[dict], index: dict, other: list[dict]) -> None:
|
|
120
|
+
"""Attach nodes unseen by the skeleton under their (known) parent."""
|
|
121
|
+
|
|
122
|
+
def walk(nodes: list[dict], parent_children: list[dict]):
|
|
123
|
+
for node in nodes:
|
|
124
|
+
norm = normalize(node["url"]) if node["url"] else None
|
|
125
|
+
if norm and norm in index:
|
|
126
|
+
walk(node["children"], index[norm]["children"])
|
|
127
|
+
elif norm:
|
|
128
|
+
copy = _node(node["title"], node["url"], [])
|
|
129
|
+
parent_children.append(copy)
|
|
130
|
+
index[norm] = copy
|
|
131
|
+
walk(node["children"], copy["children"])
|
|
132
|
+
else:
|
|
133
|
+
walk(node["children"], parent_children)
|
|
134
|
+
|
|
135
|
+
walk(other, skeleton)
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def _attach_tabs(tabs: list[dict], roots: list[dict]) -> list[dict]:
|
|
139
|
+
"""Tabs become the top level; existing roots nest under the tab whose
|
|
140
|
+
URL path is their prefix. A root that IS a tab's page merges into it."""
|
|
141
|
+
tab_nodes = [_node(t["title"], t["url"], []) for t in tabs]
|
|
142
|
+
leftovers = []
|
|
143
|
+
for root in roots:
|
|
144
|
+
target = None
|
|
145
|
+
if root["url"]:
|
|
146
|
+
for tab in tab_nodes:
|
|
147
|
+
tab_path = (tab["url"] or "").rstrip("/")
|
|
148
|
+
if tab_path and (root["url"].rstrip("/") + "/").startswith(tab_path + "/"):
|
|
149
|
+
target = tab
|
|
150
|
+
break
|
|
151
|
+
if target is None:
|
|
152
|
+
leftovers.append(root)
|
|
153
|
+
elif root["url"] and tab_nodes and normalize(root["url"]) == normalize(target["url"]):
|
|
154
|
+
target["children"].extend(root["children"])
|
|
155
|
+
else:
|
|
156
|
+
target["children"].append(root)
|
|
157
|
+
return tab_nodes + leftovers
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def _prune(nodes: list[dict], written: set[str]) -> list[dict]:
|
|
161
|
+
"""Un-crawled nodes keep their label but lose the link; label-only nodes
|
|
162
|
+
without children are dropped."""
|
|
163
|
+
result = []
|
|
164
|
+
for node in nodes:
|
|
165
|
+
children = _prune(node["children"], written)
|
|
166
|
+
url = node["url"] if node["url"] and normalize(node["url"]) in written else None
|
|
167
|
+
if url is None and not children:
|
|
168
|
+
continue
|
|
169
|
+
result.append(_node(node["title"], url, children))
|
|
170
|
+
return result
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def _traversal(nodes: list[dict]) -> list[str]:
|
|
174
|
+
urls = []
|
|
175
|
+
for node in nodes:
|
|
176
|
+
if node["url"]:
|
|
177
|
+
urls.append(node["url"])
|
|
178
|
+
urls.extend(_traversal(node["children"]))
|
|
179
|
+
return urls
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def _chain_sequences(harvests: list[dict], written: set[str]) -> list[str]:
|
|
183
|
+
"""Assemble prev/next links into ordered chains of normalized URLs."""
|
|
184
|
+
next_of: dict[str, str] = {}
|
|
185
|
+
has_incoming: set[str] = set()
|
|
186
|
+
for harvest in harvests:
|
|
187
|
+
page = normalize(harvest["page"])
|
|
188
|
+
if harvest.get("next"):
|
|
189
|
+
target = normalize(harvest["next"])
|
|
190
|
+
next_of.setdefault(page, target)
|
|
191
|
+
has_incoming.add(target)
|
|
192
|
+
if harvest.get("prev"):
|
|
193
|
+
source = normalize(harvest["prev"])
|
|
194
|
+
next_of.setdefault(source, page)
|
|
195
|
+
has_incoming.add(page)
|
|
196
|
+
|
|
197
|
+
sequence, visited = [], set()
|
|
198
|
+
heads = [p for p in next_of if p not in has_incoming]
|
|
199
|
+
for head in heads:
|
|
200
|
+
current: str | None = head
|
|
201
|
+
while current and current not in visited:
|
|
202
|
+
visited.add(current)
|
|
203
|
+
if current in written:
|
|
204
|
+
sequence.append(current)
|
|
205
|
+
current = next_of.get(current)
|
|
206
|
+
return sequence
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def merge_harvests(
|
|
210
|
+
harvests: list[dict], written_urls: list[str]
|
|
211
|
+
) -> tuple[list[dict], list[str]]:
|
|
212
|
+
"""Merge per-page harvests into (nav tree, reading order).
|
|
213
|
+
|
|
214
|
+
harvests: [{"page": url, "tree": [...], "tabs": [...], "prev", "next"}]
|
|
215
|
+
written_urls: Pages actually written, in crawl order (original URLs).
|
|
216
|
+
"""
|
|
217
|
+
written = {normalize(u) for u in written_urls}
|
|
218
|
+
original = {normalize(u): u for u in written_urls}
|
|
219
|
+
|
|
220
|
+
trees = [h for h in harvests if h["tree"]]
|
|
221
|
+
skeleton: list[dict] = []
|
|
222
|
+
if trees:
|
|
223
|
+
skeleton = max(trees, key=lambda h: _count_nodes(h["tree"]))["tree"]
|
|
224
|
+
index: dict = {}
|
|
225
|
+
_index_tree(skeleton, index)
|
|
226
|
+
for harvest in trees:
|
|
227
|
+
if harvest["tree"] is not skeleton:
|
|
228
|
+
_first_seen_merge(skeleton, index, harvest["tree"])
|
|
229
|
+
|
|
230
|
+
tabs = next((h["tabs"] for h in harvests if h["tabs"]), [])
|
|
231
|
+
if tabs:
|
|
232
|
+
skeleton = _attach_tabs(tabs, skeleton)
|
|
233
|
+
|
|
234
|
+
nav = _prune(skeleton, written)
|
|
235
|
+
|
|
236
|
+
ordered = _chain_sequences(harvests, written)
|
|
237
|
+
for url in _traversal(nav) + written_urls:
|
|
238
|
+
norm = normalize(url)
|
|
239
|
+
if norm in written and norm not in ordered:
|
|
240
|
+
ordered.append(norm)
|
|
241
|
+
reading_order = [original[norm] for norm in dict.fromkeys(ordered)]
|
|
242
|
+
return nav, reading_order
|
getdocs/output.py
ADDED
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
"""Output: Page records to a .md tree with YAML frontmatter, plus the Manifest."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import posixpath
|
|
5
|
+
import re
|
|
6
|
+
from dataclasses import asdict, dataclass
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from urllib.parse import unquote, urlsplit
|
|
9
|
+
|
|
10
|
+
import yaml
|
|
11
|
+
|
|
12
|
+
from getdocs.urlnorm import normalize
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass(frozen=True)
|
|
16
|
+
class PageRecord:
|
|
17
|
+
url: str
|
|
18
|
+
title: str
|
|
19
|
+
markdown: str
|
|
20
|
+
status: int
|
|
21
|
+
crawled_at: str
|
|
22
|
+
canonical: str | None = None
|
|
23
|
+
html: str | None = None
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class FileTreeWriter:
|
|
27
|
+
def __init__(self, output_dir: Path):
|
|
28
|
+
self.output_dir = Path(output_dir)
|
|
29
|
+
self.page_count = 0
|
|
30
|
+
|
|
31
|
+
def path_for(self, url: str) -> Path:
|
|
32
|
+
# Decode percent-escapes per segment so %20 doesn't end up in file
|
|
33
|
+
# names (and %2F can't smuggle in extra directory levels).
|
|
34
|
+
segments = [
|
|
35
|
+
unquote(segment).replace("/", "_")
|
|
36
|
+
for segment in urlsplit(url).path.split("/")
|
|
37
|
+
if segment
|
|
38
|
+
]
|
|
39
|
+
path = "/".join(segments) or "index"
|
|
40
|
+
return self.output_dir / f"{path}.md"
|
|
41
|
+
|
|
42
|
+
def write_page(self, record: PageRecord) -> Path:
|
|
43
|
+
target = self.path_for(record.url)
|
|
44
|
+
target.parent.mkdir(parents=True, exist_ok=True)
|
|
45
|
+
|
|
46
|
+
frontmatter = {
|
|
47
|
+
k: v
|
|
48
|
+
for k, v in asdict(record).items()
|
|
49
|
+
if k not in ("markdown", "html") and v is not None
|
|
50
|
+
}
|
|
51
|
+
target.write_text(
|
|
52
|
+
"---\n"
|
|
53
|
+
+ yaml.safe_dump(frontmatter, sort_keys=False)
|
|
54
|
+
+ "---\n\n"
|
|
55
|
+
+ record.markdown
|
|
56
|
+
+ "\n"
|
|
57
|
+
)
|
|
58
|
+
if record.html is not None:
|
|
59
|
+
target.with_suffix(".html").write_text(record.html)
|
|
60
|
+
self.page_count += 1
|
|
61
|
+
return target
|
|
62
|
+
|
|
63
|
+
def write_manifest(
|
|
64
|
+
self,
|
|
65
|
+
seeds: list[str],
|
|
66
|
+
errors: list[dict] | None = None,
|
|
67
|
+
truncated: bool = False,
|
|
68
|
+
skipped: list[dict] | None = None,
|
|
69
|
+
shells: list[str] | None = None,
|
|
70
|
+
nav: list[dict] | None = None,
|
|
71
|
+
reading_order: list[str] | None = None,
|
|
72
|
+
media_skipped: list[dict] | None = None,
|
|
73
|
+
) -> Path:
|
|
74
|
+
target = self.output_dir / "crawl.json"
|
|
75
|
+
target.parent.mkdir(parents=True, exist_ok=True)
|
|
76
|
+
target.write_text(
|
|
77
|
+
json.dumps(
|
|
78
|
+
{
|
|
79
|
+
"seeds": seeds,
|
|
80
|
+
"page_count": self.page_count,
|
|
81
|
+
"errors": errors or [],
|
|
82
|
+
"skipped": skipped or [],
|
|
83
|
+
"shells": shells or [],
|
|
84
|
+
"truncated": truncated,
|
|
85
|
+
"nav": nav or [],
|
|
86
|
+
"reading_order": reading_order or [],
|
|
87
|
+
"media_skipped": media_skipped or [],
|
|
88
|
+
},
|
|
89
|
+
indent=2,
|
|
90
|
+
)
|
|
91
|
+
+ "\n"
|
|
92
|
+
)
|
|
93
|
+
return target
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
_MD_LINK_RE = re.compile(r"\((https?://[^)\s]+)\)")
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def relink_pages(writer: FileTreeWriter, written_urls: list[str]) -> None:
|
|
100
|
+
"""Rewrite links between crawled Pages into relative .md paths.
|
|
101
|
+
|
|
102
|
+
Runs at Crawl end, when the full set of written Pages is known. Links
|
|
103
|
+
to anything else — external sites, un-crawled pages, hotlinked media —
|
|
104
|
+
keep their absolute URLs.
|
|
105
|
+
"""
|
|
106
|
+
targets = {
|
|
107
|
+
normalize(url): writer.path_for(url).relative_to(writer.output_dir).as_posix()
|
|
108
|
+
for url in written_urls
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
for url in written_urls:
|
|
112
|
+
page_path = writer.path_for(url)
|
|
113
|
+
page_dir = page_path.relative_to(writer.output_dir).parent.as_posix()
|
|
114
|
+
|
|
115
|
+
def rewrite(match):
|
|
116
|
+
link, _, fragment = match.group(1).partition("#")
|
|
117
|
+
target = targets.get(normalize(link))
|
|
118
|
+
if target is None:
|
|
119
|
+
return match.group(0)
|
|
120
|
+
relative = posixpath.relpath(target, start=page_dir)
|
|
121
|
+
return f"({relative}{'#' + fragment if fragment else ''})"
|
|
122
|
+
|
|
123
|
+
text = page_path.read_text()
|
|
124
|
+
rewritten = _MD_LINK_RE.sub(rewrite, text)
|
|
125
|
+
if rewritten != text:
|
|
126
|
+
page_path.write_text(rewritten)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
class AssetStore:
|
|
130
|
+
"""Downloaded Assets land under _media/<host>/<decoded path>."""
|
|
131
|
+
|
|
132
|
+
def __init__(self, output_dir: Path):
|
|
133
|
+
self.output_dir = Path(output_dir)
|
|
134
|
+
|
|
135
|
+
def save(self, url: str, body: bytes) -> str:
|
|
136
|
+
parts = urlsplit(url)
|
|
137
|
+
segments = [
|
|
138
|
+
unquote(s).replace("/", "_") for s in parts.path.split("/") if s
|
|
139
|
+
] or ["asset"]
|
|
140
|
+
relpath = "/".join(["_media", parts.netloc, *segments])
|
|
141
|
+
target = self.output_dir / relpath
|
|
142
|
+
target.parent.mkdir(parents=True, exist_ok=True)
|
|
143
|
+
target.write_bytes(body)
|
|
144
|
+
return relpath
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
class JsonlWriter:
|
|
148
|
+
"""One typed JSON record per line; the Manifest is the final record.
|
|
149
|
+
|
|
150
|
+
This stream is the process-boundary protocol the future API service
|
|
151
|
+
consumes (ADR-0002) — record shape changes are contract changes.
|
|
152
|
+
"""
|
|
153
|
+
|
|
154
|
+
def __init__(self, stream):
|
|
155
|
+
self.stream = stream
|
|
156
|
+
self.page_count = 0
|
|
157
|
+
|
|
158
|
+
def _emit(self, record: dict) -> None:
|
|
159
|
+
self.stream.write(json.dumps(record, ensure_ascii=False) + "\n")
|
|
160
|
+
self.stream.flush()
|
|
161
|
+
|
|
162
|
+
def write_page(self, record: PageRecord) -> None:
|
|
163
|
+
fields = {k: v for k, v in asdict(record).items() if v is not None}
|
|
164
|
+
self._emit({"type": "page", **fields})
|
|
165
|
+
self.page_count += 1
|
|
166
|
+
|
|
167
|
+
def write_manifest(
|
|
168
|
+
self,
|
|
169
|
+
seeds: list[str],
|
|
170
|
+
errors: list[dict] | None = None,
|
|
171
|
+
truncated: bool = False,
|
|
172
|
+
skipped: list[dict] | None = None,
|
|
173
|
+
shells: list[str] | None = None,
|
|
174
|
+
nav: list[dict] | None = None,
|
|
175
|
+
reading_order: list[str] | None = None,
|
|
176
|
+
media_skipped: list[dict] | None = None,
|
|
177
|
+
) -> None:
|
|
178
|
+
self._emit(
|
|
179
|
+
{
|
|
180
|
+
"type": "manifest",
|
|
181
|
+
"seeds": seeds,
|
|
182
|
+
"page_count": self.page_count,
|
|
183
|
+
"errors": errors or [],
|
|
184
|
+
"skipped": skipped or [],
|
|
185
|
+
"shells": shells or [],
|
|
186
|
+
"truncated": truncated,
|
|
187
|
+
"nav": nav or [],
|
|
188
|
+
"reading_order": reading_order or [],
|
|
189
|
+
"media_skipped": media_skipped or [],
|
|
190
|
+
}
|
|
191
|
+
)
|