getdocs 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- getdocs/__init__.py +0 -0
- getdocs/__main__.py +3 -0
- getdocs/api.py +95 -0
- getdocs/cli.py +220 -0
- getdocs/config.py +36 -0
- getdocs/engine.py +418 -0
- getdocs/extract.py +190 -0
- getdocs/identity.py +32 -0
- getdocs/jobs.py +204 -0
- getdocs/navharvest.py +242 -0
- getdocs/output.py +191 -0
- getdocs/scope.py +84 -0
- getdocs/sitemap.py +35 -0
- getdocs/source.py +238 -0
- getdocs/urlnorm.py +34 -0
- getdocs-0.1.0.dist-info/METADATA +169 -0
- getdocs-0.1.0.dist-info/RECORD +21 -0
- getdocs-0.1.0.dist-info/WHEEL +5 -0
- getdocs-0.1.0.dist-info/entry_points.txt +2 -0
- getdocs-0.1.0.dist-info/licenses/LICENSE +21 -0
- getdocs-0.1.0.dist-info/top_level.txt +1 -0
getdocs/engine.py
ADDED
|
@@ -0,0 +1,418 @@
|
|
|
1
|
+
"""Engine: shallow Scrapy glue around the deep modules.
|
|
2
|
+
|
|
3
|
+
Runs one Crawl per process via CrawlerProcess — the Twisted reactor starts
|
|
4
|
+
once and never restarts (ADR-0002), which is why the future API service
|
|
5
|
+
spawns this as a subprocess per Crawl.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import asyncio
|
|
9
|
+
import json
|
|
10
|
+
import posixpath
|
|
11
|
+
import sys
|
|
12
|
+
from dataclasses import replace
|
|
13
|
+
from datetime import datetime, timezone
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from urllib.parse import urljoin, urlsplit
|
|
16
|
+
|
|
17
|
+
import scrapy
|
|
18
|
+
from scrapy.crawler import CrawlerProcess
|
|
19
|
+
from scrapy.downloadermiddlewares.retry import get_retry_request
|
|
20
|
+
from scrapy.exceptions import CloseSpider, IgnoreRequest
|
|
21
|
+
from scrapy.http import TextResponse
|
|
22
|
+
from scrapy.spidermiddlewares.httperror import HttpError
|
|
23
|
+
|
|
24
|
+
from getdocs.config import CrawlConfig
|
|
25
|
+
from getdocs.extract import extract_page, is_shell
|
|
26
|
+
from getdocs.identity import build_user_agent
|
|
27
|
+
from getdocs.navharvest import harvest_nav, merge_harvests
|
|
28
|
+
from getdocs.output import AssetStore, FileTreeWriter, JsonlWriter, PageRecord, relink_pages
|
|
29
|
+
from getdocs.scope import Scope
|
|
30
|
+
from getdocs.sitemap import parse_robots_sitemaps, parse_sitemap_xml
|
|
31
|
+
from getdocs.urlnorm import normalize
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def state_file_for(config: CrawlConfig) -> Path:
|
|
35
|
+
return config.output_dir / ".getdocs" / "crawl-state.json"
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def playwright_available() -> bool:
|
|
39
|
+
import importlib.util
|
|
40
|
+
|
|
41
|
+
return importlib.util.find_spec("scrapy_playwright") is not None
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
_SHELL_LATCH_THRESHOLD = 2 # Shells on one host before it renders everything
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class RetryAfterMiddleware:
|
|
48
|
+
"""Retry 429 responses no sooner than the server's Retry-After asks.
|
|
49
|
+
|
|
50
|
+
Scrapy's stock RetryMiddleware retries 429 immediately, which is exactly
|
|
51
|
+
what a rate-limiting server is telling us not to do — so 429 is removed
|
|
52
|
+
from its codes and handled here with an async sleep (asyncio reactor).
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
async def process_response(self, request, response, spider):
|
|
56
|
+
if response.status != 429:
|
|
57
|
+
return response
|
|
58
|
+
retry = get_retry_request(request, spider=spider, reason="429 Too Many Requests")
|
|
59
|
+
if retry is None:
|
|
60
|
+
return response # retries exhausted; falls through to the errback
|
|
61
|
+
try:
|
|
62
|
+
delay = float(response.headers.get("Retry-After", b"1"))
|
|
63
|
+
except ValueError:
|
|
64
|
+
delay = 1.0
|
|
65
|
+
await asyncio.sleep(delay)
|
|
66
|
+
return retry
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class _CrawlSpider(scrapy.Spider):
|
|
70
|
+
name = "getdocs"
|
|
71
|
+
|
|
72
|
+
def __init__(
|
|
73
|
+
self, config: CrawlConfig, writer, outcome: dict, resume_state: dict | None = None,
|
|
74
|
+
render_enabled: bool = False, **kwargs,
|
|
75
|
+
):
|
|
76
|
+
super().__init__(**kwargs)
|
|
77
|
+
self.config = config
|
|
78
|
+
self.writer = writer
|
|
79
|
+
self.outcome = outcome
|
|
80
|
+
self.resume_state = resume_state
|
|
81
|
+
self.render_enabled = render_enabled
|
|
82
|
+
self.shell_hosts: dict[str, int] = {}
|
|
83
|
+
# Asset download bookkeeping (--download-media). A Page's write is
|
|
84
|
+
# deferred until its Assets resolve, so failed/oversized Assets can
|
|
85
|
+
# keep their absolute URLs in the written markdown.
|
|
86
|
+
self.asset_store = AssetStore(config.output_dir)
|
|
87
|
+
self.asset_results: dict[str, str | None] = {}
|
|
88
|
+
self.asset_inflight: set[str] = set()
|
|
89
|
+
self.pending_pages: list[dict] = []
|
|
90
|
+
self.media_cap_bytes = int(config.media_max_size * 1024 * 1024)
|
|
91
|
+
# Frontier bookkeeping, persisted across runs (see closed()).
|
|
92
|
+
# pending maps a yielded URL to its hop count; an entry is removed
|
|
93
|
+
# only once the URL is written or errored, so an interruption never
|
|
94
|
+
# loses in-flight work.
|
|
95
|
+
self.pending: dict[str, int] = {}
|
|
96
|
+
self.scope = Scope.from_seeds(
|
|
97
|
+
config.seeds,
|
|
98
|
+
allow_backward=config.allow_backward,
|
|
99
|
+
allow_subdomains=config.allow_subdomains,
|
|
100
|
+
include_paths=config.include_paths,
|
|
101
|
+
exclude_paths=config.exclude_paths,
|
|
102
|
+
)
|
|
103
|
+
self.follow_links = config.sitemap != "only"
|
|
104
|
+
self.enqueued: set[str] = set(resume_state["enqueued"]) if resume_state else set()
|
|
105
|
+
self.written: set[str] = set(resume_state["written"]) if resume_state else set()
|
|
106
|
+
self.sitemaps_fetched: set[str] = set()
|
|
107
|
+
self.crawl_sequence: list[str] = outcome["crawl_sequence"]
|
|
108
|
+
|
|
109
|
+
# -- discovery ---------------------------------------------------------
|
|
110
|
+
|
|
111
|
+
async def start(self):
|
|
112
|
+
if self.config.sitemap != "off":
|
|
113
|
+
roots = {f"{urlsplit(s).scheme}://{urlsplit(s).netloc}" for s in self.config.seeds}
|
|
114
|
+
for root in sorted(roots):
|
|
115
|
+
yield scrapy.Request(urljoin(root, "/robots.txt"), callback=self.parse_robots)
|
|
116
|
+
for request in self._sitemap_requests([urljoin(root, "/sitemap.xml")]):
|
|
117
|
+
yield request
|
|
118
|
+
if self.resume_state:
|
|
119
|
+
for url, hops in self.resume_state["pending"].items():
|
|
120
|
+
self.pending[url] = hops
|
|
121
|
+
yield self._page_request(url, hops=hops)
|
|
122
|
+
elif self.config.sitemap != "only":
|
|
123
|
+
for seed in self.config.seeds:
|
|
124
|
+
self.enqueued.add(normalize(seed))
|
|
125
|
+
self.pending[seed] = 0
|
|
126
|
+
yield self._page_request(seed, hops=0)
|
|
127
|
+
|
|
128
|
+
def parse_robots(self, response):
|
|
129
|
+
yield from self._sitemap_requests(parse_robots_sitemaps(response.text))
|
|
130
|
+
|
|
131
|
+
def parse_sitemap(self, response):
|
|
132
|
+
page_urls, nested = parse_sitemap_xml(response.text)
|
|
133
|
+
yield from self._sitemap_requests(nested)
|
|
134
|
+
for url in page_urls:
|
|
135
|
+
# Sitemap-discovered Pages are depth-0 seeds: Scope still gates
|
|
136
|
+
# them, but --depth never excludes them.
|
|
137
|
+
yield from self._enqueue_page(url, hops=0)
|
|
138
|
+
|
|
139
|
+
# -- fetching ----------------------------------------------------------
|
|
140
|
+
|
|
141
|
+
def parse_page(self, response):
|
|
142
|
+
rendered = response.meta.get("playwright", False)
|
|
143
|
+
shellish = (
|
|
144
|
+
not rendered and isinstance(response, TextResponse) and is_shell(response.text)
|
|
145
|
+
)
|
|
146
|
+
if shellish and self.config.render == "auto" and self.render_enabled:
|
|
147
|
+
# Escalate: re-fetch through the browser. The pending entry
|
|
148
|
+
# survives until the rendered version is written.
|
|
149
|
+
host = urlsplit(response.url).netloc
|
|
150
|
+
self.shell_hosts[host] = self.shell_hosts.get(host, 0) + 1
|
|
151
|
+
yield scrapy.Request(
|
|
152
|
+
response.request.url,
|
|
153
|
+
callback=self.parse_page,
|
|
154
|
+
errback=self.on_page_error,
|
|
155
|
+
meta={"hops": response.meta["hops"], "playwright": True},
|
|
156
|
+
dont_filter=True,
|
|
157
|
+
)
|
|
158
|
+
return
|
|
159
|
+
|
|
160
|
+
norm = normalize(response.url)
|
|
161
|
+
if norm not in self.written:
|
|
162
|
+
if self.config.limit and len(self.written) >= self.config.limit:
|
|
163
|
+
self.outcome["truncated"] = True
|
|
164
|
+
raise CloseSpider("page limit reached")
|
|
165
|
+
self.written.add(norm)
|
|
166
|
+
extracted = extract_page(response.text, response.url, selector=self.config.selector)
|
|
167
|
+
record = PageRecord(
|
|
168
|
+
url=response.url,
|
|
169
|
+
title=extracted.title,
|
|
170
|
+
markdown=extracted.markdown,
|
|
171
|
+
status=response.status,
|
|
172
|
+
crawled_at=datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
|
|
173
|
+
canonical=extracted.canonical,
|
|
174
|
+
html=response.text if self.config.keep_html else None,
|
|
175
|
+
)
|
|
176
|
+
if shellish:
|
|
177
|
+
# Rendering is off or unavailable: written as-is, flagged.
|
|
178
|
+
self.outcome["shells"].append(response.url)
|
|
179
|
+
self.crawl_sequence.append(response.url)
|
|
180
|
+
self.outcome["harvests"].append(
|
|
181
|
+
{"page": response.url, **harvest_nav(response.text, response.url)}
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
assets = list(dict.fromkeys(extracted.assets)) if self.config.download_media else []
|
|
185
|
+
waiting = set()
|
|
186
|
+
for asset_url in assets:
|
|
187
|
+
if asset_url in self.asset_results:
|
|
188
|
+
continue
|
|
189
|
+
waiting.add(asset_url)
|
|
190
|
+
if asset_url not in self.asset_inflight:
|
|
191
|
+
self.asset_inflight.add(asset_url)
|
|
192
|
+
yield scrapy.Request(
|
|
193
|
+
asset_url,
|
|
194
|
+
callback=self.on_asset,
|
|
195
|
+
errback=self.on_asset_error,
|
|
196
|
+
meta={"download_maxsize": self.media_cap_bytes},
|
|
197
|
+
dont_filter=True,
|
|
198
|
+
)
|
|
199
|
+
if waiting:
|
|
200
|
+
self.pending_pages.append(
|
|
201
|
+
{"record": record, "assets": assets, "waiting": waiting}
|
|
202
|
+
)
|
|
203
|
+
else:
|
|
204
|
+
self._finalize_page(record, assets)
|
|
205
|
+
self.pending.pop(response.request.url, None)
|
|
206
|
+
|
|
207
|
+
if not self.follow_links or not isinstance(response, TextResponse):
|
|
208
|
+
return
|
|
209
|
+
hops = response.meta["hops"]
|
|
210
|
+
if self.config.depth and hops + 1 > self.config.depth:
|
|
211
|
+
return
|
|
212
|
+
for href in response.css("a::attr(href)").getall():
|
|
213
|
+
yield from self._enqueue_page(response.urljoin(href.strip()), hops=hops + 1)
|
|
214
|
+
|
|
215
|
+
# -- assets (--download-media; Scope never applies, ADR-0005) -----------
|
|
216
|
+
|
|
217
|
+
def on_asset(self, response):
|
|
218
|
+
relpath = self.asset_store.save(response.request.url, response.body)
|
|
219
|
+
self._resolve_asset(response.request.url, relpath)
|
|
220
|
+
|
|
221
|
+
def on_asset_error(self, failure):
|
|
222
|
+
url = failure.request.url
|
|
223
|
+
reason = (failure.getErrorMessage() or failure.type.__name__)[:200]
|
|
224
|
+
self.outcome["media_skipped"].append({"url": url, "reason": reason})
|
|
225
|
+
self._resolve_asset(url, None)
|
|
226
|
+
|
|
227
|
+
def _resolve_asset(self, url: str, relpath: str | None) -> None:
|
|
228
|
+
self.asset_results[url] = relpath
|
|
229
|
+
self.asset_inflight.discard(url)
|
|
230
|
+
still_waiting = []
|
|
231
|
+
for page in self.pending_pages:
|
|
232
|
+
page["waiting"].discard(url)
|
|
233
|
+
if page["waiting"]:
|
|
234
|
+
still_waiting.append(page)
|
|
235
|
+
else:
|
|
236
|
+
self._finalize_page(page["record"], page["assets"])
|
|
237
|
+
self.pending_pages = still_waiting
|
|
238
|
+
|
|
239
|
+
def _finalize_page(self, record: PageRecord, assets: list[str]) -> None:
|
|
240
|
+
markdown = record.markdown
|
|
241
|
+
for url in assets:
|
|
242
|
+
relpath = self.asset_results.get(url)
|
|
243
|
+
if relpath:
|
|
244
|
+
markdown = markdown.replace(url, self._asset_link(record.url, relpath))
|
|
245
|
+
if markdown is not record.markdown:
|
|
246
|
+
record = replace(record, markdown=markdown)
|
|
247
|
+
self.writer.write_page(record)
|
|
248
|
+
self._progress()
|
|
249
|
+
|
|
250
|
+
def _asset_link(self, page_url: str, asset_relpath: str) -> str:
|
|
251
|
+
"""files mode: relative from the page's .md location; jsonl mode:
|
|
252
|
+
root-relative (records have no on-disk location)."""
|
|
253
|
+
if isinstance(self.writer, FileTreeWriter):
|
|
254
|
+
page_rel = self.writer.path_for(page_url).relative_to(self.writer.output_dir)
|
|
255
|
+
return posixpath.relpath(asset_relpath, start=page_rel.parent.as_posix())
|
|
256
|
+
return asset_relpath
|
|
257
|
+
|
|
258
|
+
def on_page_error(self, failure):
|
|
259
|
+
self.pending.pop(failure.request.url, None)
|
|
260
|
+
if failure.check(HttpError):
|
|
261
|
+
response = failure.value.response
|
|
262
|
+
error = {"url": response.url, "status": response.status, "reason": f"HTTP {response.status}"}
|
|
263
|
+
elif failure.check(IgnoreRequest):
|
|
264
|
+
# HttpError subclasses IgnoreRequest, so this arm only sees true
|
|
265
|
+
# filtering — robots.txt telling us not to fetch.
|
|
266
|
+
self.outcome["skipped"].append(
|
|
267
|
+
{"url": failure.request.url, "reason": "robots.txt"}
|
|
268
|
+
)
|
|
269
|
+
self._progress()
|
|
270
|
+
return
|
|
271
|
+
else:
|
|
272
|
+
error = {"url": failure.request.url, "status": None, "reason": failure.type.__name__}
|
|
273
|
+
self.outcome["errors"].append(error)
|
|
274
|
+
self._progress()
|
|
275
|
+
|
|
276
|
+
# -- helpers -----------------------------------------------------------
|
|
277
|
+
|
|
278
|
+
def _progress(self):
|
|
279
|
+
done = len(self.written) + len(self.outcome["errors"])
|
|
280
|
+
print(
|
|
281
|
+
f"[getdocs] pages={len(self.written)} "
|
|
282
|
+
f"pending={max(len(self.enqueued) - done, 0)} "
|
|
283
|
+
f"errors={len(self.outcome['errors'])}",
|
|
284
|
+
file=sys.stderr,
|
|
285
|
+
flush=True,
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
def _page_request(self, url: str, hops: int) -> scrapy.Request:
|
|
289
|
+
meta = {"hops": hops}
|
|
290
|
+
if self._should_render(url):
|
|
291
|
+
meta["playwright"] = True
|
|
292
|
+
return scrapy.Request(
|
|
293
|
+
url, callback=self.parse_page, errback=self.on_page_error, meta=meta
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
def _should_render(self, url: str) -> bool:
|
|
297
|
+
if not self.render_enabled:
|
|
298
|
+
return False
|
|
299
|
+
if self.config.render == "always":
|
|
300
|
+
return True
|
|
301
|
+
host = urlsplit(url).netloc
|
|
302
|
+
return self.shell_hosts.get(host, 0) >= _SHELL_LATCH_THRESHOLD
|
|
303
|
+
|
|
304
|
+
def _enqueue_page(self, url: str, hops: int):
|
|
305
|
+
if not self.scope.allows(url):
|
|
306
|
+
return
|
|
307
|
+
norm = normalize(url)
|
|
308
|
+
if norm in self.enqueued:
|
|
309
|
+
return
|
|
310
|
+
self.enqueued.add(norm)
|
|
311
|
+
self.pending[url] = hops
|
|
312
|
+
yield self._page_request(url, hops=hops)
|
|
313
|
+
|
|
314
|
+
def _sitemap_requests(self, urls: list[str]):
|
|
315
|
+
for url in urls:
|
|
316
|
+
if url not in self.sitemaps_fetched:
|
|
317
|
+
self.sitemaps_fetched.add(url)
|
|
318
|
+
yield scrapy.Request(url, callback=self.parse_sitemap)
|
|
319
|
+
|
|
320
|
+
def closed(self, reason):
|
|
321
|
+
# Pages still waiting on Assets write now with whatever resolved;
|
|
322
|
+
# unresolved Assets keep their absolute URLs.
|
|
323
|
+
for page in self.pending_pages:
|
|
324
|
+
self._finalize_page(page["record"], page["assets"])
|
|
325
|
+
self.pending_pages = []
|
|
326
|
+
state_file = state_file_for(self.config)
|
|
327
|
+
state_file.parent.mkdir(parents=True, exist_ok=True)
|
|
328
|
+
state_file.write_text(
|
|
329
|
+
json.dumps(
|
|
330
|
+
{
|
|
331
|
+
"seeds": self.config.seeds,
|
|
332
|
+
"enqueued": sorted(self.enqueued),
|
|
333
|
+
"written": sorted(self.written),
|
|
334
|
+
"pending": self.pending,
|
|
335
|
+
"errors": self.outcome["errors"],
|
|
336
|
+
"skipped": self.outcome["skipped"],
|
|
337
|
+
"shells": self.outcome["shells"],
|
|
338
|
+
"crawl_sequence": self.crawl_sequence,
|
|
339
|
+
"harvests": self.outcome["harvests"],
|
|
340
|
+
"media_skipped": self.outcome["media_skipped"],
|
|
341
|
+
}
|
|
342
|
+
)
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
def run_crawl(config: CrawlConfig) -> int:
|
|
347
|
+
"""Run a Crawl to completion; returns the number of Pages produced."""
|
|
348
|
+
if config.format == "jsonl":
|
|
349
|
+
writer = JsonlWriter(sys.stdout)
|
|
350
|
+
else:
|
|
351
|
+
writer = FileTreeWriter(config.output_dir)
|
|
352
|
+
resume_state = None
|
|
353
|
+
if config.resume:
|
|
354
|
+
resume_state = json.loads(state_file_for(config).read_text())
|
|
355
|
+
writer.page_count = len(resume_state["written"])
|
|
356
|
+
outcome = {
|
|
357
|
+
"errors": list(resume_state["errors"]) if resume_state else [],
|
|
358
|
+
"skipped": list(resume_state["skipped"]) if resume_state else [],
|
|
359
|
+
"shells": list(resume_state.get("shells", [])) if resume_state else [],
|
|
360
|
+
"harvests": list(resume_state.get("harvests", [])) if resume_state else [],
|
|
361
|
+
"media_skipped": list(resume_state.get("media_skipped", [])) if resume_state else [],
|
|
362
|
+
# Shared with the spider, which appends as Pages are written.
|
|
363
|
+
"crawl_sequence": list(resume_state.get("crawl_sequence", [])) if resume_state else [],
|
|
364
|
+
"truncated": False, # recomputed by this run: a resumed Crawl may finish
|
|
365
|
+
}
|
|
366
|
+
render_enabled = config.render != "never" and playwright_available()
|
|
367
|
+
if config.render != "never" and not render_enabled:
|
|
368
|
+
print(
|
|
369
|
+
"note: scrapy-playwright is not installed — JS rendering disabled; "
|
|
370
|
+
"Shell pages will be written as-is and flagged in the Manifest",
|
|
371
|
+
file=sys.stderr,
|
|
372
|
+
)
|
|
373
|
+
settings = {
|
|
374
|
+
"LOG_LEVEL": "ERROR",
|
|
375
|
+
# Identify honestly as getdocs (not generic Scrapy); robots.txt matching
|
|
376
|
+
# uses this UA too. --contact appends the user's email/URL.
|
|
377
|
+
"USER_AGENT": build_user_agent(config.contact, config.user_agent),
|
|
378
|
+
"REQUEST_FINGERPRINTER_IMPLEMENTATION": "2.7",
|
|
379
|
+
"RETRY_TIMES": 2,
|
|
380
|
+
# 429 is handled by RetryAfterMiddleware, which honors Retry-After.
|
|
381
|
+
"RETRY_HTTP_CODES": [500, 502, 503, 504, 522, 524, 408],
|
|
382
|
+
"DOWNLOADER_MIDDLEWARES": {RetryAfterMiddleware: 560},
|
|
383
|
+
"ROBOTSTXT_OBEY": not config.ignore_robots,
|
|
384
|
+
"DOWNLOAD_DELAY": config.delay,
|
|
385
|
+
"AUTOTHROTTLE_ENABLED": config.delay > 0,
|
|
386
|
+
"AUTOTHROTTLE_START_DELAY": config.delay or 1.0,
|
|
387
|
+
"CONCURRENT_REQUESTS_PER_DOMAIN": config.concurrency,
|
|
388
|
+
}
|
|
389
|
+
if render_enabled:
|
|
390
|
+
settings["DOWNLOAD_HANDLERS"] = {
|
|
391
|
+
"http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
|
|
392
|
+
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
|
|
393
|
+
}
|
|
394
|
+
settings["TWISTED_REACTOR"] = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
|
|
395
|
+
process = CrawlerProcess(settings=settings, install_root_handler=False)
|
|
396
|
+
process.crawl(
|
|
397
|
+
_CrawlSpider,
|
|
398
|
+
config=config,
|
|
399
|
+
writer=writer,
|
|
400
|
+
outcome=outcome,
|
|
401
|
+
resume_state=resume_state,
|
|
402
|
+
render_enabled=render_enabled,
|
|
403
|
+
)
|
|
404
|
+
process.start()
|
|
405
|
+
if isinstance(writer, FileTreeWriter):
|
|
406
|
+
relink_pages(writer, outcome["crawl_sequence"])
|
|
407
|
+
nav, reading_order = merge_harvests(outcome["harvests"], outcome["crawl_sequence"])
|
|
408
|
+
writer.write_manifest(
|
|
409
|
+
seeds=config.seeds,
|
|
410
|
+
errors=outcome["errors"],
|
|
411
|
+
truncated=outcome["truncated"],
|
|
412
|
+
skipped=outcome["skipped"],
|
|
413
|
+
shells=outcome["shells"],
|
|
414
|
+
nav=nav,
|
|
415
|
+
reading_order=reading_order,
|
|
416
|
+
media_skipped=outcome["media_skipped"],
|
|
417
|
+
)
|
|
418
|
+
return writer.page_count
|
getdocs/extract.py
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
"""Extract: HTML in, (title, markdown, canonical) out.
|
|
2
|
+
|
|
3
|
+
Selector-first pipeline: a user-supplied selector wins, then the known
|
|
4
|
+
content containers of common docs generators, then semantic candidates.
|
|
5
|
+
Readability extraction (trafilatura) is a last resort for pages with no
|
|
6
|
+
recognizable content root — docs sites are structured, so exploiting
|
|
7
|
+
that structure beats statistical extraction (and never eats code blocks).
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import re
|
|
11
|
+
from dataclasses import dataclass
|
|
12
|
+
from urllib.parse import urljoin, urlsplit
|
|
13
|
+
|
|
14
|
+
from bs4 import BeautifulSoup
|
|
15
|
+
from markdownify import MarkdownConverter, markdownify
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass(frozen=True)
|
|
19
|
+
class ExtractedPage:
|
|
20
|
+
title: str
|
|
21
|
+
markdown: str
|
|
22
|
+
canonical: str | None = None
|
|
23
|
+
assets: tuple[str, ...] = () # absolute URLs of referenced images/documents
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
_DOC_EXTENSIONS = (
|
|
27
|
+
".pdf", ".zip", ".tar.gz", ".tgz", ".7z", ".dmg", ".pkg", ".msi", ".exe", ".whl",
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
_GENERATOR_SELECTORS = [
|
|
32
|
+
"div.theme-doc-markdown", # Docusaurus
|
|
33
|
+
"article.md-content__inner", # MkDocs Material
|
|
34
|
+
'div.body[role="main"]', # Sphinx
|
|
35
|
+
"#content-area", # Mintlify
|
|
36
|
+
]
|
|
37
|
+
_SEMANTIC_SELECTORS = ["main", "article", '[role="main"]']
|
|
38
|
+
|
|
39
|
+
_NOISE_TAGS = ("nav", "aside", "header", "footer", "script", "style", "noscript")
|
|
40
|
+
_NOISE_CLASS_RE = re.compile(r"breadcrumb|table-of-contents|(^|[-_])toc($|[-_])")
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _code_language(el) -> str:
|
|
44
|
+
"""Language hint for a <pre> block: language-x on it or its <code>
|
|
45
|
+
children, or Sphinx-style highlight-x on an ancestor."""
|
|
46
|
+
candidates = [el, *el.find_all("code"), *el.parents]
|
|
47
|
+
for node in candidates:
|
|
48
|
+
for cls in node.get("class") or []:
|
|
49
|
+
if cls.startswith("language-"):
|
|
50
|
+
return cls.removeprefix("language-")
|
|
51
|
+
if cls.startswith("highlight-"):
|
|
52
|
+
lang = cls.removeprefix("highlight-")
|
|
53
|
+
if lang not in ("default", "notranslate"):
|
|
54
|
+
return lang
|
|
55
|
+
return ""
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
_converter = MarkdownConverter(heading_style="ATX", code_language_callback=_code_language)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
# Known icon shapes (Material Design icon path prefixes, as emitted by
|
|
62
|
+
# mkdocs-material twemoji spans). Inline SVGs carry no text, so without
|
|
63
|
+
# this, checkmark columns in comparison tables extract as empty cells.
|
|
64
|
+
_SVG_GLYPHS = {
|
|
65
|
+
"M21 7 9 19l-5.5-5.5": "✓", # mdi-check
|
|
66
|
+
"M9 20.42 2.79 14.21": "✓", # mdi-check-bold
|
|
67
|
+
"M19 6.41 17.59 5 12 10.59": "✗", # mdi-close
|
|
68
|
+
"M20 6.91 17.09 4 12 9.09": "✗", # mdi-close-thick
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _svg_to_text(root) -> None:
|
|
73
|
+
for svg in root.find_all("svg"):
|
|
74
|
+
label = svg.get("aria-label")
|
|
75
|
+
if not label:
|
|
76
|
+
title = svg.find("title")
|
|
77
|
+
label = title.get_text(strip=True) if title else None
|
|
78
|
+
if not label:
|
|
79
|
+
path = svg.find("path")
|
|
80
|
+
d = (path.get("d") or "") if path else ""
|
|
81
|
+
label = next((g for prefix, g in _SVG_GLYPHS.items() if d.startswith(prefix)), None)
|
|
82
|
+
if label:
|
|
83
|
+
svg.replace_with(label)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _absolutize_urls(root, page_url: str) -> list[str]:
|
|
87
|
+
"""Rewrite hrefs/srcs absolute against the page URL — relative values
|
|
88
|
+
would point at nothing in the output tree (hotlink default, ADR-0005).
|
|
89
|
+
Returns the Asset URLs found: images, then document downloads."""
|
|
90
|
+
images, documents = [], []
|
|
91
|
+
for tag in root.find_all(href=True):
|
|
92
|
+
tag["href"] = urljoin(page_url, tag["href"])
|
|
93
|
+
if tag.name == "a" and urlsplit(tag["href"]).path.lower().endswith(_DOC_EXTENSIONS):
|
|
94
|
+
documents.append(tag["href"])
|
|
95
|
+
for tag in root.find_all(src=True):
|
|
96
|
+
tag["src"] = urljoin(page_url, tag["src"])
|
|
97
|
+
if tag.name == "img":
|
|
98
|
+
images.append(tag["src"])
|
|
99
|
+
for tag in root.find_all(srcset=True):
|
|
100
|
+
tag["srcset"] = ", ".join(
|
|
101
|
+
" ".join([urljoin(page_url, part.strip().split()[0]), *part.strip().split()[1:]])
|
|
102
|
+
for part in tag["srcset"].split(",")
|
|
103
|
+
if part.strip()
|
|
104
|
+
)
|
|
105
|
+
return images + documents
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def _strip_noise(root) -> None:
|
|
109
|
+
for tag in root.find_all(_NOISE_TAGS):
|
|
110
|
+
tag.decompose()
|
|
111
|
+
doomed = [
|
|
112
|
+
el
|
|
113
|
+
for el in root.find_all(True)
|
|
114
|
+
if any(_NOISE_CLASS_RE.search(cls) for cls in el.get("class") or [])
|
|
115
|
+
]
|
|
116
|
+
for el in doomed:
|
|
117
|
+
el.decompose()
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def _find_content_root(soup, selector: str | None):
|
|
121
|
+
if selector:
|
|
122
|
+
root = soup.select_one(selector)
|
|
123
|
+
if root is not None:
|
|
124
|
+
return root
|
|
125
|
+
for sel in [*_GENERATOR_SELECTORS, *_SEMANTIC_SELECTORS]:
|
|
126
|
+
root = soup.select_one(sel)
|
|
127
|
+
if root is not None:
|
|
128
|
+
return root
|
|
129
|
+
return None
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def _readability_markdown(html: str) -> str | None:
|
|
133
|
+
import trafilatura
|
|
134
|
+
|
|
135
|
+
return trafilatura.extract(html, output_format="markdown", include_tables=True)
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
_ROOT_DIV_IDS = ["root", "app", "__next", "___gatsby", "__nuxt"]
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def is_shell(html: str) -> bool:
|
|
142
|
+
"""True when a response is an unhydrated client-side app frame rather
|
|
143
|
+
than real content — the signal that triggers render escalation."""
|
|
144
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
145
|
+
body = soup.body
|
|
146
|
+
if body is None:
|
|
147
|
+
return False
|
|
148
|
+
|
|
149
|
+
noscript_warning = any(
|
|
150
|
+
"javascript" in ns.get_text().lower() for ns in body.find_all("noscript")
|
|
151
|
+
)
|
|
152
|
+
has_scripts = bool(soup.find("script"))
|
|
153
|
+
|
|
154
|
+
for tag in body.find_all(["script", "noscript", "style", "template"]):
|
|
155
|
+
tag.decompose()
|
|
156
|
+
text = body.get_text(strip=True)
|
|
157
|
+
if len(text) > 200:
|
|
158
|
+
return False # plenty of real text — not a shell, whatever else it has
|
|
159
|
+
|
|
160
|
+
empty_root = any(
|
|
161
|
+
not div.get_text(strip=True) for div in body.find_all("div", id=_ROOT_DIV_IDS)
|
|
162
|
+
)
|
|
163
|
+
return empty_root or noscript_warning or (has_scripts and len(text) < 30)
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def extract_page(html: str, url: str, selector: str | None = None) -> ExtractedPage:
|
|
167
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
168
|
+
|
|
169
|
+
title = soup.title.get_text(strip=True) if soup.title else None
|
|
170
|
+
if not title:
|
|
171
|
+
og_title = soup.find("meta", property="og:title")
|
|
172
|
+
title = og_title.get("content") if og_title else None
|
|
173
|
+
title = title or url
|
|
174
|
+
|
|
175
|
+
canonical_link = soup.find("link", rel="canonical")
|
|
176
|
+
canonical = canonical_link.get("href") if canonical_link else None
|
|
177
|
+
|
|
178
|
+
root = _find_content_root(soup, selector)
|
|
179
|
+
assets: list[str] = []
|
|
180
|
+
if root is not None:
|
|
181
|
+
_strip_noise(root)
|
|
182
|
+
_svg_to_text(root)
|
|
183
|
+
assets = _absolutize_urls(root, url)
|
|
184
|
+
markdown = _converter.convert(str(root)).strip()
|
|
185
|
+
else:
|
|
186
|
+
markdown = (_readability_markdown(html) or markdownify(str(soup.body or soup))).strip()
|
|
187
|
+
|
|
188
|
+
return ExtractedPage(
|
|
189
|
+
title=title, markdown=markdown, canonical=canonical, assets=tuple(assets)
|
|
190
|
+
)
|
getdocs/identity.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""How getdocs identifies itself to the sites it fetches.
|
|
2
|
+
|
|
3
|
+
One honest, descriptive User-Agent for every request — both the pre-crawl
|
|
4
|
+
source check (`source.py`) and the Scrapy crawl (`engine.py`) — so a site
|
|
5
|
+
operator reading their logs can tell it's getdocs and, when the user opts in
|
|
6
|
+
with `--contact`, reach whoever is crawling. Identifying yourself is crawling
|
|
7
|
+
etiquette (RFC 9309), not a hard requirement, so `contact` stays optional.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import importlib.metadata
|
|
11
|
+
|
|
12
|
+
PROJECT_URL = "https://github.com/jonbakerfish/getdocs"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _version() -> str:
|
|
16
|
+
try:
|
|
17
|
+
return importlib.metadata.version("getdocs")
|
|
18
|
+
except importlib.metadata.PackageNotFoundError:
|
|
19
|
+
return "0.0.0"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def build_user_agent(contact: str | None = None, override: str | None = None) -> str:
|
|
23
|
+
"""The User-Agent getdocs sends.
|
|
24
|
+
|
|
25
|
+
`override` wins verbatim when given; otherwise the UA names getdocs and its
|
|
26
|
+
version, with the project URL and — when supplied — the user's contact:
|
|
27
|
+
getdocs/0.1.0 (+https://github.com/jonbakerfish/getdocs; you@example.com)
|
|
28
|
+
"""
|
|
29
|
+
if override:
|
|
30
|
+
return override
|
|
31
|
+
detail = PROJECT_URL if not contact else f"{PROJECT_URL}; {contact}"
|
|
32
|
+
return f"getdocs/{_version()} (+{detail})"
|