getdocs 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
getdocs/engine.py ADDED
@@ -0,0 +1,418 @@
1
+ """Engine: shallow Scrapy glue around the deep modules.
2
+
3
+ Runs one Crawl per process via CrawlerProcess — the Twisted reactor starts
4
+ once and never restarts (ADR-0002), which is why the future API service
5
+ spawns this as a subprocess per Crawl.
6
+ """
7
+
8
+ import asyncio
9
+ import json
10
+ import posixpath
11
+ import sys
12
+ from dataclasses import replace
13
+ from datetime import datetime, timezone
14
+ from pathlib import Path
15
+ from urllib.parse import urljoin, urlsplit
16
+
17
+ import scrapy
18
+ from scrapy.crawler import CrawlerProcess
19
+ from scrapy.downloadermiddlewares.retry import get_retry_request
20
+ from scrapy.exceptions import CloseSpider, IgnoreRequest
21
+ from scrapy.http import TextResponse
22
+ from scrapy.spidermiddlewares.httperror import HttpError
23
+
24
+ from getdocs.config import CrawlConfig
25
+ from getdocs.extract import extract_page, is_shell
26
+ from getdocs.identity import build_user_agent
27
+ from getdocs.navharvest import harvest_nav, merge_harvests
28
+ from getdocs.output import AssetStore, FileTreeWriter, JsonlWriter, PageRecord, relink_pages
29
+ from getdocs.scope import Scope
30
+ from getdocs.sitemap import parse_robots_sitemaps, parse_sitemap_xml
31
+ from getdocs.urlnorm import normalize
32
+
33
+
34
+ def state_file_for(config: CrawlConfig) -> Path:
35
+ return config.output_dir / ".getdocs" / "crawl-state.json"
36
+
37
+
38
+ def playwright_available() -> bool:
39
+ import importlib.util
40
+
41
+ return importlib.util.find_spec("scrapy_playwright") is not None
42
+
43
+
44
+ _SHELL_LATCH_THRESHOLD = 2 # Shells on one host before it renders everything
45
+
46
+
47
+ class RetryAfterMiddleware:
48
+ """Retry 429 responses no sooner than the server's Retry-After asks.
49
+
50
+ Scrapy's stock RetryMiddleware retries 429 immediately, which is exactly
51
+ what a rate-limiting server is telling us not to do — so 429 is removed
52
+ from its codes and handled here with an async sleep (asyncio reactor).
53
+ """
54
+
55
+ async def process_response(self, request, response, spider):
56
+ if response.status != 429:
57
+ return response
58
+ retry = get_retry_request(request, spider=spider, reason="429 Too Many Requests")
59
+ if retry is None:
60
+ return response # retries exhausted; falls through to the errback
61
+ try:
62
+ delay = float(response.headers.get("Retry-After", b"1"))
63
+ except ValueError:
64
+ delay = 1.0
65
+ await asyncio.sleep(delay)
66
+ return retry
67
+
68
+
69
+ class _CrawlSpider(scrapy.Spider):
70
+ name = "getdocs"
71
+
72
+ def __init__(
73
+ self, config: CrawlConfig, writer, outcome: dict, resume_state: dict | None = None,
74
+ render_enabled: bool = False, **kwargs,
75
+ ):
76
+ super().__init__(**kwargs)
77
+ self.config = config
78
+ self.writer = writer
79
+ self.outcome = outcome
80
+ self.resume_state = resume_state
81
+ self.render_enabled = render_enabled
82
+ self.shell_hosts: dict[str, int] = {}
83
+ # Asset download bookkeeping (--download-media). A Page's write is
84
+ # deferred until its Assets resolve, so failed/oversized Assets can
85
+ # keep their absolute URLs in the written markdown.
86
+ self.asset_store = AssetStore(config.output_dir)
87
+ self.asset_results: dict[str, str | None] = {}
88
+ self.asset_inflight: set[str] = set()
89
+ self.pending_pages: list[dict] = []
90
+ self.media_cap_bytes = int(config.media_max_size * 1024 * 1024)
91
+ # Frontier bookkeeping, persisted across runs (see closed()).
92
+ # pending maps a yielded URL to its hop count; an entry is removed
93
+ # only once the URL is written or errored, so an interruption never
94
+ # loses in-flight work.
95
+ self.pending: dict[str, int] = {}
96
+ self.scope = Scope.from_seeds(
97
+ config.seeds,
98
+ allow_backward=config.allow_backward,
99
+ allow_subdomains=config.allow_subdomains,
100
+ include_paths=config.include_paths,
101
+ exclude_paths=config.exclude_paths,
102
+ )
103
+ self.follow_links = config.sitemap != "only"
104
+ self.enqueued: set[str] = set(resume_state["enqueued"]) if resume_state else set()
105
+ self.written: set[str] = set(resume_state["written"]) if resume_state else set()
106
+ self.sitemaps_fetched: set[str] = set()
107
+ self.crawl_sequence: list[str] = outcome["crawl_sequence"]
108
+
109
+ # -- discovery ---------------------------------------------------------
110
+
111
+ async def start(self):
112
+ if self.config.sitemap != "off":
113
+ roots = {f"{urlsplit(s).scheme}://{urlsplit(s).netloc}" for s in self.config.seeds}
114
+ for root in sorted(roots):
115
+ yield scrapy.Request(urljoin(root, "/robots.txt"), callback=self.parse_robots)
116
+ for request in self._sitemap_requests([urljoin(root, "/sitemap.xml")]):
117
+ yield request
118
+ if self.resume_state:
119
+ for url, hops in self.resume_state["pending"].items():
120
+ self.pending[url] = hops
121
+ yield self._page_request(url, hops=hops)
122
+ elif self.config.sitemap != "only":
123
+ for seed in self.config.seeds:
124
+ self.enqueued.add(normalize(seed))
125
+ self.pending[seed] = 0
126
+ yield self._page_request(seed, hops=0)
127
+
128
+ def parse_robots(self, response):
129
+ yield from self._sitemap_requests(parse_robots_sitemaps(response.text))
130
+
131
+ def parse_sitemap(self, response):
132
+ page_urls, nested = parse_sitemap_xml(response.text)
133
+ yield from self._sitemap_requests(nested)
134
+ for url in page_urls:
135
+ # Sitemap-discovered Pages are depth-0 seeds: Scope still gates
136
+ # them, but --depth never excludes them.
137
+ yield from self._enqueue_page(url, hops=0)
138
+
139
+ # -- fetching ----------------------------------------------------------
140
+
141
+ def parse_page(self, response):
142
+ rendered = response.meta.get("playwright", False)
143
+ shellish = (
144
+ not rendered and isinstance(response, TextResponse) and is_shell(response.text)
145
+ )
146
+ if shellish and self.config.render == "auto" and self.render_enabled:
147
+ # Escalate: re-fetch through the browser. The pending entry
148
+ # survives until the rendered version is written.
149
+ host = urlsplit(response.url).netloc
150
+ self.shell_hosts[host] = self.shell_hosts.get(host, 0) + 1
151
+ yield scrapy.Request(
152
+ response.request.url,
153
+ callback=self.parse_page,
154
+ errback=self.on_page_error,
155
+ meta={"hops": response.meta["hops"], "playwright": True},
156
+ dont_filter=True,
157
+ )
158
+ return
159
+
160
+ norm = normalize(response.url)
161
+ if norm not in self.written:
162
+ if self.config.limit and len(self.written) >= self.config.limit:
163
+ self.outcome["truncated"] = True
164
+ raise CloseSpider("page limit reached")
165
+ self.written.add(norm)
166
+ extracted = extract_page(response.text, response.url, selector=self.config.selector)
167
+ record = PageRecord(
168
+ url=response.url,
169
+ title=extracted.title,
170
+ markdown=extracted.markdown,
171
+ status=response.status,
172
+ crawled_at=datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
173
+ canonical=extracted.canonical,
174
+ html=response.text if self.config.keep_html else None,
175
+ )
176
+ if shellish:
177
+ # Rendering is off or unavailable: written as-is, flagged.
178
+ self.outcome["shells"].append(response.url)
179
+ self.crawl_sequence.append(response.url)
180
+ self.outcome["harvests"].append(
181
+ {"page": response.url, **harvest_nav(response.text, response.url)}
182
+ )
183
+
184
+ assets = list(dict.fromkeys(extracted.assets)) if self.config.download_media else []
185
+ waiting = set()
186
+ for asset_url in assets:
187
+ if asset_url in self.asset_results:
188
+ continue
189
+ waiting.add(asset_url)
190
+ if asset_url not in self.asset_inflight:
191
+ self.asset_inflight.add(asset_url)
192
+ yield scrapy.Request(
193
+ asset_url,
194
+ callback=self.on_asset,
195
+ errback=self.on_asset_error,
196
+ meta={"download_maxsize": self.media_cap_bytes},
197
+ dont_filter=True,
198
+ )
199
+ if waiting:
200
+ self.pending_pages.append(
201
+ {"record": record, "assets": assets, "waiting": waiting}
202
+ )
203
+ else:
204
+ self._finalize_page(record, assets)
205
+ self.pending.pop(response.request.url, None)
206
+
207
+ if not self.follow_links or not isinstance(response, TextResponse):
208
+ return
209
+ hops = response.meta["hops"]
210
+ if self.config.depth and hops + 1 > self.config.depth:
211
+ return
212
+ for href in response.css("a::attr(href)").getall():
213
+ yield from self._enqueue_page(response.urljoin(href.strip()), hops=hops + 1)
214
+
215
+ # -- assets (--download-media; Scope never applies, ADR-0005) -----------
216
+
217
+ def on_asset(self, response):
218
+ relpath = self.asset_store.save(response.request.url, response.body)
219
+ self._resolve_asset(response.request.url, relpath)
220
+
221
+ def on_asset_error(self, failure):
222
+ url = failure.request.url
223
+ reason = (failure.getErrorMessage() or failure.type.__name__)[:200]
224
+ self.outcome["media_skipped"].append({"url": url, "reason": reason})
225
+ self._resolve_asset(url, None)
226
+
227
+ def _resolve_asset(self, url: str, relpath: str | None) -> None:
228
+ self.asset_results[url] = relpath
229
+ self.asset_inflight.discard(url)
230
+ still_waiting = []
231
+ for page in self.pending_pages:
232
+ page["waiting"].discard(url)
233
+ if page["waiting"]:
234
+ still_waiting.append(page)
235
+ else:
236
+ self._finalize_page(page["record"], page["assets"])
237
+ self.pending_pages = still_waiting
238
+
239
+ def _finalize_page(self, record: PageRecord, assets: list[str]) -> None:
240
+ markdown = record.markdown
241
+ for url in assets:
242
+ relpath = self.asset_results.get(url)
243
+ if relpath:
244
+ markdown = markdown.replace(url, self._asset_link(record.url, relpath))
245
+ if markdown is not record.markdown:
246
+ record = replace(record, markdown=markdown)
247
+ self.writer.write_page(record)
248
+ self._progress()
249
+
250
+ def _asset_link(self, page_url: str, asset_relpath: str) -> str:
251
+ """files mode: relative from the page's .md location; jsonl mode:
252
+ root-relative (records have no on-disk location)."""
253
+ if isinstance(self.writer, FileTreeWriter):
254
+ page_rel = self.writer.path_for(page_url).relative_to(self.writer.output_dir)
255
+ return posixpath.relpath(asset_relpath, start=page_rel.parent.as_posix())
256
+ return asset_relpath
257
+
258
+ def on_page_error(self, failure):
259
+ self.pending.pop(failure.request.url, None)
260
+ if failure.check(HttpError):
261
+ response = failure.value.response
262
+ error = {"url": response.url, "status": response.status, "reason": f"HTTP {response.status}"}
263
+ elif failure.check(IgnoreRequest):
264
+ # HttpError subclasses IgnoreRequest, so this arm only sees true
265
+ # filtering — robots.txt telling us not to fetch.
266
+ self.outcome["skipped"].append(
267
+ {"url": failure.request.url, "reason": "robots.txt"}
268
+ )
269
+ self._progress()
270
+ return
271
+ else:
272
+ error = {"url": failure.request.url, "status": None, "reason": failure.type.__name__}
273
+ self.outcome["errors"].append(error)
274
+ self._progress()
275
+
276
+ # -- helpers -----------------------------------------------------------
277
+
278
+ def _progress(self):
279
+ done = len(self.written) + len(self.outcome["errors"])
280
+ print(
281
+ f"[getdocs] pages={len(self.written)} "
282
+ f"pending={max(len(self.enqueued) - done, 0)} "
283
+ f"errors={len(self.outcome['errors'])}",
284
+ file=sys.stderr,
285
+ flush=True,
286
+ )
287
+
288
+ def _page_request(self, url: str, hops: int) -> scrapy.Request:
289
+ meta = {"hops": hops}
290
+ if self._should_render(url):
291
+ meta["playwright"] = True
292
+ return scrapy.Request(
293
+ url, callback=self.parse_page, errback=self.on_page_error, meta=meta
294
+ )
295
+
296
+ def _should_render(self, url: str) -> bool:
297
+ if not self.render_enabled:
298
+ return False
299
+ if self.config.render == "always":
300
+ return True
301
+ host = urlsplit(url).netloc
302
+ return self.shell_hosts.get(host, 0) >= _SHELL_LATCH_THRESHOLD
303
+
304
+ def _enqueue_page(self, url: str, hops: int):
305
+ if not self.scope.allows(url):
306
+ return
307
+ norm = normalize(url)
308
+ if norm in self.enqueued:
309
+ return
310
+ self.enqueued.add(norm)
311
+ self.pending[url] = hops
312
+ yield self._page_request(url, hops=hops)
313
+
314
+ def _sitemap_requests(self, urls: list[str]):
315
+ for url in urls:
316
+ if url not in self.sitemaps_fetched:
317
+ self.sitemaps_fetched.add(url)
318
+ yield scrapy.Request(url, callback=self.parse_sitemap)
319
+
320
+ def closed(self, reason):
321
+ # Pages still waiting on Assets write now with whatever resolved;
322
+ # unresolved Assets keep their absolute URLs.
323
+ for page in self.pending_pages:
324
+ self._finalize_page(page["record"], page["assets"])
325
+ self.pending_pages = []
326
+ state_file = state_file_for(self.config)
327
+ state_file.parent.mkdir(parents=True, exist_ok=True)
328
+ state_file.write_text(
329
+ json.dumps(
330
+ {
331
+ "seeds": self.config.seeds,
332
+ "enqueued": sorted(self.enqueued),
333
+ "written": sorted(self.written),
334
+ "pending": self.pending,
335
+ "errors": self.outcome["errors"],
336
+ "skipped": self.outcome["skipped"],
337
+ "shells": self.outcome["shells"],
338
+ "crawl_sequence": self.crawl_sequence,
339
+ "harvests": self.outcome["harvests"],
340
+ "media_skipped": self.outcome["media_skipped"],
341
+ }
342
+ )
343
+ )
344
+
345
+
346
+ def run_crawl(config: CrawlConfig) -> int:
347
+ """Run a Crawl to completion; returns the number of Pages produced."""
348
+ if config.format == "jsonl":
349
+ writer = JsonlWriter(sys.stdout)
350
+ else:
351
+ writer = FileTreeWriter(config.output_dir)
352
+ resume_state = None
353
+ if config.resume:
354
+ resume_state = json.loads(state_file_for(config).read_text())
355
+ writer.page_count = len(resume_state["written"])
356
+ outcome = {
357
+ "errors": list(resume_state["errors"]) if resume_state else [],
358
+ "skipped": list(resume_state["skipped"]) if resume_state else [],
359
+ "shells": list(resume_state.get("shells", [])) if resume_state else [],
360
+ "harvests": list(resume_state.get("harvests", [])) if resume_state else [],
361
+ "media_skipped": list(resume_state.get("media_skipped", [])) if resume_state else [],
362
+ # Shared with the spider, which appends as Pages are written.
363
+ "crawl_sequence": list(resume_state.get("crawl_sequence", [])) if resume_state else [],
364
+ "truncated": False, # recomputed by this run: a resumed Crawl may finish
365
+ }
366
+ render_enabled = config.render != "never" and playwright_available()
367
+ if config.render != "never" and not render_enabled:
368
+ print(
369
+ "note: scrapy-playwright is not installed — JS rendering disabled; "
370
+ "Shell pages will be written as-is and flagged in the Manifest",
371
+ file=sys.stderr,
372
+ )
373
+ settings = {
374
+ "LOG_LEVEL": "ERROR",
375
+ # Identify honestly as getdocs (not generic Scrapy); robots.txt matching
376
+ # uses this UA too. --contact appends the user's email/URL.
377
+ "USER_AGENT": build_user_agent(config.contact, config.user_agent),
378
+ "REQUEST_FINGERPRINTER_IMPLEMENTATION": "2.7",
379
+ "RETRY_TIMES": 2,
380
+ # 429 is handled by RetryAfterMiddleware, which honors Retry-After.
381
+ "RETRY_HTTP_CODES": [500, 502, 503, 504, 522, 524, 408],
382
+ "DOWNLOADER_MIDDLEWARES": {RetryAfterMiddleware: 560},
383
+ "ROBOTSTXT_OBEY": not config.ignore_robots,
384
+ "DOWNLOAD_DELAY": config.delay,
385
+ "AUTOTHROTTLE_ENABLED": config.delay > 0,
386
+ "AUTOTHROTTLE_START_DELAY": config.delay or 1.0,
387
+ "CONCURRENT_REQUESTS_PER_DOMAIN": config.concurrency,
388
+ }
389
+ if render_enabled:
390
+ settings["DOWNLOAD_HANDLERS"] = {
391
+ "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
392
+ "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
393
+ }
394
+ settings["TWISTED_REACTOR"] = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
395
+ process = CrawlerProcess(settings=settings, install_root_handler=False)
396
+ process.crawl(
397
+ _CrawlSpider,
398
+ config=config,
399
+ writer=writer,
400
+ outcome=outcome,
401
+ resume_state=resume_state,
402
+ render_enabled=render_enabled,
403
+ )
404
+ process.start()
405
+ if isinstance(writer, FileTreeWriter):
406
+ relink_pages(writer, outcome["crawl_sequence"])
407
+ nav, reading_order = merge_harvests(outcome["harvests"], outcome["crawl_sequence"])
408
+ writer.write_manifest(
409
+ seeds=config.seeds,
410
+ errors=outcome["errors"],
411
+ truncated=outcome["truncated"],
412
+ skipped=outcome["skipped"],
413
+ shells=outcome["shells"],
414
+ nav=nav,
415
+ reading_order=reading_order,
416
+ media_skipped=outcome["media_skipped"],
417
+ )
418
+ return writer.page_count
getdocs/extract.py ADDED
@@ -0,0 +1,190 @@
1
+ """Extract: HTML in, (title, markdown, canonical) out.
2
+
3
+ Selector-first pipeline: a user-supplied selector wins, then the known
4
+ content containers of common docs generators, then semantic candidates.
5
+ Readability extraction (trafilatura) is a last resort for pages with no
6
+ recognizable content root — docs sites are structured, so exploiting
7
+ that structure beats statistical extraction (and never eats code blocks).
8
+ """
9
+
10
+ import re
11
+ from dataclasses import dataclass
12
+ from urllib.parse import urljoin, urlsplit
13
+
14
+ from bs4 import BeautifulSoup
15
+ from markdownify import MarkdownConverter, markdownify
16
+
17
+
18
+ @dataclass(frozen=True)
19
+ class ExtractedPage:
20
+ title: str
21
+ markdown: str
22
+ canonical: str | None = None
23
+ assets: tuple[str, ...] = () # absolute URLs of referenced images/documents
24
+
25
+
26
+ _DOC_EXTENSIONS = (
27
+ ".pdf", ".zip", ".tar.gz", ".tgz", ".7z", ".dmg", ".pkg", ".msi", ".exe", ".whl",
28
+ )
29
+
30
+
31
+ _GENERATOR_SELECTORS = [
32
+ "div.theme-doc-markdown", # Docusaurus
33
+ "article.md-content__inner", # MkDocs Material
34
+ 'div.body[role="main"]', # Sphinx
35
+ "#content-area", # Mintlify
36
+ ]
37
+ _SEMANTIC_SELECTORS = ["main", "article", '[role="main"]']
38
+
39
+ _NOISE_TAGS = ("nav", "aside", "header", "footer", "script", "style", "noscript")
40
+ _NOISE_CLASS_RE = re.compile(r"breadcrumb|table-of-contents|(^|[-_])toc($|[-_])")
41
+
42
+
43
+ def _code_language(el) -> str:
44
+ """Language hint for a <pre> block: language-x on it or its <code>
45
+ children, or Sphinx-style highlight-x on an ancestor."""
46
+ candidates = [el, *el.find_all("code"), *el.parents]
47
+ for node in candidates:
48
+ for cls in node.get("class") or []:
49
+ if cls.startswith("language-"):
50
+ return cls.removeprefix("language-")
51
+ if cls.startswith("highlight-"):
52
+ lang = cls.removeprefix("highlight-")
53
+ if lang not in ("default", "notranslate"):
54
+ return lang
55
+ return ""
56
+
57
+
58
+ _converter = MarkdownConverter(heading_style="ATX", code_language_callback=_code_language)
59
+
60
+
61
+ # Known icon shapes (Material Design icon path prefixes, as emitted by
62
+ # mkdocs-material twemoji spans). Inline SVGs carry no text, so without
63
+ # this, checkmark columns in comparison tables extract as empty cells.
64
+ _SVG_GLYPHS = {
65
+ "M21 7 9 19l-5.5-5.5": "✓", # mdi-check
66
+ "M9 20.42 2.79 14.21": "✓", # mdi-check-bold
67
+ "M19 6.41 17.59 5 12 10.59": "✗", # mdi-close
68
+ "M20 6.91 17.09 4 12 9.09": "✗", # mdi-close-thick
69
+ }
70
+
71
+
72
+ def _svg_to_text(root) -> None:
73
+ for svg in root.find_all("svg"):
74
+ label = svg.get("aria-label")
75
+ if not label:
76
+ title = svg.find("title")
77
+ label = title.get_text(strip=True) if title else None
78
+ if not label:
79
+ path = svg.find("path")
80
+ d = (path.get("d") or "") if path else ""
81
+ label = next((g for prefix, g in _SVG_GLYPHS.items() if d.startswith(prefix)), None)
82
+ if label:
83
+ svg.replace_with(label)
84
+
85
+
86
+ def _absolutize_urls(root, page_url: str) -> list[str]:
87
+ """Rewrite hrefs/srcs absolute against the page URL — relative values
88
+ would point at nothing in the output tree (hotlink default, ADR-0005).
89
+ Returns the Asset URLs found: images, then document downloads."""
90
+ images, documents = [], []
91
+ for tag in root.find_all(href=True):
92
+ tag["href"] = urljoin(page_url, tag["href"])
93
+ if tag.name == "a" and urlsplit(tag["href"]).path.lower().endswith(_DOC_EXTENSIONS):
94
+ documents.append(tag["href"])
95
+ for tag in root.find_all(src=True):
96
+ tag["src"] = urljoin(page_url, tag["src"])
97
+ if tag.name == "img":
98
+ images.append(tag["src"])
99
+ for tag in root.find_all(srcset=True):
100
+ tag["srcset"] = ", ".join(
101
+ " ".join([urljoin(page_url, part.strip().split()[0]), *part.strip().split()[1:]])
102
+ for part in tag["srcset"].split(",")
103
+ if part.strip()
104
+ )
105
+ return images + documents
106
+
107
+
108
+ def _strip_noise(root) -> None:
109
+ for tag in root.find_all(_NOISE_TAGS):
110
+ tag.decompose()
111
+ doomed = [
112
+ el
113
+ for el in root.find_all(True)
114
+ if any(_NOISE_CLASS_RE.search(cls) for cls in el.get("class") or [])
115
+ ]
116
+ for el in doomed:
117
+ el.decompose()
118
+
119
+
120
+ def _find_content_root(soup, selector: str | None):
121
+ if selector:
122
+ root = soup.select_one(selector)
123
+ if root is not None:
124
+ return root
125
+ for sel in [*_GENERATOR_SELECTORS, *_SEMANTIC_SELECTORS]:
126
+ root = soup.select_one(sel)
127
+ if root is not None:
128
+ return root
129
+ return None
130
+
131
+
132
+ def _readability_markdown(html: str) -> str | None:
133
+ import trafilatura
134
+
135
+ return trafilatura.extract(html, output_format="markdown", include_tables=True)
136
+
137
+
138
+ _ROOT_DIV_IDS = ["root", "app", "__next", "___gatsby", "__nuxt"]
139
+
140
+
141
+ def is_shell(html: str) -> bool:
142
+ """True when a response is an unhydrated client-side app frame rather
143
+ than real content — the signal that triggers render escalation."""
144
+ soup = BeautifulSoup(html, "html.parser")
145
+ body = soup.body
146
+ if body is None:
147
+ return False
148
+
149
+ noscript_warning = any(
150
+ "javascript" in ns.get_text().lower() for ns in body.find_all("noscript")
151
+ )
152
+ has_scripts = bool(soup.find("script"))
153
+
154
+ for tag in body.find_all(["script", "noscript", "style", "template"]):
155
+ tag.decompose()
156
+ text = body.get_text(strip=True)
157
+ if len(text) > 200:
158
+ return False # plenty of real text — not a shell, whatever else it has
159
+
160
+ empty_root = any(
161
+ not div.get_text(strip=True) for div in body.find_all("div", id=_ROOT_DIV_IDS)
162
+ )
163
+ return empty_root or noscript_warning or (has_scripts and len(text) < 30)
164
+
165
+
166
+ def extract_page(html: str, url: str, selector: str | None = None) -> ExtractedPage:
167
+ soup = BeautifulSoup(html, "html.parser")
168
+
169
+ title = soup.title.get_text(strip=True) if soup.title else None
170
+ if not title:
171
+ og_title = soup.find("meta", property="og:title")
172
+ title = og_title.get("content") if og_title else None
173
+ title = title or url
174
+
175
+ canonical_link = soup.find("link", rel="canonical")
176
+ canonical = canonical_link.get("href") if canonical_link else None
177
+
178
+ root = _find_content_root(soup, selector)
179
+ assets: list[str] = []
180
+ if root is not None:
181
+ _strip_noise(root)
182
+ _svg_to_text(root)
183
+ assets = _absolutize_urls(root, url)
184
+ markdown = _converter.convert(str(root)).strip()
185
+ else:
186
+ markdown = (_readability_markdown(html) or markdownify(str(soup.body or soup))).strip()
187
+
188
+ return ExtractedPage(
189
+ title=title, markdown=markdown, canonical=canonical, assets=tuple(assets)
190
+ )
getdocs/identity.py ADDED
@@ -0,0 +1,32 @@
1
+ """How getdocs identifies itself to the sites it fetches.
2
+
3
+ One honest, descriptive User-Agent for every request — both the pre-crawl
4
+ source check (`source.py`) and the Scrapy crawl (`engine.py`) — so a site
5
+ operator reading their logs can tell it's getdocs and, when the user opts in
6
+ with `--contact`, reach whoever is crawling. Identifying yourself is crawling
7
+ etiquette (RFC 9309), not a hard requirement, so `contact` stays optional.
8
+ """
9
+
10
+ import importlib.metadata
11
+
12
+ PROJECT_URL = "https://github.com/jonbakerfish/getdocs"
13
+
14
+
15
+ def _version() -> str:
16
+ try:
17
+ return importlib.metadata.version("getdocs")
18
+ except importlib.metadata.PackageNotFoundError:
19
+ return "0.0.0"
20
+
21
+
22
+ def build_user_agent(contact: str | None = None, override: str | None = None) -> str:
23
+ """The User-Agent getdocs sends.
24
+
25
+ `override` wins verbatim when given; otherwise the UA names getdocs and its
26
+ version, with the project URL and — when supplied — the user's contact:
27
+ getdocs/0.1.0 (+https://github.com/jonbakerfish/getdocs; you@example.com)
28
+ """
29
+ if override:
30
+ return override
31
+ detail = PROJECT_URL if not contact else f"{PROJECT_URL}; {contact}"
32
+ return f"getdocs/{_version()} (+{detail})"