docs2epub 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -88,6 +88,27 @@ def _extract_article(soup: BeautifulSoup) -> Tag:
88
88
  raise RuntimeError("Could not find <article> in page HTML")
89
89
 
90
90
 
91
+ def _extract_canonical_url(soup: BeautifulSoup, *, base_url: str) -> str | None:
92
+ for link in soup.find_all("link", href=True, rel=True):
93
+ rel = link.get("rel")
94
+ rel_values = []
95
+ if isinstance(rel, list):
96
+ rel_values = [str(r).lower() for r in rel]
97
+ else:
98
+ rel_values = [str(rel).lower()]
99
+ if "canonical" not in rel_values:
100
+ continue
101
+ href = str(link.get("href") or "").strip()
102
+ if not href:
103
+ continue
104
+ canonical = urljoin(base_url, href)
105
+ parsed = urlparse(canonical)
106
+ if parsed.scheme not in ("http", "https"):
107
+ continue
108
+ return canonical
109
+ return None
110
+
111
+
91
112
  def _canonicalize_url(url: str) -> str:
92
113
  parsed = urlparse(url)
93
114
  path = parsed.path or "/"
@@ -321,6 +342,17 @@ def iter_docusaurus_next(options: DocusaurusNextOptions) -> list[Chapter]:
321
342
  return BeautifulSoup(resp.text, "lxml")
322
343
 
323
344
  initial_soup = fetch_soup(url)
345
+ canonical = _extract_canonical_url(initial_soup, base_url=url)
346
+ if options.base_url is None and canonical:
347
+ start_origin = urlparse(url).netloc.lower()
348
+ canonical_origin = urlparse(canonical).netloc.lower()
349
+ if canonical_origin == start_origin:
350
+ canonical_key = _canonicalize_url(canonical)
351
+ if canonical_key != _canonicalize_url(url):
352
+ url = canonical
353
+ base_url = canonical
354
+ initial_soup = fetch_soup(url)
355
+
324
356
  sidebar_urls = _extract_sidebar_urls(initial_soup, base_url=base_url, start_url=url)
325
357
  initial_key = _canonicalize_url(url)
326
358
 
@@ -332,9 +364,22 @@ def iter_docusaurus_next(options: DocusaurusNextOptions) -> list[Chapter]:
332
364
  return None
333
365
  visited.add(key)
334
366
 
335
- page_soup = soup if soup is not None else fetch_soup(target_url)
336
-
337
- article = _extract_article(page_soup)
367
+ page_soup = soup
368
+ if page_soup is None:
369
+ try:
370
+ page_soup = fetch_soup(target_url)
371
+ except requests.HTTPError as exc:
372
+ status = exc.response.status_code if exc.response is not None else None
373
+ if status in {404, 410} and key != initial_key:
374
+ return None
375
+ raise
376
+
377
+ try:
378
+ article = _extract_article(page_soup)
379
+ except RuntimeError:
380
+ if key != initial_key:
381
+ return None
382
+ raise
338
383
  title_el = article.find(["h1", "h2"])
339
384
  title = (
340
385
  " ".join(title_el.get_text(" ", strip=True).split())
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docs2epub
3
- Version: 0.1.5
3
+ Version: 0.1.7
4
4
  Summary: Turn documentation sites into an EPUB (Kindle-friendly).
5
5
  Author: Breno Brito
6
6
  License: MIT
@@ -1,11 +1,11 @@
1
1
  docs2epub/__init__.py,sha256=iccyEu4zlubhvd6pM7Z2Gjwn8tPw9IhZ4ABKhbiFjUY,54
2
2
  docs2epub/cli.py,sha256=pt1crvrkr2k1ybf_p0m4xSYyoZVluFsDNGuwJ7CykYM,3863
3
- docs2epub/docusaurus_next.py,sha256=iOEBxWYhsCw-mJQdV_iNXlQgzGP5ZRTEQ94LJrA45I4,10622
3
+ docs2epub/docusaurus_next.py,sha256=_PTPnObjYrR6Yxz9rFxooiJhzYlBpRutLMTG0SYshUg,12030
4
4
  docs2epub/epub.py,sha256=OsPWcPGTgazAeNpWASIE6e4HQ5ILQr2VFO1-Aj3y1kg,2986
5
5
  docs2epub/kindle_html.py,sha256=LN0CGj9ap9b8iC_MlZcQLuhJ7FehZr_VbIfMOz78E5c,2297
6
6
  docs2epub/model.py,sha256=uL7uwbG6yU0bEGpSFxxIv2pcZHQR9cs2prfqk5iNQwc,160
7
7
  docs2epub/pandoc_epub2.py,sha256=l22-QAQcCgJyl7HF0_b5weC3qEGVQLwOhxdbAvd8C2o,3610
8
- docs2epub-0.1.5.dist-info/METADATA,sha256=IKC3voH2Zc6b2rZxUb6I4VUMDpZpUZErDEjwpZjgFIQ,1886
9
- docs2epub-0.1.5.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
10
- docs2epub-0.1.5.dist-info/entry_points.txt,sha256=DHK4mzthrIXUvM8Y8Vo_3jG2IhegEDDM7T9CvCkUtvw,49
11
- docs2epub-0.1.5.dist-info/RECORD,,
8
+ docs2epub-0.1.7.dist-info/METADATA,sha256=m8ea2WWxNYzeCXHnab2P19oHCQIULTtrTdYMmtnLES8,1886
9
+ docs2epub-0.1.7.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
10
+ docs2epub-0.1.7.dist-info/entry_points.txt,sha256=DHK4mzthrIXUvM8Y8Vo_3jG2IhegEDDM7T9CvCkUtvw,49
11
+ docs2epub-0.1.7.dist-info/RECORD,,