docs2epub 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -88,6 +88,27 @@ def _extract_article(soup: BeautifulSoup) -> Tag:
88
88
  raise RuntimeError("Could not find <article> in page HTML")
89
89
 
90
90
 
91
+ def _extract_canonical_url(soup: BeautifulSoup, *, base_url: str) -> str | None:
92
+ for link in soup.find_all("link", href=True, rel=True):
93
+ rel = link.get("rel")
94
+ rel_values = []
95
+ if isinstance(rel, list):
96
+ rel_values = [str(r).lower() for r in rel]
97
+ else:
98
+ rel_values = [str(rel).lower()]
99
+ if "canonical" not in rel_values:
100
+ continue
101
+ href = str(link.get("href") or "").strip()
102
+ if not href:
103
+ continue
104
+ canonical = urljoin(base_url, href)
105
+ parsed = urlparse(canonical)
106
+ if parsed.scheme not in ("http", "https"):
107
+ continue
108
+ return canonical
109
+ return None
110
+
111
+
91
112
  def _canonicalize_url(url: str) -> str:
92
113
  parsed = urlparse(url)
93
114
  path = parsed.path or "/"
@@ -321,6 +342,17 @@ def iter_docusaurus_next(options: DocusaurusNextOptions) -> list[Chapter]:
321
342
  return BeautifulSoup(resp.text, "lxml")
322
343
 
323
344
  initial_soup = fetch_soup(url)
345
+ canonical = _extract_canonical_url(initial_soup, base_url=url)
346
+ if options.base_url is None and canonical:
347
+ start_origin = urlparse(url).netloc.lower()
348
+ canonical_origin = urlparse(canonical).netloc.lower()
349
+ if canonical_origin == start_origin:
350
+ canonical_key = _canonicalize_url(canonical)
351
+ if canonical_key != _canonicalize_url(url):
352
+ url = canonical
353
+ base_url = canonical
354
+ initial_soup = fetch_soup(url)
355
+
324
356
  sidebar_urls = _extract_sidebar_urls(initial_soup, base_url=base_url, start_url=url)
325
357
  initial_key = _canonicalize_url(url)
326
358
 
@@ -342,7 +374,12 @@ def iter_docusaurus_next(options: DocusaurusNextOptions) -> list[Chapter]:
342
374
  return None
343
375
  raise
344
376
 
345
- article = _extract_article(page_soup)
377
+ try:
378
+ article = _extract_article(page_soup)
379
+ except RuntimeError:
380
+ if key != initial_key:
381
+ return None
382
+ raise
346
383
  title_el = article.find(["h1", "h2"])
347
384
  title = (
348
385
  " ".join(title_el.get_text(" ", strip=True).split())
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docs2epub
3
- Version: 0.1.6
3
+ Version: 0.1.7
4
4
  Summary: Turn documentation sites into an EPUB (Kindle-friendly).
5
5
  Author: Breno Brito
6
6
  License: MIT
@@ -1,11 +1,11 @@
1
1
  docs2epub/__init__.py,sha256=iccyEu4zlubhvd6pM7Z2Gjwn8tPw9IhZ4ABKhbiFjUY,54
2
2
  docs2epub/cli.py,sha256=pt1crvrkr2k1ybf_p0m4xSYyoZVluFsDNGuwJ7CykYM,3863
3
- docs2epub/docusaurus_next.py,sha256=c4EZMo7E6zeuekbA3n_rF1joT3Km1cPY1HTimzUkMFg,10866
3
+ docs2epub/docusaurus_next.py,sha256=_PTPnObjYrR6Yxz9rFxooiJhzYlBpRutLMTG0SYshUg,12030
4
4
  docs2epub/epub.py,sha256=OsPWcPGTgazAeNpWASIE6e4HQ5ILQr2VFO1-Aj3y1kg,2986
5
5
  docs2epub/kindle_html.py,sha256=LN0CGj9ap9b8iC_MlZcQLuhJ7FehZr_VbIfMOz78E5c,2297
6
6
  docs2epub/model.py,sha256=uL7uwbG6yU0bEGpSFxxIv2pcZHQR9cs2prfqk5iNQwc,160
7
7
  docs2epub/pandoc_epub2.py,sha256=l22-QAQcCgJyl7HF0_b5weC3qEGVQLwOhxdbAvd8C2o,3610
8
- docs2epub-0.1.6.dist-info/METADATA,sha256=Qu_OyYWfevaG_y7rCpUR81AT8aYK9Yo9KS4Cz_ZgDg8,1886
9
- docs2epub-0.1.6.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
10
- docs2epub-0.1.6.dist-info/entry_points.txt,sha256=DHK4mzthrIXUvM8Y8Vo_3jG2IhegEDDM7T9CvCkUtvw,49
11
- docs2epub-0.1.6.dist-info/RECORD,,
8
+ docs2epub-0.1.7.dist-info/METADATA,sha256=m8ea2WWxNYzeCXHnab2P19oHCQIULTtrTdYMmtnLES8,1886
9
+ docs2epub-0.1.7.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
10
+ docs2epub-0.1.7.dist-info/entry_points.txt,sha256=DHK4mzthrIXUvM8Y8Vo_3jG2IhegEDDM7T9CvCkUtvw,49
11
+ docs2epub-0.1.7.dist-info/RECORD,,