docs2epub 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs2epub/docusaurus_next.py +38 -1
- {docs2epub-0.1.6.dist-info → docs2epub-0.1.7.dist-info}/METADATA +1 -1
- {docs2epub-0.1.6.dist-info → docs2epub-0.1.7.dist-info}/RECORD +5 -5
- {docs2epub-0.1.6.dist-info → docs2epub-0.1.7.dist-info}/WHEEL +0 -0
- {docs2epub-0.1.6.dist-info → docs2epub-0.1.7.dist-info}/entry_points.txt +0 -0
docs2epub/docusaurus_next.py
CHANGED
|
@@ -88,6 +88,27 @@ def _extract_article(soup: BeautifulSoup) -> Tag:
|
|
|
88
88
|
raise RuntimeError("Could not find <article> in page HTML")
|
|
89
89
|
|
|
90
90
|
|
|
91
|
+
def _extract_canonical_url(soup: BeautifulSoup, *, base_url: str) -> str | None:
|
|
92
|
+
for link in soup.find_all("link", href=True, rel=True):
|
|
93
|
+
rel = link.get("rel")
|
|
94
|
+
rel_values = []
|
|
95
|
+
if isinstance(rel, list):
|
|
96
|
+
rel_values = [str(r).lower() for r in rel]
|
|
97
|
+
else:
|
|
98
|
+
rel_values = [str(rel).lower()]
|
|
99
|
+
if "canonical" not in rel_values:
|
|
100
|
+
continue
|
|
101
|
+
href = str(link.get("href") or "").strip()
|
|
102
|
+
if not href:
|
|
103
|
+
continue
|
|
104
|
+
canonical = urljoin(base_url, href)
|
|
105
|
+
parsed = urlparse(canonical)
|
|
106
|
+
if parsed.scheme not in ("http", "https"):
|
|
107
|
+
continue
|
|
108
|
+
return canonical
|
|
109
|
+
return None
|
|
110
|
+
|
|
111
|
+
|
|
91
112
|
def _canonicalize_url(url: str) -> str:
|
|
92
113
|
parsed = urlparse(url)
|
|
93
114
|
path = parsed.path or "/"
|
|
@@ -321,6 +342,17 @@ def iter_docusaurus_next(options: DocusaurusNextOptions) -> list[Chapter]:
|
|
|
321
342
|
return BeautifulSoup(resp.text, "lxml")
|
|
322
343
|
|
|
323
344
|
initial_soup = fetch_soup(url)
|
|
345
|
+
canonical = _extract_canonical_url(initial_soup, base_url=url)
|
|
346
|
+
if options.base_url is None and canonical:
|
|
347
|
+
start_origin = urlparse(url).netloc.lower()
|
|
348
|
+
canonical_origin = urlparse(canonical).netloc.lower()
|
|
349
|
+
if canonical_origin == start_origin:
|
|
350
|
+
canonical_key = _canonicalize_url(canonical)
|
|
351
|
+
if canonical_key != _canonicalize_url(url):
|
|
352
|
+
url = canonical
|
|
353
|
+
base_url = canonical
|
|
354
|
+
initial_soup = fetch_soup(url)
|
|
355
|
+
|
|
324
356
|
sidebar_urls = _extract_sidebar_urls(initial_soup, base_url=base_url, start_url=url)
|
|
325
357
|
initial_key = _canonicalize_url(url)
|
|
326
358
|
|
|
@@ -342,7 +374,12 @@ def iter_docusaurus_next(options: DocusaurusNextOptions) -> list[Chapter]:
|
|
|
342
374
|
return None
|
|
343
375
|
raise
|
|
344
376
|
|
|
345
|
-
|
|
377
|
+
try:
|
|
378
|
+
article = _extract_article(page_soup)
|
|
379
|
+
except RuntimeError:
|
|
380
|
+
if key != initial_key:
|
|
381
|
+
return None
|
|
382
|
+
raise
|
|
346
383
|
title_el = article.find(["h1", "h2"])
|
|
347
384
|
title = (
|
|
348
385
|
" ".join(title_el.get_text(" ", strip=True).split())
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
docs2epub/__init__.py,sha256=iccyEu4zlubhvd6pM7Z2Gjwn8tPw9IhZ4ABKhbiFjUY,54
|
|
2
2
|
docs2epub/cli.py,sha256=pt1crvrkr2k1ybf_p0m4xSYyoZVluFsDNGuwJ7CykYM,3863
|
|
3
|
-
docs2epub/docusaurus_next.py,sha256=
|
|
3
|
+
docs2epub/docusaurus_next.py,sha256=_PTPnObjYrR6Yxz9rFxooiJhzYlBpRutLMTG0SYshUg,12030
|
|
4
4
|
docs2epub/epub.py,sha256=OsPWcPGTgazAeNpWASIE6e4HQ5ILQr2VFO1-Aj3y1kg,2986
|
|
5
5
|
docs2epub/kindle_html.py,sha256=LN0CGj9ap9b8iC_MlZcQLuhJ7FehZr_VbIfMOz78E5c,2297
|
|
6
6
|
docs2epub/model.py,sha256=uL7uwbG6yU0bEGpSFxxIv2pcZHQR9cs2prfqk5iNQwc,160
|
|
7
7
|
docs2epub/pandoc_epub2.py,sha256=l22-QAQcCgJyl7HF0_b5weC3qEGVQLwOhxdbAvd8C2o,3610
|
|
8
|
-
docs2epub-0.1.
|
|
9
|
-
docs2epub-0.1.
|
|
10
|
-
docs2epub-0.1.
|
|
11
|
-
docs2epub-0.1.
|
|
8
|
+
docs2epub-0.1.7.dist-info/METADATA,sha256=m8ea2WWxNYzeCXHnab2P19oHCQIULTtrTdYMmtnLES8,1886
|
|
9
|
+
docs2epub-0.1.7.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
10
|
+
docs2epub-0.1.7.dist-info/entry_points.txt,sha256=DHK4mzthrIXUvM8Y8Vo_3jG2IhegEDDM7T9CvCkUtvw,49
|
|
11
|
+
docs2epub-0.1.7.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|