docs2epub 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs2epub/docusaurus_next.py +58 -1
- {docs2epub-0.1.6.dist-info → docs2epub-0.1.8.dist-info}/METADATA +1 -1
- {docs2epub-0.1.6.dist-info → docs2epub-0.1.8.dist-info}/RECORD +5 -5
- {docs2epub-0.1.6.dist-info → docs2epub-0.1.8.dist-info}/WHEEL +0 -0
- {docs2epub-0.1.6.dist-info → docs2epub-0.1.8.dist-info}/entry_points.txt +0 -0
docs2epub/docusaurus_next.py
CHANGED
|
@@ -85,9 +85,46 @@ def _extract_article(soup: BeautifulSoup) -> Tag:
|
|
|
85
85
|
role_main = soup.find(attrs={"role": "main"})
|
|
86
86
|
if role_main:
|
|
87
87
|
return role_main
|
|
88
|
+
for selector in [
|
|
89
|
+
"div#content",
|
|
90
|
+
"div.content",
|
|
91
|
+
"div#main",
|
|
92
|
+
"div.main",
|
|
93
|
+
"div#page",
|
|
94
|
+
"div.page",
|
|
95
|
+
"div.document",
|
|
96
|
+
"div#document",
|
|
97
|
+
]:
|
|
98
|
+
candidate = soup.select_one(selector)
|
|
99
|
+
if candidate:
|
|
100
|
+
return candidate
|
|
101
|
+
body = soup.find("body")
|
|
102
|
+
if body:
|
|
103
|
+
return body
|
|
88
104
|
raise RuntimeError("Could not find <article> in page HTML")
|
|
89
105
|
|
|
90
106
|
|
|
107
|
+
def _extract_canonical_url(soup: BeautifulSoup, *, base_url: str) -> str | None:
|
|
108
|
+
for link in soup.find_all("link", href=True, rel=True):
|
|
109
|
+
rel = link.get("rel")
|
|
110
|
+
rel_values = []
|
|
111
|
+
if isinstance(rel, list):
|
|
112
|
+
rel_values = [str(r).lower() for r in rel]
|
|
113
|
+
else:
|
|
114
|
+
rel_values = [str(rel).lower()]
|
|
115
|
+
if "canonical" not in rel_values:
|
|
116
|
+
continue
|
|
117
|
+
href = str(link.get("href") or "").strip()
|
|
118
|
+
if not href:
|
|
119
|
+
continue
|
|
120
|
+
canonical = urljoin(base_url, href)
|
|
121
|
+
parsed = urlparse(canonical)
|
|
122
|
+
if parsed.scheme not in ("http", "https"):
|
|
123
|
+
continue
|
|
124
|
+
return canonical
|
|
125
|
+
return None
|
|
126
|
+
|
|
127
|
+
|
|
91
128
|
def _canonicalize_url(url: str) -> str:
|
|
92
129
|
parsed = urlparse(url)
|
|
93
130
|
path = parsed.path or "/"
|
|
@@ -321,6 +358,17 @@ def iter_docusaurus_next(options: DocusaurusNextOptions) -> list[Chapter]:
|
|
|
321
358
|
return BeautifulSoup(resp.text, "lxml")
|
|
322
359
|
|
|
323
360
|
initial_soup = fetch_soup(url)
|
|
361
|
+
canonical = _extract_canonical_url(initial_soup, base_url=url)
|
|
362
|
+
if options.base_url is None and canonical:
|
|
363
|
+
start_origin = urlparse(url).netloc.lower()
|
|
364
|
+
canonical_origin = urlparse(canonical).netloc.lower()
|
|
365
|
+
if canonical_origin == start_origin:
|
|
366
|
+
canonical_key = _canonicalize_url(canonical)
|
|
367
|
+
if canonical_key != _canonicalize_url(url):
|
|
368
|
+
url = canonical
|
|
369
|
+
base_url = canonical
|
|
370
|
+
initial_soup = fetch_soup(url)
|
|
371
|
+
|
|
324
372
|
sidebar_urls = _extract_sidebar_urls(initial_soup, base_url=base_url, start_url=url)
|
|
325
373
|
initial_key = _canonicalize_url(url)
|
|
326
374
|
|
|
@@ -342,13 +390,22 @@ def iter_docusaurus_next(options: DocusaurusNextOptions) -> list[Chapter]:
|
|
|
342
390
|
return None
|
|
343
391
|
raise
|
|
344
392
|
|
|
345
|
-
|
|
393
|
+
try:
|
|
394
|
+
article = _extract_article(page_soup)
|
|
395
|
+
except RuntimeError:
|
|
396
|
+
if key != initial_key:
|
|
397
|
+
return None
|
|
398
|
+
raise
|
|
346
399
|
title_el = article.find(["h1", "h2"])
|
|
347
400
|
title = (
|
|
348
401
|
" ".join(title_el.get_text(" ", strip=True).split())
|
|
349
402
|
if title_el
|
|
350
403
|
else f"Chapter {len(chapters) + 1}"
|
|
351
404
|
)
|
|
405
|
+
if title_el is None and article.name == "body":
|
|
406
|
+
body_text = " ".join(article.get_text(" ", strip=True).split())
|
|
407
|
+
if len(body_text) < 200:
|
|
408
|
+
return None
|
|
352
409
|
|
|
353
410
|
_remove_unwanted(article)
|
|
354
411
|
_absolutize_urls(article, base_url=target_url)
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
docs2epub/__init__.py,sha256=iccyEu4zlubhvd6pM7Z2Gjwn8tPw9IhZ4ABKhbiFjUY,54
|
|
2
2
|
docs2epub/cli.py,sha256=pt1crvrkr2k1ybf_p0m4xSYyoZVluFsDNGuwJ7CykYM,3863
|
|
3
|
-
docs2epub/docusaurus_next.py,sha256=
|
|
3
|
+
docs2epub/docusaurus_next.py,sha256=nQYkNecXgh4TsxaTydoiC1tVmIqjYiLiyEtYlpXGmXg,12507
|
|
4
4
|
docs2epub/epub.py,sha256=OsPWcPGTgazAeNpWASIE6e4HQ5ILQr2VFO1-Aj3y1kg,2986
|
|
5
5
|
docs2epub/kindle_html.py,sha256=LN0CGj9ap9b8iC_MlZcQLuhJ7FehZr_VbIfMOz78E5c,2297
|
|
6
6
|
docs2epub/model.py,sha256=uL7uwbG6yU0bEGpSFxxIv2pcZHQR9cs2prfqk5iNQwc,160
|
|
7
7
|
docs2epub/pandoc_epub2.py,sha256=l22-QAQcCgJyl7HF0_b5weC3qEGVQLwOhxdbAvd8C2o,3610
|
|
8
|
-
docs2epub-0.1.
|
|
9
|
-
docs2epub-0.1.
|
|
10
|
-
docs2epub-0.1.
|
|
11
|
-
docs2epub-0.1.
|
|
8
|
+
docs2epub-0.1.8.dist-info/METADATA,sha256=KdwbHGiBRLuXLQKlTypnDH8eOogD5bDoSGDIJgNriZs,1886
|
|
9
|
+
docs2epub-0.1.8.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
10
|
+
docs2epub-0.1.8.dist-info/entry_points.txt,sha256=DHK4mzthrIXUvM8Y8Vo_3jG2IhegEDDM7T9CvCkUtvw,49
|
|
11
|
+
docs2epub-0.1.8.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|