docs2epub 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -85,9 +85,46 @@ def _extract_article(soup: BeautifulSoup) -> Tag:
85
85
  role_main = soup.find(attrs={"role": "main"})
86
86
  if role_main:
87
87
  return role_main
88
+ for selector in [
89
+ "div#content",
90
+ "div.content",
91
+ "div#main",
92
+ "div.main",
93
+ "div#page",
94
+ "div.page",
95
+ "div.document",
96
+ "div#document",
97
+ ]:
98
+ candidate = soup.select_one(selector)
99
+ if candidate:
100
+ return candidate
101
+ body = soup.find("body")
102
+ if body:
103
+ return body
88
104
  raise RuntimeError("Could not find <article> in page HTML")
89
105
 
90
106
 
107
+ def _extract_canonical_url(soup: BeautifulSoup, *, base_url: str) -> str | None:
108
+ for link in soup.find_all("link", href=True, rel=True):
109
+ rel = link.get("rel")
110
+ rel_values = []
111
+ if isinstance(rel, list):
112
+ rel_values = [str(r).lower() for r in rel]
113
+ else:
114
+ rel_values = [str(rel).lower()]
115
+ if "canonical" not in rel_values:
116
+ continue
117
+ href = str(link.get("href") or "").strip()
118
+ if not href:
119
+ continue
120
+ canonical = urljoin(base_url, href)
121
+ parsed = urlparse(canonical)
122
+ if parsed.scheme not in ("http", "https"):
123
+ continue
124
+ return canonical
125
+ return None
126
+
127
+
91
128
  def _canonicalize_url(url: str) -> str:
92
129
  parsed = urlparse(url)
93
130
  path = parsed.path or "/"
@@ -321,6 +358,17 @@ def iter_docusaurus_next(options: DocusaurusNextOptions) -> list[Chapter]:
321
358
  return BeautifulSoup(resp.text, "lxml")
322
359
 
323
360
  initial_soup = fetch_soup(url)
361
+ canonical = _extract_canonical_url(initial_soup, base_url=url)
362
+ if options.base_url is None and canonical:
363
+ start_origin = urlparse(url).netloc.lower()
364
+ canonical_origin = urlparse(canonical).netloc.lower()
365
+ if canonical_origin == start_origin:
366
+ canonical_key = _canonicalize_url(canonical)
367
+ if canonical_key != _canonicalize_url(url):
368
+ url = canonical
369
+ base_url = canonical
370
+ initial_soup = fetch_soup(url)
371
+
324
372
  sidebar_urls = _extract_sidebar_urls(initial_soup, base_url=base_url, start_url=url)
325
373
  initial_key = _canonicalize_url(url)
326
374
 
@@ -342,13 +390,22 @@ def iter_docusaurus_next(options: DocusaurusNextOptions) -> list[Chapter]:
342
390
  return None
343
391
  raise
344
392
 
345
- article = _extract_article(page_soup)
393
+ try:
394
+ article = _extract_article(page_soup)
395
+ except RuntimeError:
396
+ if key != initial_key:
397
+ return None
398
+ raise
346
399
  title_el = article.find(["h1", "h2"])
347
400
  title = (
348
401
  " ".join(title_el.get_text(" ", strip=True).split())
349
402
  if title_el
350
403
  else f"Chapter {len(chapters) + 1}"
351
404
  )
405
+ if title_el is None and article.name == "body":
406
+ body_text = " ".join(article.get_text(" ", strip=True).split())
407
+ if len(body_text) < 200:
408
+ return None
352
409
 
353
410
  _remove_unwanted(article)
354
411
  _absolutize_urls(article, base_url=target_url)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docs2epub
3
- Version: 0.1.6
3
+ Version: 0.1.8
4
4
  Summary: Turn documentation sites into an EPUB (Kindle-friendly).
5
5
  Author: Breno Brito
6
6
  License: MIT
@@ -1,11 +1,11 @@
1
1
  docs2epub/__init__.py,sha256=iccyEu4zlubhvd6pM7Z2Gjwn8tPw9IhZ4ABKhbiFjUY,54
2
2
  docs2epub/cli.py,sha256=pt1crvrkr2k1ybf_p0m4xSYyoZVluFsDNGuwJ7CykYM,3863
3
- docs2epub/docusaurus_next.py,sha256=c4EZMo7E6zeuekbA3n_rF1joT3Km1cPY1HTimzUkMFg,10866
3
+ docs2epub/docusaurus_next.py,sha256=nQYkNecXgh4TsxaTydoiC1tVmIqjYiLiyEtYlpXGmXg,12507
4
4
  docs2epub/epub.py,sha256=OsPWcPGTgazAeNpWASIE6e4HQ5ILQr2VFO1-Aj3y1kg,2986
5
5
  docs2epub/kindle_html.py,sha256=LN0CGj9ap9b8iC_MlZcQLuhJ7FehZr_VbIfMOz78E5c,2297
6
6
  docs2epub/model.py,sha256=uL7uwbG6yU0bEGpSFxxIv2pcZHQR9cs2prfqk5iNQwc,160
7
7
  docs2epub/pandoc_epub2.py,sha256=l22-QAQcCgJyl7HF0_b5weC3qEGVQLwOhxdbAvd8C2o,3610
8
- docs2epub-0.1.6.dist-info/METADATA,sha256=Qu_OyYWfevaG_y7rCpUR81AT8aYK9Yo9KS4Cz_ZgDg8,1886
9
- docs2epub-0.1.6.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
10
- docs2epub-0.1.6.dist-info/entry_points.txt,sha256=DHK4mzthrIXUvM8Y8Vo_3jG2IhegEDDM7T9CvCkUtvw,49
11
- docs2epub-0.1.6.dist-info/RECORD,,
8
+ docs2epub-0.1.8.dist-info/METADATA,sha256=KdwbHGiBRLuXLQKlTypnDH8eOogD5bDoSGDIJgNriZs,1886
9
+ docs2epub-0.1.8.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
10
+ docs2epub-0.1.8.dist-info/entry_points.txt,sha256=DHK4mzthrIXUvM8Y8Vo_3jG2IhegEDDM7T9CvCkUtvw,49
11
+ docs2epub-0.1.8.dist-info/RECORD,,