docs2epub 0.1.2__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,7 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  import re
4
4
  from dataclasses import dataclass
5
- from urllib.parse import urljoin
5
+ from urllib.parse import urljoin, urlparse
6
6
 
7
7
  import requests
8
8
  from bs4 import BeautifulSoup, Tag
@@ -12,6 +12,48 @@ from .model import Chapter
12
12
 
13
13
  DEFAULT_USER_AGENT = "docs2epub/0.1 (+https://github.com/brenorb/docs2epub)"
14
14
 
15
+ _SIDEBAR_SELECTORS = [
16
+ 'aside[data-testid="table-of-contents"]',
17
+ "aside#table-of-contents",
18
+ 'nav[aria-label="Table of contents"]',
19
+ 'nav[aria-label="Table of Contents"]',
20
+ 'nav[aria-label="Docs sidebar"]',
21
+ 'nav[aria-label="Docs navigation"]',
22
+ 'nav[aria-label="Documentation"]',
23
+ 'nav[aria-label="Docs"]',
24
+ "aside.theme-doc-sidebar-container",
25
+ "div.theme-doc-sidebar-container",
26
+ "nav.theme-doc-sidebar-menu",
27
+ "nav.menu",
28
+ 'nav[class*="menu"]',
29
+ 'aside[class*="sidebar"]',
30
+ 'nav[class*="sidebar"]',
31
+ ]
32
+
33
+ _NON_DOC_EXTENSIONS = {
34
+ ".png",
35
+ ".jpg",
36
+ ".jpeg",
37
+ ".gif",
38
+ ".svg",
39
+ ".webp",
40
+ ".css",
41
+ ".js",
42
+ ".map",
43
+ ".json",
44
+ ".xml",
45
+ ".rss",
46
+ ".pdf",
47
+ ".zip",
48
+ ".tar",
49
+ ".gz",
50
+ ".tgz",
51
+ ".epub",
52
+ ".mp4",
53
+ ".mp3",
54
+ ".wav",
55
+ }
56
+
15
57
 
16
58
  @dataclass(frozen=True)
17
59
  class DocusaurusNextOptions:
@@ -39,12 +81,190 @@ def _extract_article(soup: BeautifulSoup) -> Tag:
39
81
  article = main.find("article")
40
82
  if article:
41
83
  return article
84
+ return main
85
+ role_main = soup.find(attrs={"role": "main"})
86
+ if role_main:
87
+ return role_main
42
88
  raise RuntimeError("Could not find <article> in page HTML")
43
89
 
44
90
 
91
+ def _canonicalize_url(url: str) -> str:
92
+ parsed = urlparse(url)
93
+ path = parsed.path or "/"
94
+ if path != "/" and path.endswith("/"):
95
+ path = path.rstrip("/")
96
+ return parsed._replace(
97
+ scheme=parsed.scheme.lower(),
98
+ netloc=parsed.netloc.lower(),
99
+ path=path,
100
+ query="",
101
+ fragment="",
102
+ ).geturl()
103
+
104
+
105
+ def _infer_root_path(start_url: str) -> str:
106
+ parsed = urlparse(start_url)
107
+ path = (parsed.path or "").rstrip("/")
108
+ if not path:
109
+ return ""
110
+ parts = path.split("/")
111
+ if len(parts) <= 2:
112
+ return path
113
+ return "/".join(parts[:-1])
114
+
115
+
116
+ def _path_within_root(path: str, root_path: str) -> bool:
117
+ if not root_path or root_path == "/":
118
+ return True
119
+ if path == root_path:
120
+ return True
121
+ root = root_path if root_path.endswith("/") else f"{root_path}/"
122
+ return path.startswith(root)
123
+
124
+
125
+ def _is_probable_doc_link(url: str) -> bool:
126
+ parsed = urlparse(url)
127
+ path = (parsed.path or "").lower()
128
+ for ext in _NON_DOC_EXTENSIONS:
129
+ if path.endswith(ext):
130
+ return False
131
+ return True
132
+
133
+
134
+ def _sidebar_candidates(soup: BeautifulSoup) -> list[Tag]:
135
+ seen: set[int] = set()
136
+ candidates: list[Tag] = []
137
+
138
+ for selector in _SIDEBAR_SELECTORS:
139
+ for el in soup.select(selector):
140
+ key = id(el)
141
+ if key in seen:
142
+ continue
143
+ seen.add(key)
144
+ candidates.append(el)
145
+
146
+ keywords = ["sidebar", "toc", "table of contents", "table-of-contents", "docs", "documentation"]
147
+ for el in soup.find_all(["nav", "aside", "div"]):
148
+ key = id(el)
149
+ if key in seen:
150
+ continue
151
+ label = str(el.get("aria-label") or "").lower()
152
+ elem_id = str(el.get("id") or "").lower()
153
+ data_testid = str(el.get("data-testid") or "").lower()
154
+ classes = " ".join(el.get("class", [])).lower()
155
+ haystack = " ".join([label, elem_id, data_testid, classes])
156
+ if any(k in haystack for k in keywords):
157
+ seen.add(key)
158
+ candidates.append(el)
159
+
160
+ return candidates
161
+
162
+
163
+ def _looks_like_pager(container: Tag, links: list[Tag]) -> bool:
164
+ label = str(container.get("aria-label") or "").lower()
165
+ if "docs pages" in label or "breadcrumb" in label:
166
+ return True
167
+ if not links:
168
+ return True
169
+ texts = []
170
+ for a in links:
171
+ text = " ".join(a.get_text(" ", strip=True).split()).lower()
172
+ if text:
173
+ texts.append(text)
174
+ if not texts:
175
+ return False
176
+ pager_words = {"next", "previous", "prev", "back"}
177
+ return all(text in pager_words for text in texts)
178
+
179
+
180
+ def _extract_sidebar_urls(
181
+ soup: BeautifulSoup,
182
+ *,
183
+ base_url: str,
184
+ start_url: str,
185
+ ) -> list[str]:
186
+ candidates = _sidebar_candidates(soup)
187
+ if not candidates:
188
+ return []
189
+
190
+ origin = urlparse(start_url).netloc.lower()
191
+ root_path = _infer_root_path(start_url)
192
+ best: list[str] = []
193
+ for container in candidates:
194
+ anchors = list(container.find_all("a", href=True))
195
+ if _looks_like_pager(container, anchors):
196
+ continue
197
+
198
+ urls: list[str] = []
199
+ seen: set[str] = set()
200
+ for a in anchors:
201
+ href = str(a.get("href") or "").strip()
202
+ if not href or href.startswith("#"):
203
+ continue
204
+ if href.startswith(("mailto:", "tel:", "javascript:")):
205
+ continue
206
+ abs_url = urljoin(base_url, href)
207
+ parsed = urlparse(abs_url)
208
+ if parsed.scheme not in ("http", "https"):
209
+ continue
210
+ if origin and parsed.netloc.lower() != origin:
211
+ continue
212
+ if not _is_probable_doc_link(abs_url):
213
+ continue
214
+ if not _path_within_root(parsed.path or "", root_path):
215
+ continue
216
+ canonical = _canonicalize_url(abs_url)
217
+ if canonical in seen:
218
+ continue
219
+ seen.add(canonical)
220
+ urls.append(canonical)
221
+
222
+ if len(urls) > len(best):
223
+ best = urls
224
+
225
+ return best
226
+
227
+
228
+ def _extract_content_urls(
229
+ container: Tag,
230
+ *,
231
+ base_url: str,
232
+ start_url: str,
233
+ ) -> list[str]:
234
+ origin = urlparse(start_url).netloc.lower()
235
+ root_path = _infer_root_path(start_url)
236
+ urls: list[str] = []
237
+ seen: set[str] = set()
238
+
239
+ for a in container.find_all("a", href=True):
240
+ href = str(a.get("href") or "").strip()
241
+ if not href or href.startswith("#"):
242
+ continue
243
+ if href.startswith(("mailto:", "tel:", "javascript:")):
244
+ continue
245
+ abs_url = urljoin(base_url, href)
246
+ parsed = urlparse(abs_url)
247
+ if parsed.scheme not in ("http", "https"):
248
+ continue
249
+ if origin and parsed.netloc.lower() != origin:
250
+ continue
251
+ if not _is_probable_doc_link(abs_url):
252
+ continue
253
+ if not _path_within_root(parsed.path or "", root_path):
254
+ continue
255
+ canonical = _canonicalize_url(abs_url)
256
+ if canonical in seen:
257
+ continue
258
+ seen.add(canonical)
259
+ urls.append(canonical)
260
+
261
+ return urls
262
+
263
+
45
264
  def _remove_unwanted(article: Tag) -> None:
46
265
  for selector in [
47
266
  'nav[aria-label="Breadcrumbs"]',
267
+ 'nav[aria-label="Breadcrumb"]',
48
268
  'nav[aria-label="Docs pages"]',
49
269
  "div.theme-doc-footer",
50
270
  "div.theme-doc-footer-edit-meta-row",
@@ -95,46 +315,98 @@ def iter_docusaurus_next(options: DocusaurusNextOptions) -> list[Chapter]:
95
315
  visited: set[str] = set()
96
316
  chapters: list[Chapter] = []
97
317
 
98
- idx = 1
99
- while True:
100
- if options.max_pages is not None and idx > options.max_pages:
101
- break
102
-
103
- if url in visited:
104
- break
105
- visited.add(url)
106
-
107
- resp = session.get(url, timeout=30)
318
+ def fetch_soup(target_url: str) -> BeautifulSoup:
319
+ resp = session.get(target_url, timeout=30)
108
320
  resp.raise_for_status()
109
-
110
- soup = BeautifulSoup(resp.text, "lxml")
111
- article = _extract_article(soup)
112
-
321
+ return BeautifulSoup(resp.text, "lxml")
322
+
323
+ initial_soup = fetch_soup(url)
324
+ sidebar_urls = _extract_sidebar_urls(initial_soup, base_url=base_url, start_url=url)
325
+ initial_key = _canonicalize_url(url)
326
+
327
+ def consume_page(target_url: str, *, soup: BeautifulSoup | None = None) -> Tag | None:
328
+ if options.max_pages is not None and len(chapters) >= options.max_pages:
329
+ return None
330
+ key = _canonicalize_url(target_url)
331
+ if key in visited:
332
+ return None
333
+ visited.add(key)
334
+
335
+ page_soup = soup
336
+ if page_soup is None:
337
+ try:
338
+ page_soup = fetch_soup(target_url)
339
+ except requests.HTTPError as exc:
340
+ status = exc.response.status_code if exc.response is not None else None
341
+ if status in {404, 410} and key != initial_key:
342
+ return None
343
+ raise
344
+
345
+ article = _extract_article(page_soup)
113
346
  title_el = article.find(["h1", "h2"])
114
347
  title = (
115
- " ".join(title_el.get_text(" ", strip=True).split()) if title_el else f"Chapter {idx}"
348
+ " ".join(title_el.get_text(" ", strip=True).split())
349
+ if title_el
350
+ else f"Chapter {len(chapters) + 1}"
116
351
  )
117
352
 
118
353
  _remove_unwanted(article)
119
- _absolutize_urls(article, base_url=base_url)
354
+ _absolutize_urls(article, base_url=target_url)
120
355
 
121
356
  for a in list(article.select('a.hash-link[href^="#"]')):
122
357
  a.decompose()
123
358
 
124
359
  html = article.decode_contents()
360
+ chapters.append(Chapter(index=len(chapters) + 1, title=title, url=target_url, html=html))
125
361
 
126
- chapters.append(Chapter(index=idx, title=title, url=url, html=html))
362
+ if options.sleep_s > 0 and (options.max_pages is None or len(chapters) < options.max_pages):
363
+ import time
127
364
 
128
- next_url = _extract_next_url(soup, base_url=base_url)
129
- if not next_url:
365
+ time.sleep(options.sleep_s)
366
+
367
+ return article
368
+
369
+ if sidebar_urls:
370
+ if initial_key not in {_canonicalize_url(u) for u in sidebar_urls}:
371
+ sidebar_urls.insert(0, url)
372
+ queue = list(sidebar_urls)
373
+ discovered = {_canonicalize_url(u) for u in queue}
374
+ idx = 0
375
+ while idx < len(queue):
376
+ if options.max_pages is not None and len(chapters) >= options.max_pages:
377
+ break
378
+ target_url = queue[idx]
379
+ use_soup = initial_soup if _canonicalize_url(target_url) == initial_key else None
380
+ article = consume_page(target_url, soup=use_soup)
381
+ if article is None:
382
+ idx += 1
383
+ continue
384
+ extra = _extract_content_urls(article, base_url=target_url, start_url=url)
385
+ for link in extra:
386
+ key = _canonicalize_url(link)
387
+ if key in discovered:
388
+ continue
389
+ discovered.add(key)
390
+ queue.append(link)
391
+ idx += 1
392
+ return chapters
393
+
394
+ # Fallback: follow next/previous navigation.
395
+ current_url = url
396
+ soup = initial_soup
397
+ while True:
398
+ if options.max_pages is not None and len(chapters) >= options.max_pages:
130
399
  break
131
400
 
132
- url = next_url
133
- idx += 1
401
+ article = consume_page(current_url, soup=soup)
402
+ if article is None:
403
+ break
134
404
 
135
- if options.sleep_s > 0:
136
- import time
405
+ next_url = _extract_next_url(soup, base_url=base_url)
406
+ if not next_url:
407
+ break
137
408
 
138
- time.sleep(options.sleep_s)
409
+ current_url = next_url
410
+ soup = fetch_soup(current_url)
139
411
 
140
412
  return chapters
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docs2epub
3
- Version: 0.1.2
3
+ Version: 0.1.6
4
4
  Summary: Turn documentation sites into an EPUB (Kindle-friendly).
5
5
  Author: Breno Brito
6
6
  License: MIT
@@ -1,11 +1,11 @@
1
1
  docs2epub/__init__.py,sha256=iccyEu4zlubhvd6pM7Z2Gjwn8tPw9IhZ4ABKhbiFjUY,54
2
2
  docs2epub/cli.py,sha256=pt1crvrkr2k1ybf_p0m4xSYyoZVluFsDNGuwJ7CykYM,3863
3
- docs2epub/docusaurus_next.py,sha256=OybL3KPwDZvp2sOL3locE374Zb80M1kubx81SH2GxgQ,3378
3
+ docs2epub/docusaurus_next.py,sha256=c4EZMo7E6zeuekbA3n_rF1joT3Km1cPY1HTimzUkMFg,10866
4
4
  docs2epub/epub.py,sha256=OsPWcPGTgazAeNpWASIE6e4HQ5ILQr2VFO1-Aj3y1kg,2986
5
5
  docs2epub/kindle_html.py,sha256=LN0CGj9ap9b8iC_MlZcQLuhJ7FehZr_VbIfMOz78E5c,2297
6
6
  docs2epub/model.py,sha256=uL7uwbG6yU0bEGpSFxxIv2pcZHQR9cs2prfqk5iNQwc,160
7
7
  docs2epub/pandoc_epub2.py,sha256=l22-QAQcCgJyl7HF0_b5weC3qEGVQLwOhxdbAvd8C2o,3610
8
- docs2epub-0.1.2.dist-info/METADATA,sha256=UphwnhA-8wtH-DkX1gRRiu4Fu3ukmri0GUGdCFIcU70,1886
9
- docs2epub-0.1.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
10
- docs2epub-0.1.2.dist-info/entry_points.txt,sha256=DHK4mzthrIXUvM8Y8Vo_3jG2IhegEDDM7T9CvCkUtvw,49
11
- docs2epub-0.1.2.dist-info/RECORD,,
8
+ docs2epub-0.1.6.dist-info/METADATA,sha256=Qu_OyYWfevaG_y7rCpUR81AT8aYK9Yo9KS4Cz_ZgDg8,1886
9
+ docs2epub-0.1.6.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
10
+ docs2epub-0.1.6.dist-info/entry_points.txt,sha256=DHK4mzthrIXUvM8Y8Vo_3jG2IhegEDDM7T9CvCkUtvw,49
11
+ docs2epub-0.1.6.dist-info/RECORD,,