docs2epub 0.1.2__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs2epub/docusaurus_next.py +297 -25
- {docs2epub-0.1.2.dist-info → docs2epub-0.1.6.dist-info}/METADATA +1 -1
- {docs2epub-0.1.2.dist-info → docs2epub-0.1.6.dist-info}/RECORD +5 -5
- {docs2epub-0.1.2.dist-info → docs2epub-0.1.6.dist-info}/WHEEL +0 -0
- {docs2epub-0.1.2.dist-info → docs2epub-0.1.6.dist-info}/entry_points.txt +0 -0
docs2epub/docusaurus_next.py
CHANGED
|
@@ -2,7 +2,7 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import re
|
|
4
4
|
from dataclasses import dataclass
|
|
5
|
-
from urllib.parse import urljoin
|
|
5
|
+
from urllib.parse import urljoin, urlparse
|
|
6
6
|
|
|
7
7
|
import requests
|
|
8
8
|
from bs4 import BeautifulSoup, Tag
|
|
@@ -12,6 +12,48 @@ from .model import Chapter
|
|
|
12
12
|
|
|
13
13
|
DEFAULT_USER_AGENT = "docs2epub/0.1 (+https://github.com/brenorb/docs2epub)"
|
|
14
14
|
|
|
15
|
+
_SIDEBAR_SELECTORS = [
|
|
16
|
+
'aside[data-testid="table-of-contents"]',
|
|
17
|
+
"aside#table-of-contents",
|
|
18
|
+
'nav[aria-label="Table of contents"]',
|
|
19
|
+
'nav[aria-label="Table of Contents"]',
|
|
20
|
+
'nav[aria-label="Docs sidebar"]',
|
|
21
|
+
'nav[aria-label="Docs navigation"]',
|
|
22
|
+
'nav[aria-label="Documentation"]',
|
|
23
|
+
'nav[aria-label="Docs"]',
|
|
24
|
+
"aside.theme-doc-sidebar-container",
|
|
25
|
+
"div.theme-doc-sidebar-container",
|
|
26
|
+
"nav.theme-doc-sidebar-menu",
|
|
27
|
+
"nav.menu",
|
|
28
|
+
'nav[class*="menu"]',
|
|
29
|
+
'aside[class*="sidebar"]',
|
|
30
|
+
'nav[class*="sidebar"]',
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
_NON_DOC_EXTENSIONS = {
|
|
34
|
+
".png",
|
|
35
|
+
".jpg",
|
|
36
|
+
".jpeg",
|
|
37
|
+
".gif",
|
|
38
|
+
".svg",
|
|
39
|
+
".webp",
|
|
40
|
+
".css",
|
|
41
|
+
".js",
|
|
42
|
+
".map",
|
|
43
|
+
".json",
|
|
44
|
+
".xml",
|
|
45
|
+
".rss",
|
|
46
|
+
".pdf",
|
|
47
|
+
".zip",
|
|
48
|
+
".tar",
|
|
49
|
+
".gz",
|
|
50
|
+
".tgz",
|
|
51
|
+
".epub",
|
|
52
|
+
".mp4",
|
|
53
|
+
".mp3",
|
|
54
|
+
".wav",
|
|
55
|
+
}
|
|
56
|
+
|
|
15
57
|
|
|
16
58
|
@dataclass(frozen=True)
|
|
17
59
|
class DocusaurusNextOptions:
|
|
@@ -39,12 +81,190 @@ def _extract_article(soup: BeautifulSoup) -> Tag:
|
|
|
39
81
|
article = main.find("article")
|
|
40
82
|
if article:
|
|
41
83
|
return article
|
|
84
|
+
return main
|
|
85
|
+
role_main = soup.find(attrs={"role": "main"})
|
|
86
|
+
if role_main:
|
|
87
|
+
return role_main
|
|
42
88
|
raise RuntimeError("Could not find <article> in page HTML")
|
|
43
89
|
|
|
44
90
|
|
|
91
|
+
def _canonicalize_url(url: str) -> str:
|
|
92
|
+
parsed = urlparse(url)
|
|
93
|
+
path = parsed.path or "/"
|
|
94
|
+
if path != "/" and path.endswith("/"):
|
|
95
|
+
path = path.rstrip("/")
|
|
96
|
+
return parsed._replace(
|
|
97
|
+
scheme=parsed.scheme.lower(),
|
|
98
|
+
netloc=parsed.netloc.lower(),
|
|
99
|
+
path=path,
|
|
100
|
+
query="",
|
|
101
|
+
fragment="",
|
|
102
|
+
).geturl()
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def _infer_root_path(start_url: str) -> str:
|
|
106
|
+
parsed = urlparse(start_url)
|
|
107
|
+
path = (parsed.path or "").rstrip("/")
|
|
108
|
+
if not path:
|
|
109
|
+
return ""
|
|
110
|
+
parts = path.split("/")
|
|
111
|
+
if len(parts) <= 2:
|
|
112
|
+
return path
|
|
113
|
+
return "/".join(parts[:-1])
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def _path_within_root(path: str, root_path: str) -> bool:
|
|
117
|
+
if not root_path or root_path == "/":
|
|
118
|
+
return True
|
|
119
|
+
if path == root_path:
|
|
120
|
+
return True
|
|
121
|
+
root = root_path if root_path.endswith("/") else f"{root_path}/"
|
|
122
|
+
return path.startswith(root)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def _is_probable_doc_link(url: str) -> bool:
|
|
126
|
+
parsed = urlparse(url)
|
|
127
|
+
path = (parsed.path or "").lower()
|
|
128
|
+
for ext in _NON_DOC_EXTENSIONS:
|
|
129
|
+
if path.endswith(ext):
|
|
130
|
+
return False
|
|
131
|
+
return True
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _sidebar_candidates(soup: BeautifulSoup) -> list[Tag]:
|
|
135
|
+
seen: set[int] = set()
|
|
136
|
+
candidates: list[Tag] = []
|
|
137
|
+
|
|
138
|
+
for selector in _SIDEBAR_SELECTORS:
|
|
139
|
+
for el in soup.select(selector):
|
|
140
|
+
key = id(el)
|
|
141
|
+
if key in seen:
|
|
142
|
+
continue
|
|
143
|
+
seen.add(key)
|
|
144
|
+
candidates.append(el)
|
|
145
|
+
|
|
146
|
+
keywords = ["sidebar", "toc", "table of contents", "table-of-contents", "docs", "documentation"]
|
|
147
|
+
for el in soup.find_all(["nav", "aside", "div"]):
|
|
148
|
+
key = id(el)
|
|
149
|
+
if key in seen:
|
|
150
|
+
continue
|
|
151
|
+
label = str(el.get("aria-label") or "").lower()
|
|
152
|
+
elem_id = str(el.get("id") or "").lower()
|
|
153
|
+
data_testid = str(el.get("data-testid") or "").lower()
|
|
154
|
+
classes = " ".join(el.get("class", [])).lower()
|
|
155
|
+
haystack = " ".join([label, elem_id, data_testid, classes])
|
|
156
|
+
if any(k in haystack for k in keywords):
|
|
157
|
+
seen.add(key)
|
|
158
|
+
candidates.append(el)
|
|
159
|
+
|
|
160
|
+
return candidates
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def _looks_like_pager(container: Tag, links: list[Tag]) -> bool:
|
|
164
|
+
label = str(container.get("aria-label") or "").lower()
|
|
165
|
+
if "docs pages" in label or "breadcrumb" in label:
|
|
166
|
+
return True
|
|
167
|
+
if not links:
|
|
168
|
+
return True
|
|
169
|
+
texts = []
|
|
170
|
+
for a in links:
|
|
171
|
+
text = " ".join(a.get_text(" ", strip=True).split()).lower()
|
|
172
|
+
if text:
|
|
173
|
+
texts.append(text)
|
|
174
|
+
if not texts:
|
|
175
|
+
return False
|
|
176
|
+
pager_words = {"next", "previous", "prev", "back"}
|
|
177
|
+
return all(text in pager_words for text in texts)
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def _extract_sidebar_urls(
|
|
181
|
+
soup: BeautifulSoup,
|
|
182
|
+
*,
|
|
183
|
+
base_url: str,
|
|
184
|
+
start_url: str,
|
|
185
|
+
) -> list[str]:
|
|
186
|
+
candidates = _sidebar_candidates(soup)
|
|
187
|
+
if not candidates:
|
|
188
|
+
return []
|
|
189
|
+
|
|
190
|
+
origin = urlparse(start_url).netloc.lower()
|
|
191
|
+
root_path = _infer_root_path(start_url)
|
|
192
|
+
best: list[str] = []
|
|
193
|
+
for container in candidates:
|
|
194
|
+
anchors = list(container.find_all("a", href=True))
|
|
195
|
+
if _looks_like_pager(container, anchors):
|
|
196
|
+
continue
|
|
197
|
+
|
|
198
|
+
urls: list[str] = []
|
|
199
|
+
seen: set[str] = set()
|
|
200
|
+
for a in anchors:
|
|
201
|
+
href = str(a.get("href") or "").strip()
|
|
202
|
+
if not href or href.startswith("#"):
|
|
203
|
+
continue
|
|
204
|
+
if href.startswith(("mailto:", "tel:", "javascript:")):
|
|
205
|
+
continue
|
|
206
|
+
abs_url = urljoin(base_url, href)
|
|
207
|
+
parsed = urlparse(abs_url)
|
|
208
|
+
if parsed.scheme not in ("http", "https"):
|
|
209
|
+
continue
|
|
210
|
+
if origin and parsed.netloc.lower() != origin:
|
|
211
|
+
continue
|
|
212
|
+
if not _is_probable_doc_link(abs_url):
|
|
213
|
+
continue
|
|
214
|
+
if not _path_within_root(parsed.path or "", root_path):
|
|
215
|
+
continue
|
|
216
|
+
canonical = _canonicalize_url(abs_url)
|
|
217
|
+
if canonical in seen:
|
|
218
|
+
continue
|
|
219
|
+
seen.add(canonical)
|
|
220
|
+
urls.append(canonical)
|
|
221
|
+
|
|
222
|
+
if len(urls) > len(best):
|
|
223
|
+
best = urls
|
|
224
|
+
|
|
225
|
+
return best
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def _extract_content_urls(
|
|
229
|
+
container: Tag,
|
|
230
|
+
*,
|
|
231
|
+
base_url: str,
|
|
232
|
+
start_url: str,
|
|
233
|
+
) -> list[str]:
|
|
234
|
+
origin = urlparse(start_url).netloc.lower()
|
|
235
|
+
root_path = _infer_root_path(start_url)
|
|
236
|
+
urls: list[str] = []
|
|
237
|
+
seen: set[str] = set()
|
|
238
|
+
|
|
239
|
+
for a in container.find_all("a", href=True):
|
|
240
|
+
href = str(a.get("href") or "").strip()
|
|
241
|
+
if not href or href.startswith("#"):
|
|
242
|
+
continue
|
|
243
|
+
if href.startswith(("mailto:", "tel:", "javascript:")):
|
|
244
|
+
continue
|
|
245
|
+
abs_url = urljoin(base_url, href)
|
|
246
|
+
parsed = urlparse(abs_url)
|
|
247
|
+
if parsed.scheme not in ("http", "https"):
|
|
248
|
+
continue
|
|
249
|
+
if origin and parsed.netloc.lower() != origin:
|
|
250
|
+
continue
|
|
251
|
+
if not _is_probable_doc_link(abs_url):
|
|
252
|
+
continue
|
|
253
|
+
if not _path_within_root(parsed.path or "", root_path):
|
|
254
|
+
continue
|
|
255
|
+
canonical = _canonicalize_url(abs_url)
|
|
256
|
+
if canonical in seen:
|
|
257
|
+
continue
|
|
258
|
+
seen.add(canonical)
|
|
259
|
+
urls.append(canonical)
|
|
260
|
+
|
|
261
|
+
return urls
|
|
262
|
+
|
|
263
|
+
|
|
45
264
|
def _remove_unwanted(article: Tag) -> None:
|
|
46
265
|
for selector in [
|
|
47
266
|
'nav[aria-label="Breadcrumbs"]',
|
|
267
|
+
'nav[aria-label="Breadcrumb"]',
|
|
48
268
|
'nav[aria-label="Docs pages"]',
|
|
49
269
|
"div.theme-doc-footer",
|
|
50
270
|
"div.theme-doc-footer-edit-meta-row",
|
|
@@ -95,46 +315,98 @@ def iter_docusaurus_next(options: DocusaurusNextOptions) -> list[Chapter]:
|
|
|
95
315
|
visited: set[str] = set()
|
|
96
316
|
chapters: list[Chapter] = []
|
|
97
317
|
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
if options.max_pages is not None and idx > options.max_pages:
|
|
101
|
-
break
|
|
102
|
-
|
|
103
|
-
if url in visited:
|
|
104
|
-
break
|
|
105
|
-
visited.add(url)
|
|
106
|
-
|
|
107
|
-
resp = session.get(url, timeout=30)
|
|
318
|
+
def fetch_soup(target_url: str) -> BeautifulSoup:
|
|
319
|
+
resp = session.get(target_url, timeout=30)
|
|
108
320
|
resp.raise_for_status()
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
321
|
+
return BeautifulSoup(resp.text, "lxml")
|
|
322
|
+
|
|
323
|
+
initial_soup = fetch_soup(url)
|
|
324
|
+
sidebar_urls = _extract_sidebar_urls(initial_soup, base_url=base_url, start_url=url)
|
|
325
|
+
initial_key = _canonicalize_url(url)
|
|
326
|
+
|
|
327
|
+
def consume_page(target_url: str, *, soup: BeautifulSoup | None = None) -> Tag | None:
|
|
328
|
+
if options.max_pages is not None and len(chapters) >= options.max_pages:
|
|
329
|
+
return None
|
|
330
|
+
key = _canonicalize_url(target_url)
|
|
331
|
+
if key in visited:
|
|
332
|
+
return None
|
|
333
|
+
visited.add(key)
|
|
334
|
+
|
|
335
|
+
page_soup = soup
|
|
336
|
+
if page_soup is None:
|
|
337
|
+
try:
|
|
338
|
+
page_soup = fetch_soup(target_url)
|
|
339
|
+
except requests.HTTPError as exc:
|
|
340
|
+
status = exc.response.status_code if exc.response is not None else None
|
|
341
|
+
if status in {404, 410} and key != initial_key:
|
|
342
|
+
return None
|
|
343
|
+
raise
|
|
344
|
+
|
|
345
|
+
article = _extract_article(page_soup)
|
|
113
346
|
title_el = article.find(["h1", "h2"])
|
|
114
347
|
title = (
|
|
115
|
-
" ".join(title_el.get_text(" ", strip=True).split())
|
|
348
|
+
" ".join(title_el.get_text(" ", strip=True).split())
|
|
349
|
+
if title_el
|
|
350
|
+
else f"Chapter {len(chapters) + 1}"
|
|
116
351
|
)
|
|
117
352
|
|
|
118
353
|
_remove_unwanted(article)
|
|
119
|
-
_absolutize_urls(article, base_url=
|
|
354
|
+
_absolutize_urls(article, base_url=target_url)
|
|
120
355
|
|
|
121
356
|
for a in list(article.select('a.hash-link[href^="#"]')):
|
|
122
357
|
a.decompose()
|
|
123
358
|
|
|
124
359
|
html = article.decode_contents()
|
|
360
|
+
chapters.append(Chapter(index=len(chapters) + 1, title=title, url=target_url, html=html))
|
|
125
361
|
|
|
126
|
-
|
|
362
|
+
if options.sleep_s > 0 and (options.max_pages is None or len(chapters) < options.max_pages):
|
|
363
|
+
import time
|
|
127
364
|
|
|
128
|
-
|
|
129
|
-
|
|
365
|
+
time.sleep(options.sleep_s)
|
|
366
|
+
|
|
367
|
+
return article
|
|
368
|
+
|
|
369
|
+
if sidebar_urls:
|
|
370
|
+
if initial_key not in {_canonicalize_url(u) for u in sidebar_urls}:
|
|
371
|
+
sidebar_urls.insert(0, url)
|
|
372
|
+
queue = list(sidebar_urls)
|
|
373
|
+
discovered = {_canonicalize_url(u) for u in queue}
|
|
374
|
+
idx = 0
|
|
375
|
+
while idx < len(queue):
|
|
376
|
+
if options.max_pages is not None and len(chapters) >= options.max_pages:
|
|
377
|
+
break
|
|
378
|
+
target_url = queue[idx]
|
|
379
|
+
use_soup = initial_soup if _canonicalize_url(target_url) == initial_key else None
|
|
380
|
+
article = consume_page(target_url, soup=use_soup)
|
|
381
|
+
if article is None:
|
|
382
|
+
idx += 1
|
|
383
|
+
continue
|
|
384
|
+
extra = _extract_content_urls(article, base_url=target_url, start_url=url)
|
|
385
|
+
for link in extra:
|
|
386
|
+
key = _canonicalize_url(link)
|
|
387
|
+
if key in discovered:
|
|
388
|
+
continue
|
|
389
|
+
discovered.add(key)
|
|
390
|
+
queue.append(link)
|
|
391
|
+
idx += 1
|
|
392
|
+
return chapters
|
|
393
|
+
|
|
394
|
+
# Fallback: follow next/previous navigation.
|
|
395
|
+
current_url = url
|
|
396
|
+
soup = initial_soup
|
|
397
|
+
while True:
|
|
398
|
+
if options.max_pages is not None and len(chapters) >= options.max_pages:
|
|
130
399
|
break
|
|
131
400
|
|
|
132
|
-
|
|
133
|
-
|
|
401
|
+
article = consume_page(current_url, soup=soup)
|
|
402
|
+
if article is None:
|
|
403
|
+
break
|
|
134
404
|
|
|
135
|
-
|
|
136
|
-
|
|
405
|
+
next_url = _extract_next_url(soup, base_url=base_url)
|
|
406
|
+
if not next_url:
|
|
407
|
+
break
|
|
137
408
|
|
|
138
|
-
|
|
409
|
+
current_url = next_url
|
|
410
|
+
soup = fetch_soup(current_url)
|
|
139
411
|
|
|
140
412
|
return chapters
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
docs2epub/__init__.py,sha256=iccyEu4zlubhvd6pM7Z2Gjwn8tPw9IhZ4ABKhbiFjUY,54
|
|
2
2
|
docs2epub/cli.py,sha256=pt1crvrkr2k1ybf_p0m4xSYyoZVluFsDNGuwJ7CykYM,3863
|
|
3
|
-
docs2epub/docusaurus_next.py,sha256=
|
|
3
|
+
docs2epub/docusaurus_next.py,sha256=c4EZMo7E6zeuekbA3n_rF1joT3Km1cPY1HTimzUkMFg,10866
|
|
4
4
|
docs2epub/epub.py,sha256=OsPWcPGTgazAeNpWASIE6e4HQ5ILQr2VFO1-Aj3y1kg,2986
|
|
5
5
|
docs2epub/kindle_html.py,sha256=LN0CGj9ap9b8iC_MlZcQLuhJ7FehZr_VbIfMOz78E5c,2297
|
|
6
6
|
docs2epub/model.py,sha256=uL7uwbG6yU0bEGpSFxxIv2pcZHQR9cs2prfqk5iNQwc,160
|
|
7
7
|
docs2epub/pandoc_epub2.py,sha256=l22-QAQcCgJyl7HF0_b5weC3qEGVQLwOhxdbAvd8C2o,3610
|
|
8
|
-
docs2epub-0.1.
|
|
9
|
-
docs2epub-0.1.
|
|
10
|
-
docs2epub-0.1.
|
|
11
|
-
docs2epub-0.1.
|
|
8
|
+
docs2epub-0.1.6.dist-info/METADATA,sha256=Qu_OyYWfevaG_y7rCpUR81AT8aYK9Yo9KS4Cz_ZgDg8,1886
|
|
9
|
+
docs2epub-0.1.6.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
10
|
+
docs2epub-0.1.6.dist-info/entry_points.txt,sha256=DHK4mzthrIXUvM8Y8Vo_3jG2IhegEDDM7T9CvCkUtvw,49
|
|
11
|
+
docs2epub-0.1.6.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|