docs2epub 0.1.1__tar.gz → 0.1.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,41 @@
1
+ # docs2epub AGENTS
2
+
3
+ This file documents local conventions for working on `docs2epub`.
4
+
5
+ **Focus**
6
+ - Keep the scraper generalistic across doc-site frontends (GitBook, Docusaurus, similar).
7
+ - Prefer resilient HTML heuristics over site-specific hacks.
8
+ - Fail gracefully when content is missing; surface actionable errors.
9
+
10
+ **Development Workflow**
11
+ - Use TDD for bug fixes and new behaviors. Add a failing test first.
12
+ - Prefer unit tests with `monkeypatch` and deterministic HTML fixtures.
13
+ - Keep tests fast and offline. Only do real network checks for manual validation.
14
+ - Run tests with `uv run pytest -q`.
15
+
16
+ **Scraping Heuristics**
17
+ - Primary crawl: sidebar/index extraction.
18
+ - Expand index/category pages by collecting in-page content links.
19
+ - Fallback crawl: “Next” navigation when no sidebar is found.
20
+ - Normalize URLs: strip fragments and queries; lower-case scheme/host.
21
+ - Filter non-doc links by extension; avoid cross-origin URLs by default.
22
+ - Resolve relative URLs against the page URL, not the site root.
23
+
24
+ **Code Layout**
25
+ - Core crawler logic: `src/docs2epub/docusaurus_next.py`.
26
+ - EPUB generation: `src/docs2epub/epub.py` and `src/docs2epub/pandoc_epub2.py`.
27
+ - HTML cleanup: `src/docs2epub/kindle_html.py`.
28
+ - Tests live in `tests/`.
29
+
30
+ **Release Discipline**
31
+ - Bump version in `pyproject.toml` for user-visible changes.
32
+ - Run `uv lock` after bumping the version.
33
+ - Build artifacts with `uv build` before publishing.
34
+ - Publish with `uv publish` when explicitly requested.
35
+ - Do not commit generated EPUBs or other artifacts.
36
+
37
+ **Validation**
38
+ - Quick manual checks (optional):
39
+ - `uvx --from . docs2epub https://midl.gitbook.io/midl out.epub`
40
+ - `uvx --from . docs2epub https://tutorial.docusaurus.io/docs/intro out.epub`
41
+ - Clean up any generated files after validation.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docs2epub
3
- Version: 0.1.1
3
+ Version: 0.1.5
4
4
  Summary: Turn documentation sites into an EPUB (Kindle-friendly).
5
5
  Author: Breno Brito
6
6
  License: MIT
@@ -34,6 +34,11 @@ uv run docs2epub --help
34
34
  ### uvx (no install)
35
35
 
36
36
  ```bash
37
+ uvx docs2epub \
38
+ https://www.techinterviewhandbook.org/software-engineering-interview-guide/ \
39
+ tech-interview-handbook.epub
40
+
41
+ # Optional (override inferred metadata)
37
42
  uvx docs2epub \
38
43
  https://www.techinterviewhandbook.org/software-engineering-interview-guide/ \
39
44
  tech-interview-handbook.epub \
@@ -18,6 +18,11 @@ uv run docs2epub --help
18
18
  ### uvx (no install)
19
19
 
20
20
  ```bash
21
+ uvx docs2epub \
22
+ https://www.techinterviewhandbook.org/software-engineering-interview-guide/ \
23
+ tech-interview-handbook.epub
24
+
25
+ # Optional (override inferred metadata)
21
26
  uvx docs2epub \
22
27
  https://www.techinterviewhandbook.org/software-engineering-interview-guide/ \
23
28
  tech-interview-handbook.epub \
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "docs2epub"
3
- version = "0.1.1"
3
+ version = "0.1.5"
4
4
  description = "Turn documentation sites into an EPUB (Kindle-friendly)."
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.12"
@@ -2,4 +2,4 @@ __all__ = [
2
2
  "__version__",
3
3
  ]
4
4
 
5
- __version__ = "0.1.1"
5
+ __version__ = "0.1.2"
@@ -2,10 +2,20 @@ from __future__ import annotations
2
2
 
3
3
  import argparse
4
4
  from pathlib import Path
5
+ from urllib.parse import urlparse
5
6
 
6
7
  from .docusaurus_next import DocusaurusNextOptions, iter_docusaurus_next
7
8
  from .epub import EpubMetadata, build_epub
8
- from .pandoc_epub2 import build_epub2_with_pandoc
9
+ from .pandoc_epub2 import PandocEpub2Options, build_epub2_with_pandoc
10
+
11
+
12
+ def _infer_defaults(start_url: str) -> tuple[str, str, str]:
13
+ parsed = urlparse(start_url)
14
+ host = parsed.netloc or "docs"
15
+ title = host
16
+ author = host
17
+ language = "en"
18
+ return title, author, language
9
19
 
10
20
 
11
21
  def _build_parser() -> argparse.ArgumentParser:
@@ -49,9 +59,9 @@ def _build_parser() -> argparse.ArgumentParser:
49
59
  p.add_argument("--max-pages", type=int, default=None)
50
60
  p.add_argument("--sleep-s", type=float, default=0.5)
51
61
 
52
- p.add_argument("--title", required=True)
53
- p.add_argument("--author", required=True)
54
- p.add_argument("--language", default="en")
62
+ p.add_argument("--title", default=None)
63
+ p.add_argument("--author", default=None)
64
+ p.add_argument("--language", default=None)
55
65
  p.add_argument("--identifier", default=None)
56
66
  p.add_argument("--publisher", default=None)
57
67
 
@@ -62,6 +72,19 @@ def _build_parser() -> argparse.ArgumentParser:
62
72
  help="Output format. Default: epub2 (Kindle-friendly).",
63
73
  )
64
74
 
75
+ p.add_argument(
76
+ "--keep-images",
77
+ action="store_true",
78
+ help="Keep and embed remote images (may be slower and can trigger fetch warnings).",
79
+ )
80
+
81
+ p.add_argument(
82
+ "-v",
83
+ "--verbose",
84
+ action="store_true",
85
+ help="Verbose output (shows full pandoc warnings).",
86
+ )
87
+
65
88
  return p
66
89
 
67
90
 
@@ -72,10 +95,13 @@ def main(argv: list[str] | None = None) -> int:
72
95
  out_value = args.out or args.out_pos
73
96
 
74
97
  if not start_url or not out_value:
75
- raise SystemExit(
76
- "Usage: docs2epub <START_URL> <OUT.epub> --title ... --author ...\n"
77
- "(or use --start-url/--out flags)"
78
- )
98
+ raise SystemExit("Usage: docs2epub <START_URL> <OUT.epub> [options]")
99
+
100
+ inferred_title, inferred_author, inferred_language = _infer_defaults(start_url)
101
+
102
+ title = args.title or inferred_title
103
+ author = args.author or inferred_author
104
+ language = args.language or inferred_language
79
105
 
80
106
  options = DocusaurusNextOptions(
81
107
  start_url=start_url,
@@ -86,26 +112,27 @@ def main(argv: list[str] | None = None) -> int:
86
112
 
87
113
  chapters = iter_docusaurus_next(options)
88
114
  if not chapters:
89
- raise SystemExit("No chapters scraped (did not find article content).")
115
+ raise SystemExit("No pages scraped (did not find article content).")
90
116
 
91
- out_path: Path
92
117
  out_path_value = Path(out_value)
93
118
 
94
119
  if args.format == "epub2":
95
120
  out_path = build_epub2_with_pandoc(
96
121
  chapters=chapters,
97
122
  out_file=out_path_value,
98
- title=args.title,
99
- author=args.author,
100
- language=args.language,
123
+ title=title,
124
+ author=author,
125
+ language=language,
101
126
  publisher=args.publisher,
102
127
  identifier=args.identifier,
128
+ verbose=args.verbose,
129
+ options=PandocEpub2Options(keep_images=args.keep_images),
103
130
  )
104
131
  else:
105
132
  meta = EpubMetadata(
106
- title=args.title,
107
- author=args.author,
108
- language=args.language,
133
+ title=title,
134
+ author=author,
135
+ language=language,
109
136
  identifier=args.identifier,
110
137
  publisher=args.publisher,
111
138
  )
@@ -116,6 +143,7 @@ def main(argv: list[str] | None = None) -> int:
116
143
  meta=meta,
117
144
  )
118
145
 
146
+ size_mb = out_path.stat().st_size / (1024 * 1024)
119
147
  print(f"Scraped {len(chapters)} pages")
120
- print(f"EPUB written to: {out_path.resolve()}")
148
+ print(f"EPUB written to: {out_path.resolve()} ({size_mb:.2f} MB)")
121
149
  return 0
@@ -0,0 +1,404 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from dataclasses import dataclass
5
+ from urllib.parse import urljoin, urlparse
6
+
7
+ import requests
8
+ from bs4 import BeautifulSoup, Tag
9
+
10
+ from .model import Chapter
11
+
12
+
13
+ DEFAULT_USER_AGENT = "docs2epub/0.1 (+https://github.com/brenorb/docs2epub)"
14
+
15
+ _SIDEBAR_SELECTORS = [
16
+ 'aside[data-testid="table-of-contents"]',
17
+ "aside#table-of-contents",
18
+ 'nav[aria-label="Table of contents"]',
19
+ 'nav[aria-label="Table of Contents"]',
20
+ 'nav[aria-label="Docs sidebar"]',
21
+ 'nav[aria-label="Docs navigation"]',
22
+ 'nav[aria-label="Documentation"]',
23
+ 'nav[aria-label="Docs"]',
24
+ "aside.theme-doc-sidebar-container",
25
+ "div.theme-doc-sidebar-container",
26
+ "nav.theme-doc-sidebar-menu",
27
+ "nav.menu",
28
+ 'nav[class*="menu"]',
29
+ 'aside[class*="sidebar"]',
30
+ 'nav[class*="sidebar"]',
31
+ ]
32
+
33
+ _NON_DOC_EXTENSIONS = {
34
+ ".png",
35
+ ".jpg",
36
+ ".jpeg",
37
+ ".gif",
38
+ ".svg",
39
+ ".webp",
40
+ ".css",
41
+ ".js",
42
+ ".map",
43
+ ".json",
44
+ ".xml",
45
+ ".rss",
46
+ ".pdf",
47
+ ".zip",
48
+ ".tar",
49
+ ".gz",
50
+ ".tgz",
51
+ ".epub",
52
+ ".mp4",
53
+ ".mp3",
54
+ ".wav",
55
+ }
56
+
57
+
58
+ @dataclass(frozen=True)
59
+ class DocusaurusNextOptions:
60
+ start_url: str
61
+ base_url: str | None = None
62
+ max_pages: int | None = None
63
+ sleep_s: float = 0.5
64
+ user_agent: str = DEFAULT_USER_AGENT
65
+
66
+
67
+ def _slugify_filename(text: str) -> str:
68
+ value = text.strip().lower()
69
+ value = re.sub(r"[^\w\s-]", "", value)
70
+ value = re.sub(r"[\s_-]+", "-", value)
71
+ value = value.strip("-")
72
+ return value or "chapter"
73
+
74
+
75
+ def _extract_article(soup: BeautifulSoup) -> Tag:
76
+ article = soup.find("article")
77
+ if article:
78
+ return article
79
+ main = soup.find("main")
80
+ if main:
81
+ article = main.find("article")
82
+ if article:
83
+ return article
84
+ return main
85
+ role_main = soup.find(attrs={"role": "main"})
86
+ if role_main:
87
+ return role_main
88
+ raise RuntimeError("Could not find <article> in page HTML")
89
+
90
+
91
+ def _canonicalize_url(url: str) -> str:
92
+ parsed = urlparse(url)
93
+ path = parsed.path or "/"
94
+ if path != "/" and path.endswith("/"):
95
+ path = path.rstrip("/")
96
+ return parsed._replace(
97
+ scheme=parsed.scheme.lower(),
98
+ netloc=parsed.netloc.lower(),
99
+ path=path,
100
+ query="",
101
+ fragment="",
102
+ ).geturl()
103
+
104
+
105
+ def _infer_root_path(start_url: str) -> str:
106
+ parsed = urlparse(start_url)
107
+ path = (parsed.path or "").rstrip("/")
108
+ if not path:
109
+ return ""
110
+ parts = path.split("/")
111
+ if len(parts) <= 2:
112
+ return path
113
+ return "/".join(parts[:-1])
114
+
115
+
116
+ def _path_within_root(path: str, root_path: str) -> bool:
117
+ if not root_path or root_path == "/":
118
+ return True
119
+ if path == root_path:
120
+ return True
121
+ root = root_path if root_path.endswith("/") else f"{root_path}/"
122
+ return path.startswith(root)
123
+
124
+
125
+ def _is_probable_doc_link(url: str) -> bool:
126
+ parsed = urlparse(url)
127
+ path = (parsed.path or "").lower()
128
+ for ext in _NON_DOC_EXTENSIONS:
129
+ if path.endswith(ext):
130
+ return False
131
+ return True
132
+
133
+
134
+ def _sidebar_candidates(soup: BeautifulSoup) -> list[Tag]:
135
+ seen: set[int] = set()
136
+ candidates: list[Tag] = []
137
+
138
+ for selector in _SIDEBAR_SELECTORS:
139
+ for el in soup.select(selector):
140
+ key = id(el)
141
+ if key in seen:
142
+ continue
143
+ seen.add(key)
144
+ candidates.append(el)
145
+
146
+ keywords = ["sidebar", "toc", "table of contents", "table-of-contents", "docs", "documentation"]
147
+ for el in soup.find_all(["nav", "aside", "div"]):
148
+ key = id(el)
149
+ if key in seen:
150
+ continue
151
+ label = str(el.get("aria-label") or "").lower()
152
+ elem_id = str(el.get("id") or "").lower()
153
+ data_testid = str(el.get("data-testid") or "").lower()
154
+ classes = " ".join(el.get("class", [])).lower()
155
+ haystack = " ".join([label, elem_id, data_testid, classes])
156
+ if any(k in haystack for k in keywords):
157
+ seen.add(key)
158
+ candidates.append(el)
159
+
160
+ return candidates
161
+
162
+
163
+ def _looks_like_pager(container: Tag, links: list[Tag]) -> bool:
164
+ label = str(container.get("aria-label") or "").lower()
165
+ if "docs pages" in label or "breadcrumb" in label:
166
+ return True
167
+ if not links:
168
+ return True
169
+ texts = []
170
+ for a in links:
171
+ text = " ".join(a.get_text(" ", strip=True).split()).lower()
172
+ if text:
173
+ texts.append(text)
174
+ if not texts:
175
+ return False
176
+ pager_words = {"next", "previous", "prev", "back"}
177
+ return all(text in pager_words for text in texts)
178
+
179
+
180
+ def _extract_sidebar_urls(
181
+ soup: BeautifulSoup,
182
+ *,
183
+ base_url: str,
184
+ start_url: str,
185
+ ) -> list[str]:
186
+ candidates = _sidebar_candidates(soup)
187
+ if not candidates:
188
+ return []
189
+
190
+ origin = urlparse(start_url).netloc.lower()
191
+ root_path = _infer_root_path(start_url)
192
+ best: list[str] = []
193
+ for container in candidates:
194
+ anchors = list(container.find_all("a", href=True))
195
+ if _looks_like_pager(container, anchors):
196
+ continue
197
+
198
+ urls: list[str] = []
199
+ seen: set[str] = set()
200
+ for a in anchors:
201
+ href = str(a.get("href") or "").strip()
202
+ if not href or href.startswith("#"):
203
+ continue
204
+ if href.startswith(("mailto:", "tel:", "javascript:")):
205
+ continue
206
+ abs_url = urljoin(base_url, href)
207
+ parsed = urlparse(abs_url)
208
+ if parsed.scheme not in ("http", "https"):
209
+ continue
210
+ if origin and parsed.netloc.lower() != origin:
211
+ continue
212
+ if not _is_probable_doc_link(abs_url):
213
+ continue
214
+ if not _path_within_root(parsed.path or "", root_path):
215
+ continue
216
+ canonical = _canonicalize_url(abs_url)
217
+ if canonical in seen:
218
+ continue
219
+ seen.add(canonical)
220
+ urls.append(canonical)
221
+
222
+ if len(urls) > len(best):
223
+ best = urls
224
+
225
+ return best
226
+
227
+
228
+ def _extract_content_urls(
229
+ container: Tag,
230
+ *,
231
+ base_url: str,
232
+ start_url: str,
233
+ ) -> list[str]:
234
+ origin = urlparse(start_url).netloc.lower()
235
+ root_path = _infer_root_path(start_url)
236
+ urls: list[str] = []
237
+ seen: set[str] = set()
238
+
239
+ for a in container.find_all("a", href=True):
240
+ href = str(a.get("href") or "").strip()
241
+ if not href or href.startswith("#"):
242
+ continue
243
+ if href.startswith(("mailto:", "tel:", "javascript:")):
244
+ continue
245
+ abs_url = urljoin(base_url, href)
246
+ parsed = urlparse(abs_url)
247
+ if parsed.scheme not in ("http", "https"):
248
+ continue
249
+ if origin and parsed.netloc.lower() != origin:
250
+ continue
251
+ if not _is_probable_doc_link(abs_url):
252
+ continue
253
+ if not _path_within_root(parsed.path or "", root_path):
254
+ continue
255
+ canonical = _canonicalize_url(abs_url)
256
+ if canonical in seen:
257
+ continue
258
+ seen.add(canonical)
259
+ urls.append(canonical)
260
+
261
+ return urls
262
+
263
+
264
+ def _remove_unwanted(article: Tag) -> None:
265
+ for selector in [
266
+ 'nav[aria-label="Breadcrumbs"]',
267
+ 'nav[aria-label="Breadcrumb"]',
268
+ 'nav[aria-label="Docs pages"]',
269
+ "div.theme-doc-footer",
270
+ "div.theme-doc-footer-edit-meta-row",
271
+ "div.theme-doc-version-badge",
272
+ "script",
273
+ "style",
274
+ "noscript",
275
+ "iframe",
276
+ "svg",
277
+ "button",
278
+ ]:
279
+ for el in list(article.select(selector)):
280
+ el.decompose()
281
+
282
+
283
+ def _absolutize_urls(container: Tag, base_url: str) -> None:
284
+ for el in container.find_all(True):
285
+ if el.has_attr("href"):
286
+ href = str(el.get("href") or "")
287
+ if href.startswith("/"):
288
+ el["href"] = urljoin(base_url, href)
289
+ if el.has_attr("src"):
290
+ src = str(el.get("src") or "")
291
+ if src.startswith("/"):
292
+ el["src"] = urljoin(base_url, src)
293
+
294
+
295
+ def _extract_next_url(soup: BeautifulSoup, base_url: str) -> str | None:
296
+ nav = soup.select_one('nav[aria-label="Docs pages"]')
297
+ if not nav:
298
+ return None
299
+
300
+ for a in nav.find_all("a", href=True):
301
+ text = " ".join(a.get_text(" ", strip=True).split())
302
+ if text.lower().startswith("next"):
303
+ return urljoin(base_url, a["href"])
304
+
305
+ return None
306
+
307
+
308
+ def iter_docusaurus_next(options: DocusaurusNextOptions) -> list[Chapter]:
309
+ session = requests.Session()
310
+ session.headers.update({"User-Agent": options.user_agent})
311
+
312
+ url = options.start_url
313
+ base_url = options.base_url or options.start_url
314
+
315
+ visited: set[str] = set()
316
+ chapters: list[Chapter] = []
317
+
318
+ def fetch_soup(target_url: str) -> BeautifulSoup:
319
+ resp = session.get(target_url, timeout=30)
320
+ resp.raise_for_status()
321
+ return BeautifulSoup(resp.text, "lxml")
322
+
323
+ initial_soup = fetch_soup(url)
324
+ sidebar_urls = _extract_sidebar_urls(initial_soup, base_url=base_url, start_url=url)
325
+ initial_key = _canonicalize_url(url)
326
+
327
+ def consume_page(target_url: str, *, soup: BeautifulSoup | None = None) -> Tag | None:
328
+ if options.max_pages is not None and len(chapters) >= options.max_pages:
329
+ return None
330
+ key = _canonicalize_url(target_url)
331
+ if key in visited:
332
+ return None
333
+ visited.add(key)
334
+
335
+ page_soup = soup if soup is not None else fetch_soup(target_url)
336
+
337
+ article = _extract_article(page_soup)
338
+ title_el = article.find(["h1", "h2"])
339
+ title = (
340
+ " ".join(title_el.get_text(" ", strip=True).split())
341
+ if title_el
342
+ else f"Chapter {len(chapters) + 1}"
343
+ )
344
+
345
+ _remove_unwanted(article)
346
+ _absolutize_urls(article, base_url=target_url)
347
+
348
+ for a in list(article.select('a.hash-link[href^="#"]')):
349
+ a.decompose()
350
+
351
+ html = article.decode_contents()
352
+ chapters.append(Chapter(index=len(chapters) + 1, title=title, url=target_url, html=html))
353
+
354
+ if options.sleep_s > 0 and (options.max_pages is None or len(chapters) < options.max_pages):
355
+ import time
356
+
357
+ time.sleep(options.sleep_s)
358
+
359
+ return article
360
+
361
+ if sidebar_urls:
362
+ if initial_key not in {_canonicalize_url(u) for u in sidebar_urls}:
363
+ sidebar_urls.insert(0, url)
364
+ queue = list(sidebar_urls)
365
+ discovered = {_canonicalize_url(u) for u in queue}
366
+ idx = 0
367
+ while idx < len(queue):
368
+ if options.max_pages is not None and len(chapters) >= options.max_pages:
369
+ break
370
+ target_url = queue[idx]
371
+ use_soup = initial_soup if _canonicalize_url(target_url) == initial_key else None
372
+ article = consume_page(target_url, soup=use_soup)
373
+ if article is None:
374
+ idx += 1
375
+ continue
376
+ extra = _extract_content_urls(article, base_url=target_url, start_url=url)
377
+ for link in extra:
378
+ key = _canonicalize_url(link)
379
+ if key in discovered:
380
+ continue
381
+ discovered.add(key)
382
+ queue.append(link)
383
+ idx += 1
384
+ return chapters
385
+
386
+ # Fallback: follow next/previous navigation.
387
+ current_url = url
388
+ soup = initial_soup
389
+ while True:
390
+ if options.max_pages is not None and len(chapters) >= options.max_pages:
391
+ break
392
+
393
+ article = consume_page(current_url, soup=soup)
394
+ if article is None:
395
+ break
396
+
397
+ next_url = _extract_next_url(soup, base_url=base_url)
398
+ if not next_url:
399
+ break
400
+
401
+ current_url = next_url
402
+ soup = fetch_soup(current_url)
403
+
404
+ return chapters
@@ -5,37 +5,45 @@ import re
5
5
  from bs4 import BeautifulSoup
6
6
 
7
7
 
8
- def clean_html_for_kindle_epub2(html_fragment: str) -> str:
8
+ def clean_html_for_kindle_epub2(
9
+ html_fragment: str,
10
+ *,
11
+ keep_images: bool,
12
+ ) -> str:
9
13
  """Best-effort HTML cleanup for Kindle-friendly EPUB2.
10
14
 
11
15
  This is intentionally conservative: it strips known-problematic attributes
12
16
  and tags that commonly cause Send-to-Kindle conversion issues.
17
+
18
+ By default we drop remote images to avoid pandoc fetch failures.
13
19
  """
14
20
 
15
21
  soup = BeautifulSoup(html_fragment, "lxml")
16
22
 
23
+ if not keep_images:
24
+ for img in list(soup.find_all("img")):
25
+ src = str(img.get("src") or "")
26
+ if src.startswith("http://") or src.startswith("https://"):
27
+ img.decompose()
28
+
17
29
  # EPUB2: <u> tag isn't consistently supported; convert to a span.
18
30
  for u in list(soup.find_all("u")):
19
31
  span = soup.new_tag("span")
20
32
  span["style"] = "text-decoration: underline;"
21
- span.string = u.get_text() if u.string is None else u.string
22
33
  if u.string is None:
23
- # Keep children by moving them into the span.
24
34
  for child in list(u.contents):
25
35
  span.append(child)
36
+ else:
37
+ span.string = u.string
26
38
  u.replace_with(span)
27
39
 
28
40
  # Remove tabindex attributes (not allowed in EPUB2 XHTML).
29
41
  for el in soup.find_all(attrs={"tabindex": True}):
30
- try:
31
- del el["tabindex"]
32
- except KeyError:
33
- pass
42
+ el.attrs.pop("tabindex", None)
34
43
 
35
44
  # Remove start attribute from ordered lists (not allowed in EPUB2 XHTML).
36
45
  for ol in soup.find_all("ol"):
37
- if ol.has_attr("start"):
38
- del ol["start"]
46
+ ol.attrs.pop("start", None)
39
47
 
40
48
  # Strip duplicate ids in a simple way: if an id repeats, rename it.
41
49
  seen_ids: set[str] = set()
@@ -54,7 +62,6 @@ def clean_html_for_kindle_epub2(html_fragment: str) -> str:
54
62
  el["id"] = new_id
55
63
  seen_ids.add(new_id)
56
64
 
57
- # Remove empty fragment links that point to missing ids (best-effort).
58
65
  # If href="#something" but no element has id="something", drop href.
59
66
  all_ids = {str(el.get("id")) for el in soup.find_all(attrs={"id": True})}
60
67
  for a in soup.find_all("a", href=True):
@@ -62,9 +69,9 @@ def clean_html_for_kindle_epub2(html_fragment: str) -> str:
62
69
  if href.startswith("#") and len(href) > 1:
63
70
  frag = href[1:]
64
71
  if frag not in all_ids:
65
- del a["href"]
72
+ a.attrs.pop("href", None)
66
73
 
67
- # Normalize weird whitespace artifacts.
74
+ # Normalize whitespace a bit (helps keep diffs smaller and reduces odd output).
68
75
  text = str(soup)
69
76
  text = re.sub(r"\s+", " ", text)
70
77
  return text.strip()
@@ -15,7 +15,8 @@ from .model import Chapter
15
15
  class PandocEpub2Options:
16
16
  toc: bool = True
17
17
  toc_depth: int = 2
18
- chapter_level: int = 1
18
+ split_level: int = 1
19
+ keep_images: bool = False
19
20
 
20
21
 
21
22
  def _wrap_html(title: str, body_html: str) -> str:
@@ -34,6 +35,26 @@ def _wrap_html(title: str, body_html: str) -> str:
34
35
  """
35
36
 
36
37
 
38
+ def _summarize_pandoc_warnings(stderr: str) -> str:
39
+ warnings = [line for line in stderr.splitlines() if line.startswith("[WARNING]")]
40
+ if not warnings:
41
+ return ""
42
+
43
+ resource = [w for w in warnings if "Could not fetch resource" in w]
44
+ duplicate = [w for w in warnings if "Duplicate identifier" in w]
45
+
46
+ parts: list[str] = []
47
+ parts.append(f"pandoc warnings: {len(warnings)} (use -v to see full output)")
48
+ if duplicate:
49
+ parts.append(f"- Duplicate identifier: {len(duplicate)} (usually safe; affects internal anchors)")
50
+ if resource:
51
+ parts.append(
52
+ f"- Missing resources: {len(resource)} (some images may be dropped; use --keep-images/-v to inspect)"
53
+ )
54
+
55
+ return "\n".join(parts)
56
+
57
+
37
58
  def build_epub2_with_pandoc(
38
59
  *,
39
60
  chapters: Iterable[Chapter],
@@ -43,6 +64,7 @@ def build_epub2_with_pandoc(
43
64
  language: str,
44
65
  publisher: str | None,
45
66
  identifier: str | None,
67
+ verbose: bool,
46
68
  options: PandocEpub2Options | None = None,
47
69
  ) -> Path:
48
70
  pandoc = shutil.which("pandoc")
@@ -61,7 +83,7 @@ def build_epub2_with_pandoc(
61
83
 
62
84
  html_files: list[Path] = []
63
85
  for ch in chapters:
64
- cleaned = clean_html_for_kindle_epub2(ch.html)
86
+ cleaned = clean_html_for_kindle_epub2(ch.html, keep_images=opts.keep_images)
65
87
  html_doc = _wrap_html(ch.title, cleaned)
66
88
  fp = tmp_path / f"chapter_{ch.index:04d}.html"
67
89
  fp.write_text(html_doc, encoding="utf-8")
@@ -81,14 +103,15 @@ def build_epub2_with_pandoc(
81
103
  "encoding=UTF-8",
82
104
  "--standalone",
83
105
  "--split-level",
84
- str(opts.chapter_level),
106
+ str(opts.split_level),
85
107
  ]
86
108
 
87
109
  if publisher:
88
110
  cmd.extend(["--metadata", f"publisher={publisher}"])
89
111
 
90
112
  if identifier:
91
- cmd.extend(["--epub-metadata", str(identifier)])
113
+ # Keep identifier stable for Kindle.
114
+ cmd.extend(["--metadata", f"identifier={identifier}"])
92
115
 
93
116
  if opts.toc:
94
117
  cmd.extend(["--toc", "--toc-depth", str(opts.toc_depth)])
@@ -96,6 +119,22 @@ def build_epub2_with_pandoc(
96
119
  cmd.extend(["-o", str(out_path)])
97
120
  cmd.extend([str(p) for p in html_files])
98
121
 
99
- subprocess.run(cmd, check=True)
122
+ proc = subprocess.run(
123
+ cmd,
124
+ stdout=subprocess.PIPE,
125
+ stderr=subprocess.PIPE,
126
+ text=True,
127
+ )
128
+
129
+ if proc.returncode != 0:
130
+ # On failure, always show stderr.
131
+ raise RuntimeError(f"pandoc failed (exit {proc.returncode}):\n{proc.stderr.strip()}")
132
+
133
+ if verbose and proc.stderr.strip():
134
+ print(proc.stderr.strip())
135
+ elif proc.stderr.strip():
136
+ summary = _summarize_pandoc_warnings(proc.stderr)
137
+ if summary:
138
+ print(summary)
100
139
 
101
140
  return out_path
@@ -0,0 +1,42 @@
1
+ from docs2epub.docusaurus_next import DocusaurusNextOptions, iter_docusaurus_next
2
+
3
+
4
+ def test_iter_docusaurus_next_falls_back_to_main_when_no_article(monkeypatch):
5
+ html = """
6
+ <!doctype html>
7
+ <html>
8
+ <body>
9
+ <main>
10
+ <div>
11
+ <h1>Overview</h1>
12
+ <p>Hello world</p>
13
+ </div>
14
+ </main>
15
+ </body>
16
+ </html>
17
+ """
18
+
19
+ class DummyResponse:
20
+ text = html
21
+
22
+ def raise_for_status(self) -> None:
23
+ return None
24
+
25
+ class DummySession:
26
+ def __init__(self) -> None:
27
+ self.headers = {}
28
+
29
+ def get(self, url: str, timeout: int = 30) -> DummyResponse:
30
+ return DummyResponse()
31
+
32
+ monkeypatch.setattr(
33
+ "docs2epub.docusaurus_next.requests.Session",
34
+ lambda: DummySession(),
35
+ )
36
+
37
+ options = DocusaurusNextOptions(start_url="https://example.com/docs", sleep_s=0)
38
+ chapters = iter_docusaurus_next(options)
39
+
40
+ assert len(chapters) == 1
41
+ assert chapters[0].title == "Overview"
42
+ assert "Hello world" in chapters[0].html
@@ -0,0 +1,102 @@
1
+ from docs2epub.docusaurus_next import DocusaurusNextOptions, iter_docusaurus_next
2
+
3
+
4
+ def _make_session(pages: dict[str, str]):
5
+ class DummyResponse:
6
+ def __init__(self, text: str) -> None:
7
+ self.text = text
8
+
9
+ def raise_for_status(self) -> None:
10
+ return None
11
+
12
+ class DummySession:
13
+ def __init__(self) -> None:
14
+ self.headers = {}
15
+
16
+ def get(self, url: str, timeout: int = 30) -> DummyResponse:
17
+ if url not in pages:
18
+ raise AssertionError(f"unexpected url fetch: {url}")
19
+ return DummyResponse(pages[url])
20
+
21
+ return DummySession
22
+
23
+
24
+ def test_iter_uses_gitbook_sidebar_links(monkeypatch):
25
+ start_url = "https://example.com/book/intro"
26
+ sidebar = """
27
+ <aside data-testid="table-of-contents">
28
+ <a href="/book/intro">Intro</a>
29
+ <a href="/book/chapter-1">Chapter 1</a>
30
+ </aside>
31
+ """
32
+ pages = {
33
+ start_url: f"<html><body>{sidebar}<main><h1>Intro</h1><p>Intro text</p></main></body></html>",
34
+ "https://example.com/book/chapter-1": f"<html><body>{sidebar}<main><h1>Chapter 1</h1><p>Ch1</p></main></body></html>",
35
+ }
36
+
37
+ monkeypatch.setattr(
38
+ "docs2epub.docusaurus_next.requests.Session",
39
+ lambda: _make_session(pages)(),
40
+ )
41
+
42
+ options = DocusaurusNextOptions(start_url=start_url, sleep_s=0)
43
+ chapters = iter_docusaurus_next(options)
44
+
45
+ assert [c.title for c in chapters] == ["Intro", "Chapter 1"]
46
+
47
+
48
+ def test_iter_uses_docusaurus_menu_sidebar(monkeypatch):
49
+ start_url = "https://example.com/docs/intro"
50
+ sidebar = """
51
+ <nav class="menu">
52
+ <a class="menu__link" href="/docs/intro">Intro</a>
53
+ <a class="menu__link" href="/docs/install">Install</a>
54
+ </nav>
55
+ """
56
+ pages = {
57
+ start_url: f"<html><body>{sidebar}<article><h1>Intro</h1><p>Intro text</p></article></body></html>",
58
+ "https://example.com/docs/install": f"<html><body>{sidebar}<article><h1>Install</h1><p>Install text</p></article></body></html>",
59
+ }
60
+
61
+ monkeypatch.setattr(
62
+ "docs2epub.docusaurus_next.requests.Session",
63
+ lambda: _make_session(pages)(),
64
+ )
65
+
66
+ options = DocusaurusNextOptions(start_url=start_url, sleep_s=0)
67
+ chapters = iter_docusaurus_next(options)
68
+
69
+ assert [c.title for c in chapters] == ["Intro", "Install"]
70
+
71
+
72
+ def test_iter_expands_links_from_index_pages(monkeypatch):
73
+ start_url = "https://example.com/docs/intro"
74
+ sidebar = """
75
+ <nav class="menu">
76
+ <a class="menu__link" href="/docs/intro">Intro</a>
77
+ <a class="menu__link" href="/docs/category/getting-started">Getting Started</a>
78
+ </nav>
79
+ """
80
+ pages = {
81
+ start_url: f"<html><body>{sidebar}<article><h1>Intro</h1><p>Intro text</p></article></body></html>",
82
+ "https://example.com/docs/category/getting-started": (
83
+ "<html><body>"
84
+ f"{sidebar}"
85
+ '<article><h1>Getting Started</h1>'
86
+ '<a href="/docs/one">One</a>'
87
+ '<a href="/docs/two">Two</a>'
88
+ "</article></body></html>"
89
+ ),
90
+ "https://example.com/docs/one": f"<html><body>{sidebar}<article><h1>One</h1><p>One text</p></article></body></html>",
91
+ "https://example.com/docs/two": f"<html><body>{sidebar}<article><h1>Two</h1><p>Two text</p></article></body></html>",
92
+ }
93
+
94
+ monkeypatch.setattr(
95
+ "docs2epub.docusaurus_next.requests.Session",
96
+ lambda: _make_session(pages)(),
97
+ )
98
+
99
+ options = DocusaurusNextOptions(start_url=start_url, sleep_s=0)
100
+ chapters = iter_docusaurus_next(options)
101
+
102
+ assert [c.title for c in chapters] == ["Intro", "Getting Started", "One", "Two"]
@@ -16,8 +16,17 @@ def test_build_epub3_smoke(tmp_path):
16
16
 
17
17
  def test_kindle_cleaner_strips_tabindex_and_ol_start():
18
18
  cleaned = clean_html_for_kindle_epub2(
19
- '<div tabindex="0"><ol start="2"><li><u>Hi</u></li></ol></div>'
19
+ '<div tabindex="0"><ol start="2"><li><u>Hi</u></li></ol></div>',
20
+ keep_images=False,
20
21
  )
21
22
  assert "tabindex" not in cleaned
22
23
  assert "start=" not in cleaned
23
24
  assert "underline" in cleaned
25
+
26
+
27
+ def test_kindle_cleaner_drops_remote_images_by_default():
28
+ cleaned = clean_html_for_kindle_epub2(
29
+ '<p>x</p><img src="https://example.com/a.png" /><p>y</p>',
30
+ keep_images=False,
31
+ )
32
+ assert "img" not in cleaned
@@ -166,7 +166,7 @@ wheels = [
166
166
 
167
167
  [[package]]
168
168
  name = "docs2epub"
169
- version = "0.1.0"
169
+ version = "0.1.5"
170
170
  source = { editable = "." }
171
171
  dependencies = [
172
172
  { name = "beautifulsoup4" },
@@ -1,140 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import re
4
- from dataclasses import dataclass
5
- from urllib.parse import urljoin
6
-
7
- import requests
8
- from bs4 import BeautifulSoup, Tag
9
-
10
- from .model import Chapter
11
-
12
-
13
- DEFAULT_USER_AGENT = "docs2epub/0.1 (+https://github.com/brenorb/docs2epub)"
14
-
15
-
16
- @dataclass(frozen=True)
17
- class DocusaurusNextOptions:
18
- start_url: str
19
- base_url: str | None = None
20
- max_pages: int | None = None
21
- sleep_s: float = 0.5
22
- user_agent: str = DEFAULT_USER_AGENT
23
-
24
-
25
- def _slugify_filename(text: str) -> str:
26
- value = text.strip().lower()
27
- value = re.sub(r"[^\w\s-]", "", value)
28
- value = re.sub(r"[\s_-]+", "-", value)
29
- value = value.strip("-")
30
- return value or "chapter"
31
-
32
-
33
- def _extract_article(soup: BeautifulSoup) -> Tag:
34
- article = soup.find("article")
35
- if article:
36
- return article
37
- main = soup.find("main")
38
- if main:
39
- article = main.find("article")
40
- if article:
41
- return article
42
- raise RuntimeError("Could not find <article> in page HTML")
43
-
44
-
45
- def _remove_unwanted(article: Tag) -> None:
46
- for selector in [
47
- 'nav[aria-label="Breadcrumbs"]',
48
- 'nav[aria-label="Docs pages"]',
49
- "div.theme-doc-footer",
50
- "div.theme-doc-footer-edit-meta-row",
51
- "div.theme-doc-version-badge",
52
- "script",
53
- "style",
54
- "noscript",
55
- "iframe",
56
- "svg",
57
- "button",
58
- ]:
59
- for el in list(article.select(selector)):
60
- el.decompose()
61
-
62
-
63
- def _absolutize_urls(container: Tag, base_url: str) -> None:
64
- for el in container.find_all(True):
65
- if el.has_attr("href"):
66
- href = str(el.get("href") or "")
67
- if href.startswith("/"):
68
- el["href"] = urljoin(base_url, href)
69
- if el.has_attr("src"):
70
- src = str(el.get("src") or "")
71
- if src.startswith("/"):
72
- el["src"] = urljoin(base_url, src)
73
-
74
-
75
- def _extract_next_url(soup: BeautifulSoup, base_url: str) -> str | None:
76
- nav = soup.select_one('nav[aria-label="Docs pages"]')
77
- if not nav:
78
- return None
79
-
80
- for a in nav.find_all("a", href=True):
81
- text = " ".join(a.get_text(" ", strip=True).split())
82
- if text.lower().startswith("next"):
83
- return urljoin(base_url, a["href"])
84
-
85
- return None
86
-
87
-
88
- def iter_docusaurus_next(options: DocusaurusNextOptions) -> list[Chapter]:
89
- session = requests.Session()
90
- session.headers.update({"User-Agent": options.user_agent})
91
-
92
- url = options.start_url
93
- base_url = options.base_url or options.start_url
94
-
95
- visited: set[str] = set()
96
- chapters: list[Chapter] = []
97
-
98
- idx = 1
99
- while True:
100
- if options.max_pages is not None and idx > options.max_pages:
101
- break
102
-
103
- if url in visited:
104
- break
105
- visited.add(url)
106
-
107
- resp = session.get(url, timeout=30)
108
- resp.raise_for_status()
109
-
110
- soup = BeautifulSoup(resp.text, "lxml")
111
- article = _extract_article(soup)
112
-
113
- title_el = article.find(["h1", "h2"])
114
- title = (
115
- " ".join(title_el.get_text(" ", strip=True).split()) if title_el else f"Chapter {idx}"
116
- )
117
-
118
- _remove_unwanted(article)
119
- _absolutize_urls(article, base_url=base_url)
120
-
121
- for a in list(article.select('a.hash-link[href^="#"]')):
122
- a.decompose()
123
-
124
- html = article.decode_contents()
125
-
126
- chapters.append(Chapter(index=idx, title=title, url=url, html=html))
127
-
128
- next_url = _extract_next_url(soup, base_url=base_url)
129
- if not next_url:
130
- break
131
-
132
- url = next_url
133
- idx += 1
134
-
135
- if options.sleep_s > 0:
136
- import time
137
-
138
- time.sleep(options.sleep_s)
139
-
140
- return chapters
File without changes