docs2epub 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,28 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+
8
+ jobs:
9
+ test:
10
+ runs-on: ubuntu-latest
11
+ steps:
12
+ - uses: actions/checkout@v4
13
+
14
+ - name: Install uv
15
+ uses: astral-sh/setup-uv@v5
16
+ with:
17
+ version: "latest"
18
+
19
+ - name: Set up Python
20
+ uses: actions/setup-python@v5
21
+ with:
22
+ python-version: "3.12"
23
+
24
+ - name: Install deps
25
+ run: uv sync --extra dev
26
+
27
+ - name: Run tests
28
+ run: uv run pytest
@@ -0,0 +1,33 @@
1
+ name: Publish
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - "v*"
7
+
8
+ jobs:
9
+ publish:
10
+ runs-on: ubuntu-latest
11
+ permissions:
12
+ contents: read
13
+
14
+ steps:
15
+ - uses: actions/checkout@v4
16
+
17
+ - name: Install uv
18
+ uses: astral-sh/setup-uv@v5
19
+ with:
20
+ version: "latest"
21
+
22
+ - name: Set up Python
23
+ uses: actions/setup-python@v5
24
+ with:
25
+ python-version: "3.12"
26
+
27
+ - name: Build
28
+ run: uv build
29
+
30
+ - name: Publish to PyPI
31
+ env:
32
+ UV_PUBLISH_TOKEN: ${{ secrets.PYPI_API_TOKEN }}
33
+ run: uv publish
@@ -0,0 +1,8 @@
1
+ __pycache__/
2
+ *.py[cod]
3
+ *.egg-info/
4
+ .dist/
5
+ dist/
6
+ .venv/
7
+ .pytest_cache/
8
+ .DS_Store
@@ -0,0 +1,56 @@
1
+ Metadata-Version: 2.4
2
+ Name: docs2epub
3
+ Version: 0.1.0
4
+ Summary: Turn documentation sites into an EPUB (Kindle-friendly).
5
+ Author: Breno Brito
6
+ License: MIT
7
+ Requires-Python: >=3.12
8
+ Requires-Dist: beautifulsoup4>=4.14.3
9
+ Requires-Dist: ebooklib>=0.20
10
+ Requires-Dist: lxml>=6.0.2
11
+ Requires-Dist: requests>=2.32.5
12
+ Provides-Extra: dev
13
+ Requires-Dist: pytest-cov>=4.1.0; extra == 'dev'
14
+ Requires-Dist: pytest>=8.0.0; extra == 'dev'
15
+ Description-Content-Type: text/markdown
16
+
17
+ # docs2epub
18
+
19
+ Turn documentation sites into an EPUB (Kindle-friendly).
20
+
21
+ Initial focus: Docusaurus sites that expose a **Next** button (docs navigation).
22
+
23
+ ## Install (dev)
24
+
25
+ This project uses Python 3.12+.
26
+
27
+ ```bash
28
+ uv sync
29
+ uv run docs2epub --help
30
+ ```
31
+
32
+ ## Usage
33
+
34
+ ### Docusaurus “Next” crawl
35
+
36
+ ```bash
37
+ # Default output is EPUB2 (Kindle-friendly) via pandoc
38
+ uv run docs2epub \
39
+ --start-url "https://www.techinterviewhandbook.org/software-engineering-interview-guide/" \
40
+ --out "dist/tech-interview-handbook.epub" \
41
+ --title "Tech Interview Handbook" \
42
+ --author "Yangshun Tay"
43
+
44
+ # Optional: build EPUB3 (ebooklib)
45
+ uv run docs2epub \
46
+ --format epub3 \
47
+ --start-url "https://www.techinterviewhandbook.org/software-engineering-interview-guide/" \
48
+ --out "dist/tech-interview-handbook.epub" \
49
+ --title "Tech Interview Handbook" \
50
+ --author "Yangshun Tay"
51
+ ```
52
+
53
+ ## Roadmap
54
+
55
+ - Add additional discovery strategies: `sitemap.xml`, sidebar parsing, and explicit link lists.
56
+ - Optional: send-to-kindle (email), once Gmail auth is set up.
@@ -0,0 +1,40 @@
1
+ # docs2epub
2
+
3
+ Turn documentation sites into an EPUB (Kindle-friendly).
4
+
5
+ Initial focus: Docusaurus sites that expose a **Next** button (docs navigation).
6
+
7
+ ## Install (dev)
8
+
9
+ This project uses Python 3.12+.
10
+
11
+ ```bash
12
+ uv sync
13
+ uv run docs2epub --help
14
+ ```
15
+
16
+ ## Usage
17
+
18
+ ### Docusaurus “Next” crawl
19
+
20
+ ```bash
21
+ # Default output is EPUB2 (Kindle-friendly) via pandoc
22
+ uv run docs2epub \
23
+ --start-url "https://www.techinterviewhandbook.org/software-engineering-interview-guide/" \
24
+ --out "dist/tech-interview-handbook.epub" \
25
+ --title "Tech Interview Handbook" \
26
+ --author "Yangshun Tay"
27
+
28
+ # Optional: build EPUB3 (ebooklib)
29
+ uv run docs2epub \
30
+ --format epub3 \
31
+ --start-url "https://www.techinterviewhandbook.org/software-engineering-interview-guide/" \
32
+ --out "dist/tech-interview-handbook.epub" \
33
+ --title "Tech Interview Handbook" \
34
+ --author "Yangshun Tay"
35
+ ```
36
+
37
+ ## Roadmap
38
+
39
+ - Add additional discovery strategies: `sitemap.xml`, sidebar parsing, and explicit link lists.
40
+ - Optional: send-to-kindle (email), once Gmail auth is set up.
@@ -0,0 +1,36 @@
1
+ [project]
2
+ name = "docs2epub"
3
+ version = "0.1.0"
4
+ description = "Turn documentation sites into an EPUB (Kindle-friendly)."
5
+ readme = "README.md"
6
+ requires-python = ">=3.12"
7
+ license = { text = "MIT" }
8
+ authors = [
9
+ { name = "Breno Brito" },
10
+ ]
11
+ dependencies = [
12
+ "beautifulsoup4>=4.14.3",
13
+ "ebooklib>=0.20",
14
+ "lxml>=6.0.2",
15
+ "requests>=2.32.5",
16
+ ]
17
+
18
+ [project.scripts]
19
+ docs2epub = "docs2epub.cli:main"
20
+
21
+ [project.optional-dependencies]
22
+ dev = [
23
+ "pytest>=8.0.0",
24
+ "pytest-cov>=4.1.0",
25
+ ]
26
+
27
+ [build-system]
28
+ requires = ["hatchling>=1.26.0"]
29
+ build-backend = "hatchling.build"
30
+
31
+ [tool.uv]
32
+ package = true
33
+
34
+ [tool.pytest.ini_options]
35
+ addopts = "-q"
36
+ testpaths = ["tests"]
@@ -0,0 +1,5 @@
1
+ __all__ = [
2
+ "__version__",
3
+ ]
4
+
5
+ __version__ = "0.1.0"
@@ -0,0 +1,95 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ from pathlib import Path
5
+
6
+ from .docusaurus_next import DocusaurusNextOptions, iter_docusaurus_next
7
+ from .epub import EpubMetadata, build_epub
8
+ from .pandoc_epub2 import build_epub2_with_pandoc
9
+
10
+
11
+ def _build_parser() -> argparse.ArgumentParser:
12
+ p = argparse.ArgumentParser(
13
+ prog="docs2epub",
14
+ description="Turn documentation sites into an EPUB (Kindle-friendly).",
15
+ )
16
+
17
+ p.add_argument(
18
+ "--start-url",
19
+ required=True,
20
+ help="Starting URL for scraping (initially: Docusaurus docs page).",
21
+ )
22
+ p.add_argument(
23
+ "--base-url",
24
+ default=None,
25
+ help="Base URL used to resolve relative links (defaults to start-url).",
26
+ )
27
+ p.add_argument("--max-pages", type=int, default=None)
28
+ p.add_argument("--sleep-s", type=float, default=0.5)
29
+
30
+ p.add_argument("--title", required=True)
31
+ p.add_argument("--author", required=True)
32
+ p.add_argument("--language", default="en")
33
+ p.add_argument("--identifier", default=None)
34
+ p.add_argument("--publisher", default=None)
35
+
36
+ p.add_argument(
37
+ "--format",
38
+ default="epub2",
39
+ choices=["epub2", "epub3"],
40
+ help="Output format. Default: epub2 (Kindle-friendly).",
41
+ )
42
+
43
+ p.add_argument(
44
+ "--out",
45
+ required=True,
46
+ help="Output EPUB file path.",
47
+ )
48
+
49
+ return p
50
+
51
+
52
+ def main(argv: list[str] | None = None) -> int:
53
+ args = _build_parser().parse_args(argv)
54
+
55
+ options = DocusaurusNextOptions(
56
+ start_url=args.start_url,
57
+ base_url=args.base_url,
58
+ max_pages=args.max_pages,
59
+ sleep_s=args.sleep_s,
60
+ )
61
+
62
+ chapters = iter_docusaurus_next(options)
63
+ if not chapters:
64
+ raise SystemExit("No chapters scraped (did not find article content).")
65
+
66
+ out_path: Path
67
+
68
+ if args.format == "epub2":
69
+ out_path = build_epub2_with_pandoc(
70
+ chapters=chapters,
71
+ out_file=Path(args.out),
72
+ title=args.title,
73
+ author=args.author,
74
+ language=args.language,
75
+ publisher=args.publisher,
76
+ identifier=args.identifier,
77
+ )
78
+ else:
79
+ meta = EpubMetadata(
80
+ title=args.title,
81
+ author=args.author,
82
+ language=args.language,
83
+ identifier=args.identifier,
84
+ publisher=args.publisher,
85
+ )
86
+
87
+ out_path = build_epub(
88
+ chapters=chapters,
89
+ out_file=Path(args.out),
90
+ meta=meta,
91
+ )
92
+
93
+ print(f"Scraped {len(chapters)} pages")
94
+ print(f"EPUB written to: {out_path.resolve()}")
95
+ return 0
@@ -0,0 +1,140 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from dataclasses import dataclass
5
+ from urllib.parse import urljoin
6
+
7
+ import requests
8
+ from bs4 import BeautifulSoup, Tag
9
+
10
+ from .model import Chapter
11
+
12
+
13
+ DEFAULT_USER_AGENT = "docs2epub/0.1 (+https://github.com/brenorb/docs2epub)"
14
+
15
+
16
+ @dataclass(frozen=True)
17
+ class DocusaurusNextOptions:
18
+ start_url: str
19
+ base_url: str | None = None
20
+ max_pages: int | None = None
21
+ sleep_s: float = 0.5
22
+ user_agent: str = DEFAULT_USER_AGENT
23
+
24
+
25
+ def _slugify_filename(text: str) -> str:
26
+ value = text.strip().lower()
27
+ value = re.sub(r"[^\w\s-]", "", value)
28
+ value = re.sub(r"[\s_-]+", "-", value)
29
+ value = value.strip("-")
30
+ return value or "chapter"
31
+
32
+
33
+ def _extract_article(soup: BeautifulSoup) -> Tag:
34
+ article = soup.find("article")
35
+ if article:
36
+ return article
37
+ main = soup.find("main")
38
+ if main:
39
+ article = main.find("article")
40
+ if article:
41
+ return article
42
+ raise RuntimeError("Could not find <article> in page HTML")
43
+
44
+
45
+ def _remove_unwanted(article: Tag) -> None:
46
+ for selector in [
47
+ 'nav[aria-label="Breadcrumbs"]',
48
+ 'nav[aria-label="Docs pages"]',
49
+ "div.theme-doc-footer",
50
+ "div.theme-doc-footer-edit-meta-row",
51
+ "div.theme-doc-version-badge",
52
+ "script",
53
+ "style",
54
+ "noscript",
55
+ "iframe",
56
+ "svg",
57
+ "button",
58
+ ]:
59
+ for el in list(article.select(selector)):
60
+ el.decompose()
61
+
62
+
63
+ def _absolutize_urls(container: Tag, base_url: str) -> None:
64
+ for el in container.find_all(True):
65
+ if el.has_attr("href"):
66
+ href = str(el.get("href") or "")
67
+ if href.startswith("/"):
68
+ el["href"] = urljoin(base_url, href)
69
+ if el.has_attr("src"):
70
+ src = str(el.get("src") or "")
71
+ if src.startswith("/"):
72
+ el["src"] = urljoin(base_url, src)
73
+
74
+
75
+ def _extract_next_url(soup: BeautifulSoup, base_url: str) -> str | None:
76
+ nav = soup.select_one('nav[aria-label="Docs pages"]')
77
+ if not nav:
78
+ return None
79
+
80
+ for a in nav.find_all("a", href=True):
81
+ text = " ".join(a.get_text(" ", strip=True).split())
82
+ if text.lower().startswith("next"):
83
+ return urljoin(base_url, a["href"])
84
+
85
+ return None
86
+
87
+
88
+ def iter_docusaurus_next(options: DocusaurusNextOptions) -> list[Chapter]:
89
+ session = requests.Session()
90
+ session.headers.update({"User-Agent": options.user_agent})
91
+
92
+ url = options.start_url
93
+ base_url = options.base_url or options.start_url
94
+
95
+ visited: set[str] = set()
96
+ chapters: list[Chapter] = []
97
+
98
+ idx = 1
99
+ while True:
100
+ if options.max_pages is not None and idx > options.max_pages:
101
+ break
102
+
103
+ if url in visited:
104
+ break
105
+ visited.add(url)
106
+
107
+ resp = session.get(url, timeout=30)
108
+ resp.raise_for_status()
109
+
110
+ soup = BeautifulSoup(resp.text, "lxml")
111
+ article = _extract_article(soup)
112
+
113
+ title_el = article.find(["h1", "h2"])
114
+ title = (
115
+ " ".join(title_el.get_text(" ", strip=True).split()) if title_el else f"Chapter {idx}"
116
+ )
117
+
118
+ _remove_unwanted(article)
119
+ _absolutize_urls(article, base_url=base_url)
120
+
121
+ for a in list(article.select('a.hash-link[href^="#"]')):
122
+ a.decompose()
123
+
124
+ html = article.decode_contents()
125
+
126
+ chapters.append(Chapter(index=idx, title=title, url=url, html=html))
127
+
128
+ next_url = _extract_next_url(soup, base_url=base_url)
129
+ if not next_url:
130
+ break
131
+
132
+ url = next_url
133
+ idx += 1
134
+
135
+ if options.sleep_s > 0:
136
+ import time
137
+
138
+ time.sleep(options.sleep_s)
139
+
140
+ return chapters
@@ -0,0 +1,129 @@
1
+ from __future__ import annotations
2
+
3
+ import uuid
4
+ from dataclasses import dataclass
5
+ from datetime import datetime, timezone
6
+ from pathlib import Path
7
+ from typing import Iterable
8
+
9
+ from bs4 import BeautifulSoup
10
+ from ebooklib import epub
11
+
12
+ from .model import Chapter
13
+
14
+
15
+ EPUB_CSS = """
16
+ body {
17
+ font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif;
18
+ line-height: 1.55;
19
+ }
20
+ pre, code {
21
+ font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace;
22
+ }
23
+ pre {
24
+ white-space: pre-wrap;
25
+ word-wrap: break-word;
26
+ padding: 0.75rem;
27
+ border: 1px solid #ddd;
28
+ border-radius: 0.5rem;
29
+ background: #f7f7f7;
30
+ }
31
+ .chapter-sep {
32
+ margin-top: 1.75rem;
33
+ border-top: 1px solid #ddd;
34
+ }
35
+ """
36
+
37
+
38
+ @dataclass(frozen=True)
39
+ class EpubMetadata:
40
+ title: str
41
+ author: str
42
+ language: str = "en"
43
+ identifier: str | None = None
44
+ publisher: str | None = None
45
+ created_at: datetime | None = None
46
+
47
+
48
+ def _extract_body_inner_html(html: str) -> str:
49
+ soup = BeautifulSoup(html, "lxml")
50
+ body = soup.find("body")
51
+ if not body:
52
+ return html
53
+ return body.decode_contents()
54
+
55
+
56
+ def _strip_first_h1(html_fragment: str) -> str:
57
+ soup = BeautifulSoup(html_fragment, "lxml")
58
+ first_h1 = soup.find("h1")
59
+ if first_h1:
60
+ first_h1.decompose()
61
+ body = soup.find("body")
62
+ if body:
63
+ return body.decode_contents()
64
+ return str(soup)
65
+
66
+
67
+ def build_epub(
68
+ *,
69
+ chapters: Iterable[Chapter],
70
+ out_file: str | Path,
71
+ meta: EpubMetadata,
72
+ ) -> Path:
73
+ out_path = Path(out_file)
74
+ out_path.parent.mkdir(parents=True, exist_ok=True)
75
+
76
+ book = epub.EpubBook()
77
+
78
+ identifier = meta.identifier or f"urn:uuid:{uuid.uuid4()}"
79
+ book.set_identifier(identifier)
80
+ book.set_title(meta.title)
81
+ book.set_language(meta.language)
82
+ book.add_author(meta.author)
83
+
84
+ if meta.publisher:
85
+ book.add_metadata("DC", "publisher", meta.publisher)
86
+
87
+ created_at = meta.created_at or datetime.now(timezone.utc)
88
+ book.add_metadata("DC", "date", created_at.isoformat())
89
+
90
+ style_item = epub.EpubItem(
91
+ uid="style_nav",
92
+ file_name="style/style.css",
93
+ media_type="text/css",
94
+ content=EPUB_CSS.encode("utf-8"),
95
+ )
96
+ book.add_item(style_item)
97
+
98
+ chapter_items: list[epub.EpubHtml] = []
99
+ toc_items: list[epub.Link] = []
100
+
101
+ for ch in chapters:
102
+ body_inner = _extract_body_inner_html(ch.html)
103
+ body_inner = _strip_first_h1(body_inner)
104
+
105
+ content = f"""<h1>{ch.title}</h1>
106
+ <div class=\"chapter-sep\"></div>
107
+ {body_inner}
108
+ """
109
+
110
+ item = epub.EpubHtml(
111
+ title=ch.title,
112
+ file_name=f"chap_{ch.index:03d}.xhtml",
113
+ lang=meta.language,
114
+ )
115
+ item.content = content
116
+ item.add_item(style_item)
117
+
118
+ book.add_item(item)
119
+ chapter_items.append(item)
120
+ toc_items.append(epub.Link(item.file_name, ch.title, f"chap_{ch.index:03d}"))
121
+
122
+ book.toc = tuple(toc_items)
123
+ book.spine = ["nav", *chapter_items]
124
+
125
+ book.add_item(epub.EpubNcx())
126
+ book.add_item(epub.EpubNav())
127
+
128
+ epub.write_epub(str(out_path), book, {})
129
+ return out_path
@@ -0,0 +1,70 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+
5
+ from bs4 import BeautifulSoup
6
+
7
+
8
+ def clean_html_for_kindle_epub2(html_fragment: str) -> str:
9
+ """Best-effort HTML cleanup for Kindle-friendly EPUB2.
10
+
11
+ This is intentionally conservative: it strips known-problematic attributes
12
+ and tags that commonly cause Send-to-Kindle conversion issues.
13
+ """
14
+
15
+ soup = BeautifulSoup(html_fragment, "lxml")
16
+
17
+ # EPUB2: <u> tag isn't consistently supported; convert to a span.
18
+ for u in list(soup.find_all("u")):
19
+ span = soup.new_tag("span")
20
+ span["style"] = "text-decoration: underline;"
21
+ span.string = u.get_text() if u.string is None else u.string
22
+ if u.string is None:
23
+ # Keep children by moving them into the span.
24
+ for child in list(u.contents):
25
+ span.append(child)
26
+ u.replace_with(span)
27
+
28
+ # Remove tabindex attributes (not allowed in EPUB2 XHTML).
29
+ for el in soup.find_all(attrs={"tabindex": True}):
30
+ try:
31
+ del el["tabindex"]
32
+ except KeyError:
33
+ pass
34
+
35
+ # Remove start attribute from ordered lists (not allowed in EPUB2 XHTML).
36
+ for ol in soup.find_all("ol"):
37
+ if ol.has_attr("start"):
38
+ del ol["start"]
39
+
40
+ # Strip duplicate ids in a simple way: if an id repeats, rename it.
41
+ seen_ids: set[str] = set()
42
+ for el in soup.find_all(attrs={"id": True}):
43
+ raw = str(el.get("id") or "").strip()
44
+ if not raw:
45
+ continue
46
+ if raw not in seen_ids:
47
+ seen_ids.add(raw)
48
+ continue
49
+ suffix = 2
50
+ new_id = f"{raw}-{suffix}"
51
+ while new_id in seen_ids:
52
+ suffix += 1
53
+ new_id = f"{raw}-{suffix}"
54
+ el["id"] = new_id
55
+ seen_ids.add(new_id)
56
+
57
+ # Remove empty fragment links that point to missing ids (best-effort).
58
+ # If href="#something" but no element has id="something", drop href.
59
+ all_ids = {str(el.get("id")) for el in soup.find_all(attrs={"id": True})}
60
+ for a in soup.find_all("a", href=True):
61
+ href = str(a.get("href") or "")
62
+ if href.startswith("#") and len(href) > 1:
63
+ frag = href[1:]
64
+ if frag not in all_ids:
65
+ del a["href"]
66
+
67
+ # Normalize weird whitespace artifacts.
68
+ text = str(soup)
69
+ text = re.sub(r"\s+", " ", text)
70
+ return text.strip()
@@ -0,0 +1,11 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+
5
+
6
+ @dataclass(frozen=True)
7
+ class Chapter:
8
+ index: int
9
+ title: str
10
+ url: str
11
+ html: str