docs2epub 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs2epub-0.1.0/.github/workflows/ci.yml +28 -0
- docs2epub-0.1.0/.github/workflows/publish.yml +33 -0
- docs2epub-0.1.0/.gitignore +8 -0
- docs2epub-0.1.0/PKG-INFO +56 -0
- docs2epub-0.1.0/README.md +40 -0
- docs2epub-0.1.0/pyproject.toml +36 -0
- docs2epub-0.1.0/src/docs2epub/__init__.py +5 -0
- docs2epub-0.1.0/src/docs2epub/cli.py +95 -0
- docs2epub-0.1.0/src/docs2epub/docusaurus_next.py +140 -0
- docs2epub-0.1.0/src/docs2epub/epub.py +129 -0
- docs2epub-0.1.0/src/docs2epub/kindle_html.py +70 -0
- docs2epub-0.1.0/src/docs2epub/model.py +11 -0
- docs2epub-0.1.0/src/docs2epub/pandoc_epub2.py +101 -0
- docs2epub-0.1.0/tests/test_smoke.py +23 -0
- docs2epub-0.1.0/uv.lock +412 -0
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
test:
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
steps:
|
|
12
|
+
- uses: actions/checkout@v4
|
|
13
|
+
|
|
14
|
+
- name: Install uv
|
|
15
|
+
uses: astral-sh/setup-uv@v5
|
|
16
|
+
with:
|
|
17
|
+
version: "latest"
|
|
18
|
+
|
|
19
|
+
- name: Set up Python
|
|
20
|
+
uses: actions/setup-python@v5
|
|
21
|
+
with:
|
|
22
|
+
python-version: "3.12"
|
|
23
|
+
|
|
24
|
+
- name: Install deps
|
|
25
|
+
run: uv sync --extra dev
|
|
26
|
+
|
|
27
|
+
- name: Run tests
|
|
28
|
+
run: uv run pytest
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
name: Publish
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags:
|
|
6
|
+
- "v*"
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
publish:
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
permissions:
|
|
12
|
+
contents: read
|
|
13
|
+
|
|
14
|
+
steps:
|
|
15
|
+
- uses: actions/checkout@v4
|
|
16
|
+
|
|
17
|
+
- name: Install uv
|
|
18
|
+
uses: astral-sh/setup-uv@v5
|
|
19
|
+
with:
|
|
20
|
+
version: "latest"
|
|
21
|
+
|
|
22
|
+
- name: Set up Python
|
|
23
|
+
uses: actions/setup-python@v5
|
|
24
|
+
with:
|
|
25
|
+
python-version: "3.12"
|
|
26
|
+
|
|
27
|
+
- name: Build
|
|
28
|
+
run: uv build
|
|
29
|
+
|
|
30
|
+
- name: Publish to PyPI
|
|
31
|
+
env:
|
|
32
|
+
UV_PUBLISH_TOKEN: ${{ secrets.PYPI_API_TOKEN }}
|
|
33
|
+
run: uv publish
|
docs2epub-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: docs2epub
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Turn documentation sites into an EPUB (Kindle-friendly).
|
|
5
|
+
Author: Breno Brito
|
|
6
|
+
License: MIT
|
|
7
|
+
Requires-Python: >=3.12
|
|
8
|
+
Requires-Dist: beautifulsoup4>=4.14.3
|
|
9
|
+
Requires-Dist: ebooklib>=0.20
|
|
10
|
+
Requires-Dist: lxml>=6.0.2
|
|
11
|
+
Requires-Dist: requests>=2.32.5
|
|
12
|
+
Provides-Extra: dev
|
|
13
|
+
Requires-Dist: pytest-cov>=4.1.0; extra == 'dev'
|
|
14
|
+
Requires-Dist: pytest>=8.0.0; extra == 'dev'
|
|
15
|
+
Description-Content-Type: text/markdown
|
|
16
|
+
|
|
17
|
+
# docs2epub
|
|
18
|
+
|
|
19
|
+
Turn documentation sites into an EPUB (Kindle-friendly).
|
|
20
|
+
|
|
21
|
+
Initial focus: Docusaurus sites that expose a **Next** button (docs navigation).
|
|
22
|
+
|
|
23
|
+
## Install (dev)
|
|
24
|
+
|
|
25
|
+
This project uses Python 3.12+.
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
uv sync
|
|
29
|
+
uv run docs2epub --help
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## Usage
|
|
33
|
+
|
|
34
|
+
### Docusaurus “Next” crawl
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
# Default output is EPUB2 (Kindle-friendly) via pandoc
|
|
38
|
+
uv run docs2epub \
|
|
39
|
+
--start-url "https://www.techinterviewhandbook.org/software-engineering-interview-guide/" \
|
|
40
|
+
--out "dist/tech-interview-handbook.epub" \
|
|
41
|
+
--title "Tech Interview Handbook" \
|
|
42
|
+
--author "Yangshun Tay"
|
|
43
|
+
|
|
44
|
+
# Optional: build EPUB3 (ebooklib)
|
|
45
|
+
uv run docs2epub \
|
|
46
|
+
--format epub3 \
|
|
47
|
+
--start-url "https://www.techinterviewhandbook.org/software-engineering-interview-guide/" \
|
|
48
|
+
--out "dist/tech-interview-handbook.epub" \
|
|
49
|
+
--title "Tech Interview Handbook" \
|
|
50
|
+
--author "Yangshun Tay"
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## Roadmap
|
|
54
|
+
|
|
55
|
+
- Add additional discovery strategies: `sitemap.xml`, sidebar parsing, and explicit link lists.
|
|
56
|
+
- Optional: send-to-kindle (email), once Gmail auth is set up.
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
# docs2epub
|
|
2
|
+
|
|
3
|
+
Turn documentation sites into an EPUB (Kindle-friendly).
|
|
4
|
+
|
|
5
|
+
Initial focus: Docusaurus sites that expose a **Next** button (docs navigation).
|
|
6
|
+
|
|
7
|
+
## Install (dev)
|
|
8
|
+
|
|
9
|
+
This project uses Python 3.12+.
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
uv sync
|
|
13
|
+
uv run docs2epub --help
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
## Usage
|
|
17
|
+
|
|
18
|
+
### Docusaurus “Next” crawl
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
# Default output is EPUB2 (Kindle-friendly) via pandoc
|
|
22
|
+
uv run docs2epub \
|
|
23
|
+
--start-url "https://www.techinterviewhandbook.org/software-engineering-interview-guide/" \
|
|
24
|
+
--out "dist/tech-interview-handbook.epub" \
|
|
25
|
+
--title "Tech Interview Handbook" \
|
|
26
|
+
--author "Yangshun Tay"
|
|
27
|
+
|
|
28
|
+
# Optional: build EPUB3 (ebooklib)
|
|
29
|
+
uv run docs2epub \
|
|
30
|
+
--format epub3 \
|
|
31
|
+
--start-url "https://www.techinterviewhandbook.org/software-engineering-interview-guide/" \
|
|
32
|
+
--out "dist/tech-interview-handbook.epub" \
|
|
33
|
+
--title "Tech Interview Handbook" \
|
|
34
|
+
--author "Yangshun Tay"
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Roadmap
|
|
38
|
+
|
|
39
|
+
- Add additional discovery strategies: `sitemap.xml`, sidebar parsing, and explicit link lists.
|
|
40
|
+
- Optional: send-to-kindle (email), once Gmail auth is set up.
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "docs2epub"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Turn documentation sites into an EPUB (Kindle-friendly)."
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.12"
|
|
7
|
+
license = { text = "MIT" }
|
|
8
|
+
authors = [
|
|
9
|
+
{ name = "Breno Brito" },
|
|
10
|
+
]
|
|
11
|
+
dependencies = [
|
|
12
|
+
"beautifulsoup4>=4.14.3",
|
|
13
|
+
"ebooklib>=0.20",
|
|
14
|
+
"lxml>=6.0.2",
|
|
15
|
+
"requests>=2.32.5",
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
[project.scripts]
|
|
19
|
+
docs2epub = "docs2epub.cli:main"
|
|
20
|
+
|
|
21
|
+
[project.optional-dependencies]
|
|
22
|
+
dev = [
|
|
23
|
+
"pytest>=8.0.0",
|
|
24
|
+
"pytest-cov>=4.1.0",
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
[build-system]
|
|
28
|
+
requires = ["hatchling>=1.26.0"]
|
|
29
|
+
build-backend = "hatchling.build"
|
|
30
|
+
|
|
31
|
+
[tool.uv]
|
|
32
|
+
package = true
|
|
33
|
+
|
|
34
|
+
[tool.pytest.ini_options]
|
|
35
|
+
addopts = "-q"
|
|
36
|
+
testpaths = ["tests"]
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from .docusaurus_next import DocusaurusNextOptions, iter_docusaurus_next
|
|
7
|
+
from .epub import EpubMetadata, build_epub
|
|
8
|
+
from .pandoc_epub2 import build_epub2_with_pandoc
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _build_parser() -> argparse.ArgumentParser:
|
|
12
|
+
p = argparse.ArgumentParser(
|
|
13
|
+
prog="docs2epub",
|
|
14
|
+
description="Turn documentation sites into an EPUB (Kindle-friendly).",
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
p.add_argument(
|
|
18
|
+
"--start-url",
|
|
19
|
+
required=True,
|
|
20
|
+
help="Starting URL for scraping (initially: Docusaurus docs page).",
|
|
21
|
+
)
|
|
22
|
+
p.add_argument(
|
|
23
|
+
"--base-url",
|
|
24
|
+
default=None,
|
|
25
|
+
help="Base URL used to resolve relative links (defaults to start-url).",
|
|
26
|
+
)
|
|
27
|
+
p.add_argument("--max-pages", type=int, default=None)
|
|
28
|
+
p.add_argument("--sleep-s", type=float, default=0.5)
|
|
29
|
+
|
|
30
|
+
p.add_argument("--title", required=True)
|
|
31
|
+
p.add_argument("--author", required=True)
|
|
32
|
+
p.add_argument("--language", default="en")
|
|
33
|
+
p.add_argument("--identifier", default=None)
|
|
34
|
+
p.add_argument("--publisher", default=None)
|
|
35
|
+
|
|
36
|
+
p.add_argument(
|
|
37
|
+
"--format",
|
|
38
|
+
default="epub2",
|
|
39
|
+
choices=["epub2", "epub3"],
|
|
40
|
+
help="Output format. Default: epub2 (Kindle-friendly).",
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
p.add_argument(
|
|
44
|
+
"--out",
|
|
45
|
+
required=True,
|
|
46
|
+
help="Output EPUB file path.",
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
return p
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def main(argv: list[str] | None = None) -> int:
|
|
53
|
+
args = _build_parser().parse_args(argv)
|
|
54
|
+
|
|
55
|
+
options = DocusaurusNextOptions(
|
|
56
|
+
start_url=args.start_url,
|
|
57
|
+
base_url=args.base_url,
|
|
58
|
+
max_pages=args.max_pages,
|
|
59
|
+
sleep_s=args.sleep_s,
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
chapters = iter_docusaurus_next(options)
|
|
63
|
+
if not chapters:
|
|
64
|
+
raise SystemExit("No chapters scraped (did not find article content).")
|
|
65
|
+
|
|
66
|
+
out_path: Path
|
|
67
|
+
|
|
68
|
+
if args.format == "epub2":
|
|
69
|
+
out_path = build_epub2_with_pandoc(
|
|
70
|
+
chapters=chapters,
|
|
71
|
+
out_file=Path(args.out),
|
|
72
|
+
title=args.title,
|
|
73
|
+
author=args.author,
|
|
74
|
+
language=args.language,
|
|
75
|
+
publisher=args.publisher,
|
|
76
|
+
identifier=args.identifier,
|
|
77
|
+
)
|
|
78
|
+
else:
|
|
79
|
+
meta = EpubMetadata(
|
|
80
|
+
title=args.title,
|
|
81
|
+
author=args.author,
|
|
82
|
+
language=args.language,
|
|
83
|
+
identifier=args.identifier,
|
|
84
|
+
publisher=args.publisher,
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
out_path = build_epub(
|
|
88
|
+
chapters=chapters,
|
|
89
|
+
out_file=Path(args.out),
|
|
90
|
+
meta=meta,
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
print(f"Scraped {len(chapters)} pages")
|
|
94
|
+
print(f"EPUB written to: {out_path.resolve()}")
|
|
95
|
+
return 0
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from urllib.parse import urljoin
|
|
6
|
+
|
|
7
|
+
import requests
|
|
8
|
+
from bs4 import BeautifulSoup, Tag
|
|
9
|
+
|
|
10
|
+
from .model import Chapter
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
DEFAULT_USER_AGENT = "docs2epub/0.1 (+https://github.com/brenorb/docs2epub)"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass(frozen=True)
|
|
17
|
+
class DocusaurusNextOptions:
|
|
18
|
+
start_url: str
|
|
19
|
+
base_url: str | None = None
|
|
20
|
+
max_pages: int | None = None
|
|
21
|
+
sleep_s: float = 0.5
|
|
22
|
+
user_agent: str = DEFAULT_USER_AGENT
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _slugify_filename(text: str) -> str:
|
|
26
|
+
value = text.strip().lower()
|
|
27
|
+
value = re.sub(r"[^\w\s-]", "", value)
|
|
28
|
+
value = re.sub(r"[\s_-]+", "-", value)
|
|
29
|
+
value = value.strip("-")
|
|
30
|
+
return value or "chapter"
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _extract_article(soup: BeautifulSoup) -> Tag:
|
|
34
|
+
article = soup.find("article")
|
|
35
|
+
if article:
|
|
36
|
+
return article
|
|
37
|
+
main = soup.find("main")
|
|
38
|
+
if main:
|
|
39
|
+
article = main.find("article")
|
|
40
|
+
if article:
|
|
41
|
+
return article
|
|
42
|
+
raise RuntimeError("Could not find <article> in page HTML")
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _remove_unwanted(article: Tag) -> None:
|
|
46
|
+
for selector in [
|
|
47
|
+
'nav[aria-label="Breadcrumbs"]',
|
|
48
|
+
'nav[aria-label="Docs pages"]',
|
|
49
|
+
"div.theme-doc-footer",
|
|
50
|
+
"div.theme-doc-footer-edit-meta-row",
|
|
51
|
+
"div.theme-doc-version-badge",
|
|
52
|
+
"script",
|
|
53
|
+
"style",
|
|
54
|
+
"noscript",
|
|
55
|
+
"iframe",
|
|
56
|
+
"svg",
|
|
57
|
+
"button",
|
|
58
|
+
]:
|
|
59
|
+
for el in list(article.select(selector)):
|
|
60
|
+
el.decompose()
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _absolutize_urls(container: Tag, base_url: str) -> None:
|
|
64
|
+
for el in container.find_all(True):
|
|
65
|
+
if el.has_attr("href"):
|
|
66
|
+
href = str(el.get("href") or "")
|
|
67
|
+
if href.startswith("/"):
|
|
68
|
+
el["href"] = urljoin(base_url, href)
|
|
69
|
+
if el.has_attr("src"):
|
|
70
|
+
src = str(el.get("src") or "")
|
|
71
|
+
if src.startswith("/"):
|
|
72
|
+
el["src"] = urljoin(base_url, src)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _extract_next_url(soup: BeautifulSoup, base_url: str) -> str | None:
|
|
76
|
+
nav = soup.select_one('nav[aria-label="Docs pages"]')
|
|
77
|
+
if not nav:
|
|
78
|
+
return None
|
|
79
|
+
|
|
80
|
+
for a in nav.find_all("a", href=True):
|
|
81
|
+
text = " ".join(a.get_text(" ", strip=True).split())
|
|
82
|
+
if text.lower().startswith("next"):
|
|
83
|
+
return urljoin(base_url, a["href"])
|
|
84
|
+
|
|
85
|
+
return None
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def iter_docusaurus_next(options: DocusaurusNextOptions) -> list[Chapter]:
|
|
89
|
+
session = requests.Session()
|
|
90
|
+
session.headers.update({"User-Agent": options.user_agent})
|
|
91
|
+
|
|
92
|
+
url = options.start_url
|
|
93
|
+
base_url = options.base_url or options.start_url
|
|
94
|
+
|
|
95
|
+
visited: set[str] = set()
|
|
96
|
+
chapters: list[Chapter] = []
|
|
97
|
+
|
|
98
|
+
idx = 1
|
|
99
|
+
while True:
|
|
100
|
+
if options.max_pages is not None and idx > options.max_pages:
|
|
101
|
+
break
|
|
102
|
+
|
|
103
|
+
if url in visited:
|
|
104
|
+
break
|
|
105
|
+
visited.add(url)
|
|
106
|
+
|
|
107
|
+
resp = session.get(url, timeout=30)
|
|
108
|
+
resp.raise_for_status()
|
|
109
|
+
|
|
110
|
+
soup = BeautifulSoup(resp.text, "lxml")
|
|
111
|
+
article = _extract_article(soup)
|
|
112
|
+
|
|
113
|
+
title_el = article.find(["h1", "h2"])
|
|
114
|
+
title = (
|
|
115
|
+
" ".join(title_el.get_text(" ", strip=True).split()) if title_el else f"Chapter {idx}"
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
_remove_unwanted(article)
|
|
119
|
+
_absolutize_urls(article, base_url=base_url)
|
|
120
|
+
|
|
121
|
+
for a in list(article.select('a.hash-link[href^="#"]')):
|
|
122
|
+
a.decompose()
|
|
123
|
+
|
|
124
|
+
html = article.decode_contents()
|
|
125
|
+
|
|
126
|
+
chapters.append(Chapter(index=idx, title=title, url=url, html=html))
|
|
127
|
+
|
|
128
|
+
next_url = _extract_next_url(soup, base_url=base_url)
|
|
129
|
+
if not next_url:
|
|
130
|
+
break
|
|
131
|
+
|
|
132
|
+
url = next_url
|
|
133
|
+
idx += 1
|
|
134
|
+
|
|
135
|
+
if options.sleep_s > 0:
|
|
136
|
+
import time
|
|
137
|
+
|
|
138
|
+
time.sleep(options.sleep_s)
|
|
139
|
+
|
|
140
|
+
return chapters
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import uuid
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from datetime import datetime, timezone
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Iterable
|
|
8
|
+
|
|
9
|
+
from bs4 import BeautifulSoup
|
|
10
|
+
from ebooklib import epub
|
|
11
|
+
|
|
12
|
+
from .model import Chapter
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
EPUB_CSS = """
|
|
16
|
+
body {
|
|
17
|
+
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif;
|
|
18
|
+
line-height: 1.55;
|
|
19
|
+
}
|
|
20
|
+
pre, code {
|
|
21
|
+
font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace;
|
|
22
|
+
}
|
|
23
|
+
pre {
|
|
24
|
+
white-space: pre-wrap;
|
|
25
|
+
word-wrap: break-word;
|
|
26
|
+
padding: 0.75rem;
|
|
27
|
+
border: 1px solid #ddd;
|
|
28
|
+
border-radius: 0.5rem;
|
|
29
|
+
background: #f7f7f7;
|
|
30
|
+
}
|
|
31
|
+
.chapter-sep {
|
|
32
|
+
margin-top: 1.75rem;
|
|
33
|
+
border-top: 1px solid #ddd;
|
|
34
|
+
}
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass(frozen=True)
|
|
39
|
+
class EpubMetadata:
|
|
40
|
+
title: str
|
|
41
|
+
author: str
|
|
42
|
+
language: str = "en"
|
|
43
|
+
identifier: str | None = None
|
|
44
|
+
publisher: str | None = None
|
|
45
|
+
created_at: datetime | None = None
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _extract_body_inner_html(html: str) -> str:
|
|
49
|
+
soup = BeautifulSoup(html, "lxml")
|
|
50
|
+
body = soup.find("body")
|
|
51
|
+
if not body:
|
|
52
|
+
return html
|
|
53
|
+
return body.decode_contents()
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _strip_first_h1(html_fragment: str) -> str:
|
|
57
|
+
soup = BeautifulSoup(html_fragment, "lxml")
|
|
58
|
+
first_h1 = soup.find("h1")
|
|
59
|
+
if first_h1:
|
|
60
|
+
first_h1.decompose()
|
|
61
|
+
body = soup.find("body")
|
|
62
|
+
if body:
|
|
63
|
+
return body.decode_contents()
|
|
64
|
+
return str(soup)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def build_epub(
|
|
68
|
+
*,
|
|
69
|
+
chapters: Iterable[Chapter],
|
|
70
|
+
out_file: str | Path,
|
|
71
|
+
meta: EpubMetadata,
|
|
72
|
+
) -> Path:
|
|
73
|
+
out_path = Path(out_file)
|
|
74
|
+
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
75
|
+
|
|
76
|
+
book = epub.EpubBook()
|
|
77
|
+
|
|
78
|
+
identifier = meta.identifier or f"urn:uuid:{uuid.uuid4()}"
|
|
79
|
+
book.set_identifier(identifier)
|
|
80
|
+
book.set_title(meta.title)
|
|
81
|
+
book.set_language(meta.language)
|
|
82
|
+
book.add_author(meta.author)
|
|
83
|
+
|
|
84
|
+
if meta.publisher:
|
|
85
|
+
book.add_metadata("DC", "publisher", meta.publisher)
|
|
86
|
+
|
|
87
|
+
created_at = meta.created_at or datetime.now(timezone.utc)
|
|
88
|
+
book.add_metadata("DC", "date", created_at.isoformat())
|
|
89
|
+
|
|
90
|
+
style_item = epub.EpubItem(
|
|
91
|
+
uid="style_nav",
|
|
92
|
+
file_name="style/style.css",
|
|
93
|
+
media_type="text/css",
|
|
94
|
+
content=EPUB_CSS.encode("utf-8"),
|
|
95
|
+
)
|
|
96
|
+
book.add_item(style_item)
|
|
97
|
+
|
|
98
|
+
chapter_items: list[epub.EpubHtml] = []
|
|
99
|
+
toc_items: list[epub.Link] = []
|
|
100
|
+
|
|
101
|
+
for ch in chapters:
|
|
102
|
+
body_inner = _extract_body_inner_html(ch.html)
|
|
103
|
+
body_inner = _strip_first_h1(body_inner)
|
|
104
|
+
|
|
105
|
+
content = f"""<h1>{ch.title}</h1>
|
|
106
|
+
<div class=\"chapter-sep\"></div>
|
|
107
|
+
{body_inner}
|
|
108
|
+
"""
|
|
109
|
+
|
|
110
|
+
item = epub.EpubHtml(
|
|
111
|
+
title=ch.title,
|
|
112
|
+
file_name=f"chap_{ch.index:03d}.xhtml",
|
|
113
|
+
lang=meta.language,
|
|
114
|
+
)
|
|
115
|
+
item.content = content
|
|
116
|
+
item.add_item(style_item)
|
|
117
|
+
|
|
118
|
+
book.add_item(item)
|
|
119
|
+
chapter_items.append(item)
|
|
120
|
+
toc_items.append(epub.Link(item.file_name, ch.title, f"chap_{ch.index:03d}"))
|
|
121
|
+
|
|
122
|
+
book.toc = tuple(toc_items)
|
|
123
|
+
book.spine = ["nav", *chapter_items]
|
|
124
|
+
|
|
125
|
+
book.add_item(epub.EpubNcx())
|
|
126
|
+
book.add_item(epub.EpubNav())
|
|
127
|
+
|
|
128
|
+
epub.write_epub(str(out_path), book, {})
|
|
129
|
+
return out_path
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
from bs4 import BeautifulSoup
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def clean_html_for_kindle_epub2(html_fragment: str) -> str:
|
|
9
|
+
"""Best-effort HTML cleanup for Kindle-friendly EPUB2.
|
|
10
|
+
|
|
11
|
+
This is intentionally conservative: it strips known-problematic attributes
|
|
12
|
+
and tags that commonly cause Send-to-Kindle conversion issues.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
soup = BeautifulSoup(html_fragment, "lxml")
|
|
16
|
+
|
|
17
|
+
# EPUB2: <u> tag isn't consistently supported; convert to a span.
|
|
18
|
+
for u in list(soup.find_all("u")):
|
|
19
|
+
span = soup.new_tag("span")
|
|
20
|
+
span["style"] = "text-decoration: underline;"
|
|
21
|
+
span.string = u.get_text() if u.string is None else u.string
|
|
22
|
+
if u.string is None:
|
|
23
|
+
# Keep children by moving them into the span.
|
|
24
|
+
for child in list(u.contents):
|
|
25
|
+
span.append(child)
|
|
26
|
+
u.replace_with(span)
|
|
27
|
+
|
|
28
|
+
# Remove tabindex attributes (not allowed in EPUB2 XHTML).
|
|
29
|
+
for el in soup.find_all(attrs={"tabindex": True}):
|
|
30
|
+
try:
|
|
31
|
+
del el["tabindex"]
|
|
32
|
+
except KeyError:
|
|
33
|
+
pass
|
|
34
|
+
|
|
35
|
+
# Remove start attribute from ordered lists (not allowed in EPUB2 XHTML).
|
|
36
|
+
for ol in soup.find_all("ol"):
|
|
37
|
+
if ol.has_attr("start"):
|
|
38
|
+
del ol["start"]
|
|
39
|
+
|
|
40
|
+
# Strip duplicate ids in a simple way: if an id repeats, rename it.
|
|
41
|
+
seen_ids: set[str] = set()
|
|
42
|
+
for el in soup.find_all(attrs={"id": True}):
|
|
43
|
+
raw = str(el.get("id") or "").strip()
|
|
44
|
+
if not raw:
|
|
45
|
+
continue
|
|
46
|
+
if raw not in seen_ids:
|
|
47
|
+
seen_ids.add(raw)
|
|
48
|
+
continue
|
|
49
|
+
suffix = 2
|
|
50
|
+
new_id = f"{raw}-{suffix}"
|
|
51
|
+
while new_id in seen_ids:
|
|
52
|
+
suffix += 1
|
|
53
|
+
new_id = f"{raw}-{suffix}"
|
|
54
|
+
el["id"] = new_id
|
|
55
|
+
seen_ids.add(new_id)
|
|
56
|
+
|
|
57
|
+
# Remove empty fragment links that point to missing ids (best-effort).
|
|
58
|
+
# If href="#something" but no element has id="something", drop href.
|
|
59
|
+
all_ids = {str(el.get("id")) for el in soup.find_all(attrs={"id": True})}
|
|
60
|
+
for a in soup.find_all("a", href=True):
|
|
61
|
+
href = str(a.get("href") or "")
|
|
62
|
+
if href.startswith("#") and len(href) > 1:
|
|
63
|
+
frag = href[1:]
|
|
64
|
+
if frag not in all_ids:
|
|
65
|
+
del a["href"]
|
|
66
|
+
|
|
67
|
+
# Normalize weird whitespace artifacts.
|
|
68
|
+
text = str(soup)
|
|
69
|
+
text = re.sub(r"\s+", " ", text)
|
|
70
|
+
return text.strip()
|