docs2epub 0.1.0__tar.gz → 0.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docs2epub
3
- Version: 0.1.0
3
+ Version: 0.1.2
4
4
  Summary: Turn documentation sites into an EPUB (Kindle-friendly).
5
5
  Author: Breno Brito
6
6
  License: MIT
@@ -31,6 +31,21 @@ uv run docs2epub --help
31
31
 
32
32
  ## Usage
33
33
 
34
+ ### uvx (no install)
35
+
36
+ ```bash
37
+ uvx docs2epub \
38
+ https://www.techinterviewhandbook.org/software-engineering-interview-guide/ \
39
+ tech-interview-handbook.epub
40
+
41
+ # Optional (override inferred metadata)
42
+ uvx docs2epub \
43
+ https://www.techinterviewhandbook.org/software-engineering-interview-guide/ \
44
+ tech-interview-handbook.epub \
45
+ --title "Tech Interview Handbook" \
46
+ --author "Yangshun Tay"
47
+ ```
48
+
34
49
  ### Docusaurus “Next” crawl
35
50
 
36
51
  ```bash
@@ -15,6 +15,21 @@ uv run docs2epub --help
15
15
 
16
16
  ## Usage
17
17
 
18
+ ### uvx (no install)
19
+
20
+ ```bash
21
+ uvx docs2epub \
22
+ https://www.techinterviewhandbook.org/software-engineering-interview-guide/ \
23
+ tech-interview-handbook.epub
24
+
25
+ # Optional (override inferred metadata)
26
+ uvx docs2epub \
27
+ https://www.techinterviewhandbook.org/software-engineering-interview-guide/ \
28
+ tech-interview-handbook.epub \
29
+ --title "Tech Interview Handbook" \
30
+ --author "Yangshun Tay"
31
+ ```
32
+
18
33
  ### Docusaurus “Next” crawl
19
34
 
20
35
  ```bash
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "docs2epub"
3
- version = "0.1.0"
3
+ version = "0.1.2"
4
4
  description = "Turn documentation sites into an EPUB (Kindle-friendly)."
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.12"
@@ -2,4 +2,4 @@ __all__ = [
2
2
  "__version__",
3
3
  ]
4
4
 
5
- __version__ = "0.1.0"
5
+ __version__ = "0.1.2"
@@ -0,0 +1,149 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ from pathlib import Path
5
+ from urllib.parse import urlparse
6
+
7
+ from .docusaurus_next import DocusaurusNextOptions, iter_docusaurus_next
8
+ from .epub import EpubMetadata, build_epub
9
+ from .pandoc_epub2 import PandocEpub2Options, build_epub2_with_pandoc
10
+
11
+
12
+ def _infer_defaults(start_url: str) -> tuple[str, str, str]:
13
+ parsed = urlparse(start_url)
14
+ host = parsed.netloc or "docs"
15
+ title = host
16
+ author = host
17
+ language = "en"
18
+ return title, author, language
19
+
20
+
21
+ def _build_parser() -> argparse.ArgumentParser:
22
+ p = argparse.ArgumentParser(
23
+ prog="docs2epub",
24
+ description="Turn documentation sites into an EPUB (Kindle-friendly).",
25
+ )
26
+
27
+ # Short positional form (for uvx):
28
+ # docs2epub <START_URL> <OUT.epub>
29
+ p.add_argument(
30
+ "start_url_pos",
31
+ nargs="?",
32
+ help="Starting URL for scraping (initially: Docusaurus docs page).",
33
+ )
34
+ p.add_argument(
35
+ "out_pos",
36
+ nargs="?",
37
+ help="Output EPUB file path.",
38
+ )
39
+
40
+ # Flag form (more explicit):
41
+ p.add_argument(
42
+ "--start-url",
43
+ dest="start_url",
44
+ default=None,
45
+ help="Starting URL for scraping (overrides positional start_url).",
46
+ )
47
+ p.add_argument(
48
+ "--out",
49
+ dest="out",
50
+ default=None,
51
+ help="Output EPUB file path (overrides positional out).",
52
+ )
53
+
54
+ p.add_argument(
55
+ "--base-url",
56
+ default=None,
57
+ help="Base URL used to resolve relative links (defaults to start-url).",
58
+ )
59
+ p.add_argument("--max-pages", type=int, default=None)
60
+ p.add_argument("--sleep-s", type=float, default=0.5)
61
+
62
+ p.add_argument("--title", default=None)
63
+ p.add_argument("--author", default=None)
64
+ p.add_argument("--language", default=None)
65
+ p.add_argument("--identifier", default=None)
66
+ p.add_argument("--publisher", default=None)
67
+
68
+ p.add_argument(
69
+ "--format",
70
+ default="epub2",
71
+ choices=["epub2", "epub3"],
72
+ help="Output format. Default: epub2 (Kindle-friendly).",
73
+ )
74
+
75
+ p.add_argument(
76
+ "--keep-images",
77
+ action="store_true",
78
+ help="Keep and embed remote images (may be slower and can trigger fetch warnings).",
79
+ )
80
+
81
+ p.add_argument(
82
+ "-v",
83
+ "--verbose",
84
+ action="store_true",
85
+ help="Verbose output (shows full pandoc warnings).",
86
+ )
87
+
88
+ return p
89
+
90
+
91
+ def main(argv: list[str] | None = None) -> int:
92
+ args = _build_parser().parse_args(argv)
93
+
94
+ start_url = args.start_url or args.start_url_pos
95
+ out_value = args.out or args.out_pos
96
+
97
+ if not start_url or not out_value:
98
+ raise SystemExit("Usage: docs2epub <START_URL> <OUT.epub> [options]")
99
+
100
+ inferred_title, inferred_author, inferred_language = _infer_defaults(start_url)
101
+
102
+ title = args.title or inferred_title
103
+ author = args.author or inferred_author
104
+ language = args.language or inferred_language
105
+
106
+ options = DocusaurusNextOptions(
107
+ start_url=start_url,
108
+ base_url=args.base_url,
109
+ max_pages=args.max_pages,
110
+ sleep_s=args.sleep_s,
111
+ )
112
+
113
+ chapters = iter_docusaurus_next(options)
114
+ if not chapters:
115
+ raise SystemExit("No pages scraped (did not find article content).")
116
+
117
+ out_path_value = Path(out_value)
118
+
119
+ if args.format == "epub2":
120
+ out_path = build_epub2_with_pandoc(
121
+ chapters=chapters,
122
+ out_file=out_path_value,
123
+ title=title,
124
+ author=author,
125
+ language=language,
126
+ publisher=args.publisher,
127
+ identifier=args.identifier,
128
+ verbose=args.verbose,
129
+ options=PandocEpub2Options(keep_images=args.keep_images),
130
+ )
131
+ else:
132
+ meta = EpubMetadata(
133
+ title=title,
134
+ author=author,
135
+ language=language,
136
+ identifier=args.identifier,
137
+ publisher=args.publisher,
138
+ )
139
+
140
+ out_path = build_epub(
141
+ chapters=chapters,
142
+ out_file=out_path_value,
143
+ meta=meta,
144
+ )
145
+
146
+ size_mb = out_path.stat().st_size / (1024 * 1024)
147
+ print(f"Scraped {len(chapters)} pages")
148
+ print(f"EPUB written to: {out_path.resolve()} ({size_mb:.2f} MB)")
149
+ return 0
@@ -5,37 +5,45 @@ import re
5
5
  from bs4 import BeautifulSoup
6
6
 
7
7
 
8
- def clean_html_for_kindle_epub2(html_fragment: str) -> str:
8
+ def clean_html_for_kindle_epub2(
9
+ html_fragment: str,
10
+ *,
11
+ keep_images: bool,
12
+ ) -> str:
9
13
  """Best-effort HTML cleanup for Kindle-friendly EPUB2.
10
14
 
11
15
  This is intentionally conservative: it strips known-problematic attributes
12
16
  and tags that commonly cause Send-to-Kindle conversion issues.
17
+
18
+ By default we drop remote images to avoid pandoc fetch failures.
13
19
  """
14
20
 
15
21
  soup = BeautifulSoup(html_fragment, "lxml")
16
22
 
23
+ if not keep_images:
24
+ for img in list(soup.find_all("img")):
25
+ src = str(img.get("src") or "")
26
+ if src.startswith("http://") or src.startswith("https://"):
27
+ img.decompose()
28
+
17
29
  # EPUB2: <u> tag isn't consistently supported; convert to a span.
18
30
  for u in list(soup.find_all("u")):
19
31
  span = soup.new_tag("span")
20
32
  span["style"] = "text-decoration: underline;"
21
- span.string = u.get_text() if u.string is None else u.string
22
33
  if u.string is None:
23
- # Keep children by moving them into the span.
24
34
  for child in list(u.contents):
25
35
  span.append(child)
36
+ else:
37
+ span.string = u.string
26
38
  u.replace_with(span)
27
39
 
28
40
  # Remove tabindex attributes (not allowed in EPUB2 XHTML).
29
41
  for el in soup.find_all(attrs={"tabindex": True}):
30
- try:
31
- del el["tabindex"]
32
- except KeyError:
33
- pass
42
+ el.attrs.pop("tabindex", None)
34
43
 
35
44
  # Remove start attribute from ordered lists (not allowed in EPUB2 XHTML).
36
45
  for ol in soup.find_all("ol"):
37
- if ol.has_attr("start"):
38
- del ol["start"]
46
+ ol.attrs.pop("start", None)
39
47
 
40
48
  # Strip duplicate ids in a simple way: if an id repeats, rename it.
41
49
  seen_ids: set[str] = set()
@@ -54,7 +62,6 @@ def clean_html_for_kindle_epub2(html_fragment: str) -> str:
54
62
  el["id"] = new_id
55
63
  seen_ids.add(new_id)
56
64
 
57
- # Remove empty fragment links that point to missing ids (best-effort).
58
65
  # If href="#something" but no element has id="something", drop href.
59
66
  all_ids = {str(el.get("id")) for el in soup.find_all(attrs={"id": True})}
60
67
  for a in soup.find_all("a", href=True):
@@ -62,9 +69,9 @@ def clean_html_for_kindle_epub2(html_fragment: str) -> str:
62
69
  if href.startswith("#") and len(href) > 1:
63
70
  frag = href[1:]
64
71
  if frag not in all_ids:
65
- del a["href"]
72
+ a.attrs.pop("href", None)
66
73
 
67
- # Normalize weird whitespace artifacts.
74
+ # Normalize whitespace a bit (helps keep diffs smaller and reduces odd output).
68
75
  text = str(soup)
69
76
  text = re.sub(r"\s+", " ", text)
70
77
  return text.strip()
@@ -15,7 +15,8 @@ from .model import Chapter
15
15
  class PandocEpub2Options:
16
16
  toc: bool = True
17
17
  toc_depth: int = 2
18
- chapter_level: int = 1
18
+ split_level: int = 1
19
+ keep_images: bool = False
19
20
 
20
21
 
21
22
  def _wrap_html(title: str, body_html: str) -> str:
@@ -34,6 +35,26 @@ def _wrap_html(title: str, body_html: str) -> str:
34
35
  """
35
36
 
36
37
 
38
+ def _summarize_pandoc_warnings(stderr: str) -> str:
39
+ warnings = [line for line in stderr.splitlines() if line.startswith("[WARNING]")]
40
+ if not warnings:
41
+ return ""
42
+
43
+ resource = [w for w in warnings if "Could not fetch resource" in w]
44
+ duplicate = [w for w in warnings if "Duplicate identifier" in w]
45
+
46
+ parts: list[str] = []
47
+ parts.append(f"pandoc warnings: {len(warnings)} (use -v to see full output)")
48
+ if duplicate:
49
+ parts.append(f"- Duplicate identifier: {len(duplicate)} (usually safe; affects internal anchors)")
50
+ if resource:
51
+ parts.append(
52
+ f"- Missing resources: {len(resource)} (some images may be dropped; use --keep-images/-v to inspect)"
53
+ )
54
+
55
+ return "\n".join(parts)
56
+
57
+
37
58
  def build_epub2_with_pandoc(
38
59
  *,
39
60
  chapters: Iterable[Chapter],
@@ -43,6 +64,7 @@ def build_epub2_with_pandoc(
43
64
  language: str,
44
65
  publisher: str | None,
45
66
  identifier: str | None,
67
+ verbose: bool,
46
68
  options: PandocEpub2Options | None = None,
47
69
  ) -> Path:
48
70
  pandoc = shutil.which("pandoc")
@@ -61,7 +83,7 @@ def build_epub2_with_pandoc(
61
83
 
62
84
  html_files: list[Path] = []
63
85
  for ch in chapters:
64
- cleaned = clean_html_for_kindle_epub2(ch.html)
86
+ cleaned = clean_html_for_kindle_epub2(ch.html, keep_images=opts.keep_images)
65
87
  html_doc = _wrap_html(ch.title, cleaned)
66
88
  fp = tmp_path / f"chapter_{ch.index:04d}.html"
67
89
  fp.write_text(html_doc, encoding="utf-8")
@@ -81,14 +103,15 @@ def build_epub2_with_pandoc(
81
103
  "encoding=UTF-8",
82
104
  "--standalone",
83
105
  "--split-level",
84
- str(opts.chapter_level),
106
+ str(opts.split_level),
85
107
  ]
86
108
 
87
109
  if publisher:
88
110
  cmd.extend(["--metadata", f"publisher={publisher}"])
89
111
 
90
112
  if identifier:
91
- cmd.extend(["--epub-metadata", str(identifier)])
113
+ # Keep identifier stable for Kindle.
114
+ cmd.extend(["--metadata", f"identifier={identifier}"])
92
115
 
93
116
  if opts.toc:
94
117
  cmd.extend(["--toc", "--toc-depth", str(opts.toc_depth)])
@@ -96,6 +119,22 @@ def build_epub2_with_pandoc(
96
119
  cmd.extend(["-o", str(out_path)])
97
120
  cmd.extend([str(p) for p in html_files])
98
121
 
99
- subprocess.run(cmd, check=True)
122
+ proc = subprocess.run(
123
+ cmd,
124
+ stdout=subprocess.PIPE,
125
+ stderr=subprocess.PIPE,
126
+ text=True,
127
+ )
128
+
129
+ if proc.returncode != 0:
130
+ # On failure, always show stderr.
131
+ raise RuntimeError(f"pandoc failed (exit {proc.returncode}):\n{proc.stderr.strip()}")
132
+
133
+ if verbose and proc.stderr.strip():
134
+ print(proc.stderr.strip())
135
+ elif proc.stderr.strip():
136
+ summary = _summarize_pandoc_warnings(proc.stderr)
137
+ if summary:
138
+ print(summary)
100
139
 
101
140
  return out_path
@@ -16,8 +16,17 @@ def test_build_epub3_smoke(tmp_path):
16
16
 
17
17
  def test_kindle_cleaner_strips_tabindex_and_ol_start():
18
18
  cleaned = clean_html_for_kindle_epub2(
19
- '<div tabindex="0"><ol start="2"><li><u>Hi</u></li></ol></div>'
19
+ '<div tabindex="0"><ol start="2"><li><u>Hi</u></li></ol></div>',
20
+ keep_images=False,
20
21
  )
21
22
  assert "tabindex" not in cleaned
22
23
  assert "start=" not in cleaned
23
24
  assert "underline" in cleaned
25
+
26
+
27
+ def test_kindle_cleaner_drops_remote_images_by_default():
28
+ cleaned = clean_html_for_kindle_epub2(
29
+ '<p>x</p><img src="https://example.com/a.png" /><p>y</p>',
30
+ keep_images=False,
31
+ )
32
+ assert "img" not in cleaned
@@ -166,7 +166,7 @@ wheels = [
166
166
 
167
167
  [[package]]
168
168
  name = "docs2epub"
169
- version = "0.1.0"
169
+ version = "0.1.1"
170
170
  source = { editable = "." }
171
171
  dependencies = [
172
172
  { name = "beautifulsoup4" },
@@ -1,95 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import argparse
4
- from pathlib import Path
5
-
6
- from .docusaurus_next import DocusaurusNextOptions, iter_docusaurus_next
7
- from .epub import EpubMetadata, build_epub
8
- from .pandoc_epub2 import build_epub2_with_pandoc
9
-
10
-
11
- def _build_parser() -> argparse.ArgumentParser:
12
- p = argparse.ArgumentParser(
13
- prog="docs2epub",
14
- description="Turn documentation sites into an EPUB (Kindle-friendly).",
15
- )
16
-
17
- p.add_argument(
18
- "--start-url",
19
- required=True,
20
- help="Starting URL for scraping (initially: Docusaurus docs page).",
21
- )
22
- p.add_argument(
23
- "--base-url",
24
- default=None,
25
- help="Base URL used to resolve relative links (defaults to start-url).",
26
- )
27
- p.add_argument("--max-pages", type=int, default=None)
28
- p.add_argument("--sleep-s", type=float, default=0.5)
29
-
30
- p.add_argument("--title", required=True)
31
- p.add_argument("--author", required=True)
32
- p.add_argument("--language", default="en")
33
- p.add_argument("--identifier", default=None)
34
- p.add_argument("--publisher", default=None)
35
-
36
- p.add_argument(
37
- "--format",
38
- default="epub2",
39
- choices=["epub2", "epub3"],
40
- help="Output format. Default: epub2 (Kindle-friendly).",
41
- )
42
-
43
- p.add_argument(
44
- "--out",
45
- required=True,
46
- help="Output EPUB file path.",
47
- )
48
-
49
- return p
50
-
51
-
52
- def main(argv: list[str] | None = None) -> int:
53
- args = _build_parser().parse_args(argv)
54
-
55
- options = DocusaurusNextOptions(
56
- start_url=args.start_url,
57
- base_url=args.base_url,
58
- max_pages=args.max_pages,
59
- sleep_s=args.sleep_s,
60
- )
61
-
62
- chapters = iter_docusaurus_next(options)
63
- if not chapters:
64
- raise SystemExit("No chapters scraped (did not find article content).")
65
-
66
- out_path: Path
67
-
68
- if args.format == "epub2":
69
- out_path = build_epub2_with_pandoc(
70
- chapters=chapters,
71
- out_file=Path(args.out),
72
- title=args.title,
73
- author=args.author,
74
- language=args.language,
75
- publisher=args.publisher,
76
- identifier=args.identifier,
77
- )
78
- else:
79
- meta = EpubMetadata(
80
- title=args.title,
81
- author=args.author,
82
- language=args.language,
83
- identifier=args.identifier,
84
- publisher=args.publisher,
85
- )
86
-
87
- out_path = build_epub(
88
- chapters=chapters,
89
- out_file=Path(args.out),
90
- meta=meta,
91
- )
92
-
93
- print(f"Scraped {len(chapters)} pages")
94
- print(f"EPUB written to: {out_path.resolve()}")
95
- return 0
File without changes