PyPI - docs2epub - Versions diffs - 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl - Mend

docs2epub 0.1.1py3-none-any.whl → 0.1.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

docs2epub/__init__.py +1 -1
docs2epub/cli.py +45 -17
docs2epub/kindle_html.py +19 -12
docs2epub/pandoc_epub2.py +44 -5
{docs2epub-0.1.1.dist-info → docs2epub-0.1.2.dist-info}/METADATA +6 -1
docs2epub-0.1.2.dist-info/RECORD +11 -0
docs2epub-0.1.1.dist-info/RECORD +0 -11
{docs2epub-0.1.1.dist-info → docs2epub-0.1.2.dist-info}/WHEEL +0 -0
{docs2epub-0.1.1.dist-info → docs2epub-0.1.2.dist-info}/entry_points.txt +0 -0

docs2epub/__init__.py CHANGED Viewed

@@ -2,4 +2,4 @@ __all__ = [
   "__version__",
 ]
-__version__ = "0.1.1"
+__version__ = "0.1.2"

docs2epub/cli.py CHANGED Viewed

@@ -2,10 +2,20 @@ from __future__ import annotations
 import argparse
 from pathlib import Path
+from urllib.parse import urlparse
 from .docusaurus_next import DocusaurusNextOptions, iter_docusaurus_next
 from .epub import EpubMetadata, build_epub
-from .pandoc_epub2 import build_epub2_with_pandoc
+from .pandoc_epub2 import PandocEpub2Options, build_epub2_with_pandoc
+def _infer_defaults(start_url: str) -> tuple[str, str, str]:
+  parsed = urlparse(start_url)
+  host = parsed.netloc or "docs"
+  title = host
+  author = host
+  language = "en"
+  return title, author, language
 def _build_parser() -> argparse.ArgumentParser:
@@ -49,9 +59,9 @@ def _build_parser() -> argparse.ArgumentParser:
   p.add_argument("--max-pages", type=int, default=None)
   p.add_argument("--sleep-s", type=float, default=0.5)
-  p.add_argument("--title", required=True)
-  p.add_argument("--author", required=True)
-  p.add_argument("--language", default="en")
+  p.add_argument("--title", default=None)
+  p.add_argument("--author", default=None)
+  p.add_argument("--language", default=None)
   p.add_argument("--identifier", default=None)
   p.add_argument("--publisher", default=None)
@@ -62,6 +72,19 @@ def _build_parser() -> argparse.ArgumentParser:
     help="Output format. Default: epub2 (Kindle-friendly).",
   )
+  p.add_argument(
+    "--keep-images",
+    action="store_true",
+    help="Keep and embed remote images (may be slower and can trigger fetch warnings).",
+  )
+  p.add_argument(
+    "-v",
+    "--verbose",
+    action="store_true",
+    help="Verbose output (shows full pandoc warnings).",
+  )
   return p
@@ -72,10 +95,13 @@ def main(argv: list[str] | None = None) -> int:
   out_value = args.out or args.out_pos
   if not start_url or not out_value:
-    raise SystemExit(
-      "Usage: docs2epub <START_URL> <OUT.epub> --title ... --author ...\n"
-      "(or use --start-url/--out flags)"
-    )
+    raise SystemExit("Usage: docs2epub <START_URL> <OUT.epub> [options]")
+  inferred_title, inferred_author, inferred_language = _infer_defaults(start_url)
+  title = args.title or inferred_title
+  author = args.author or inferred_author
+  language = args.language or inferred_language
   options = DocusaurusNextOptions(
     start_url=start_url,
@@ -86,26 +112,27 @@ def main(argv: list[str] | None = None) -> int:
   chapters = iter_docusaurus_next(options)
   if not chapters:
-    raise SystemExit("No chapters scraped (did not find article content).")
+    raise SystemExit("No pages scraped (did not find article content).")
-  out_path: Path
   out_path_value = Path(out_value)
   if args.format == "epub2":
     out_path = build_epub2_with_pandoc(
       chapters=chapters,
       out_file=out_path_value,
-      title=args.title,
-      author=args.author,
-      language=args.language,
+      title=title,
+      author=author,
+      language=language,
       publisher=args.publisher,
       identifier=args.identifier,
+      verbose=args.verbose,
+      options=PandocEpub2Options(keep_images=args.keep_images),
     )
   else:
     meta = EpubMetadata(
-      title=args.title,
-      author=args.author,
-      language=args.language,
+      title=title,
+      author=author,
+      language=language,
       identifier=args.identifier,
       publisher=args.publisher,
     )
@@ -116,6 +143,7 @@ def main(argv: list[str] | None = None) -> int:
       meta=meta,
     )
+  size_mb = out_path.stat().st_size / (1024 * 1024)
   print(f"Scraped {len(chapters)} pages")
-  print(f"EPUB written to: {out_path.resolve()}")
+  print(f"EPUB written to: {out_path.resolve()} ({size_mb:.2f} MB)")
   return 0

docs2epub/kindle_html.py CHANGED Viewed

@@ -5,37 +5,45 @@ import re
 from bs4 import BeautifulSoup
-def clean_html_for_kindle_epub2(html_fragment: str) -> str:
+def clean_html_for_kindle_epub2(
+  html_fragment: str,
+  *,
+  keep_images: bool,
+) -> str:
   """Best-effort HTML cleanup for Kindle-friendly EPUB2.
   This is intentionally conservative: it strips known-problematic attributes
   and tags that commonly cause Send-to-Kindle conversion issues.
+  By default we drop remote images to avoid pandoc fetch failures.
   """
   soup = BeautifulSoup(html_fragment, "lxml")
+  if not keep_images:
+    for img in list(soup.find_all("img")):
+      src = str(img.get("src") or "")
+      if src.startswith("http://") or src.startswith("https://"):
+        img.decompose()
   # EPUB2: <u> tag isn't consistently supported; convert to a span.
   for u in list(soup.find_all("u")):
     span = soup.new_tag("span")
     span["style"] = "text-decoration: underline;"
-    span.string = u.get_text() if u.string is None else u.string
     if u.string is None:
-      # Keep children by moving them into the span.
       for child in list(u.contents):
         span.append(child)
+    else:
+      span.string = u.string
     u.replace_with(span)
   # Remove tabindex attributes (not allowed in EPUB2 XHTML).
   for el in soup.find_all(attrs={"tabindex": True}):
-    try:
-      del el["tabindex"]
-    except KeyError:
-      pass
+    el.attrs.pop("tabindex", None)
   # Remove start attribute from ordered lists (not allowed in EPUB2 XHTML).
   for ol in soup.find_all("ol"):
-    if ol.has_attr("start"):
-      del ol["start"]
+    ol.attrs.pop("start", None)
   # Strip duplicate ids in a simple way: if an id repeats, rename it.
   seen_ids: set[str] = set()
@@ -54,7 +62,6 @@ def clean_html_for_kindle_epub2(html_fragment: str) -> str:
     el["id"] = new_id
     seen_ids.add(new_id)
-  # Remove empty fragment links that point to missing ids (best-effort).
   # If href="#something" but no element has id="something", drop href.
   all_ids = {str(el.get("id")) for el in soup.find_all(attrs={"id": True})}
   for a in soup.find_all("a", href=True):
@@ -62,9 +69,9 @@ def clean_html_for_kindle_epub2(html_fragment: str) -> str:
     if href.startswith("#") and len(href) > 1:
       frag = href[1:]
       if frag not in all_ids:
-        del a["href"]
+        a.attrs.pop("href", None)
-  # Normalize weird whitespace artifacts.
+  # Normalize whitespace a bit (helps keep diffs smaller and reduces odd output).
   text = str(soup)
   text = re.sub(r"\s+", " ", text)
   return text.strip()

docs2epub/pandoc_epub2.py CHANGED Viewed

@@ -15,7 +15,8 @@ from .model import Chapter
 class PandocEpub2Options:
   toc: bool = True
   toc_depth: int = 2
-  chapter_level: int = 1
+  split_level: int = 1
+  keep_images: bool = False
 def _wrap_html(title: str, body_html: str) -> str:
@@ -34,6 +35,26 @@ def _wrap_html(title: str, body_html: str) -> str:
 """
+def _summarize_pandoc_warnings(stderr: str) -> str:
+  warnings = [line for line in stderr.splitlines() if line.startswith("[WARNING]")]
+  if not warnings:
+    return ""
+  resource = [w for w in warnings if "Could not fetch resource" in w]
+  duplicate = [w for w in warnings if "Duplicate identifier" in w]
+  parts: list[str] = []
+  parts.append(f"pandoc warnings: {len(warnings)} (use -v to see full output)")
+  if duplicate:
+    parts.append(f"- Duplicate identifier: {len(duplicate)} (usually safe; affects internal anchors)")
+  if resource:
+    parts.append(
+      f"- Missing resources: {len(resource)} (some images may be dropped; use --keep-images/-v to inspect)"
+    )
+  return "\n".join(parts)
 def build_epub2_with_pandoc(
   *,
   chapters: Iterable[Chapter],
@@ -43,6 +64,7 @@ def build_epub2_with_pandoc(
   language: str,
   publisher: str | None,
   identifier: str | None,
+  verbose: bool,
   options: PandocEpub2Options | None = None,
 ) -> Path:
   pandoc = shutil.which("pandoc")
@@ -61,7 +83,7 @@ def build_epub2_with_pandoc(
     html_files: list[Path] = []
     for ch in chapters:
-      cleaned = clean_html_for_kindle_epub2(ch.html)
+      cleaned = clean_html_for_kindle_epub2(ch.html, keep_images=opts.keep_images)
       html_doc = _wrap_html(ch.title, cleaned)
       fp = tmp_path / f"chapter_{ch.index:04d}.html"
       fp.write_text(html_doc, encoding="utf-8")
@@ -81,14 +103,15 @@ def build_epub2_with_pandoc(
       "encoding=UTF-8",
       "--standalone",
       "--split-level",
-      str(opts.chapter_level),
+      str(opts.split_level),
     ]
     if publisher:
       cmd.extend(["--metadata", f"publisher={publisher}"])
     if identifier:
-      cmd.extend(["--epub-metadata", str(identifier)])
+      # Keep identifier stable for Kindle.
+      cmd.extend(["--metadata", f"identifier={identifier}"])
     if opts.toc:
       cmd.extend(["--toc", "--toc-depth", str(opts.toc_depth)])
@@ -96,6 +119,22 @@ def build_epub2_with_pandoc(
     cmd.extend(["-o", str(out_path)])
     cmd.extend([str(p) for p in html_files])
-    subprocess.run(cmd, check=True)
+    proc = subprocess.run(
+      cmd,
+      stdout=subprocess.PIPE,
+      stderr=subprocess.PIPE,
+      text=True,
+    )
+    if proc.returncode != 0:
+      # On failure, always show stderr.
+      raise RuntimeError(f"pandoc failed (exit {proc.returncode}):\n{proc.stderr.strip()}")
+    if verbose and proc.stderr.strip():
+      print(proc.stderr.strip())
+    elif proc.stderr.strip():
+      summary = _summarize_pandoc_warnings(proc.stderr)
+      if summary:
+        print(summary)
   return out_path

{docs2epub-0.1.1.dist-info → docs2epub-0.1.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: docs2epub
-Version: 0.1.1
+Version: 0.1.2
 Summary: Turn documentation sites into an EPUB (Kindle-friendly).
 Author: Breno Brito
 License: MIT
@@ -34,6 +34,11 @@ uv run docs2epub --help
 ### uvx (no install)
 ```bash
+uvx docs2epub \
+  https://www.techinterviewhandbook.org/software-engineering-interview-guide/ \
+  tech-interview-handbook.epub
+# Optional (override inferred metadata)
 uvx docs2epub \
   https://www.techinterviewhandbook.org/software-engineering-interview-guide/ \
   tech-interview-handbook.epub \

docs2epub-0.1.2.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,11 @@
+docs2epub/__init__.py,sha256=iccyEu4zlubhvd6pM7Z2Gjwn8tPw9IhZ4ABKhbiFjUY,54
+docs2epub/cli.py,sha256=pt1crvrkr2k1ybf_p0m4xSYyoZVluFsDNGuwJ7CykYM,3863
+docs2epub/docusaurus_next.py,sha256=OybL3KPwDZvp2sOL3locE374Zb80M1kubx81SH2GxgQ,3378
+docs2epub/epub.py,sha256=OsPWcPGTgazAeNpWASIE6e4HQ5ILQr2VFO1-Aj3y1kg,2986
+docs2epub/kindle_html.py,sha256=LN0CGj9ap9b8iC_MlZcQLuhJ7FehZr_VbIfMOz78E5c,2297
+docs2epub/model.py,sha256=uL7uwbG6yU0bEGpSFxxIv2pcZHQR9cs2prfqk5iNQwc,160
+docs2epub/pandoc_epub2.py,sha256=l22-QAQcCgJyl7HF0_b5weC3qEGVQLwOhxdbAvd8C2o,3610
+docs2epub-0.1.2.dist-info/METADATA,sha256=UphwnhA-8wtH-DkX1gRRiu4Fu3ukmri0GUGdCFIcU70,1886
+docs2epub-0.1.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+docs2epub-0.1.2.dist-info/entry_points.txt,sha256=DHK4mzthrIXUvM8Y8Vo_3jG2IhegEDDM7T9CvCkUtvw,49
+docs2epub-0.1.2.dist-info/RECORD,,

docs2epub-0.1.1.dist-info/RECORD DELETED Viewed

@@ -1,11 +0,0 @@
-docs2epub/__init__.py,sha256=zCC51i_uxiMh2wmZYBMWo4zCNEe8sfrk0dhrvo6eZ8E,54
-docs2epub/cli.py,sha256=wq0oQP2W9p_eRR9P0W1AvZTn-yrutxfm7u6qU2JCcRA,3056
-docs2epub/docusaurus_next.py,sha256=OybL3KPwDZvp2sOL3locE374Zb80M1kubx81SH2GxgQ,3378
-docs2epub/epub.py,sha256=OsPWcPGTgazAeNpWASIE6e4HQ5ILQr2VFO1-Aj3y1kg,2986
-docs2epub/kindle_html.py,sha256=iymXlYFDfEIRIKC3vaepV_K-SFEvCPpAI5FESj7udOU,2153
-docs2epub/model.py,sha256=uL7uwbG6yU0bEGpSFxxIv2pcZHQR9cs2prfqk5iNQwc,160
-docs2epub/pandoc_epub2.py,sha256=m_9yJeeciyeoXMx5bVJs2qIutGDXifvd8TtFh2DRm-k,2335
-docs2epub-0.1.1.dist-info/METADATA,sha256=QJ8YaNtyyUqoSf8dYMeT9bNAmZ3RCmb2s74_VYvbdvM,1718
-docs2epub-0.1.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
-docs2epub-0.1.1.dist-info/entry_points.txt,sha256=DHK4mzthrIXUvM8Y8Vo_3jG2IhegEDDM7T9CvCkUtvw,49
-docs2epub-0.1.1.dist-info/RECORD,,

{docs2epub-0.1.1.dist-info → docs2epub-0.1.2.dist-info}/WHEEL RENAMED Viewed

File without changes

{docs2epub-0.1.1.dist-info → docs2epub-0.1.2.dist-info}/entry_points.txt RENAMED Viewed

File without changes

docs2epub 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl

docs2epub 0.1.1py3-none-any.whl → 0.1.2py3-none-any.whl