PyPI - docs2epub - Versions diffs - 0.1.1__tar.gz → 0.1.5__tar.gz - Mend

docs2epub 0.1.1tar.gz → 0.1.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

docs2epub-0.1.5/AGENTS.md ADDED Viewed

@@ -0,0 +1,41 @@
+# docs2epub AGENTS
+This file documents local conventions for working on `docs2epub`.
+**Focus**
+- Keep the scraper generalistic across doc-site frontends (GitBook, Docusaurus, similar).
+- Prefer resilient HTML heuristics over site-specific hacks.
+- Fail gracefully when content is missing; surface actionable errors.
+**Development Workflow**
+- Use TDD for bug fixes and new behaviors. Add a failing test first.
+- Prefer unit tests with `monkeypatch` and deterministic HTML fixtures.
+- Keep tests fast and offline. Only do real network checks for manual validation.
+- Run tests with `uv run pytest -q`.
+**Scraping Heuristics**
+- Primary crawl: sidebar/index extraction.
+- Expand index/category pages by collecting in-page content links.
+- Fallback crawl: “Next” navigation when no sidebar is found.
+- Normalize URLs: strip fragments and queries; lower-case scheme/host.
+- Filter non-doc links by extension; avoid cross-origin URLs by default.
+- Resolve relative URLs against the page URL, not the site root.
+**Code Layout**
+- Core crawler logic: `src/docs2epub/docusaurus_next.py`.
+- EPUB generation: `src/docs2epub/epub.py` and `src/docs2epub/pandoc_epub2.py`.
+- HTML cleanup: `src/docs2epub/kindle_html.py`.
+- Tests live in `tests/`.
+**Release Discipline**
+- Bump version in `pyproject.toml` for user-visible changes.
+- Run `uv lock` after bumping the version.
+- Build artifacts with `uv build` before publishing.
+- Publish with `uv publish` when explicitly requested.
+- Do not commit generated EPUBs or other artifacts.
+**Validation**
+- Quick manual checks (optional):
+  - `uvx --from . docs2epub https://midl.gitbook.io/midl out.epub`
+  - `uvx --from . docs2epub https://tutorial.docusaurus.io/docs/intro out.epub`
+- Clean up any generated files after validation.

{docs2epub-0.1.1 → docs2epub-0.1.5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: docs2epub
-Version: 0.1.1
+Version: 0.1.5
 Summary: Turn documentation sites into an EPUB (Kindle-friendly).
 Author: Breno Brito
 License: MIT
@@ -34,6 +34,11 @@ uv run docs2epub --help
 ### uvx (no install)
 ```bash
+uvx docs2epub \
+  https://www.techinterviewhandbook.org/software-engineering-interview-guide/ \
+  tech-interview-handbook.epub
+# Optional (override inferred metadata)
 uvx docs2epub \
   https://www.techinterviewhandbook.org/software-engineering-interview-guide/ \
   tech-interview-handbook.epub \

{docs2epub-0.1.1 → docs2epub-0.1.5}/README.md RENAMED Viewed

@@ -18,6 +18,11 @@ uv run docs2epub --help
 ### uvx (no install)
 ```bash
+uvx docs2epub \
+  https://www.techinterviewhandbook.org/software-engineering-interview-guide/ \
+  tech-interview-handbook.epub
+# Optional (override inferred metadata)
 uvx docs2epub \
   https://www.techinterviewhandbook.org/software-engineering-interview-guide/ \
   tech-interview-handbook.epub \

{docs2epub-0.1.1 → docs2epub-0.1.5}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "docs2epub"
-version = "0.1.1"
+version = "0.1.5"
 description = "Turn documentation sites into an EPUB (Kindle-friendly)."
 readme = "README.md"
 requires-python = ">=3.12"

{docs2epub-0.1.1 → docs2epub-0.1.5}/src/docs2epub/__init__.py RENAMED Viewed

@@ -2,4 +2,4 @@ __all__ = [
   "__version__",
 ]
-__version__ = "0.1.1"
+__version__ = "0.1.2"

{docs2epub-0.1.1 → docs2epub-0.1.5}/src/docs2epub/cli.py RENAMED Viewed

@@ -2,10 +2,20 @@ from __future__ import annotations
 import argparse
 from pathlib import Path
+from urllib.parse import urlparse
 from .docusaurus_next import DocusaurusNextOptions, iter_docusaurus_next
 from .epub import EpubMetadata, build_epub
-from .pandoc_epub2 import build_epub2_with_pandoc
+from .pandoc_epub2 import PandocEpub2Options, build_epub2_with_pandoc
+def _infer_defaults(start_url: str) -> tuple[str, str, str]:
+  parsed = urlparse(start_url)
+  host = parsed.netloc or "docs"
+  title = host
+  author = host
+  language = "en"
+  return title, author, language
 def _build_parser() -> argparse.ArgumentParser:
@@ -49,9 +59,9 @@ def _build_parser() -> argparse.ArgumentParser:
   p.add_argument("--max-pages", type=int, default=None)
   p.add_argument("--sleep-s", type=float, default=0.5)
-  p.add_argument("--title", required=True)
-  p.add_argument("--author", required=True)
-  p.add_argument("--language", default="en")
+  p.add_argument("--title", default=None)
+  p.add_argument("--author", default=None)
+  p.add_argument("--language", default=None)
   p.add_argument("--identifier", default=None)
   p.add_argument("--publisher", default=None)
@@ -62,6 +72,19 @@ def _build_parser() -> argparse.ArgumentParser:
     help="Output format. Default: epub2 (Kindle-friendly).",
   )
+  p.add_argument(
+    "--keep-images",
+    action="store_true",
+    help="Keep and embed remote images (may be slower and can trigger fetch warnings).",
+  )
+  p.add_argument(
+    "-v",
+    "--verbose",
+    action="store_true",
+    help="Verbose output (shows full pandoc warnings).",
+  )
   return p
@@ -72,10 +95,13 @@ def main(argv: list[str] | None = None) -> int:
   out_value = args.out or args.out_pos
   if not start_url or not out_value:
-    raise SystemExit(
-      "Usage: docs2epub <START_URL> <OUT.epub> --title ... --author ...\n"
-      "(or use --start-url/--out flags)"
-    )
+    raise SystemExit("Usage: docs2epub <START_URL> <OUT.epub> [options]")
+  inferred_title, inferred_author, inferred_language = _infer_defaults(start_url)
+  title = args.title or inferred_title
+  author = args.author or inferred_author
+  language = args.language or inferred_language
   options = DocusaurusNextOptions(
     start_url=start_url,
@@ -86,26 +112,27 @@ def main(argv: list[str] | None = None) -> int:
   chapters = iter_docusaurus_next(options)
   if not chapters:
-    raise SystemExit("No chapters scraped (did not find article content).")
+    raise SystemExit("No pages scraped (did not find article content).")
-  out_path: Path
   out_path_value = Path(out_value)
   if args.format == "epub2":
     out_path = build_epub2_with_pandoc(
       chapters=chapters,
       out_file=out_path_value,
-      title=args.title,
-      author=args.author,
-      language=args.language,
+      title=title,
+      author=author,
+      language=language,
       publisher=args.publisher,
       identifier=args.identifier,
+      verbose=args.verbose,
+      options=PandocEpub2Options(keep_images=args.keep_images),
     )
   else:
     meta = EpubMetadata(
-      title=args.title,
-      author=args.author,
-      language=args.language,
+      title=title,
+      author=author,
+      language=language,
       identifier=args.identifier,
       publisher=args.publisher,
     )
@@ -116,6 +143,7 @@ def main(argv: list[str] | None = None) -> int:
       meta=meta,
     )
+  size_mb = out_path.stat().st_size / (1024 * 1024)
   print(f"Scraped {len(chapters)} pages")
-  print(f"EPUB written to: {out_path.resolve()}")
+  print(f"EPUB written to: {out_path.resolve()} ({size_mb:.2f} MB)")
   return 0

docs2epub-0.1.5/src/docs2epub/docusaurus_next.py ADDED Viewed

@@ -0,0 +1,404 @@
+from __future__ import annotations
+import re
+from dataclasses import dataclass
+from urllib.parse import urljoin, urlparse
+import requests
+from bs4 import BeautifulSoup, Tag
+from .model import Chapter
+DEFAULT_USER_AGENT = "docs2epub/0.1 (+https://github.com/brenorb/docs2epub)"
+_SIDEBAR_SELECTORS = [
+  'aside[data-testid="table-of-contents"]',
+  "aside#table-of-contents",
+  'nav[aria-label="Table of contents"]',
+  'nav[aria-label="Table of Contents"]',
+  'nav[aria-label="Docs sidebar"]',
+  'nav[aria-label="Docs navigation"]',
+  'nav[aria-label="Documentation"]',
+  'nav[aria-label="Docs"]',
+  "aside.theme-doc-sidebar-container",
+  "div.theme-doc-sidebar-container",
+  "nav.theme-doc-sidebar-menu",
+  "nav.menu",
+  'nav[class*="menu"]',
+  'aside[class*="sidebar"]',
+  'nav[class*="sidebar"]',
+]
+_NON_DOC_EXTENSIONS = {
+  ".png",
+  ".jpg",
+  ".jpeg",
+  ".gif",
+  ".svg",
+  ".webp",
+  ".css",
+  ".js",
+  ".map",
+  ".json",
+  ".xml",
+  ".rss",
+  ".pdf",
+  ".zip",
+  ".tar",
+  ".gz",
+  ".tgz",
+  ".epub",
+  ".mp4",
+  ".mp3",
+  ".wav",
+}
+@dataclass(frozen=True)
+class DocusaurusNextOptions:
+  start_url: str
+  base_url: str | None = None
+  max_pages: int | None = None
+  sleep_s: float = 0.5
+  user_agent: str = DEFAULT_USER_AGENT
+def _slugify_filename(text: str) -> str:
+  value = text.strip().lower()
+  value = re.sub(r"[^\w\s-]", "", value)
+  value = re.sub(r"[\s_-]+", "-", value)
+  value = value.strip("-")
+  return value or "chapter"
+def _extract_article(soup: BeautifulSoup) -> Tag:
+  article = soup.find("article")
+  if article:
+    return article
+  main = soup.find("main")
+  if main:
+    article = main.find("article")
+    if article:
+      return article
+    return main
+  role_main = soup.find(attrs={"role": "main"})
+  if role_main:
+    return role_main
+  raise RuntimeError("Could not find <article> in page HTML")
+def _canonicalize_url(url: str) -> str:
+  parsed = urlparse(url)
+  path = parsed.path or "/"
+  if path != "/" and path.endswith("/"):
+    path = path.rstrip("/")
+  return parsed._replace(
+    scheme=parsed.scheme.lower(),
+    netloc=parsed.netloc.lower(),
+    path=path,
+    query="",
+    fragment="",
+  ).geturl()
+def _infer_root_path(start_url: str) -> str:
+  parsed = urlparse(start_url)
+  path = (parsed.path or "").rstrip("/")
+  if not path:
+    return ""
+  parts = path.split("/")
+  if len(parts) <= 2:
+    return path
+  return "/".join(parts[:-1])
+def _path_within_root(path: str, root_path: str) -> bool:
+  if not root_path or root_path == "/":
+    return True
+  if path == root_path:
+    return True
+  root = root_path if root_path.endswith("/") else f"{root_path}/"
+  return path.startswith(root)
+def _is_probable_doc_link(url: str) -> bool:
+  parsed = urlparse(url)
+  path = (parsed.path or "").lower()
+  for ext in _NON_DOC_EXTENSIONS:
+    if path.endswith(ext):
+      return False
+  return True
+def _sidebar_candidates(soup: BeautifulSoup) -> list[Tag]:
+  seen: set[int] = set()
+  candidates: list[Tag] = []
+  for selector in _SIDEBAR_SELECTORS:
+    for el in soup.select(selector):
+      key = id(el)
+      if key in seen:
+        continue
+      seen.add(key)
+      candidates.append(el)
+  keywords = ["sidebar", "toc", "table of contents", "table-of-contents", "docs", "documentation"]
+  for el in soup.find_all(["nav", "aside", "div"]):
+    key = id(el)
+    if key in seen:
+      continue
+    label = str(el.get("aria-label") or "").lower()
+    elem_id = str(el.get("id") or "").lower()
+    data_testid = str(el.get("data-testid") or "").lower()
+    classes = " ".join(el.get("class", [])).lower()
+    haystack = " ".join([label, elem_id, data_testid, classes])
+    if any(k in haystack for k in keywords):
+      seen.add(key)
+      candidates.append(el)
+  return candidates
+def _looks_like_pager(container: Tag, links: list[Tag]) -> bool:
+  label = str(container.get("aria-label") or "").lower()
+  if "docs pages" in label or "breadcrumb" in label:
+    return True
+  if not links:
+    return True
+  texts = []
+  for a in links:
+    text = " ".join(a.get_text(" ", strip=True).split()).lower()
+    if text:
+      texts.append(text)
+  if not texts:
+    return False
+  pager_words = {"next", "previous", "prev", "back"}
+  return all(text in pager_words for text in texts)
+def _extract_sidebar_urls(
+  soup: BeautifulSoup,
+  *,
+  base_url: str,
+  start_url: str,
+) -> list[str]:
+  candidates = _sidebar_candidates(soup)
+  if not candidates:
+    return []
+  origin = urlparse(start_url).netloc.lower()
+  root_path = _infer_root_path(start_url)
+  best: list[str] = []
+  for container in candidates:
+    anchors = list(container.find_all("a", href=True))
+    if _looks_like_pager(container, anchors):
+      continue
+    urls: list[str] = []
+    seen: set[str] = set()
+    for a in anchors:
+      href = str(a.get("href") or "").strip()
+      if not href or href.startswith("#"):
+        continue
+      if href.startswith(("mailto:", "tel:", "javascript:")):
+        continue
+      abs_url = urljoin(base_url, href)
+      parsed = urlparse(abs_url)
+      if parsed.scheme not in ("http", "https"):
+        continue
+      if origin and parsed.netloc.lower() != origin:
+        continue
+      if not _is_probable_doc_link(abs_url):
+        continue
+      if not _path_within_root(parsed.path or "", root_path):
+        continue
+      canonical = _canonicalize_url(abs_url)
+      if canonical in seen:
+        continue
+      seen.add(canonical)
+      urls.append(canonical)
+    if len(urls) > len(best):
+      best = urls
+  return best
+def _extract_content_urls(
+  container: Tag,
+  *,
+  base_url: str,
+  start_url: str,
+) -> list[str]:
+  origin = urlparse(start_url).netloc.lower()
+  root_path = _infer_root_path(start_url)
+  urls: list[str] = []
+  seen: set[str] = set()
+  for a in container.find_all("a", href=True):
+    href = str(a.get("href") or "").strip()
+    if not href or href.startswith("#"):
+      continue
+    if href.startswith(("mailto:", "tel:", "javascript:")):
+      continue
+    abs_url = urljoin(base_url, href)
+    parsed = urlparse(abs_url)
+    if parsed.scheme not in ("http", "https"):
+      continue
+    if origin and parsed.netloc.lower() != origin:
+      continue
+    if not _is_probable_doc_link(abs_url):
+      continue
+    if not _path_within_root(parsed.path or "", root_path):
+      continue
+    canonical = _canonicalize_url(abs_url)
+    if canonical in seen:
+      continue
+    seen.add(canonical)
+    urls.append(canonical)
+  return urls
+def _remove_unwanted(article: Tag) -> None:
+  for selector in [
+    'nav[aria-label="Breadcrumbs"]',
+    'nav[aria-label="Breadcrumb"]',
+    'nav[aria-label="Docs pages"]',
+    "div.theme-doc-footer",
+    "div.theme-doc-footer-edit-meta-row",
+    "div.theme-doc-version-badge",
+    "script",
+    "style",
+    "noscript",
+    "iframe",
+    "svg",
+    "button",
+  ]:
+    for el in list(article.select(selector)):
+      el.decompose()
+def _absolutize_urls(container: Tag, base_url: str) -> None:
+  for el in container.find_all(True):
+    if el.has_attr("href"):
+      href = str(el.get("href") or "")
+      if href.startswith("/"):
+        el["href"] = urljoin(base_url, href)
+    if el.has_attr("src"):
+      src = str(el.get("src") or "")
+      if src.startswith("/"):
+        el["src"] = urljoin(base_url, src)
+def _extract_next_url(soup: BeautifulSoup, base_url: str) -> str | None:
+  nav = soup.select_one('nav[aria-label="Docs pages"]')
+  if not nav:
+    return None
+  for a in nav.find_all("a", href=True):
+    text = " ".join(a.get_text(" ", strip=True).split())
+    if text.lower().startswith("next"):
+      return urljoin(base_url, a["href"])
+  return None
+def iter_docusaurus_next(options: DocusaurusNextOptions) -> list[Chapter]:
+  session = requests.Session()
+  session.headers.update({"User-Agent": options.user_agent})
+  url = options.start_url
+  base_url = options.base_url or options.start_url
+  visited: set[str] = set()
+  chapters: list[Chapter] = []
+  def fetch_soup(target_url: str) -> BeautifulSoup:
+    resp = session.get(target_url, timeout=30)
+    resp.raise_for_status()
+    return BeautifulSoup(resp.text, "lxml")
+  initial_soup = fetch_soup(url)
+  sidebar_urls = _extract_sidebar_urls(initial_soup, base_url=base_url, start_url=url)
+  initial_key = _canonicalize_url(url)
+  def consume_page(target_url: str, *, soup: BeautifulSoup | None = None) -> Tag | None:
+    if options.max_pages is not None and len(chapters) >= options.max_pages:
+      return None
+    key = _canonicalize_url(target_url)
+    if key in visited:
+      return None
+    visited.add(key)
+    page_soup = soup if soup is not None else fetch_soup(target_url)
+    article = _extract_article(page_soup)
+    title_el = article.find(["h1", "h2"])
+    title = (
+      " ".join(title_el.get_text(" ", strip=True).split())
+      if title_el
+      else f"Chapter {len(chapters) + 1}"
+    )
+    _remove_unwanted(article)
+    _absolutize_urls(article, base_url=target_url)
+    for a in list(article.select('a.hash-link[href^="#"]')):
+      a.decompose()
+    html = article.decode_contents()
+    chapters.append(Chapter(index=len(chapters) + 1, title=title, url=target_url, html=html))
+    if options.sleep_s > 0 and (options.max_pages is None or len(chapters) < options.max_pages):
+      import time
+      time.sleep(options.sleep_s)
+    return article
+  if sidebar_urls:
+    if initial_key not in {_canonicalize_url(u) for u in sidebar_urls}:
+      sidebar_urls.insert(0, url)
+    queue = list(sidebar_urls)
+    discovered = {_canonicalize_url(u) for u in queue}
+    idx = 0
+    while idx < len(queue):
+      if options.max_pages is not None and len(chapters) >= options.max_pages:
+        break
+      target_url = queue[idx]
+      use_soup = initial_soup if _canonicalize_url(target_url) == initial_key else None
+      article = consume_page(target_url, soup=use_soup)
+      if article is None:
+        idx += 1
+        continue
+      extra = _extract_content_urls(article, base_url=target_url, start_url=url)
+      for link in extra:
+        key = _canonicalize_url(link)
+        if key in discovered:
+          continue
+        discovered.add(key)
+        queue.append(link)
+      idx += 1
+    return chapters
+  # Fallback: follow next/previous navigation.
+  current_url = url
+  soup = initial_soup
+  while True:
+    if options.max_pages is not None and len(chapters) >= options.max_pages:
+      break
+    article = consume_page(current_url, soup=soup)
+    if article is None:
+      break
+    next_url = _extract_next_url(soup, base_url=base_url)
+    if not next_url:
+      break
+    current_url = next_url
+    soup = fetch_soup(current_url)
+  return chapters

{docs2epub-0.1.1 → docs2epub-0.1.5}/src/docs2epub/kindle_html.py RENAMED Viewed

@@ -5,37 +5,45 @@ import re
 from bs4 import BeautifulSoup
-def clean_html_for_kindle_epub2(html_fragment: str) -> str:
+def clean_html_for_kindle_epub2(
+  html_fragment: str,
+  *,
+  keep_images: bool,
+) -> str:
   """Best-effort HTML cleanup for Kindle-friendly EPUB2.
   This is intentionally conservative: it strips known-problematic attributes
   and tags that commonly cause Send-to-Kindle conversion issues.
+  By default we drop remote images to avoid pandoc fetch failures.
   """
   soup = BeautifulSoup(html_fragment, "lxml")
+  if not keep_images:
+    for img in list(soup.find_all("img")):
+      src = str(img.get("src") or "")
+      if src.startswith("http://") or src.startswith("https://"):
+        img.decompose()
   # EPUB2: <u> tag isn't consistently supported; convert to a span.
   for u in list(soup.find_all("u")):
     span = soup.new_tag("span")
     span["style"] = "text-decoration: underline;"
-    span.string = u.get_text() if u.string is None else u.string
     if u.string is None:
-      # Keep children by moving them into the span.
       for child in list(u.contents):
         span.append(child)
+    else:
+      span.string = u.string
     u.replace_with(span)
   # Remove tabindex attributes (not allowed in EPUB2 XHTML).
   for el in soup.find_all(attrs={"tabindex": True}):
-    try:
-      del el["tabindex"]
-    except KeyError:
-      pass
+    el.attrs.pop("tabindex", None)
   # Remove start attribute from ordered lists (not allowed in EPUB2 XHTML).
   for ol in soup.find_all("ol"):
-    if ol.has_attr("start"):
-      del ol["start"]
+    ol.attrs.pop("start", None)
   # Strip duplicate ids in a simple way: if an id repeats, rename it.
   seen_ids: set[str] = set()
@@ -54,7 +62,6 @@ def clean_html_for_kindle_epub2(html_fragment: str) -> str:
     el["id"] = new_id
     seen_ids.add(new_id)
-  # Remove empty fragment links that point to missing ids (best-effort).
   # If href="#something" but no element has id="something", drop href.
   all_ids = {str(el.get("id")) for el in soup.find_all(attrs={"id": True})}
   for a in soup.find_all("a", href=True):
@@ -62,9 +69,9 @@ def clean_html_for_kindle_epub2(html_fragment: str) -> str:
     if href.startswith("#") and len(href) > 1:
       frag = href[1:]
       if frag not in all_ids:
-        del a["href"]
+        a.attrs.pop("href", None)
-  # Normalize weird whitespace artifacts.
+  # Normalize whitespace a bit (helps keep diffs smaller and reduces odd output).
   text = str(soup)
   text = re.sub(r"\s+", " ", text)
   return text.strip()

{docs2epub-0.1.1 → docs2epub-0.1.5}/src/docs2epub/pandoc_epub2.py RENAMED Viewed

@@ -15,7 +15,8 @@ from .model import Chapter
 class PandocEpub2Options:
   toc: bool = True
   toc_depth: int = 2
-  chapter_level: int = 1
+  split_level: int = 1
+  keep_images: bool = False
 def _wrap_html(title: str, body_html: str) -> str:
@@ -34,6 +35,26 @@ def _wrap_html(title: str, body_html: str) -> str:
 """
+def _summarize_pandoc_warnings(stderr: str) -> str:
+  warnings = [line for line in stderr.splitlines() if line.startswith("[WARNING]")]
+  if not warnings:
+    return ""
+  resource = [w for w in warnings if "Could not fetch resource" in w]
+  duplicate = [w for w in warnings if "Duplicate identifier" in w]
+  parts: list[str] = []
+  parts.append(f"pandoc warnings: {len(warnings)} (use -v to see full output)")
+  if duplicate:
+    parts.append(f"- Duplicate identifier: {len(duplicate)} (usually safe; affects internal anchors)")
+  if resource:
+    parts.append(
+      f"- Missing resources: {len(resource)} (some images may be dropped; use --keep-images/-v to inspect)"
+    )
+  return "\n".join(parts)
 def build_epub2_with_pandoc(
   *,
   chapters: Iterable[Chapter],
@@ -43,6 +64,7 @@ def build_epub2_with_pandoc(
   language: str,
   publisher: str | None,
   identifier: str | None,
+  verbose: bool,
   options: PandocEpub2Options | None = None,
 ) -> Path:
   pandoc = shutil.which("pandoc")
@@ -61,7 +83,7 @@ def build_epub2_with_pandoc(
     html_files: list[Path] = []
     for ch in chapters:
-      cleaned = clean_html_for_kindle_epub2(ch.html)
+      cleaned = clean_html_for_kindle_epub2(ch.html, keep_images=opts.keep_images)
       html_doc = _wrap_html(ch.title, cleaned)
       fp = tmp_path / f"chapter_{ch.index:04d}.html"
       fp.write_text(html_doc, encoding="utf-8")
@@ -81,14 +103,15 @@ def build_epub2_with_pandoc(
       "encoding=UTF-8",
       "--standalone",
       "--split-level",
-      str(opts.chapter_level),
+      str(opts.split_level),
     ]
     if publisher:
       cmd.extend(["--metadata", f"publisher={publisher}"])
     if identifier:
-      cmd.extend(["--epub-metadata", str(identifier)])
+      # Keep identifier stable for Kindle.
+      cmd.extend(["--metadata", f"identifier={identifier}"])
     if opts.toc:
       cmd.extend(["--toc", "--toc-depth", str(opts.toc_depth)])
@@ -96,6 +119,22 @@ def build_epub2_with_pandoc(
     cmd.extend(["-o", str(out_path)])
     cmd.extend([str(p) for p in html_files])
-    subprocess.run(cmd, check=True)
+    proc = subprocess.run(
+      cmd,
+      stdout=subprocess.PIPE,
+      stderr=subprocess.PIPE,
+      text=True,
+    )
+    if proc.returncode != 0:
+      # On failure, always show stderr.
+      raise RuntimeError(f"pandoc failed (exit {proc.returncode}):\n{proc.stderr.strip()}")
+    if verbose and proc.stderr.strip():
+      print(proc.stderr.strip())
+    elif proc.stderr.strip():
+      summary = _summarize_pandoc_warnings(proc.stderr)
+      if summary:
+        print(summary)
   return out_path

docs2epub-0.1.5/tests/test_docusaurus_next.py ADDED Viewed

@@ -0,0 +1,42 @@
+from docs2epub.docusaurus_next import DocusaurusNextOptions, iter_docusaurus_next
+def test_iter_docusaurus_next_falls_back_to_main_when_no_article(monkeypatch):
+  html = """
+  <!doctype html>
+  <html>
+    <body>
+      <main>
+        <div>
+          <h1>Overview</h1>
+          <p>Hello world</p>
+        </div>
+      </main>
+    </body>
+  </html>
+  """
+  class DummyResponse:
+    text = html
+    def raise_for_status(self) -> None:
+      return None
+  class DummySession:
+    def __init__(self) -> None:
+      self.headers = {}
+    def get(self, url: str, timeout: int = 30) -> DummyResponse:
+      return DummyResponse()
+  monkeypatch.setattr(
+    "docs2epub.docusaurus_next.requests.Session",
+    lambda: DummySession(),
+  )
+  options = DocusaurusNextOptions(start_url="https://example.com/docs", sleep_s=0)
+  chapters = iter_docusaurus_next(options)
+  assert len(chapters) == 1
+  assert chapters[0].title == "Overview"
+  assert "Hello world" in chapters[0].html

docs2epub-0.1.5/tests/test_sidebar_scrape.py ADDED Viewed

@@ -0,0 +1,102 @@
+from docs2epub.docusaurus_next import DocusaurusNextOptions, iter_docusaurus_next
+def _make_session(pages: dict[str, str]):
+  class DummyResponse:
+    def __init__(self, text: str) -> None:
+      self.text = text
+    def raise_for_status(self) -> None:
+      return None
+  class DummySession:
+    def __init__(self) -> None:
+      self.headers = {}
+    def get(self, url: str, timeout: int = 30) -> DummyResponse:
+      if url not in pages:
+        raise AssertionError(f"unexpected url fetch: {url}")
+      return DummyResponse(pages[url])
+  return DummySession
+def test_iter_uses_gitbook_sidebar_links(monkeypatch):
+  start_url = "https://example.com/book/intro"
+  sidebar = """
+  <aside data-testid="table-of-contents">
+    <a href="/book/intro">Intro</a>
+    <a href="/book/chapter-1">Chapter 1</a>
+  </aside>
+  """
+  pages = {
+    start_url: f"<html><body>{sidebar}<main><h1>Intro</h1><p>Intro text</p></main></body></html>",
+    "https://example.com/book/chapter-1": f"<html><body>{sidebar}<main><h1>Chapter 1</h1><p>Ch1</p></main></body></html>",
+  }
+  monkeypatch.setattr(
+    "docs2epub.docusaurus_next.requests.Session",
+    lambda: _make_session(pages)(),
+  )
+  options = DocusaurusNextOptions(start_url=start_url, sleep_s=0)
+  chapters = iter_docusaurus_next(options)
+  assert [c.title for c in chapters] == ["Intro", "Chapter 1"]
+def test_iter_uses_docusaurus_menu_sidebar(monkeypatch):
+  start_url = "https://example.com/docs/intro"
+  sidebar = """
+  <nav class="menu">
+    <a class="menu__link" href="/docs/intro">Intro</a>
+    <a class="menu__link" href="/docs/install">Install</a>
+  </nav>
+  """
+  pages = {
+    start_url: f"<html><body>{sidebar}<article><h1>Intro</h1><p>Intro text</p></article></body></html>",
+    "https://example.com/docs/install": f"<html><body>{sidebar}<article><h1>Install</h1><p>Install text</p></article></body></html>",
+  }
+  monkeypatch.setattr(
+    "docs2epub.docusaurus_next.requests.Session",
+    lambda: _make_session(pages)(),
+  )
+  options = DocusaurusNextOptions(start_url=start_url, sleep_s=0)
+  chapters = iter_docusaurus_next(options)
+  assert [c.title for c in chapters] == ["Intro", "Install"]
+def test_iter_expands_links_from_index_pages(monkeypatch):
+  start_url = "https://example.com/docs/intro"
+  sidebar = """
+  <nav class="menu">
+    <a class="menu__link" href="/docs/intro">Intro</a>
+    <a class="menu__link" href="/docs/category/getting-started">Getting Started</a>
+  </nav>
+  """
+  pages = {
+    start_url: f"<html><body>{sidebar}<article><h1>Intro</h1><p>Intro text</p></article></body></html>",
+    "https://example.com/docs/category/getting-started": (
+      "<html><body>"
+      f"{sidebar}"
+      '<article><h1>Getting Started</h1>'
+      '<a href="/docs/one">One</a>'
+      '<a href="/docs/two">Two</a>'
+      "</article></body></html>"
+    ),
+    "https://example.com/docs/one": f"<html><body>{sidebar}<article><h1>One</h1><p>One text</p></article></body></html>",
+    "https://example.com/docs/two": f"<html><body>{sidebar}<article><h1>Two</h1><p>Two text</p></article></body></html>",
+  }
+  monkeypatch.setattr(
+    "docs2epub.docusaurus_next.requests.Session",
+    lambda: _make_session(pages)(),
+  )
+  options = DocusaurusNextOptions(start_url=start_url, sleep_s=0)
+  chapters = iter_docusaurus_next(options)
+  assert [c.title for c in chapters] == ["Intro", "Getting Started", "One", "Two"]

{docs2epub-0.1.1 → docs2epub-0.1.5}/tests/test_smoke.py RENAMED Viewed

@@ -16,8 +16,17 @@ def test_build_epub3_smoke(tmp_path):
 def test_kindle_cleaner_strips_tabindex_and_ol_start():
   cleaned = clean_html_for_kindle_epub2(
-    '<div tabindex="0"><ol start="2"><li><u>Hi</u></li></ol></div>'
+    '<div tabindex="0"><ol start="2"><li><u>Hi</u></li></ol></div>',
+    keep_images=False,
   )
   assert "tabindex" not in cleaned
   assert "start=" not in cleaned
   assert "underline" in cleaned
+def test_kindle_cleaner_drops_remote_images_by_default():
+  cleaned = clean_html_for_kindle_epub2(
+    '<p>x</p><img src="https://example.com/a.png" /><p>y</p>',
+    keep_images=False,
+  )
+  assert "img" not in cleaned

{docs2epub-0.1.1 → docs2epub-0.1.5}/uv.lock RENAMED Viewed

@@ -166,7 +166,7 @@ wheels = [
 [[package]]
 name = "docs2epub"
-version = "0.1.0"
+version = "0.1.5"
 source = { editable = "." }
 dependencies = [
     { name = "beautifulsoup4" },

docs2epub-0.1.1/src/docs2epub/docusaurus_next.py DELETED Viewed

@@ -1,140 +0,0 @@
-from __future__ import annotations
-import re
-from dataclasses import dataclass
-from urllib.parse import urljoin
-import requests
-from bs4 import BeautifulSoup, Tag
-from .model import Chapter
-DEFAULT_USER_AGENT = "docs2epub/0.1 (+https://github.com/brenorb/docs2epub)"
-@dataclass(frozen=True)
-class DocusaurusNextOptions:
-  start_url: str
-  base_url: str | None = None
-  max_pages: int | None = None
-  sleep_s: float = 0.5
-  user_agent: str = DEFAULT_USER_AGENT
-def _slugify_filename(text: str) -> str:
-  value = text.strip().lower()
-  value = re.sub(r"[^\w\s-]", "", value)
-  value = re.sub(r"[\s_-]+", "-", value)
-  value = value.strip("-")
-  return value or "chapter"
-def _extract_article(soup: BeautifulSoup) -> Tag:
-  article = soup.find("article")
-  if article:
-    return article
-  main = soup.find("main")
-  if main:
-    article = main.find("article")
-    if article:
-      return article
-  raise RuntimeError("Could not find <article> in page HTML")
-def _remove_unwanted(article: Tag) -> None:
-  for selector in [
-    'nav[aria-label="Breadcrumbs"]',
-    'nav[aria-label="Docs pages"]',
-    "div.theme-doc-footer",
-    "div.theme-doc-footer-edit-meta-row",
-    "div.theme-doc-version-badge",
-    "script",
-    "style",
-    "noscript",
-    "iframe",
-    "svg",
-    "button",
-  ]:
-    for el in list(article.select(selector)):
-      el.decompose()
-def _absolutize_urls(container: Tag, base_url: str) -> None:
-  for el in container.find_all(True):
-    if el.has_attr("href"):
-      href = str(el.get("href") or "")
-      if href.startswith("/"):
-        el["href"] = urljoin(base_url, href)
-    if el.has_attr("src"):
-      src = str(el.get("src") or "")
-      if src.startswith("/"):
-        el["src"] = urljoin(base_url, src)
-def _extract_next_url(soup: BeautifulSoup, base_url: str) -> str | None:
-  nav = soup.select_one('nav[aria-label="Docs pages"]')
-  if not nav:
-    return None
-  for a in nav.find_all("a", href=True):
-    text = " ".join(a.get_text(" ", strip=True).split())
-    if text.lower().startswith("next"):
-      return urljoin(base_url, a["href"])
-  return None
-def iter_docusaurus_next(options: DocusaurusNextOptions) -> list[Chapter]:
-  session = requests.Session()
-  session.headers.update({"User-Agent": options.user_agent})
-  url = options.start_url
-  base_url = options.base_url or options.start_url
-  visited: set[str] = set()
-  chapters: list[Chapter] = []
-  idx = 1
-  while True:
-    if options.max_pages is not None and idx > options.max_pages:
-      break
-    if url in visited:
-      break
-    visited.add(url)
-    resp = session.get(url, timeout=30)
-    resp.raise_for_status()
-    soup = BeautifulSoup(resp.text, "lxml")
-    article = _extract_article(soup)
-    title_el = article.find(["h1", "h2"])
-    title = (
-      " ".join(title_el.get_text(" ", strip=True).split()) if title_el else f"Chapter {idx}"
-    )
-    _remove_unwanted(article)
-    _absolutize_urls(article, base_url=base_url)
-    for a in list(article.select('a.hash-link[href^="#"]')):
-      a.decompose()
-    html = article.decode_contents()
-    chapters.append(Chapter(index=idx, title=title, url=url, html=html))
-    next_url = _extract_next_url(soup, base_url=base_url)
-    if not next_url:
-      break
-    url = next_url
-    idx += 1
-    if options.sleep_s > 0:
-      import time
-      time.sleep(options.sleep_s)
-  return chapters

{docs2epub-0.1.1 → docs2epub-0.1.5}/.github/workflows/ci.yml RENAMED Viewed

File without changes

{docs2epub-0.1.1 → docs2epub-0.1.5}/.github/workflows/publish.yml RENAMED Viewed

File without changes

{docs2epub-0.1.1 → docs2epub-0.1.5}/.gitignore RENAMED Viewed

File without changes

{docs2epub-0.1.1 → docs2epub-0.1.5}/src/docs2epub/epub.py RENAMED Viewed

File without changes

{docs2epub-0.1.1 → docs2epub-0.1.5}/src/docs2epub/model.py RENAMED Viewed

File without changes

docs2epub 0.1.1__tar.gz → 0.1.5__tar.gz

docs2epub 0.1.1tar.gz → 0.1.5tar.gz