PyPI - epub2pdf-cli - Versions diffs - 0.3.0__py3-none-any.whl - Mend

epub2pdf-cli 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

epub2pdf_cli/__init__.py +5 -0
epub2pdf_cli/__main__.py +4 -0
epub2pdf_cli/api.py +160 -0
epub2pdf_cli/cli.py +223 -0
epub2pdf_cli/config.py +109 -0
epub2pdf_cli/epub/__init__.py +3 -0
epub2pdf_cli/epub/chapters.py +81 -0
epub2pdf_cli/epub/container.py +25 -0
epub2pdf_cli/epub/href.py +24 -0
epub2pdf_cli/epub/opf.py +159 -0
epub2pdf_cli/epub/parser.py +64 -0
epub2pdf_cli/epub/toc.py +101 -0
epub2pdf_cli/errors.py +27 -0
epub2pdf_cli/html/__init__.py +3 -0
epub2pdf_cli/html/builder.py +190 -0
epub2pdf_cli/html/css.py +49 -0
epub2pdf_cli/html/links.py +144 -0
epub2pdf_cli/html/template.py +92 -0
epub2pdf_cli/io_utils.py +24 -0
epub2pdf_cli/markdown.py +97 -0
epub2pdf_cli/mcp_server.py +189 -0
epub2pdf_cli/models.py +116 -0
epub2pdf_cli/pdf/__init__.py +5 -0
epub2pdf_cli/pdf/extract.py +79 -0
epub2pdf_cli/pdf/extractors/__init__.py +0 -0
epub2pdf_cli/pdf/extractors/base.py +23 -0
epub2pdf_cli/pdf/extractors/docling_extractor.py +139 -0
epub2pdf_cli/pdf/extractors/opendataloader_extractor.py +86 -0
epub2pdf_cli/pdf/extractors/pdfplumber_extractor.py +150 -0
epub2pdf_cli/pdf/extractors/pypdfium2_extractor.py +151 -0
epub2pdf_cli/pdf/text.py +45 -0
epub2pdf_cli/pdf/validate.py +37 -0
epub2pdf_cli/pipeline/__init__.py +6 -0
epub2pdf_cli/pipeline/batch.py +84 -0
epub2pdf_cli/pipeline/convert.py +122 -0
epub2pdf_cli/pipeline/extract.py +64 -0
epub2pdf_cli/pipeline/inspect.py +15 -0
epub2pdf_cli/render/__init__.py +17 -0
epub2pdf_cli/render/options.py +19 -0
epub2pdf_cli/render/playwright.py +91 -0
epub2pdf_cli/render/protocol.py +13 -0
epub2pdf_cli/render/weasyprint.py +28 -0
epub2pdf_cli-0.3.0.dist-info/METADATA +443 -0
epub2pdf_cli-0.3.0.dist-info/RECORD +48 -0
epub2pdf_cli-0.3.0.dist-info/WHEEL +5 -0
epub2pdf_cli-0.3.0.dist-info/entry_points.txt +3 -0
epub2pdf_cli-0.3.0.dist-info/licenses/LICENSE +21 -0
epub2pdf_cli-0.3.0.dist-info/top_level.txt +1 -0

epub2pdf_cli/html/links.py ADDED Viewed

@@ -0,0 +1,144 @@
+from __future__ import annotations
+import base64
+import posixpath
+from html import escape
+from typing import Any
+from urllib.parse import urlparse
+from epub2pdf_cli.epub.href import split_href
+from epub2pdf_cli.models import Chapter, EpubBook
+DATA_SCHEMES = ("http://", "https://", "mailto:", "data:")
+LINK_ATTRS = ("src", "href", "poster", "xlink:href")
+def rewrite_resources(
+    body: Any,
+    current_href: str,
+    chapter_lookup: dict[str, Chapter],
+    chapter_section_ids: dict[str, str],
+    element_id_map: dict[tuple[str, str], str],
+    book: EpubBook,
+    assets: dict[str, dict[str, Any]],
+    warnings: list[str],
+) -> None:
+    for tag in body.find_all(True):
+        for attr in LINK_ATTRS:
+            value = tag.get(attr)
+            if not value:
+                continue
+            rewritten = _rewrite_attr(
+                attr,
+                value,
+                current_href=current_href,
+                chapter_lookup=chapter_lookup,
+                chapter_section_ids=chapter_section_ids,
+                element_id_map=element_id_map,
+                book=book,
+                assets=assets,
+                warnings=warnings,
+            )
+            if rewritten is None:
+                tag.attrs.pop(attr, None)
+            else:
+                tag[attr] = rewritten
+def _rewrite_attr(
+    attr: str,
+    value: str,
+    *,
+    current_href: str,
+    chapter_lookup: dict[str, Chapter],
+    chapter_section_ids: dict[str, str],
+    element_id_map: dict[tuple[str, str], str],
+    book: EpubBook,
+    assets: dict[str, dict[str, Any]],
+    warnings: list[str],
+) -> str | None:
+    if any(value.startswith(prefix) for prefix in DATA_SCHEMES):
+        return value
+    parsed = urlparse(value)
+    if parsed.scheme and parsed.scheme not in {"file"}:
+        return value
+    target_path, fragment = split_href(value)
+    resolved_path = (
+        posixpath.normpath(posixpath.join(posixpath.dirname(current_href), target_path))
+        if target_path
+        else current_href
+    )
+    if attr == "href":
+        if resolved_path in chapter_lookup:
+            if fragment:
+                target_id = element_id_map.get((resolved_path, fragment))
+                if target_id:
+                    return f"#{target_id}"
+            return f"#{chapter_section_ids[resolved_path]}"
+        manifest_item = book.manifest_by_href.get(resolved_path)
+        if manifest_item and manifest_item.media_type.startswith("image/"):
+            _record_asset(assets, resolved_path, manifest_item, "linked-image")
+            return _data_uri(manifest_item.content, manifest_item.media_type)
+        return value
+    manifest_item = book.manifest_by_href.get(resolved_path)
+    if manifest_item and manifest_item.content:
+        _record_asset(assets, resolved_path, manifest_item, attr)
+        return _data_uri(manifest_item.content, manifest_item.media_type)
+    warnings.append(f"Missing asset during normalization: {resolved_path}")
+    return None
+def _record_asset(
+    assets: dict[str, dict[str, Any]],
+    resolved_path: str,
+    manifest_item: Any,
+    usage: str,
+) -> None:
+    assets[resolved_path] = {
+        "href": resolved_path,
+        "media_type": manifest_item.media_type,
+        "rewritten_as": "data-uri",
+        "usage": usage,
+    }
+def _data_uri(content: bytes, media_type: str) -> str:
+    encoded = base64.b64encode(content).decode("ascii")
+    return f"data:{media_type};base64,{encoded}"
+def map_toc_href(
+    href: str,
+    chapter_section_ids: dict[str, str],
+    element_id_map: dict[tuple[str, str], str],
+) -> str:
+    if not href or "://" in href or href.startswith("mailto:"):
+        return href
+    path, fragment = split_href(href)
+    if path in chapter_section_ids:
+        if fragment:
+            mapped = element_id_map.get((path, fragment))
+            if mapped:
+                return f"#{mapped}"
+        return f"#{chapter_section_ids[path]}"
+    return href
+def render_toc_items(
+    entries: list[Any],
+    chapter_section_ids: dict[str, str],
+    element_id_map: dict[tuple[str, str], str],
+) -> str:
+    rendered: list[str] = []
+    for entry in entries:
+        href = map_toc_href(entry.href, chapter_section_ids, element_id_map)
+        label = escape(entry.title or entry.href)
+        children = render_toc_items(entry.children, chapter_section_ids, element_id_map)
+        child_html = f"<ol>{children}</ol>" if children else ""
+        link = f'<a href="{escape(href)}">{label}</a>' if href else label
+        rendered.append(f"<li>{link}{child_html}</li>")
+    return "".join(rendered)

epub2pdf_cli/html/template.py ADDED Viewed

@@ -0,0 +1,92 @@
+from __future__ import annotations
+from html import escape
+from epub2pdf_cli.config import PageSize
+def base_css(page_size: PageSize, margin_mm: int) -> str:
+    margin = max(margin_mm, 0)
+    return f"""
+@page {{
+  size: {page_size};
+  margin: {margin}mm;
+}}
+html {{
+  font-size: 11pt;
+  line-height: 1.6;
+  color: #111;
+}}
+body {{
+  margin: 0;
+  font-family: serif;
+  print-color-adjust: exact;
+  -webkit-print-color-adjust: exact;
+}}
+h1, h2, h3, h4, h5, h6 {{
+  break-after: avoid;
+  break-inside: avoid;
+}}
+img, svg {{
+  max-width: 100%;
+  height: auto;
+  break-inside: avoid;
+}}
+a {{
+  color: #0b57d0;
+  text-decoration: none;
+}}
+.page-break {{
+  break-before: page;
+}}
+.page-break:first-child {{
+  break-before: auto;
+}}
+.epub-cover {{
+  min-height: 90vh;
+  display: flex;
+  align-items: center;
+  justify-content: center;
+}}
+.epub-cover img {{
+  max-height: 90vh;
+  object-fit: contain;
+}}
+.generated-toc ol {{
+  padding-left: 1.25rem;
+}}
+.chapter-title {{
+  margin-top: 0;
+}}
+"""
+def wrap_document(
+    *,
+    title: str,
+    language: str,
+    author: str,
+    stylesheets: list[str],
+    body_sections: list[str],
+) -> str:
+    lang = language or "en"
+    head_bits = [
+        '<meta charset="utf-8" />',
+        f"<title>{escape(title)}</title>",
+        f'<meta name="author" content="{escape(author)}" />' if author else "",
+    ]
+    head_bits.extend(f"<style>{css}</style>" for css in stylesheets)
+    return "\n".join(
+        [
+            "<!DOCTYPE html>",
+            f'<html lang="{escape(lang)}">',
+            "<head>",
+            *[part for part in head_bits if part],
+            "</head>",
+            "<body>",
+            *body_sections,
+            "</body>",
+            "</html>",
+        ]
+    )

epub2pdf_cli/io_utils.py ADDED Viewed

@@ -0,0 +1,24 @@
+from __future__ import annotations
+import hashlib
+import json
+from pathlib import Path
+from typing import Any
+def sha256(path: Path) -> str:
+    digest = hashlib.sha256()
+    with path.open("rb") as handle:
+        for chunk in iter(lambda: handle.read(65536), b""):
+            digest.update(chunk)
+    return digest.hexdigest()
+def write_text(path: Path, content: str) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(content, encoding="utf-8")
+def write_json(path: Path, payload: Any) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")

epub2pdf_cli/markdown.py ADDED Viewed

@@ -0,0 +1,97 @@
+from __future__ import annotations
+from typing import Any
+from bs4 import BeautifulSoup, NavigableString
+from epub2pdf_cli.models import EpubBook
+def build_markdown(book: EpubBook) -> str:
+    parts: list[str] = []
+    title = book.metadata.get("title", "")
+    creators = book.metadata.get("creators", [])
+    if title:
+        parts.append(f"# {title}")
+        parts.append("")
+    if creators:
+        parts.append(f"*{'*, *'.join(creators)}*")
+        parts.append("")
+    if book.toc:
+        parts.append("## Table of Contents")
+        parts.append("")
+        parts.extend(_render_toc_entries(book.toc))
+        parts.append("")
+    for index, chapter in enumerate(book.chapters, start=1):
+        if not chapter.linear:
+            continue
+        parts.append(f"## {chapter.title or f'Chapter {index}'}")
+        parts.append("")
+        parts.append(_html_to_markdown(chapter.html))
+        parts.append("")
+    return "\n".join(parts).strip() + "\n"
+def _render_toc_entries(entries: list[Any], level: int = 0) -> list[str]:
+    lines: list[str] = []
+    for entry in entries:
+        prefix = "  " * level + "- "
+        lines.append(f"{prefix}[{entry.title}]({entry.href})")
+        if entry.children:
+            lines.extend(_render_toc_entries(entry.children, level + 1))
+    return lines
+def _html_to_markdown(html: str) -> str:
+    soup = BeautifulSoup(html, "lxml")
+    body = soup.body
+    if body is None:
+        body = soup
+    return _convert_node(body).strip()
+def _convert_node(node: Any) -> str:
+    if isinstance(node, NavigableString):
+        return str(node)
+    name = node.name
+    if name is None:
+        return str(node)
+    inner = "".join(_convert_node(child) for child in node.contents)
+    inner = inner.strip()
+    handlers = {
+        "h1": lambda t: f"# {t}\n\n",
+        "h2": lambda t: f"## {t}\n\n",
+        "h3": lambda t: f"### {t}\n\n",
+        "h4": lambda t: f"#### {t}\n\n",
+        "h5": lambda t: f"##### {t}\n\n",
+        "h6": lambda t: f"###### {t}\n\n",
+        "p": lambda t: f"{t}\n\n" if t else "",
+        "br": lambda t: "\n",
+        "strong": lambda t: f"**{t}**",
+        "b": lambda t: f"**{t}**",
+        "em": lambda t: f"*{t}*",
+        "i": lambda t: f"*{t}*",
+        "code": lambda t: f"`{t}`",
+        "a": lambda t: f"[{t}]({node.get('href', '')})" if node.get("href") else t,
+        "img": lambda t: f"![{node.get('alt', '')}]({node.get('src', '')})",
+        "li": lambda t: f"- {t}\n",
+        "blockquote": lambda t: f"> {t.replace(chr(10), chr(10)+'> ')}\n\n",
+        "pre": lambda t: f"```\n{t}\n```\n\n",
+    }
+    if name in ("ol", "ul"):
+        return inner + "\n"
+    if name in handlers:
+        return handlers[name](inner)
+    # Inline elements we don't explicitly handle: span, div, section, etc.
+    if name in ("span", "div", "section", "article", "header", "footer", "nav"):
+        return inner + "\n\n" if inner else ""
+    return inner

epub2pdf_cli/mcp_server.py ADDED Viewed

@@ -0,0 +1,189 @@
+"""Lightweight MCP server for epub2pdf.
+This server exposes epub2pdf tools to MCP clients (e.g., Claude Desktop) using
+the default low-resource settings: WeasyPrint renderer, no PDF validation, and
+no long-lived browser process. Each tool spawns the CLI in a subprocess so the
+server itself stays small and releases resources after every call.
+"""
+from __future__ import annotations
+import os
+import subprocess
+import sys
+from pathlib import Path
+from typing import Any
+try:
+    from mcp.server.fastmcp import FastMCP
+except Exception as exc:
+    raise RuntimeError(
+        "The MCP Python SDK is not installed. Install with `python3 -m pip install -e '.[mcp]'`."
+    ) from exc
+mcp = FastMCP("epub2pdf")
+def _run_cli(*args: str) -> dict[str, Any]:
+    """Run the epub2pdf CLI and return a structured result."""
+    env = dict(os.environ)
+    env.setdefault("PYTHONPATH", str(Path(__file__).resolve().parents[2]))
+    env.setdefault("PYTHONWARNINGS", "ignore")
+    result = subprocess.run(
+        [sys.executable, "-m", "epub2pdf_cli", *args],
+        env=env,
+        text=True,
+        capture_output=True,
+    )
+    return {
+        "success": result.returncode == 0,
+        "returncode": result.returncode,
+        "stdout": result.stdout.strip(),
+        "stderr": result.stderr.strip(),
+    }
+@mcp.tool()
+def convert_epub(
+    input_path: str,
+    output_path: str,
+    *,
+    engine: str = "weasyprint",
+    no_validate: bool = True,
+    sidecar_json: bool = False,
+    sidecar_html: bool = False,
+    sidecar_markdown: bool = False,
+    page_size: str = "A4",
+    margin_mm: int = 12,
+    cover: str = "first",
+    force: bool = False,
+) -> dict[str, Any]:
+    """Convert a single EPUB file to PDF.
+    Defaults to the lightweight WeasyPrint backend and skips PDF validation to
+    keep resource usage low. Use engine="playwright" only when Chromium output
+    is explicitly required.
+    """
+    args: list[str] = [
+        "convert",
+        input_path,
+        "--engine",
+        engine,
+        "--output",
+        output_path,
+        "--page-size",
+        page_size,
+        "--margin-mm",
+        str(margin_mm),
+        "--cover",
+        cover,
+    ]
+    if no_validate:
+        args.append("--no-validate")
+    if sidecar_json:
+        args.extend(["--sidecar-json", str(Path(output_path).with_suffix(".json"))])
+    if sidecar_html:
+        args.extend(["--sidecar-html", str(Path(output_path).with_suffix(".html"))])
+    if sidecar_markdown:
+        args.extend(["--sidecar-markdown", str(Path(output_path).with_suffix(".md"))])
+    if force:
+        args.append("--force")
+    return _run_cli(*args)
+@mcp.tool()
+def batch_convert(
+    input_paths: list[str],
+    output_dir: str,
+    *,
+    workers: int = 1,
+    engine: str = "weasyprint",
+    no_validate: bool = True,
+    sidecar_json: bool = False,
+    sidecar_html: bool = False,
+    sidecar_markdown: bool = False,
+    page_size: str = "A4",
+    margin_mm: int = 12,
+    cover: str = "first",
+    force: bool = False,
+) -> dict[str, Any]:
+    """Convert multiple EPUBs in parallel using low-resource defaults."""
+    args: list[str] = [
+        "batch",
+        *input_paths,
+        "--output-dir",
+        output_dir,
+        "--engine",
+        engine,
+        "--workers",
+        str(workers),
+        "--page-size",
+        page_size,
+        "--margin-mm",
+        str(margin_mm),
+        "--cover",
+        cover,
+    ]
+    if no_validate:
+        args.append("--no-validate")
+    if sidecar_json:
+        args.append("--sidecar-json")
+    if sidecar_html:
+        args.append("--sidecar-html")
+    if sidecar_markdown:
+        args.append("--sidecar-markdown")
+    if force:
+        args.append("--force")
+    return _run_cli(*args)
+@mcp.tool()
+def inspect_epub(
+    input_path: str,
+    *,
+    json_path: str | None = None,
+) -> dict[str, Any]:
+    """Inspect EPUB metadata, manifest, spine, and TOC."""
+    args = ["inspect", input_path]
+    if json_path:
+        args.extend(["--json", json_path])
+    return _run_cli(*args)
+@mcp.tool()
+def extract_pdf(
+    input_path: str,
+    output_dir: str,
+    *,
+    formats: str = "markdown,json",
+    engine: str = "pypdfium2",
+    pages: str | None = None,
+    sidecar_json: bool = False,
+    force: bool = False,
+) -> dict[str, Any]:
+    """Extract Markdown, JSON, text, or HTML from an existing PDF."""
+    args: list[str] = [
+        "pdf-extract",
+        input_path,
+        "--output-dir",
+        output_dir,
+        "--format",
+        formats,
+        "--engine",
+        engine,
+    ]
+    if pages:
+        args.extend(["--pages", pages])
+    if sidecar_json:
+        args.extend(["--sidecar-json", str(Path(output_dir) / f"{Path(input_path).stem}.json")])
+    if force:
+        args.append("--force")
+    return _run_cli(*args)
+def main() -> None:
+    mcp.run(transport="stdio")
+if __name__ == "__main__":
+    main()

epub2pdf_cli/models.py ADDED Viewed

@@ -0,0 +1,116 @@
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import Any
+@dataclass(frozen=True, slots=True)
+class ManifestItem:
+    id: str
+    href: str
+    media_type: str
+    properties: tuple[str, ...] = ()
+    fallback: str | None = None
+    content: bytes = b""
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "id": self.id,
+            "href": self.href,
+            "media_type": self.media_type,
+            "properties": list(self.properties),
+            "fallback": self.fallback,
+            "size_bytes": len(self.content),
+        }
+@dataclass(frozen=True, slots=True)
+class SpineItem:
+    idref: str
+    href: str
+    media_type: str
+    linear: bool = True
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "idref": self.idref,
+            "href": self.href,
+            "media_type": self.media_type,
+            "linear": self.linear,
+        }
+@dataclass(frozen=True, slots=True)
+class TocEntry:
+    title: str
+    href: str
+    children: list[TocEntry] = field(default_factory=list)
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "title": self.title,
+            "href": self.href,
+            "children": [child.to_dict() for child in self.children],
+        }
+@dataclass(frozen=True, slots=True)
+class Chapter:
+    idref: str
+    href: str
+    media_type: str
+    title: str
+    html: str
+    text: str
+    linear: bool = True
+    def to_dict(self) -> dict[str, Any]:
+        text = self.text.strip()
+        return {
+            "idref": self.idref,
+            "href": self.href,
+            "media_type": self.media_type,
+            "title": self.title,
+            "linear": self.linear,
+            "text_length": len(text),
+            "word_count": len(text.split()),
+            "has_text": bool(text),
+        }
+@dataclass(frozen=True, slots=True)
+class CoverAsset:
+    href: str
+    media_type: str
+    content: bytes
+@dataclass(frozen=True, slots=True)
+class EpubBook:
+    source_path: str
+    rootfile_path: str
+    metadata: dict[str, Any]
+    manifest: dict[str, ManifestItem]
+    spine: list[SpineItem]
+    chapters: list[Chapter]
+    toc: list[TocEntry]
+    warnings: list[str] = field(default_factory=list)
+    cover: CoverAsset | None = None
+    @property
+    def manifest_by_href(self) -> dict[str, ManifestItem]:
+        return {item.href: item for item in self.manifest.values()}
+    def to_inspection_dict(self) -> dict[str, Any]:
+        return {
+            "source": {
+                "path": self.source_path,
+                "rootfile": self.rootfile_path,
+            },
+            "metadata": self.metadata,
+            "manifest": [item.to_dict() for item in self.manifest.values()],
+            "spine": [item.to_dict() for item in self.spine],
+            "toc": [entry.to_dict() for entry in self.toc],
+            "chapters": [chapter.to_dict() for chapter in self.chapters],
+            "warnings": self.warnings,
+        }

epub2pdf_cli/pdf/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+from epub2pdf_cli.pdf.extract import find_extract_outputs, planned_extract_paths, run_pdf_extraction
+from epub2pdf_cli.pdf.text import extract_text
+from epub2pdf_cli.pdf.validate import validate_pdf
+__all__ = ["validate_pdf", "extract_text", "run_pdf_extraction", "find_extract_outputs", "planned_extract_paths"]