PyPI - epub2pdf-cli - Versions diffs - 0.3.0__py3-none-any.whl - Mend

epub2pdf-cli 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

epub2pdf_cli/__init__.py +5 -0
epub2pdf_cli/__main__.py +4 -0
epub2pdf_cli/api.py +160 -0
epub2pdf_cli/cli.py +223 -0
epub2pdf_cli/config.py +109 -0
epub2pdf_cli/epub/__init__.py +3 -0
epub2pdf_cli/epub/chapters.py +81 -0
epub2pdf_cli/epub/container.py +25 -0
epub2pdf_cli/epub/href.py +24 -0
epub2pdf_cli/epub/opf.py +159 -0
epub2pdf_cli/epub/parser.py +64 -0
epub2pdf_cli/epub/toc.py +101 -0
epub2pdf_cli/errors.py +27 -0
epub2pdf_cli/html/__init__.py +3 -0
epub2pdf_cli/html/builder.py +190 -0
epub2pdf_cli/html/css.py +49 -0
epub2pdf_cli/html/links.py +144 -0
epub2pdf_cli/html/template.py +92 -0
epub2pdf_cli/io_utils.py +24 -0
epub2pdf_cli/markdown.py +97 -0
epub2pdf_cli/mcp_server.py +189 -0
epub2pdf_cli/models.py +116 -0
epub2pdf_cli/pdf/__init__.py +5 -0
epub2pdf_cli/pdf/extract.py +79 -0
epub2pdf_cli/pdf/extractors/__init__.py +0 -0
epub2pdf_cli/pdf/extractors/base.py +23 -0
epub2pdf_cli/pdf/extractors/docling_extractor.py +139 -0
epub2pdf_cli/pdf/extractors/opendataloader_extractor.py +86 -0
epub2pdf_cli/pdf/extractors/pdfplumber_extractor.py +150 -0
epub2pdf_cli/pdf/extractors/pypdfium2_extractor.py +151 -0
epub2pdf_cli/pdf/text.py +45 -0
epub2pdf_cli/pdf/validate.py +37 -0
epub2pdf_cli/pipeline/__init__.py +6 -0
epub2pdf_cli/pipeline/batch.py +84 -0
epub2pdf_cli/pipeline/convert.py +122 -0
epub2pdf_cli/pipeline/extract.py +64 -0
epub2pdf_cli/pipeline/inspect.py +15 -0
epub2pdf_cli/render/__init__.py +17 -0
epub2pdf_cli/render/options.py +19 -0
epub2pdf_cli/render/playwright.py +91 -0
epub2pdf_cli/render/protocol.py +13 -0
epub2pdf_cli/render/weasyprint.py +28 -0
epub2pdf_cli-0.3.0.dist-info/METADATA +443 -0
epub2pdf_cli-0.3.0.dist-info/RECORD +48 -0
epub2pdf_cli-0.3.0.dist-info/WHEEL +5 -0
epub2pdf_cli-0.3.0.dist-info/entry_points.txt +3 -0
epub2pdf_cli-0.3.0.dist-info/licenses/LICENSE +21 -0
epub2pdf_cli-0.3.0.dist-info/top_level.txt +1 -0

epub2pdf_cli/epub/opf.py ADDED Viewed

@@ -0,0 +1,159 @@
+from __future__ import annotations
+import posixpath
+import zipfile
+from typing import Any
+from xml.etree import ElementTree as ET
+from epub2pdf_cli.errors import StageError
+from epub2pdf_cli.models import CoverAsset, ManifestItem, SpineItem
+OPF_NS = {
+    "opf": "http://www.idpf.org/2007/opf",
+    "dc": "http://purl.org/dc/elements/1.1/",
+}
+def read_required(archive: zipfile.ZipFile, path: str, *, stage: str) -> bytes:
+    try:
+        return archive.read(path)
+    except KeyError as exc:
+        raise StageError(stage, f"Missing required EPUB resource: {path}") from exc
+def parse_opf(archive: zipfile.ZipFile, rootfile_path: str) -> tuple[ET.Element, str]:
+    opf_bytes = read_required(archive, rootfile_path, stage="opf")
+    try:
+        package = ET.fromstring(opf_bytes)
+    except ET.ParseError as exc:
+        raise StageError("opf", f"Unable to parse package document: {rootfile_path}") from exc
+    opf_dir = posixpath.dirname(rootfile_path)
+    return package, opf_dir
+def read_manifest(
+    archive: zipfile.ZipFile,
+    package: ET.Element,
+    opf_dir: str,
+    warnings: list[str],
+) -> dict[str, ManifestItem]:
+    manifest: dict[str, ManifestItem] = {}
+    manifest_node = package.find("opf:manifest", OPF_NS)
+    if manifest_node is None:
+        raise StageError("opf", "Package document is missing a manifest")
+    for item in manifest_node.findall("opf:item", OPF_NS):
+        item_id = item.attrib.get("id")
+        href = item.attrib.get("href")
+        media_type = item.attrib.get("media-type")
+        if not item_id or not href or not media_type:
+            continue
+        normalized_href = posixpath.normpath(posixpath.join(opf_dir, href))
+        properties = tuple(item.attrib.get("properties", "").split())
+        content = b""
+        try:
+            content = archive.read(normalized_href)
+        except KeyError:
+            warnings.append(f"Missing manifest resource: {normalized_href}")
+        manifest[item_id] = ManifestItem(
+            id=item_id,
+            href=normalized_href,
+            media_type=media_type,
+            properties=properties,
+            fallback=item.attrib.get("fallback"),
+            content=content,
+        )
+    return manifest
+def read_metadata(package: ET.Element) -> dict[str, Any]:
+    metadata_node = package.find("opf:metadata", OPF_NS)
+    metadata: dict[str, Any] = {
+        "title": "",
+        "language": "",
+        "creators": [],
+        "identifiers": [],
+        "publisher": "",
+        "dates": [],
+        "subjects": [],
+        "descriptions": [],
+        "contributors": [],
+        "rights": [],
+    }
+    if metadata_node is None:
+        return metadata
+    def read_texts(tag: str) -> list[str]:
+        values = []
+        for element in metadata_node.findall(f"dc:{tag}", OPF_NS):
+            text = (element.text or "").strip()
+            if text:
+                values.append(text)
+        return values
+    titles = read_texts("title")
+    metadata["title"] = titles[0] if titles else ""
+    languages = read_texts("language")
+    metadata["language"] = languages[0] if languages else ""
+    metadata["creators"] = read_texts("creator")
+    metadata["identifiers"] = read_texts("identifier")
+    metadata["subjects"] = read_texts("subject")
+    metadata["descriptions"] = read_texts("description")
+    metadata["contributors"] = read_texts("contributor")
+    metadata["rights"] = read_texts("rights")
+    publishers = read_texts("publisher")
+    metadata["publisher"] = publishers[0] if publishers else ""
+    metadata["dates"] = read_texts("date")
+    return metadata
+def read_cover_asset(
+    package: ET.Element,
+    manifest: dict[str, ManifestItem],
+) -> CoverAsset | None:
+    metadata_node = package.find("opf:metadata", OPF_NS)
+    if metadata_node is not None:
+        for meta in metadata_node.findall("opf:meta", OPF_NS):
+            if meta.attrib.get("name") == "cover":
+                cover_id = meta.attrib.get("content")
+                item = manifest.get(cover_id or "")
+                if item and item.content:
+                    return CoverAsset(href=item.href, media_type=item.media_type, content=item.content)
+    for item in manifest.values():
+        if "cover-image" in item.properties and item.content:
+            return CoverAsset(href=item.href, media_type=item.media_type, content=item.content)
+    return None
+def read_spine(package: ET.Element, manifest: dict[str, ManifestItem]) -> list[SpineItem]:
+    spine_node = package.find("opf:spine", OPF_NS)
+    if spine_node is None:
+        raise StageError("spine", "Package document is missing a spine")
+    spine: list[SpineItem] = []
+    for itemref in spine_node.findall("opf:itemref", OPF_NS):
+        idref = itemref.attrib.get("idref")
+        if not idref:
+            continue
+        manifest_item = manifest.get(idref)
+        if manifest_item is None:
+            raise StageError("spine", f"Spine references missing manifest item: {idref}")
+        spine.append(
+            SpineItem(
+                idref=idref,
+                href=manifest_item.href,
+                media_type=manifest_item.media_type,
+                linear=itemref.attrib.get("linear", "yes").lower() != "no",
+            )
+        )
+    if not spine:
+        raise StageError("spine", "Spine does not contain any readable items")
+    return spine
+def get_toc_id(package: ET.Element) -> str | None:
+    spine_node = package.find("opf:spine", OPF_NS)
+    if spine_node is None:
+        return None
+    return spine_node.attrib.get("toc")

epub2pdf_cli/epub/parser.py ADDED Viewed

@@ -0,0 +1,64 @@
+from __future__ import annotations
+import zipfile
+from pathlib import Path
+from epub2pdf_cli.epub.chapters import manifest_warnings, read_chapters
+from epub2pdf_cli.epub.container import read_rootfile_path
+from epub2pdf_cli.epub.opf import (
+    get_toc_id,
+    parse_opf,
+    read_cover_asset,
+    read_manifest,
+    read_metadata,
+    read_spine,
+)
+from epub2pdf_cli.epub.toc import read_toc
+from epub2pdf_cli.errors import ExitCode, StageError
+from epub2pdf_cli.models import EpubBook, ManifestItem
+def read_epub(input_path: Path) -> EpubBook:
+    try:
+        archive = zipfile.ZipFile(input_path)
+    except FileNotFoundError as exc:
+        raise StageError("container", f"Input file does not exist: {input_path}", exit_code=ExitCode.USAGE) from exc
+    except zipfile.BadZipFile as exc:
+        raise StageError("container", f"Input is not a valid EPUB/ZIP archive: {input_path}", exit_code=ExitCode.USAGE) from exc
+    with archive:
+        rootfile_path = read_rootfile_path(archive)
+        package, opf_dir = parse_opf(archive, rootfile_path)
+        warnings: list[str] = []
+        manifest = read_manifest(archive, package, opf_dir, warnings)
+        metadata = read_metadata(package)
+        cover = read_cover_asset(package, manifest)
+        spine = read_spine(package, manifest)
+        chapters, chapter_warnings = read_chapters(spine, manifest)
+        warnings.extend(chapter_warnings)
+        warnings.extend(manifest_warnings(manifest))
+        toc = read_toc(
+            nav_item=next((item for item in manifest.values() if "nav" in item.properties), None),
+            ncx_item=_find_ncx_item(manifest, get_toc_id(package)),
+            warnings=warnings,
+        )
+        return EpubBook(
+            source_path=str(input_path),
+            rootfile_path=rootfile_path,
+            metadata=metadata,
+            manifest=manifest,
+            spine=spine,
+            chapters=chapters,
+            toc=toc,
+            warnings=warnings,
+            cover=cover,
+        )
+def _find_ncx_item(manifest: dict[str, ManifestItem], toc_id: str | None) -> ManifestItem | None:
+    if not toc_id or toc_id not in manifest:
+        return None
+    return manifest[toc_id]

epub2pdf_cli/epub/toc.py ADDED Viewed

@@ -0,0 +1,101 @@
+from __future__ import annotations
+from typing import Any
+from xml.etree import ElementTree as ET
+from bs4 import BeautifulSoup
+from epub2pdf_cli.epub.href import resolve_relative_href
+from epub2pdf_cli.models import ManifestItem, TocEntry
+NCX_NS = {"ncx": "http://www.daisy.org/z3986/2005/ncx/"}
+def read_toc(
+    nav_item: ManifestItem | None,
+    ncx_item: ManifestItem | None,
+    warnings: list[str],
+) -> list[TocEntry]:
+    if nav_item and nav_item.content:
+        toc = _parse_nav_document(nav_item.content, nav_item.href)
+        if toc:
+            return toc
+        warnings.append("EPUB nav document did not contain a usable toc")
+    if ncx_item and ncx_item.content:
+        toc = _parse_ncx_document(ncx_item.content, ncx_item.href)
+        if toc:
+            return toc
+        warnings.append("NCX document did not contain a usable toc")
+    return []
+def _parse_nav_document(content: bytes, base_href: str) -> list[TocEntry]:
+    try:
+        soup = BeautifulSoup(content, "lxml")
+    except Exception:
+        return []
+    nav = None
+    for candidate in soup.find_all("nav"):
+        epub_type = str(candidate.get("epub:type") or candidate.get("type") or "")
+        if "toc" in epub_type.split():
+            nav = candidate
+            break
+    if nav is None:
+        nav = soup.find("nav")
+    if nav is None:
+        return []
+    list_node = nav.find(["ol", "ul"])
+    return _parse_nav_list(list_node, base_href) if list_node else []
+def _parse_nav_list(list_node: Any, base_href: str) -> list[TocEntry]:
+    entries: list[TocEntry] = []
+    for li in list_node.find_all("li", recursive=False):
+        link = li.find("a", recursive=False)
+        title = ""
+        href = base_href
+        if link:
+            title = link.get_text(" ", strip=True)
+            href = resolve_relative_href(base_href, link.get("href") or "")
+        else:
+            title = li.get_text(" ", strip=True)
+        child_list = li.find(["ol", "ul"], recursive=False)
+        entries.append(
+            TocEntry(
+                title=title,
+                href=href,
+                children=_parse_nav_list(child_list, base_href) if child_list else [],
+            )
+        )
+    return entries
+def _parse_ncx_document(content: bytes, base_href: str) -> list[TocEntry]:
+    try:
+        root = ET.fromstring(content)
+    except ET.ParseError:
+        return []
+    nav_map = root.find("ncx:navMap", NCX_NS)
+    if nav_map is None:
+        return []
+    return [_parse_navpoint(node, base_href) for node in nav_map.findall("ncx:navPoint", NCX_NS)]
+def _parse_navpoint(node: ET.Element, base_href: str) -> TocEntry:
+    label_node = node.find("ncx:navLabel/ncx:text", NCX_NS)
+    content_node = node.find("ncx:content", NCX_NS)
+    title = (label_node.text or "").strip() if label_node is not None and label_node.text else ""
+    href = resolve_relative_href(
+        base_href,
+        content_node.attrib.get("src", "") if content_node is not None else "",
+    )
+    return TocEntry(
+        title=title,
+        href=href,
+        children=[_parse_navpoint(child, base_href) for child in node.findall("ncx:navPoint", NCX_NS)],
+    )

epub2pdf_cli/errors.py ADDED Viewed

@@ -0,0 +1,27 @@
+from __future__ import annotations
+from enum import IntEnum
+class ExitCode(IntEnum):
+    OK = 0
+    UNEXPECTED = 1
+    USAGE = 2
+    STAGE = 3
+    OUTPUT_EXISTS = 5
+class Epub2PdfError(Exception):
+    """Base error for CLI failures."""
+    def __init__(self, message: str, *, exit_code: ExitCode = ExitCode.UNEXPECTED) -> None:
+        super().__init__(message)
+        self.exit_code = exit_code
+class StageError(Epub2PdfError):
+    """Error raised for pipeline stage failures."""
+    def __init__(self, stage: str, message: str, *, exit_code: ExitCode = ExitCode.STAGE) -> None:
+        super().__init__(f"[{stage}] {message}", exit_code=exit_code)
+        self.stage = stage

epub2pdf_cli/html/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from epub2pdf_cli.html.builder import BuildResult, build_html
+__all__ = ["BuildResult", "build_html"]

epub2pdf_cli/html/builder.py ADDED Viewed

@@ -0,0 +1,190 @@
+from __future__ import annotations
+from dataclasses import dataclass
+from html import escape
+from typing import Any, cast
+from bs4 import BeautifulSoup
+from epub2pdf_cli.config import ConvertConfig
+from epub2pdf_cli.html.css import rewrite_css_item
+from epub2pdf_cli.html.links import (
+    render_toc_items,
+    rewrite_resources,
+)
+from epub2pdf_cli.html.template import base_css, wrap_document
+from epub2pdf_cli.models import Chapter, EpubBook
+@dataclass(frozen=True, slots=True)
+class BuildResult:
+    html: str
+    chapters: list[dict[str, Any]]
+    assets: list[dict[str, Any]]
+    warnings: list[str]
+def build_html(book: EpubBook, config: ConvertConfig) -> BuildResult:
+    chapter_lookup = {chapter.href: chapter for chapter in book.chapters}
+    chapter_section_ids = {chapter.href: f"chapter-{index + 1}" for index, chapter in enumerate(book.chapters)}
+    soups = {chapter.href: BeautifulSoup(chapter.html, "lxml") for chapter in book.chapters}
+    element_id_map = _build_element_id_map(soups)
+    assets: dict[str, dict[str, Any]] = {}
+    warnings: list[str] = []
+    stylesheet_blocks = [
+        rewrite_css_item(item.href, item.content.decode("utf-8", errors="replace"), book, assets, warnings)
+        for item in book.manifest.values()
+        if item.media_type == "text/css" and item.content
+    ]
+    stylesheet_blocks = [block for block in stylesheet_blocks if block.strip()]
+    rendered_sections: list[str] = []
+    sidecar_chapters: list[dict[str, Any]] = []
+    if config.cover == "first" and book.cover is not None:
+        cover_src = _data_uri(book.cover.content, book.cover.media_type)
+        assets[book.cover.href] = {
+            "href": book.cover.href,
+            "media_type": book.cover.media_type,
+            "rewritten_as": "data-uri",
+            "usage": "cover",
+        }
+        rendered_sections.append(
+            "\n".join(
+                [
+                    '<section class="epub-cover page-break" id="cover-page">',
+                    f'<img alt="Cover image" src="{cover_src}" />',
+                    "</section>",
+                ]
+            )
+        )
+    if book.toc:
+        rendered_sections.append(_render_generated_toc(book.toc, chapter_section_ids, element_id_map))
+    for index, chapter in enumerate(book.chapters, start=1):
+        section_id = chapter_section_ids[chapter.href]
+        section_html, chapter_info = _render_chapter(
+            chapter,
+            soup=soups[chapter.href],
+            chapter_index=index,
+            section_id=section_id,
+            chapter_lookup=chapter_lookup,
+            chapter_section_ids=chapter_section_ids,
+            element_id_map=element_id_map,
+            book=book,
+            assets=assets,
+            warnings=warnings,
+        )
+        rendered_sections.append(section_html)
+        sidecar_chapters.append(chapter_info)
+    title = book.metadata.get("title") or "Untitled EPUB"
+    author = ", ".join(book.metadata.get("creators", []))
+    html = wrap_document(
+        title=title,
+        language=book.metadata.get("language", ""),
+        author=author,
+        stylesheets=[base_css(config.page_size, config.margin_mm), *stylesheet_blocks],
+        body_sections=rendered_sections,
+    )
+    return BuildResult(
+        html=html,
+        chapters=sidecar_chapters,
+        assets=list(assets.values()),
+        warnings=warnings,
+    )
+def _build_element_id_map(soups: dict[str, BeautifulSoup]) -> dict[tuple[str, str], str]:
+    element_id_map: dict[tuple[str, str], str] = {}
+    for index, (href, soup) in enumerate(soups.items(), start=1):
+        for node in soup.find_all(id=True):
+            original = cast(str, node.get("id"))
+            if not original:
+                continue
+            element_id_map[(href, original)] = f"chapter-{index}-{original}"
+    return element_id_map
+def _render_chapter(
+    chapter: Chapter,
+    *,
+    soup: BeautifulSoup,
+    chapter_index: int,
+    section_id: str,
+    chapter_lookup: dict[str, Chapter],
+    chapter_section_ids: dict[str, str],
+    element_id_map: dict[tuple[str, str], str],
+    book: EpubBook,
+    assets: dict[str, dict[str, Any]],
+    warnings: list[str],
+) -> tuple[str, dict[str, Any]]:
+    for link in soup.find_all("link"):
+        if (link.get("rel") or [""])[0].lower() == "stylesheet":
+            link.decompose()
+    body = soup.body
+    if body is None:
+        body = soup
+        # If there is no body, avoid wrapping the entire document including head
+        for tag in list(body.find_all()):
+            if tag.name in {"head", "title", "meta", "link", "style", "script"}:
+                tag.decompose()
+    for node in body.find_all(id=True):
+        original = cast(str, node.get("id"))
+        if not original:
+            continue
+        node["id"] = element_id_map.get((chapter.href, original), original)
+    rewrite_resources(body, chapter.href, chapter_lookup, chapter_section_ids, element_id_map, book, assets, warnings)
+    title = chapter.title or f"Chapter {chapter_index}"
+    chapter_info = chapter.to_dict()
+    chapter_info.update(
+        {
+            "section_id": section_id,
+            "anchors": sorted(
+                mapped_id for (href, _), mapped_id in element_id_map.items() if href == chapter.href
+            ),
+        }
+    )
+    section_html = "\n".join(
+        [
+            f'<section class="epub-chapter page-break" id="{escape(section_id)}" data-source-href="{escape(chapter.href)}">',
+            f'<h1 class="chapter-title">{escape(title)}</h1>',
+            "".join(str(child) for child in body.contents),
+            "</section>",
+        ]
+    )
+    return section_html, chapter_info
+def _render_generated_toc(
+    toc: list[Any],
+    chapter_section_ids: dict[str, str],
+    element_id_map: dict[tuple[str, str], str],
+) -> str:
+    items = render_toc_items(toc, chapter_section_ids, element_id_map)
+    if not items:
+        return ""
+    return "\n".join(
+        [
+            '<section class="generated-toc page-break" id="generated-toc">',
+            "<h1>Table of Contents</h1>",
+            "<nav>",
+            f"<ol>{items}</ol>",
+            "</nav>",
+            "</section>",
+        ]
+    )
+def _data_uri(content: bytes, media_type: str) -> str:
+    import base64
+    encoded = base64.b64encode(content).decode("ascii")
+    return f"data:{media_type};base64,{encoded}"

epub2pdf_cli/html/css.py ADDED Viewed

@@ -0,0 +1,49 @@
+from __future__ import annotations
+import re
+from typing import Any
+from epub2pdf_cli.epub.href import split_href
+from epub2pdf_cli.models import EpubBook
+URL_PATTERN = re.compile(r"url\((?P<quote>['\"]?)(?P<target>[^)'\"]+)(?P=quote)\)")
+DATA_SCHEMES = ("http://", "https://", "mailto:", "data:")
+def rewrite_css_item(
+    css_href: str,
+    css_text: str,
+    book: EpubBook,
+    assets: dict[str, dict[str, Any]],
+    warnings: list[str],
+) -> str:
+    def replace(match: re.Match[str]) -> str:
+        target = match.group("target").strip()
+        if any(target.startswith(prefix) for prefix in DATA_SCHEMES):
+            return match.group(0)
+        path, _fragment = split_href(target)
+        if not path:
+            return match.group(0)
+        import posixpath
+        resolved = posixpath.normpath(posixpath.join(posixpath.dirname(css_href), path))
+        item = book.manifest_by_href.get(resolved)
+        if not item or not item.content:
+            warnings.append(f"Missing CSS asset during normalization: {resolved}")
+            return "url()"
+        assets[resolved] = {
+            "href": resolved,
+            "media_type": item.media_type,
+            "rewritten_as": "data-uri",
+            "usage": "css-url",
+        }
+        return f"url('{_data_uri(item.content, item.media_type)}')"
+    return URL_PATTERN.sub(replace, css_text)
+def _data_uri(content: bytes, media_type: str) -> str:
+    import base64
+    encoded = base64.b64encode(content).decode("ascii")
+    return f"data:{media_type};base64,{encoded}"