epub2pdf-cli 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. epub2pdf_cli/__init__.py +5 -0
  2. epub2pdf_cli/__main__.py +4 -0
  3. epub2pdf_cli/api.py +160 -0
  4. epub2pdf_cli/cli.py +223 -0
  5. epub2pdf_cli/config.py +109 -0
  6. epub2pdf_cli/epub/__init__.py +3 -0
  7. epub2pdf_cli/epub/chapters.py +81 -0
  8. epub2pdf_cli/epub/container.py +25 -0
  9. epub2pdf_cli/epub/href.py +24 -0
  10. epub2pdf_cli/epub/opf.py +159 -0
  11. epub2pdf_cli/epub/parser.py +64 -0
  12. epub2pdf_cli/epub/toc.py +101 -0
  13. epub2pdf_cli/errors.py +27 -0
  14. epub2pdf_cli/html/__init__.py +3 -0
  15. epub2pdf_cli/html/builder.py +190 -0
  16. epub2pdf_cli/html/css.py +49 -0
  17. epub2pdf_cli/html/links.py +144 -0
  18. epub2pdf_cli/html/template.py +92 -0
  19. epub2pdf_cli/io_utils.py +24 -0
  20. epub2pdf_cli/markdown.py +97 -0
  21. epub2pdf_cli/mcp_server.py +189 -0
  22. epub2pdf_cli/models.py +116 -0
  23. epub2pdf_cli/pdf/__init__.py +5 -0
  24. epub2pdf_cli/pdf/extract.py +79 -0
  25. epub2pdf_cli/pdf/extractors/__init__.py +0 -0
  26. epub2pdf_cli/pdf/extractors/base.py +23 -0
  27. epub2pdf_cli/pdf/extractors/docling_extractor.py +139 -0
  28. epub2pdf_cli/pdf/extractors/opendataloader_extractor.py +86 -0
  29. epub2pdf_cli/pdf/extractors/pdfplumber_extractor.py +150 -0
  30. epub2pdf_cli/pdf/extractors/pypdfium2_extractor.py +151 -0
  31. epub2pdf_cli/pdf/text.py +45 -0
  32. epub2pdf_cli/pdf/validate.py +37 -0
  33. epub2pdf_cli/pipeline/__init__.py +6 -0
  34. epub2pdf_cli/pipeline/batch.py +84 -0
  35. epub2pdf_cli/pipeline/convert.py +122 -0
  36. epub2pdf_cli/pipeline/extract.py +64 -0
  37. epub2pdf_cli/pipeline/inspect.py +15 -0
  38. epub2pdf_cli/render/__init__.py +17 -0
  39. epub2pdf_cli/render/options.py +19 -0
  40. epub2pdf_cli/render/playwright.py +91 -0
  41. epub2pdf_cli/render/protocol.py +13 -0
  42. epub2pdf_cli/render/weasyprint.py +28 -0
  43. epub2pdf_cli-0.3.0.dist-info/METADATA +443 -0
  44. epub2pdf_cli-0.3.0.dist-info/RECORD +48 -0
  45. epub2pdf_cli-0.3.0.dist-info/WHEEL +5 -0
  46. epub2pdf_cli-0.3.0.dist-info/entry_points.txt +3 -0
  47. epub2pdf_cli-0.3.0.dist-info/licenses/LICENSE +21 -0
  48. epub2pdf_cli-0.3.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,144 @@
1
+ from __future__ import annotations
2
+
3
+ import base64
4
+ import posixpath
5
+ from html import escape
6
+ from typing import Any
7
+ from urllib.parse import urlparse
8
+
9
+ from epub2pdf_cli.epub.href import split_href
10
+ from epub2pdf_cli.models import Chapter, EpubBook
11
+
12
+ DATA_SCHEMES = ("http://", "https://", "mailto:", "data:")
13
+ LINK_ATTRS = ("src", "href", "poster", "xlink:href")
14
+
15
+
16
+ def rewrite_resources(
17
+ body: Any,
18
+ current_href: str,
19
+ chapter_lookup: dict[str, Chapter],
20
+ chapter_section_ids: dict[str, str],
21
+ element_id_map: dict[tuple[str, str], str],
22
+ book: EpubBook,
23
+ assets: dict[str, dict[str, Any]],
24
+ warnings: list[str],
25
+ ) -> None:
26
+ for tag in body.find_all(True):
27
+ for attr in LINK_ATTRS:
28
+ value = tag.get(attr)
29
+ if not value:
30
+ continue
31
+ rewritten = _rewrite_attr(
32
+ attr,
33
+ value,
34
+ current_href=current_href,
35
+ chapter_lookup=chapter_lookup,
36
+ chapter_section_ids=chapter_section_ids,
37
+ element_id_map=element_id_map,
38
+ book=book,
39
+ assets=assets,
40
+ warnings=warnings,
41
+ )
42
+ if rewritten is None:
43
+ tag.attrs.pop(attr, None)
44
+ else:
45
+ tag[attr] = rewritten
46
+
47
+
48
+ def _rewrite_attr(
49
+ attr: str,
50
+ value: str,
51
+ *,
52
+ current_href: str,
53
+ chapter_lookup: dict[str, Chapter],
54
+ chapter_section_ids: dict[str, str],
55
+ element_id_map: dict[tuple[str, str], str],
56
+ book: EpubBook,
57
+ assets: dict[str, dict[str, Any]],
58
+ warnings: list[str],
59
+ ) -> str | None:
60
+ if any(value.startswith(prefix) for prefix in DATA_SCHEMES):
61
+ return value
62
+ parsed = urlparse(value)
63
+ if parsed.scheme and parsed.scheme not in {"file"}:
64
+ return value
65
+
66
+ target_path, fragment = split_href(value)
67
+ resolved_path = (
68
+ posixpath.normpath(posixpath.join(posixpath.dirname(current_href), target_path))
69
+ if target_path
70
+ else current_href
71
+ )
72
+
73
+ if attr == "href":
74
+ if resolved_path in chapter_lookup:
75
+ if fragment:
76
+ target_id = element_id_map.get((resolved_path, fragment))
77
+ if target_id:
78
+ return f"#{target_id}"
79
+ return f"#{chapter_section_ids[resolved_path]}"
80
+ manifest_item = book.manifest_by_href.get(resolved_path)
81
+ if manifest_item and manifest_item.media_type.startswith("image/"):
82
+ _record_asset(assets, resolved_path, manifest_item, "linked-image")
83
+ return _data_uri(manifest_item.content, manifest_item.media_type)
84
+ return value
85
+
86
+ manifest_item = book.manifest_by_href.get(resolved_path)
87
+ if manifest_item and manifest_item.content:
88
+ _record_asset(assets, resolved_path, manifest_item, attr)
89
+ return _data_uri(manifest_item.content, manifest_item.media_type)
90
+
91
+ warnings.append(f"Missing asset during normalization: {resolved_path}")
92
+ return None
93
+
94
+
95
+ def _record_asset(
96
+ assets: dict[str, dict[str, Any]],
97
+ resolved_path: str,
98
+ manifest_item: Any,
99
+ usage: str,
100
+ ) -> None:
101
+ assets[resolved_path] = {
102
+ "href": resolved_path,
103
+ "media_type": manifest_item.media_type,
104
+ "rewritten_as": "data-uri",
105
+ "usage": usage,
106
+ }
107
+
108
+
109
+ def _data_uri(content: bytes, media_type: str) -> str:
110
+ encoded = base64.b64encode(content).decode("ascii")
111
+ return f"data:{media_type};base64,{encoded}"
112
+
113
+
114
+ def map_toc_href(
115
+ href: str,
116
+ chapter_section_ids: dict[str, str],
117
+ element_id_map: dict[tuple[str, str], str],
118
+ ) -> str:
119
+ if not href or "://" in href or href.startswith("mailto:"):
120
+ return href
121
+ path, fragment = split_href(href)
122
+ if path in chapter_section_ids:
123
+ if fragment:
124
+ mapped = element_id_map.get((path, fragment))
125
+ if mapped:
126
+ return f"#{mapped}"
127
+ return f"#{chapter_section_ids[path]}"
128
+ return href
129
+
130
+
131
+ def render_toc_items(
132
+ entries: list[Any],
133
+ chapter_section_ids: dict[str, str],
134
+ element_id_map: dict[tuple[str, str], str],
135
+ ) -> str:
136
+ rendered: list[str] = []
137
+ for entry in entries:
138
+ href = map_toc_href(entry.href, chapter_section_ids, element_id_map)
139
+ label = escape(entry.title or entry.href)
140
+ children = render_toc_items(entry.children, chapter_section_ids, element_id_map)
141
+ child_html = f"<ol>{children}</ol>" if children else ""
142
+ link = f'<a href="{escape(href)}">{label}</a>' if href else label
143
+ rendered.append(f"<li>{link}{child_html}</li>")
144
+ return "".join(rendered)
@@ -0,0 +1,92 @@
1
+ from __future__ import annotations
2
+
3
+ from html import escape
4
+
5
+ from epub2pdf_cli.config import PageSize
6
+
7
+
8
+ def base_css(page_size: PageSize, margin_mm: int) -> str:
9
+ margin = max(margin_mm, 0)
10
+ return f"""
11
+ @page {{
12
+ size: {page_size};
13
+ margin: {margin}mm;
14
+ }}
15
+ html {{
16
+ font-size: 11pt;
17
+ line-height: 1.6;
18
+ color: #111;
19
+ }}
20
+ body {{
21
+ margin: 0;
22
+ font-family: serif;
23
+ print-color-adjust: exact;
24
+ -webkit-print-color-adjust: exact;
25
+ }}
26
+ h1, h2, h3, h4, h5, h6 {{
27
+ break-after: avoid;
28
+ break-inside: avoid;
29
+ }}
30
+ img, svg {{
31
+ max-width: 100%;
32
+ height: auto;
33
+ break-inside: avoid;
34
+ }}
35
+ a {{
36
+ color: #0b57d0;
37
+ text-decoration: none;
38
+ }}
39
+ .page-break {{
40
+ break-before: page;
41
+ }}
42
+ .page-break:first-child {{
43
+ break-before: auto;
44
+ }}
45
+ .epub-cover {{
46
+ min-height: 90vh;
47
+ display: flex;
48
+ align-items: center;
49
+ justify-content: center;
50
+ }}
51
+ .epub-cover img {{
52
+ max-height: 90vh;
53
+ object-fit: contain;
54
+ }}
55
+ .generated-toc ol {{
56
+ padding-left: 1.25rem;
57
+ }}
58
+ .chapter-title {{
59
+ margin-top: 0;
60
+ }}
61
+ """
62
+
63
+
64
+ def wrap_document(
65
+ *,
66
+ title: str,
67
+ language: str,
68
+ author: str,
69
+ stylesheets: list[str],
70
+ body_sections: list[str],
71
+ ) -> str:
72
+ lang = language or "en"
73
+ head_bits = [
74
+ '<meta charset="utf-8" />',
75
+ f"<title>{escape(title)}</title>",
76
+ f'<meta name="author" content="{escape(author)}" />' if author else "",
77
+ ]
78
+ head_bits.extend(f"<style>{css}</style>" for css in stylesheets)
79
+
80
+ return "\n".join(
81
+ [
82
+ "<!DOCTYPE html>",
83
+ f'<html lang="{escape(lang)}">',
84
+ "<head>",
85
+ *[part for part in head_bits if part],
86
+ "</head>",
87
+ "<body>",
88
+ *body_sections,
89
+ "</body>",
90
+ "</html>",
91
+ ]
92
+ )
@@ -0,0 +1,24 @@
1
+ from __future__ import annotations
2
+
3
+ import hashlib
4
+ import json
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+
9
+ def sha256(path: Path) -> str:
10
+ digest = hashlib.sha256()
11
+ with path.open("rb") as handle:
12
+ for chunk in iter(lambda: handle.read(65536), b""):
13
+ digest.update(chunk)
14
+ return digest.hexdigest()
15
+
16
+
17
+ def write_text(path: Path, content: str) -> None:
18
+ path.parent.mkdir(parents=True, exist_ok=True)
19
+ path.write_text(content, encoding="utf-8")
20
+
21
+
22
+ def write_json(path: Path, payload: Any) -> None:
23
+ path.parent.mkdir(parents=True, exist_ok=True)
24
+ path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
@@ -0,0 +1,97 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+ from bs4 import BeautifulSoup, NavigableString
6
+
7
+ from epub2pdf_cli.models import EpubBook
8
+
9
+
10
+ def build_markdown(book: EpubBook) -> str:
11
+ parts: list[str] = []
12
+ title = book.metadata.get("title", "")
13
+ creators = book.metadata.get("creators", [])
14
+ if title:
15
+ parts.append(f"# {title}")
16
+ parts.append("")
17
+ if creators:
18
+ parts.append(f"*{'*, *'.join(creators)}*")
19
+ parts.append("")
20
+
21
+ if book.toc:
22
+ parts.append("## Table of Contents")
23
+ parts.append("")
24
+ parts.extend(_render_toc_entries(book.toc))
25
+ parts.append("")
26
+
27
+ for index, chapter in enumerate(book.chapters, start=1):
28
+ if not chapter.linear:
29
+ continue
30
+ parts.append(f"## {chapter.title or f'Chapter {index}'}")
31
+ parts.append("")
32
+ parts.append(_html_to_markdown(chapter.html))
33
+ parts.append("")
34
+
35
+ return "\n".join(parts).strip() + "\n"
36
+
37
+
38
+ def _render_toc_entries(entries: list[Any], level: int = 0) -> list[str]:
39
+ lines: list[str] = []
40
+ for entry in entries:
41
+ prefix = " " * level + "- "
42
+ lines.append(f"{prefix}[{entry.title}]({entry.href})")
43
+ if entry.children:
44
+ lines.extend(_render_toc_entries(entry.children, level + 1))
45
+ return lines
46
+
47
+
48
+ def _html_to_markdown(html: str) -> str:
49
+ soup = BeautifulSoup(html, "lxml")
50
+ body = soup.body
51
+ if body is None:
52
+ body = soup
53
+ return _convert_node(body).strip()
54
+
55
+
56
+ def _convert_node(node: Any) -> str:
57
+ if isinstance(node, NavigableString):
58
+ return str(node)
59
+
60
+ name = node.name
61
+ if name is None:
62
+ return str(node)
63
+
64
+ inner = "".join(_convert_node(child) for child in node.contents)
65
+ inner = inner.strip()
66
+
67
+ handlers = {
68
+ "h1": lambda t: f"# {t}\n\n",
69
+ "h2": lambda t: f"## {t}\n\n",
70
+ "h3": lambda t: f"### {t}\n\n",
71
+ "h4": lambda t: f"#### {t}\n\n",
72
+ "h5": lambda t: f"##### {t}\n\n",
73
+ "h6": lambda t: f"###### {t}\n\n",
74
+ "p": lambda t: f"{t}\n\n" if t else "",
75
+ "br": lambda t: "\n",
76
+ "strong": lambda t: f"**{t}**",
77
+ "b": lambda t: f"**{t}**",
78
+ "em": lambda t: f"*{t}*",
79
+ "i": lambda t: f"*{t}*",
80
+ "code": lambda t: f"`{t}`",
81
+ "a": lambda t: f"[{t}]({node.get('href', '')})" if node.get("href") else t,
82
+ "img": lambda t: f"![{node.get('alt', '')}]({node.get('src', '')})",
83
+ "li": lambda t: f"- {t}\n",
84
+ "blockquote": lambda t: f"> {t.replace(chr(10), chr(10)+'> ')}\n\n",
85
+ "pre": lambda t: f"```\n{t}\n```\n\n",
86
+ }
87
+
88
+ if name in ("ol", "ul"):
89
+ return inner + "\n"
90
+ if name in handlers:
91
+ return handlers[name](inner)
92
+
93
+ # Inline elements we don't explicitly handle: span, div, section, etc.
94
+ if name in ("span", "div", "section", "article", "header", "footer", "nav"):
95
+ return inner + "\n\n" if inner else ""
96
+
97
+ return inner
@@ -0,0 +1,189 @@
1
+ """Lightweight MCP server for epub2pdf.
2
+
3
+ This server exposes epub2pdf tools to MCP clients (e.g., Claude Desktop) using
4
+ the default low-resource settings: WeasyPrint renderer, no PDF validation, and
5
+ no long-lived browser process. Each tool spawns the CLI in a subprocess so the
6
+ server itself stays small and releases resources after every call.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import os
12
+ import subprocess
13
+ import sys
14
+ from pathlib import Path
15
+ from typing import Any
16
+
17
+ try:
18
+ from mcp.server.fastmcp import FastMCP
19
+ except Exception as exc:
20
+ raise RuntimeError(
21
+ "The MCP Python SDK is not installed. Install with `python3 -m pip install -e '.[mcp]'`."
22
+ ) from exc
23
+
24
+ mcp = FastMCP("epub2pdf")
25
+
26
+
27
+ def _run_cli(*args: str) -> dict[str, Any]:
28
+ """Run the epub2pdf CLI and return a structured result."""
29
+ env = dict(os.environ)
30
+ env.setdefault("PYTHONPATH", str(Path(__file__).resolve().parents[2]))
31
+ env.setdefault("PYTHONWARNINGS", "ignore")
32
+ result = subprocess.run(
33
+ [sys.executable, "-m", "epub2pdf_cli", *args],
34
+ env=env,
35
+ text=True,
36
+ capture_output=True,
37
+ )
38
+ return {
39
+ "success": result.returncode == 0,
40
+ "returncode": result.returncode,
41
+ "stdout": result.stdout.strip(),
42
+ "stderr": result.stderr.strip(),
43
+ }
44
+
45
+
46
+ @mcp.tool()
47
+ def convert_epub(
48
+ input_path: str,
49
+ output_path: str,
50
+ *,
51
+ engine: str = "weasyprint",
52
+ no_validate: bool = True,
53
+ sidecar_json: bool = False,
54
+ sidecar_html: bool = False,
55
+ sidecar_markdown: bool = False,
56
+ page_size: str = "A4",
57
+ margin_mm: int = 12,
58
+ cover: str = "first",
59
+ force: bool = False,
60
+ ) -> dict[str, Any]:
61
+ """Convert a single EPUB file to PDF.
62
+
63
+ Defaults to the lightweight WeasyPrint backend and skips PDF validation to
64
+ keep resource usage low. Use engine="playwright" only when Chromium output
65
+ is explicitly required.
66
+ """
67
+ args: list[str] = [
68
+ "convert",
69
+ input_path,
70
+ "--engine",
71
+ engine,
72
+ "--output",
73
+ output_path,
74
+ "--page-size",
75
+ page_size,
76
+ "--margin-mm",
77
+ str(margin_mm),
78
+ "--cover",
79
+ cover,
80
+ ]
81
+ if no_validate:
82
+ args.append("--no-validate")
83
+ if sidecar_json:
84
+ args.extend(["--sidecar-json", str(Path(output_path).with_suffix(".json"))])
85
+ if sidecar_html:
86
+ args.extend(["--sidecar-html", str(Path(output_path).with_suffix(".html"))])
87
+ if sidecar_markdown:
88
+ args.extend(["--sidecar-markdown", str(Path(output_path).with_suffix(".md"))])
89
+ if force:
90
+ args.append("--force")
91
+ return _run_cli(*args)
92
+
93
+
94
+ @mcp.tool()
95
+ def batch_convert(
96
+ input_paths: list[str],
97
+ output_dir: str,
98
+ *,
99
+ workers: int = 1,
100
+ engine: str = "weasyprint",
101
+ no_validate: bool = True,
102
+ sidecar_json: bool = False,
103
+ sidecar_html: bool = False,
104
+ sidecar_markdown: bool = False,
105
+ page_size: str = "A4",
106
+ margin_mm: int = 12,
107
+ cover: str = "first",
108
+ force: bool = False,
109
+ ) -> dict[str, Any]:
110
+ """Convert multiple EPUBs in parallel using low-resource defaults."""
111
+ args: list[str] = [
112
+ "batch",
113
+ *input_paths,
114
+ "--output-dir",
115
+ output_dir,
116
+ "--engine",
117
+ engine,
118
+ "--workers",
119
+ str(workers),
120
+ "--page-size",
121
+ page_size,
122
+ "--margin-mm",
123
+ str(margin_mm),
124
+ "--cover",
125
+ cover,
126
+ ]
127
+ if no_validate:
128
+ args.append("--no-validate")
129
+ if sidecar_json:
130
+ args.append("--sidecar-json")
131
+ if sidecar_html:
132
+ args.append("--sidecar-html")
133
+ if sidecar_markdown:
134
+ args.append("--sidecar-markdown")
135
+ if force:
136
+ args.append("--force")
137
+ return _run_cli(*args)
138
+
139
+
140
+ @mcp.tool()
141
+ def inspect_epub(
142
+ input_path: str,
143
+ *,
144
+ json_path: str | None = None,
145
+ ) -> dict[str, Any]:
146
+ """Inspect EPUB metadata, manifest, spine, and TOC."""
147
+ args = ["inspect", input_path]
148
+ if json_path:
149
+ args.extend(["--json", json_path])
150
+ return _run_cli(*args)
151
+
152
+
153
+ @mcp.tool()
154
+ def extract_pdf(
155
+ input_path: str,
156
+ output_dir: str,
157
+ *,
158
+ formats: str = "markdown,json",
159
+ engine: str = "pypdfium2",
160
+ pages: str | None = None,
161
+ sidecar_json: bool = False,
162
+ force: bool = False,
163
+ ) -> dict[str, Any]:
164
+ """Extract Markdown, JSON, text, or HTML from an existing PDF."""
165
+ args: list[str] = [
166
+ "pdf-extract",
167
+ input_path,
168
+ "--output-dir",
169
+ output_dir,
170
+ "--format",
171
+ formats,
172
+ "--engine",
173
+ engine,
174
+ ]
175
+ if pages:
176
+ args.extend(["--pages", pages])
177
+ if sidecar_json:
178
+ args.extend(["--sidecar-json", str(Path(output_dir) / f"{Path(input_path).stem}.json")])
179
+ if force:
180
+ args.append("--force")
181
+ return _run_cli(*args)
182
+
183
+
184
+ def main() -> None:
185
+ mcp.run(transport="stdio")
186
+
187
+
188
+ if __name__ == "__main__":
189
+ main()
epub2pdf_cli/models.py ADDED
@@ -0,0 +1,116 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass, field
4
+ from typing import Any
5
+
6
+
7
+ @dataclass(frozen=True, slots=True)
8
+ class ManifestItem:
9
+ id: str
10
+ href: str
11
+ media_type: str
12
+ properties: tuple[str, ...] = ()
13
+ fallback: str | None = None
14
+ content: bytes = b""
15
+
16
+ def to_dict(self) -> dict[str, Any]:
17
+ return {
18
+ "id": self.id,
19
+ "href": self.href,
20
+ "media_type": self.media_type,
21
+ "properties": list(self.properties),
22
+ "fallback": self.fallback,
23
+ "size_bytes": len(self.content),
24
+ }
25
+
26
+
27
+ @dataclass(frozen=True, slots=True)
28
+ class SpineItem:
29
+ idref: str
30
+ href: str
31
+ media_type: str
32
+ linear: bool = True
33
+
34
+ def to_dict(self) -> dict[str, Any]:
35
+ return {
36
+ "idref": self.idref,
37
+ "href": self.href,
38
+ "media_type": self.media_type,
39
+ "linear": self.linear,
40
+ }
41
+
42
+
43
+ @dataclass(frozen=True, slots=True)
44
+ class TocEntry:
45
+ title: str
46
+ href: str
47
+ children: list[TocEntry] = field(default_factory=list)
48
+
49
+ def to_dict(self) -> dict[str, Any]:
50
+ return {
51
+ "title": self.title,
52
+ "href": self.href,
53
+ "children": [child.to_dict() for child in self.children],
54
+ }
55
+
56
+
57
+ @dataclass(frozen=True, slots=True)
58
+ class Chapter:
59
+ idref: str
60
+ href: str
61
+ media_type: str
62
+ title: str
63
+ html: str
64
+ text: str
65
+ linear: bool = True
66
+
67
+ def to_dict(self) -> dict[str, Any]:
68
+ text = self.text.strip()
69
+ return {
70
+ "idref": self.idref,
71
+ "href": self.href,
72
+ "media_type": self.media_type,
73
+ "title": self.title,
74
+ "linear": self.linear,
75
+ "text_length": len(text),
76
+ "word_count": len(text.split()),
77
+ "has_text": bool(text),
78
+ }
79
+
80
+
81
+ @dataclass(frozen=True, slots=True)
82
+ class CoverAsset:
83
+ href: str
84
+ media_type: str
85
+ content: bytes
86
+
87
+
88
+ @dataclass(frozen=True, slots=True)
89
+ class EpubBook:
90
+ source_path: str
91
+ rootfile_path: str
92
+ metadata: dict[str, Any]
93
+ manifest: dict[str, ManifestItem]
94
+ spine: list[SpineItem]
95
+ chapters: list[Chapter]
96
+ toc: list[TocEntry]
97
+ warnings: list[str] = field(default_factory=list)
98
+ cover: CoverAsset | None = None
99
+
100
+ @property
101
+ def manifest_by_href(self) -> dict[str, ManifestItem]:
102
+ return {item.href: item for item in self.manifest.values()}
103
+
104
+ def to_inspection_dict(self) -> dict[str, Any]:
105
+ return {
106
+ "source": {
107
+ "path": self.source_path,
108
+ "rootfile": self.rootfile_path,
109
+ },
110
+ "metadata": self.metadata,
111
+ "manifest": [item.to_dict() for item in self.manifest.values()],
112
+ "spine": [item.to_dict() for item in self.spine],
113
+ "toc": [entry.to_dict() for entry in self.toc],
114
+ "chapters": [chapter.to_dict() for chapter in self.chapters],
115
+ "warnings": self.warnings,
116
+ }
@@ -0,0 +1,5 @@
1
+ from epub2pdf_cli.pdf.extract import find_extract_outputs, planned_extract_paths, run_pdf_extraction
2
+ from epub2pdf_cli.pdf.text import extract_text
3
+ from epub2pdf_cli.pdf.validate import validate_pdf
4
+
5
+ __all__ = ["validate_pdf", "extract_text", "run_pdf_extraction", "find_extract_outputs", "planned_extract_paths"]