epub2pdf-cli 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. epub2pdf_cli/__init__.py +5 -0
  2. epub2pdf_cli/__main__.py +4 -0
  3. epub2pdf_cli/api.py +160 -0
  4. epub2pdf_cli/cli.py +223 -0
  5. epub2pdf_cli/config.py +109 -0
  6. epub2pdf_cli/epub/__init__.py +3 -0
  7. epub2pdf_cli/epub/chapters.py +81 -0
  8. epub2pdf_cli/epub/container.py +25 -0
  9. epub2pdf_cli/epub/href.py +24 -0
  10. epub2pdf_cli/epub/opf.py +159 -0
  11. epub2pdf_cli/epub/parser.py +64 -0
  12. epub2pdf_cli/epub/toc.py +101 -0
  13. epub2pdf_cli/errors.py +27 -0
  14. epub2pdf_cli/html/__init__.py +3 -0
  15. epub2pdf_cli/html/builder.py +190 -0
  16. epub2pdf_cli/html/css.py +49 -0
  17. epub2pdf_cli/html/links.py +144 -0
  18. epub2pdf_cli/html/template.py +92 -0
  19. epub2pdf_cli/io_utils.py +24 -0
  20. epub2pdf_cli/markdown.py +97 -0
  21. epub2pdf_cli/mcp_server.py +189 -0
  22. epub2pdf_cli/models.py +116 -0
  23. epub2pdf_cli/pdf/__init__.py +5 -0
  24. epub2pdf_cli/pdf/extract.py +79 -0
  25. epub2pdf_cli/pdf/extractors/__init__.py +0 -0
  26. epub2pdf_cli/pdf/extractors/base.py +23 -0
  27. epub2pdf_cli/pdf/extractors/docling_extractor.py +139 -0
  28. epub2pdf_cli/pdf/extractors/opendataloader_extractor.py +86 -0
  29. epub2pdf_cli/pdf/extractors/pdfplumber_extractor.py +150 -0
  30. epub2pdf_cli/pdf/extractors/pypdfium2_extractor.py +151 -0
  31. epub2pdf_cli/pdf/text.py +45 -0
  32. epub2pdf_cli/pdf/validate.py +37 -0
  33. epub2pdf_cli/pipeline/__init__.py +6 -0
  34. epub2pdf_cli/pipeline/batch.py +84 -0
  35. epub2pdf_cli/pipeline/convert.py +122 -0
  36. epub2pdf_cli/pipeline/extract.py +64 -0
  37. epub2pdf_cli/pipeline/inspect.py +15 -0
  38. epub2pdf_cli/render/__init__.py +17 -0
  39. epub2pdf_cli/render/options.py +19 -0
  40. epub2pdf_cli/render/playwright.py +91 -0
  41. epub2pdf_cli/render/protocol.py +13 -0
  42. epub2pdf_cli/render/weasyprint.py +28 -0
  43. epub2pdf_cli-0.3.0.dist-info/METADATA +443 -0
  44. epub2pdf_cli-0.3.0.dist-info/RECORD +48 -0
  45. epub2pdf_cli-0.3.0.dist-info/WHEEL +5 -0
  46. epub2pdf_cli-0.3.0.dist-info/entry_points.txt +3 -0
  47. epub2pdf_cli-0.3.0.dist-info/licenses/LICENSE +21 -0
  48. epub2pdf_cli-0.3.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,5 @@
1
+ """epub2pdf package."""
2
+
3
+ __all__ = ["__version__"]
4
+
5
+ __version__ = "0.3.0"
@@ -0,0 +1,4 @@
1
+ from epub2pdf_cli.cli import main
2
+
3
+ if __name__ == "__main__":
4
+ raise SystemExit(main())
epub2pdf_cli/api.py ADDED
@@ -0,0 +1,160 @@
1
+ """Programmatic API for epub2pdf.
2
+
3
+ The :class:`Epub2Pdf` client provides a reusable context manager. When the
4
+ ``playwright`` engine is selected, the browser instance is launched once and
5
+ reused across conversions, avoiding per-call launch overhead.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from collections.abc import Iterable
11
+ from concurrent.futures import ProcessPoolExecutor
12
+ from contextlib import suppress
13
+ from pathlib import Path
14
+ from types import TracebackType
15
+ from typing import Any
16
+
17
+ from epub2pdf_cli.config import ConvertConfig, EngineName, PageSize
18
+ from epub2pdf_cli.pipeline.batch import _convert_one
19
+ from epub2pdf_cli.pipeline.convert import convert_epub
20
+ from epub2pdf_cli.render.playwright import PlaywrightEngine
21
+ from epub2pdf_cli.render.protocol import Renderer
22
+
23
+
24
+ class Epub2Pdf:
25
+ """High-level client for converting EPUB files to PDF.
26
+
27
+ Use as a context manager when ``engine="playwright"`` to keep a single
28
+ browser process alive for multiple conversions:
29
+
30
+ with Epub2Pdf(engine="playwright") as client:
31
+ report1 = client.convert("a.epub", "a.pdf")
32
+ report2 = client.convert("b.epub", "b.pdf")
33
+
34
+ The WeasyPrint engine does not require context-manager entry.
35
+ """
36
+
37
+ def __init__(
38
+ self,
39
+ engine: EngineName = "weasyprint",
40
+ *,
41
+ page_size: PageSize = "A4",
42
+ margin_mm: int = 12,
43
+ cover: str = "first",
44
+ validate: bool = True,
45
+ verbose: bool = False,
46
+ **defaults: Any,
47
+ ) -> None:
48
+ self.engine = engine
49
+ self.page_size = page_size
50
+ self.margin_mm = margin_mm
51
+ self.cover = cover
52
+ self.validate = validate
53
+ self.verbose = verbose
54
+ self._defaults = defaults
55
+ self._browser: Any | None = None
56
+ self._playwright: Any | None = None
57
+
58
+ def __enter__(self) -> Epub2Pdf:
59
+ if self.engine == "playwright":
60
+ self._start_browser()
61
+ return self
62
+
63
+ def __exit__(
64
+ self,
65
+ exc_type: type[BaseException] | None,
66
+ exc: BaseException | None,
67
+ tb: TracebackType | None,
68
+ ) -> None:
69
+ self.close()
70
+
71
+ def close(self) -> None:
72
+ """Release any pooled browser resources."""
73
+ if self._browser is not None:
74
+ with suppress(Exception):
75
+ self._browser.close()
76
+ self._browser = None
77
+ if self._playwright is not None:
78
+ with suppress(Exception):
79
+ self._playwright.stop()
80
+ self._playwright = None
81
+
82
+ def _start_browser(self) -> None:
83
+ try:
84
+ from playwright.sync_api import sync_playwright
85
+ except Exception as exc:
86
+ raise RuntimeError(
87
+ "Playwright is not installed. Install with `python3 -m pip install -e '.[playwright]'`."
88
+ ) from exc
89
+
90
+ try:
91
+ self._playwright = sync_playwright().start()
92
+ self._browser = self._playwright.chromium.launch()
93
+ except Exception as exc:
94
+ self.close()
95
+ raise RuntimeError(
96
+ "Playwright failed to launch Chromium. Ensure `playwright install chromium` has been run."
97
+ ) from exc
98
+
99
+ def convert(
100
+ self,
101
+ input_path: Path | str,
102
+ output_path: Path | str,
103
+ **kwargs: Any,
104
+ ) -> dict[str, Any]:
105
+ """Convert a single EPUB to PDF.
106
+
107
+ Keyword arguments override the client's default settings for this call.
108
+ """
109
+ config = self._build_config(input_path, output_path, **kwargs)
110
+ engine = self._render_engine()
111
+ return convert_epub(config, engine=engine)
112
+
113
+ def batch_convert(
114
+ self,
115
+ jobs: Iterable[tuple[Path | str, Path | str]],
116
+ max_workers: int = 1,
117
+ **kwargs: Any,
118
+ ) -> list[dict[str, Any]]:
119
+ """Convert multiple EPUBs.
120
+
121
+ When ``max_workers`` is greater than 1, worker processes are used. For
122
+ Playwright this means each worker starts its own browser; the client's
123
+ pooled browser is only reused when ``max_workers`` is 1.
124
+
125
+ Returns a list of conversion reports in the same order as ``jobs``.
126
+ """
127
+ configs = [self._build_config(input_path, output_path, **kwargs) for input_path, output_path in jobs]
128
+ if max_workers == 1:
129
+ engine = self._render_engine()
130
+ return [convert_epub(config, engine=engine) for config in configs]
131
+
132
+ with ProcessPoolExecutor(max_workers=max_workers) as executor:
133
+ return list(executor.map(_convert_one, configs))
134
+
135
+ def _render_engine(self) -> Renderer | None:
136
+ if self.engine == "playwright" and self._browser is not None:
137
+ return PlaywrightEngine(browser=self._browser)
138
+ return None
139
+
140
+ def _build_config(
141
+ self,
142
+ input_path: Path | str,
143
+ output_path: Path | str,
144
+ **kwargs: Any,
145
+ ) -> ConvertConfig:
146
+ merged = {
147
+ "page_size": self.page_size,
148
+ "margin_mm": self.margin_mm,
149
+ "cover": self.cover,
150
+ "validate": self.validate,
151
+ "verbose": self.verbose,
152
+ **self._defaults,
153
+ **kwargs,
154
+ }
155
+ return ConvertConfig(
156
+ input_path=Path(input_path),
157
+ output_path=Path(output_path),
158
+ engine=self.engine,
159
+ **merged,
160
+ )
epub2pdf_cli/cli.py ADDED
@@ -0,0 +1,223 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import json
5
+ import logging
6
+ import sys
7
+ from collections.abc import Sequence
8
+ from pathlib import Path
9
+
10
+ from epub2pdf_cli import __version__
11
+ from epub2pdf_cli.config import (
12
+ BatchConfig,
13
+ ConvertConfig,
14
+ InspectConfig,
15
+ PdfExtractConfig,
16
+ PdfExtractFormat,
17
+ )
18
+ from epub2pdf_cli.errors import Epub2PdfError, ExitCode
19
+ from epub2pdf_cli.pipeline import batch_convert, convert_epub, extract_pdf, inspect_epub
20
+
21
+ PDF_EXTRACT_FORMATS: tuple[PdfExtractFormat, ...] = (
22
+ "markdown",
23
+ "json",
24
+ "text",
25
+ "html",
26
+ "markdown-with-html",
27
+ "markdown-with-images",
28
+ "tagged-pdf",
29
+ )
30
+
31
+
32
+ def build_parser() -> argparse.ArgumentParser:
33
+ parser = argparse.ArgumentParser(prog="epub2pdf", description="Convert EPUB files into machine-readable PDFs.")
34
+ parser.add_argument("--version", action="version", version=f"%(prog)s {__version__}")
35
+ subparsers = parser.add_subparsers(dest="command", required=True)
36
+
37
+ convert_parser = subparsers.add_parser("convert", help="Render an EPUB into PDF.")
38
+ convert_parser.add_argument("input", help="Path to the input .epub file.")
39
+ convert_parser.add_argument("-o", "--output", help="Path to the output PDF. Defaults to the input basename with .pdf.")
40
+ convert_parser.add_argument("--engine", choices=("playwright", "weasyprint"), default="weasyprint", help="Rendering backend. Default: weasyprint.")
41
+ convert_parser.add_argument("--sidecar-json", help="Write structured conversion output JSON to this path.")
42
+ convert_parser.add_argument("--sidecar-html", help="Write the normalized merged HTML to this path.")
43
+ convert_parser.add_argument("--sidecar-markdown", help="Write a Markdown version of the EPUB to this path.")
44
+ convert_parser.add_argument("--page-size", choices=("A4", "Letter"), default="A4")
45
+ convert_parser.add_argument("--margin-mm", type=int, default=12)
46
+ convert_parser.add_argument("--cover", choices=("first", "none"), default="first")
47
+ convert_parser.add_argument("--no-validate", action="store_true", help="Skip PDF validation after rendering.")
48
+ convert_parser.add_argument("--force", action="store_true", help="Overwrite the output file if it already exists.")
49
+ convert_parser.add_argument("--verbose", action="store_true", help="Enable verbose logs.")
50
+
51
+ batch_parser = subparsers.add_parser("batch", help="Convert multiple EPUBs in parallel.")
52
+ batch_parser.add_argument("inputs", nargs="+", help="Paths to input .epub files.")
53
+ batch_parser.add_argument("-o", "--output-dir", required=True, help="Directory for output PDFs.")
54
+ batch_parser.add_argument("--engine", choices=("playwright", "weasyprint"), default="weasyprint", help="Rendering backend. Default: weasyprint.")
55
+ batch_parser.add_argument("-j", "--workers", type=int, default=1, help="Number of parallel worker processes. Default: 1.")
56
+ batch_parser.add_argument("--sidecar-json", action="store_true", help="Write a JSON report next to each PDF.")
57
+ batch_parser.add_argument("--sidecar-html", action="store_true", help="Write merged HTML next to each PDF.")
58
+ batch_parser.add_argument("--sidecar-markdown", action="store_true", help="Write Markdown next to each PDF.")
59
+ batch_parser.add_argument("--page-size", choices=("A4", "Letter"), default="A4")
60
+ batch_parser.add_argument("--margin-mm", type=int, default=12)
61
+ batch_parser.add_argument("--cover", choices=("first", "none"), default="first")
62
+ batch_parser.add_argument("--no-validate", action="store_true", help="Skip PDF validation after rendering.")
63
+ batch_parser.add_argument("--force", action="store_true", help="Overwrite existing output files.")
64
+ batch_parser.add_argument("--verbose", action="store_true", help="Enable verbose logs.")
65
+
66
+ inspect_parser = subparsers.add_parser("inspect", help="Inspect EPUB metadata, manifest, spine, and TOC.")
67
+ inspect_parser.add_argument("input", help="Path to the input .epub file.")
68
+ inspect_parser.add_argument("--json", help="Write inspection output JSON to this path. Defaults to stdout.")
69
+ inspect_parser.add_argument("--verbose", action="store_true", help="Enable verbose logs.")
70
+
71
+ pdf_parser = subparsers.add_parser("pdf-extract", help="Extract Markdown/JSON/HTML from a PDF.")
72
+ pdf_parser.add_argument("input", help="Path to the input .pdf file.")
73
+ pdf_parser.add_argument("-o", "--output-dir", help="Directory for extracted files. Defaults to <pdf-stem>_extracted.")
74
+ pdf_parser.add_argument("--engine", choices=("pypdfium2", "docling", "pdfplumber", "opendataloader"), default="pypdfium2", help="Extraction backend. Default: pypdfium2.")
75
+ pdf_parser.add_argument(
76
+ "--format",
77
+ default="markdown,json",
78
+ help="Comma-separated output formats. Default: markdown,json.",
79
+ )
80
+ pdf_parser.add_argument("--pages", help='Pages to extract, for example "1,3,5-7".')
81
+ pdf_parser.add_argument("--password", help="Password for encrypted PDF files.")
82
+ pdf_parser.add_argument("--use-struct-tree", action="store_true", help="Use tagged PDF structure tree when available.")
83
+ pdf_parser.add_argument("--sanitize", action="store_true", help="Sanitize emails, phone numbers, IPs, cards, and URLs.")
84
+ pdf_parser.add_argument("--keep-line-breaks", action="store_true", help="Preserve original text line breaks.")
85
+ pdf_parser.add_argument("--include-header-footer", action="store_true", help="Include page headers and footers.")
86
+ pdf_parser.add_argument("--detect-strikethrough", action="store_true", help="Detect strikethrough text in Markdown/HTML.")
87
+ pdf_parser.add_argument("--table-method", choices=("default", "cluster"), help="Table detection method.")
88
+ pdf_parser.add_argument("--reading-order", choices=("off", "xycut"), default="xycut", help="Reading order algorithm.")
89
+ pdf_parser.add_argument("--markdown-page-separator", help="Separator between Markdown pages; supports %%page-number%%.")
90
+ pdf_parser.add_argument("--html-page-separator", help="Separator between HTML pages; supports %%page-number%%.")
91
+ pdf_parser.add_argument("--image-output", choices=("off", "embedded", "external"), default="external")
92
+ pdf_parser.add_argument("--image-dir", help="Directory for extracted images.")
93
+ pdf_parser.add_argument("--threads", type=int, help="Worker thread count for native extraction.")
94
+ pdf_parser.add_argument("--sidecar-json", help="Write structured extraction report JSON to this path.")
95
+ pdf_parser.add_argument("--force", action="store_true", help="Overwrite existing extraction outputs.")
96
+ pdf_parser.add_argument("--verbose", action="store_true", help="Enable verbose logs.")
97
+ return parser
98
+
99
+
100
+ def main(argv: Sequence[str] | None = None) -> int:
101
+ parser = build_parser()
102
+ args = parser.parse_args(list(argv) if argv is not None else None)
103
+ _configure_logging(args.verbose)
104
+
105
+ try:
106
+ if args.command == "convert":
107
+ input_path = Path(args.input)
108
+ output_path = Path(args.output) if args.output else input_path.with_suffix(".pdf")
109
+ report = convert_epub(
110
+ ConvertConfig(
111
+ input_path=input_path,
112
+ output_path=output_path,
113
+ engine=args.engine,
114
+ sidecar_json_path=Path(args.sidecar_json) if args.sidecar_json else None,
115
+ sidecar_html_path=Path(args.sidecar_html) if args.sidecar_html else None,
116
+ sidecar_markdown_path=Path(args.sidecar_markdown) if args.sidecar_markdown else None,
117
+ page_size=args.page_size,
118
+ margin_mm=args.margin_mm,
119
+ cover=args.cover,
120
+ validate=not args.no_validate,
121
+ force=args.force,
122
+ verbose=args.verbose,
123
+ )
124
+ )
125
+ print(str(output_path))
126
+ logging.getLogger(__name__).debug("Conversion report: %s", json.dumps(report, ensure_ascii=False))
127
+ return ExitCode.OK
128
+
129
+ if args.command == "batch":
130
+ report = batch_convert(
131
+ BatchConfig(
132
+ input_paths=[Path(p) for p in args.inputs],
133
+ output_dir=Path(args.output_dir),
134
+ engine=args.engine,
135
+ workers=args.workers,
136
+ sidecar_json=args.sidecar_json,
137
+ sidecar_html=args.sidecar_html,
138
+ sidecar_markdown=args.sidecar_markdown,
139
+ page_size=args.page_size,
140
+ margin_mm=args.margin_mm,
141
+ cover=args.cover,
142
+ validate=not args.no_validate,
143
+ force=args.force,
144
+ verbose=args.verbose,
145
+ )
146
+ )
147
+ for result in report["results"]:
148
+ output_path = result.get("output", {}).get("path")
149
+ if output_path:
150
+ print(output_path)
151
+ logging.getLogger(__name__).debug("Batch report: %s", json.dumps(report, ensure_ascii=False))
152
+ return ExitCode.OK if report["failures"] == 0 else ExitCode.UNEXPECTED
153
+
154
+ if args.command == "inspect":
155
+ report = inspect_epub(
156
+ InspectConfig(
157
+ input_path=Path(args.input),
158
+ json_path=Path(args.json) if args.json else None,
159
+ )
160
+ )
161
+ if not args.json:
162
+ json.dump(report, sys.stdout, ensure_ascii=False, indent=2)
163
+ sys.stdout.write("\n")
164
+ return ExitCode.OK
165
+
166
+ if args.command == "pdf-extract":
167
+ input_path = Path(args.input)
168
+ output_dir = Path(args.output_dir) if args.output_dir else input_path.with_name(f"{input_path.stem}_extracted")
169
+ report = extract_pdf(
170
+ PdfExtractConfig(
171
+ input_path=input_path,
172
+ output_dir=output_dir,
173
+ formats=_parse_pdf_formats(args.format),
174
+ engine=args.engine,
175
+ pages=args.pages,
176
+ password=args.password,
177
+ use_struct_tree=args.use_struct_tree,
178
+ sanitize=args.sanitize,
179
+ keep_line_breaks=args.keep_line_breaks,
180
+ include_header_footer=args.include_header_footer,
181
+ detect_strikethrough=args.detect_strikethrough,
182
+ table_method=args.table_method,
183
+ reading_order=args.reading_order,
184
+ markdown_page_separator=args.markdown_page_separator,
185
+ html_page_separator=args.html_page_separator,
186
+ image_output=args.image_output,
187
+ image_dir=Path(args.image_dir) if args.image_dir else None,
188
+ threads=args.threads,
189
+ sidecar_json_path=Path(args.sidecar_json) if args.sidecar_json else None,
190
+ force=args.force,
191
+ verbose=args.verbose,
192
+ )
193
+ )
194
+ for output in report["outputs"]:
195
+ print(output)
196
+ return ExitCode.OK
197
+ except Epub2PdfError as exc:
198
+ print(str(exc), file=sys.stderr)
199
+ return exc.exit_code
200
+ except Exception as exc:
201
+ logging.getLogger(__name__).exception("Unexpected error")
202
+ print(f"Unexpected error: {exc}", file=sys.stderr)
203
+ return ExitCode.UNEXPECTED
204
+
205
+ # Unreachable because subparsers are required, but kept for safety.
206
+ parser.error(f"Unsupported command: {args.command}")
207
+ return ExitCode.USAGE
208
+
209
+
210
+ def _configure_logging(verbose: bool) -> None:
211
+ level = logging.DEBUG if verbose else logging.WARNING
212
+ logging.basicConfig(level=level, format="%(levelname)s %(message)s")
213
+
214
+
215
+ def _parse_pdf_formats(raw: str) -> list[PdfExtractFormat]:
216
+ formats = [part.strip() for part in raw.split(",") if part.strip()]
217
+ if not formats:
218
+ raise Epub2PdfError("At least one --format value is required.", exit_code=ExitCode.USAGE)
219
+ invalid = [fmt for fmt in formats if fmt not in PDF_EXTRACT_FORMATS]
220
+ if invalid:
221
+ allowed = ", ".join(PDF_EXTRACT_FORMATS)
222
+ raise Epub2PdfError(f"Unsupported PDF extract format(s): {', '.join(invalid)}. Allowed: {allowed}", exit_code=ExitCode.USAGE)
223
+ return formats # type: ignore[return-value]
epub2pdf_cli/config.py ADDED
@@ -0,0 +1,109 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from pathlib import Path
5
+ from typing import Literal
6
+
7
+ EngineName = Literal["playwright", "weasyprint"]
8
+ CoverMode = Literal["first", "none"]
9
+ PageSize = Literal["A4", "Letter"]
10
+ PdfExtractFormat = Literal[
11
+ "markdown",
12
+ "json",
13
+ "text",
14
+ "html",
15
+ "markdown-with-html",
16
+ "markdown-with-images",
17
+ "tagged-pdf",
18
+ ]
19
+ ImageOutputMode = Literal["off", "embedded", "external"]
20
+ TableMethod = Literal["default", "cluster"]
21
+ ReadingOrder = Literal["off", "xycut"]
22
+ PdfExtractorName = Literal["pypdfium2", "docling", "pdfplumber", "opendataloader"]
23
+
24
+
25
+ @dataclass(frozen=True, slots=True)
26
+ class ConvertConfig:
27
+ input_path: Path
28
+ output_path: Path
29
+ engine: EngineName = "weasyprint"
30
+ sidecar_json_path: Path | None = None
31
+ sidecar_html_path: Path | None = None
32
+ sidecar_markdown_path: Path | None = None
33
+ page_size: PageSize = "A4"
34
+ margin_mm: int = 12
35
+ cover: CoverMode = "first"
36
+ validate: bool = True
37
+ force: bool = False
38
+ verbose: bool = False
39
+
40
+ def __post_init__(self) -> None:
41
+ if self.margin_mm < 0:
42
+ raise ValueError("margin_mm must be non-negative")
43
+
44
+
45
+ @dataclass(frozen=True, slots=True)
46
+ class BatchConfig:
47
+ input_paths: list[Path]
48
+ output_dir: Path
49
+ engine: EngineName = "weasyprint"
50
+ workers: int = 1
51
+ sidecar_json: bool = False
52
+ sidecar_html: bool = False
53
+ sidecar_markdown: bool = False
54
+ page_size: PageSize = "A4"
55
+ margin_mm: int = 12
56
+ cover: CoverMode = "first"
57
+ validate: bool = True
58
+ force: bool = False
59
+ verbose: bool = False
60
+
61
+ def __post_init__(self) -> None:
62
+ if self.margin_mm < 0:
63
+ raise ValueError("margin_mm must be non-negative")
64
+ if self.workers < 1:
65
+ raise ValueError("workers must be at least 1")
66
+
67
+
68
+ @dataclass(frozen=True, slots=True)
69
+ class InspectConfig:
70
+ input_path: Path
71
+ json_path: Path | None = None
72
+
73
+
74
+ @dataclass(frozen=True, slots=True)
75
+ class PdfExtractConfig:
76
+ input_path: Path
77
+ output_dir: Path
78
+ formats: list[PdfExtractFormat]
79
+ engine: PdfExtractorName = "pypdfium2"
80
+ pages: str | None = None
81
+ password: str | None = None
82
+ use_struct_tree: bool = False
83
+ sanitize: bool = False
84
+ keep_line_breaks: bool = False
85
+ include_header_footer: bool = False
86
+ detect_strikethrough: bool = False
87
+ table_method: TableMethod | None = None
88
+ reading_order: ReadingOrder | None = "xycut"
89
+ markdown_page_separator: str | None = None
90
+ html_page_separator: str | None = None
91
+ image_output: ImageOutputMode | None = "external"
92
+ image_dir: Path | None = None
93
+ threads: int | None = None
94
+ sidecar_json_path: Path | None = None
95
+ force: bool = False
96
+ verbose: bool = False
97
+
98
+
99
+ @dataclass(frozen=True, slots=True)
100
+ class RenderOptions:
101
+ output_path: Path
102
+ page_size: PageSize
103
+ margin_mm: int
104
+ cover: CoverMode
105
+ title: str = ""
106
+
107
+ def __post_init__(self) -> None:
108
+ if self.margin_mm < 0:
109
+ raise ValueError("margin_mm must be non-negative")
@@ -0,0 +1,3 @@
1
+ from epub2pdf_cli.epub.parser import read_epub
2
+
3
+ __all__ = ["read_epub"]
@@ -0,0 +1,81 @@
1
+ from __future__ import annotations
2
+
3
+ from collections.abc import Iterable
4
+ from pathlib import Path
5
+
6
+ from bs4 import BeautifulSoup
7
+
8
+ from epub2pdf_cli.errors import StageError
9
+ from epub2pdf_cli.models import Chapter, ManifestItem, SpineItem
10
+
11
+ TEXT_MEDIA_TYPES = {"application/xhtml+xml", "text/html"}
12
+ UNSUPPORTED_TAGS = {"script", "audio", "video", "canvas", "iframe", "form"}
13
+
14
+
15
+ def read_chapters(
16
+ spine: Iterable[SpineItem],
17
+ manifest: dict[str, ManifestItem],
18
+ ) -> tuple[list[Chapter], list[str]]:
19
+ chapters: list[Chapter] = []
20
+ warnings: list[str] = []
21
+ for item in spine:
22
+ if item.media_type not in TEXT_MEDIA_TYPES:
23
+ continue
24
+ manifest_item = manifest[item.idref]
25
+ if not manifest_item.content:
26
+ raise StageError("spine", f"Spine item is missing chapter content: {manifest_item.href}")
27
+ soup = BeautifulSoup(manifest_item.content, "lxml")
28
+ title = _extract_title(soup, fallback=Path(manifest_item.href).stem)
29
+ text = soup.get_text(" ", strip=True)
30
+ chapter_warnings = _chapter_warnings(soup, manifest_item.href, text)
31
+ warnings.extend(chapter_warnings)
32
+ chapters.append(
33
+ Chapter(
34
+ idref=item.idref,
35
+ href=manifest_item.href,
36
+ media_type=item.media_type,
37
+ title=title,
38
+ html=manifest_item.content.decode("utf-8", errors="replace"),
39
+ text=text,
40
+ linear=item.linear,
41
+ )
42
+ )
43
+ if not chapters:
44
+ raise StageError("spine", "Spine did not contain any XHTML/HTML documents")
45
+ return chapters, warnings
46
+
47
+
48
+ def _extract_title(soup: BeautifulSoup, fallback: str) -> str:
49
+ for selector in ("title", "h1", "h2"):
50
+ node = soup.find(selector)
51
+ if node:
52
+ text = str(node.get_text(" ", strip=True))
53
+ if text:
54
+ return text
55
+ return fallback
56
+
57
+
58
+ def _chapter_warnings(soup: BeautifulSoup, href: str, text: str) -> list[str]:
59
+ warnings: list[str] = []
60
+ if not text.strip() and soup.find("img"):
61
+ warnings.append(f"Chapter is image-heavy or image-only: {href}")
62
+ found: set[str] = set()
63
+ for tag in soup.find_all(True):
64
+ if tag.name in UNSUPPORTED_TAGS:
65
+ found.add(tag.name)
66
+ for tag_name in sorted(found):
67
+ warnings.append(f"Unsupported <{tag_name}> content detected in {href}")
68
+ return warnings
69
+
70
+
71
+ def manifest_warnings(manifest: dict[str, ManifestItem]) -> list[str]:
72
+ warnings: list[str] = []
73
+ supported_prefixes = ("application/xhtml+xml", "text/html", "text/css", "image/", "font/")
74
+ supported_exact = {"application/x-dtbncx+xml"}
75
+ for item in manifest.values():
76
+ if any(item.media_type.startswith(prefix) for prefix in supported_prefixes):
77
+ continue
78
+ if item.media_type in supported_exact:
79
+ continue
80
+ warnings.append(f"Manifest item may not be fully represented in PDF: {item.href} ({item.media_type})")
81
+ return warnings
@@ -0,0 +1,25 @@
1
+ from __future__ import annotations
2
+
3
+ import zipfile
4
+ from xml.etree import ElementTree as ET
5
+
6
+ from epub2pdf_cli.errors import ExitCode, StageError
7
+
8
+ CONTAINER_NS = {"c": "urn:oasis:names:tc:opendocument:xmlns:container"}
9
+
10
+
11
+ def read_rootfile_path(archive: zipfile.ZipFile) -> str:
12
+ try:
13
+ container_bytes = archive.read("META-INF/container.xml")
14
+ except KeyError as exc:
15
+ raise StageError("container", "Missing required EPUB resource: META-INF/container.xml", exit_code=ExitCode.USAGE) from exc
16
+
17
+ try:
18
+ container = ET.fromstring(container_bytes)
19
+ except ET.ParseError as exc:
20
+ raise StageError("container", "Unable to parse META-INF/container.xml") from exc
21
+
22
+ rootfile = container.find("c:rootfiles/c:rootfile", CONTAINER_NS)
23
+ if rootfile is None or not rootfile.attrib.get("full-path"):
24
+ raise StageError("container", "container.xml does not declare a rootfile")
25
+ return rootfile.attrib["full-path"]
@@ -0,0 +1,24 @@
1
+ from __future__ import annotations
2
+
3
+ import posixpath
4
+
5
+
6
+ def split_href(href: str) -> tuple[str, str]:
7
+ if "#" not in href:
8
+ return href, ""
9
+ path, fragment = href.split("#", 1)
10
+ return path, fragment
11
+
12
+
13
+ def resolve_relative_href(base_href: str, target: str) -> str:
14
+ if not target:
15
+ return base_href
16
+ if "://" in target or target.startswith("mailto:"):
17
+ return target
18
+ path, fragment = split_href(target)
19
+ resolved_path = (
20
+ posixpath.normpath(posixpath.join(posixpath.dirname(base_href), path))
21
+ if path
22
+ else base_href
23
+ )
24
+ return f"{resolved_path}#{fragment}" if fragment else resolved_path