epub2pdf-cli 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. epub2pdf_cli/__init__.py +5 -0
  2. epub2pdf_cli/__main__.py +4 -0
  3. epub2pdf_cli/api.py +160 -0
  4. epub2pdf_cli/cli.py +223 -0
  5. epub2pdf_cli/config.py +109 -0
  6. epub2pdf_cli/epub/__init__.py +3 -0
  7. epub2pdf_cli/epub/chapters.py +81 -0
  8. epub2pdf_cli/epub/container.py +25 -0
  9. epub2pdf_cli/epub/href.py +24 -0
  10. epub2pdf_cli/epub/opf.py +159 -0
  11. epub2pdf_cli/epub/parser.py +64 -0
  12. epub2pdf_cli/epub/toc.py +101 -0
  13. epub2pdf_cli/errors.py +27 -0
  14. epub2pdf_cli/html/__init__.py +3 -0
  15. epub2pdf_cli/html/builder.py +190 -0
  16. epub2pdf_cli/html/css.py +49 -0
  17. epub2pdf_cli/html/links.py +144 -0
  18. epub2pdf_cli/html/template.py +92 -0
  19. epub2pdf_cli/io_utils.py +24 -0
  20. epub2pdf_cli/markdown.py +97 -0
  21. epub2pdf_cli/mcp_server.py +189 -0
  22. epub2pdf_cli/models.py +116 -0
  23. epub2pdf_cli/pdf/__init__.py +5 -0
  24. epub2pdf_cli/pdf/extract.py +79 -0
  25. epub2pdf_cli/pdf/extractors/__init__.py +0 -0
  26. epub2pdf_cli/pdf/extractors/base.py +23 -0
  27. epub2pdf_cli/pdf/extractors/docling_extractor.py +139 -0
  28. epub2pdf_cli/pdf/extractors/opendataloader_extractor.py +86 -0
  29. epub2pdf_cli/pdf/extractors/pdfplumber_extractor.py +150 -0
  30. epub2pdf_cli/pdf/extractors/pypdfium2_extractor.py +151 -0
  31. epub2pdf_cli/pdf/text.py +45 -0
  32. epub2pdf_cli/pdf/validate.py +37 -0
  33. epub2pdf_cli/pipeline/__init__.py +6 -0
  34. epub2pdf_cli/pipeline/batch.py +84 -0
  35. epub2pdf_cli/pipeline/convert.py +122 -0
  36. epub2pdf_cli/pipeline/extract.py +64 -0
  37. epub2pdf_cli/pipeline/inspect.py +15 -0
  38. epub2pdf_cli/render/__init__.py +17 -0
  39. epub2pdf_cli/render/options.py +19 -0
  40. epub2pdf_cli/render/playwright.py +91 -0
  41. epub2pdf_cli/render/protocol.py +13 -0
  42. epub2pdf_cli/render/weasyprint.py +28 -0
  43. epub2pdf_cli-0.3.0.dist-info/METADATA +443 -0
  44. epub2pdf_cli-0.3.0.dist-info/RECORD +48 -0
  45. epub2pdf_cli-0.3.0.dist-info/WHEEL +5 -0
  46. epub2pdf_cli-0.3.0.dist-info/entry_points.txt +3 -0
  47. epub2pdf_cli-0.3.0.dist-info/licenses/LICENSE +21 -0
  48. epub2pdf_cli-0.3.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,37 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+ from typing import Any
5
+
6
+ from pypdf import PdfReader
7
+
8
+ from epub2pdf_cli.errors import StageError
9
+ from epub2pdf_cli.pdf.text import extract_text
10
+
11
+
12
+ def validate_pdf(output_path: Path, *, expect_text: bool) -> dict[str, Any]:
13
+ if not output_path.exists():
14
+ raise StageError("validate", f"Output PDF was not created: {output_path}")
15
+ if output_path.stat().st_size == 0:
16
+ raise StageError("validate", f"Output PDF is empty: {output_path}")
17
+
18
+ try:
19
+ reader = PdfReader(str(output_path))
20
+ except Exception as exc:
21
+ raise StageError("validate", f"Unable to read output PDF: {output_path}") from exc
22
+
23
+ page_count = len(reader.pages)
24
+ if page_count <= 0:
25
+ raise StageError("validate", "Output PDF does not contain any pages")
26
+
27
+ extraction = extract_text(output_path, reader=reader)
28
+ if expect_text and not extraction["has_text"]:
29
+ raise StageError("validate", "Output PDF does not contain extractable text")
30
+
31
+ return {
32
+ "page_count": page_count,
33
+ "has_text": extraction["has_text"],
34
+ "text_length": extraction["text_length"],
35
+ "extractor": extraction["extractor"],
36
+ "text_sample": extraction["text_sample"],
37
+ }
@@ -0,0 +1,6 @@
1
+ from epub2pdf_cli.pipeline.batch import batch_convert
2
+ from epub2pdf_cli.pipeline.convert import convert_epub
3
+ from epub2pdf_cli.pipeline.extract import extract_pdf
4
+ from epub2pdf_cli.pipeline.inspect import inspect_epub
5
+
6
+ __all__ = ["inspect_epub", "convert_epub", "extract_pdf", "batch_convert"]
@@ -0,0 +1,84 @@
1
+ """Batch conversion pipeline with optional process-level parallelism."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import time
7
+ from concurrent.futures import ProcessPoolExecutor
8
+ from datetime import datetime, timezone
9
+ from pathlib import Path
10
+ from typing import Any
11
+
12
+ from epub2pdf_cli.config import BatchConfig, ConvertConfig
13
+ from epub2pdf_cli.errors import Epub2PdfError
14
+ from epub2pdf_cli.pipeline.convert import convert_epub
15
+
16
+ LOGGER = logging.getLogger(__name__)
17
+
18
+
19
+ def batch_convert(config: BatchConfig) -> dict[str, Any]:
20
+ """Convert multiple EPUBs in parallel using separate worker processes."""
21
+ config.output_dir.mkdir(parents=True, exist_ok=True)
22
+
23
+ jobs = [_build_convert_config(path, config) for path in config.input_paths]
24
+ start = time.perf_counter()
25
+
26
+ if config.workers == 1:
27
+ results = [_convert_one(job) for job in jobs]
28
+ else:
29
+ with ProcessPoolExecutor(max_workers=config.workers) as executor:
30
+ results = list(executor.map(_convert_one, jobs))
31
+
32
+ total_time = round(time.perf_counter() - start, 3)
33
+ successes = sum(1 for r in results if "error" not in r)
34
+ failures = len(results) - successes
35
+
36
+ LOGGER.info(
37
+ "Batch conversion finished: %d succeeded, %d failed, %.3fs total",
38
+ successes,
39
+ failures,
40
+ total_time,
41
+ )
42
+
43
+ return {
44
+ "engine": config.engine,
45
+ "workers": config.workers,
46
+ "output_dir": str(config.output_dir),
47
+ "total_time": total_time,
48
+ "successes": successes,
49
+ "failures": failures,
50
+ "results": results,
51
+ "completed_at": datetime.now(timezone.utc).isoformat(),
52
+ }
53
+
54
+
55
+ def _build_convert_config(input_path: Path, batch_config: BatchConfig) -> ConvertConfig:
56
+ stem = input_path.stem
57
+ output_path = batch_config.output_dir / f"{stem}.pdf"
58
+ return ConvertConfig(
59
+ input_path=input_path,
60
+ output_path=output_path,
61
+ engine=batch_config.engine,
62
+ sidecar_json_path=(batch_config.output_dir / f"{stem}.json") if batch_config.sidecar_json else None,
63
+ sidecar_html_path=(batch_config.output_dir / f"{stem}.html") if batch_config.sidecar_html else None,
64
+ sidecar_markdown_path=(batch_config.output_dir / f"{stem}.md") if batch_config.sidecar_markdown else None,
65
+ page_size=batch_config.page_size,
66
+ margin_mm=batch_config.margin_mm,
67
+ cover=batch_config.cover,
68
+ validate=batch_config.validate,
69
+ force=batch_config.force,
70
+ verbose=batch_config.verbose,
71
+ )
72
+
73
+
74
+ def _convert_one(convert_config: ConvertConfig) -> dict[str, Any]:
75
+ try:
76
+ return convert_epub(convert_config)
77
+ except Epub2PdfError as exc:
78
+ LOGGER.warning("Conversion failed for %s: %s", convert_config.input_path, exc)
79
+ return {
80
+ "source": {"path": str(convert_config.input_path)},
81
+ "output": {"path": str(convert_config.output_path), "error": str(exc)},
82
+ "error": str(exc),
83
+ "exit_code": exc.exit_code,
84
+ }
@@ -0,0 +1,122 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import time
5
+ from collections.abc import Callable
6
+ from datetime import datetime, timezone
7
+ from pathlib import Path
8
+ from typing import Any, TypeVar
9
+
10
+ from epub2pdf_cli.config import ConvertConfig
11
+ from epub2pdf_cli.epub import read_epub
12
+ from epub2pdf_cli.errors import ExitCode, StageError
13
+ from epub2pdf_cli.html.builder import build_html
14
+ from epub2pdf_cli.io_utils import sha256, write_json, write_text
15
+ from epub2pdf_cli.markdown import build_markdown
16
+ from epub2pdf_cli.pdf import validate_pdf
17
+ from epub2pdf_cli.render import ENGINES
18
+ from epub2pdf_cli.render.options import RenderOptions
19
+ from epub2pdf_cli.render.protocol import Renderer
20
+
21
+ LOGGER = logging.getLogger(__name__)
22
+
23
+ T = TypeVar("T")
24
+
25
+
26
+ def convert_epub(config: ConvertConfig, engine: Renderer | None = None) -> dict[str, Any]:
27
+ _check_output_path(config.output_path, force=config.force)
28
+
29
+ timings: dict[str, float] = {}
30
+
31
+ book, timings["read_epub"] = _timed_stage("read_epub", lambda: read_epub(config.input_path))
32
+
33
+ build_result, timings["build_html"] = _timed_stage("build_html", lambda: build_html(book, config))
34
+
35
+ if config.sidecar_markdown_path:
36
+ markdown_path = config.sidecar_markdown_path
37
+ timings["markdown"] = _timed_stage_void(
38
+ "markdown",
39
+ lambda: write_text(markdown_path, build_markdown(book)),
40
+ )
41
+
42
+ render_options = RenderOptions(
43
+ output_path=config.output_path,
44
+ page_size=config.page_size,
45
+ margin_mm=config.margin_mm,
46
+ cover=config.cover,
47
+ title=book.metadata.get("title") or "Untitled EPUB",
48
+ )
49
+
50
+ config.output_path.parent.mkdir(parents=True, exist_ok=True)
51
+ if engine is None:
52
+ try:
53
+ engine = ENGINES[config.engine]()
54
+ except KeyError as exc:
55
+ raise StageError(
56
+ "convert",
57
+ f"Rendering engine '{config.engine}' is not installed. "
58
+ f"Install with `python3 -m pip install -e '.[{config.engine}]'`.",
59
+ exit_code=ExitCode.USAGE,
60
+ ) from exc
61
+ _, timings["render"] = _timed_stage("render", lambda: engine.render(build_result.html, render_options))
62
+
63
+ if config.validate:
64
+ validation, timings["validate_pdf"] = _timed_stage(
65
+ "validate_pdf",
66
+ lambda: validate_pdf(config.output_path, expect_text=True),
67
+ )
68
+ else:
69
+ validation = None
70
+ timings["validate_pdf"] = 0.0
71
+
72
+ if config.sidecar_html_path:
73
+ write_text(config.sidecar_html_path, build_result.html)
74
+
75
+ report: dict[str, Any] = {
76
+ "source": {
77
+ "path": str(config.input_path),
78
+ "sha256": sha256(config.input_path),
79
+ },
80
+ "output": {
81
+ "path": str(config.output_path),
82
+ "engine": config.engine,
83
+ "validation": validation,
84
+ "timings": timings,
85
+ },
86
+ "html": {
87
+ "chapters": build_result.chapters,
88
+ "assets": build_result.assets,
89
+ "warnings": build_result.warnings,
90
+ },
91
+ "converted_at": datetime.now(timezone.utc).isoformat(),
92
+ }
93
+
94
+ if config.sidecar_json_path:
95
+ write_json(config.sidecar_json_path, report)
96
+
97
+ return report
98
+
99
+
100
+ def _check_output_path(output_path: Path, *, force: bool) -> None:
101
+ if output_path.exists() and not force:
102
+ raise StageError(
103
+ "convert",
104
+ f"Output already exists: {output_path}. Use --force to overwrite.",
105
+ exit_code=ExitCode.OUTPUT_EXISTS,
106
+ )
107
+
108
+
109
+ def _timed_stage(name: str, fn: Callable[[], T]) -> tuple[T, float]:
110
+ start = time.perf_counter()
111
+ result = fn()
112
+ duration = round(time.perf_counter() - start, 3)
113
+ LOGGER.info("Stage %s took %.3fs", name, duration)
114
+ return result, duration
115
+
116
+
117
+ def _timed_stage_void(name: str, fn: Callable[[], None]) -> float:
118
+ start = time.perf_counter()
119
+ fn()
120
+ duration = round(time.perf_counter() - start, 3)
121
+ LOGGER.info("Stage %s took %.3fs", name, duration)
122
+ return duration
@@ -0,0 +1,64 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import time
5
+ from datetime import datetime, timezone
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+ from epub2pdf_cli.config import PdfExtractConfig
10
+ from epub2pdf_cli.errors import ExitCode, StageError
11
+ from epub2pdf_cli.io_utils import sha256, write_json
12
+ from epub2pdf_cli.pdf.extract import planned_extract_paths, run_pdf_extraction
13
+
14
+ LOGGER = logging.getLogger(__name__)
15
+
16
+
17
+ def extract_pdf(config: PdfExtractConfig) -> dict[str, Any]:
18
+ _check_input_path(config.input_path, suffix=".pdf")
19
+ _check_extract_outputs(config)
20
+
21
+ timings: dict[str, float] = {}
22
+ start = time.perf_counter()
23
+ outputs = run_pdf_extraction(config, timings=timings)
24
+ timings["pdf-extract"] = round(time.perf_counter() - start, 3)
25
+ LOGGER.info("Stage pdf-extract took %.3fs", timings["pdf-extract"])
26
+
27
+ if not outputs:
28
+ raise StageError("pdf-extract", f"No extraction outputs were created in: {config.output_dir}")
29
+
30
+ report: dict[str, Any] = {
31
+ "source": {
32
+ "path": str(config.input_path),
33
+ "sha256": sha256(config.input_path),
34
+ "extracted_at": datetime.now(timezone.utc).isoformat(),
35
+ },
36
+ "formats": config.formats,
37
+ "output_dir": str(config.output_dir),
38
+ "outputs": outputs,
39
+ "engine": config.engine,
40
+ "mode": "local",
41
+ "timings": timings,
42
+ }
43
+
44
+ if config.sidecar_json_path:
45
+ write_json(config.sidecar_json_path, report)
46
+
47
+ return report
48
+
49
+
50
+ def _check_input_path(path: Path, *, suffix: str) -> None:
51
+ if not path.exists():
52
+ raise StageError("pdf-extract", f"Input file does not exist: {path}", exit_code=ExitCode.USAGE)
53
+ if path.suffix.lower() != suffix:
54
+ raise StageError("pdf-extract", f"Expected a {suffix} input file: {path}", exit_code=ExitCode.USAGE)
55
+
56
+
57
+ def _check_extract_outputs(config: PdfExtractConfig) -> None:
58
+ if config.force:
59
+ return
60
+ planned = planned_extract_paths(config.input_path, config.output_dir, list(config.formats))
61
+ existing = [path for path in planned if path.exists()]
62
+ if existing:
63
+ formatted = ", ".join(str(path) for path in existing)
64
+ raise StageError("pdf-extract", f"Output already exists: {formatted}. Use --force to overwrite.", exit_code=ExitCode.OUTPUT_EXISTS)
@@ -0,0 +1,15 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+ from epub2pdf_cli.config import InspectConfig
6
+ from epub2pdf_cli.epub import read_epub
7
+ from epub2pdf_cli.io_utils import write_json
8
+
9
+
10
+ def inspect_epub(config: InspectConfig) -> dict[str, Any]:
11
+ book = read_epub(config.input_path)
12
+ report = book.to_inspection_dict()
13
+ if config.json_path:
14
+ write_json(config.json_path, report)
15
+ return report
@@ -0,0 +1,17 @@
1
+ from epub2pdf_cli.render.options import RenderOptions
2
+ from epub2pdf_cli.render.protocol import Renderer
3
+ from epub2pdf_cli.render.weasyprint import WeasyPrintEngine
4
+
5
+ ENGINES: dict[str, type[Renderer]] = {
6
+ "weasyprint": WeasyPrintEngine,
7
+ }
8
+
9
+ try:
10
+ from epub2pdf_cli.render.playwright import PlaywrightEngine
11
+ except Exception:
12
+ PlaywrightEngine = None # type: ignore[misc,assignment]
13
+
14
+ if PlaywrightEngine is not None:
15
+ ENGINES["playwright"] = PlaywrightEngine
16
+
17
+ __all__ = ["Renderer", "RenderOptions", "WeasyPrintEngine", "ENGINES"]
@@ -0,0 +1,19 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from pathlib import Path
5
+
6
+ from epub2pdf_cli.config import CoverMode, PageSize
7
+
8
+
9
+ @dataclass(frozen=True, slots=True)
10
+ class RenderOptions:
11
+ output_path: Path
12
+ page_size: PageSize
13
+ margin_mm: int
14
+ cover: CoverMode
15
+ title: str = ""
16
+
17
+ def __post_init__(self) -> None:
18
+ if self.margin_mm < 0:
19
+ raise ValueError("margin_mm must be non-negative")
@@ -0,0 +1,91 @@
1
+ from __future__ import annotations
2
+
3
+ from contextlib import suppress
4
+ from typing import Any
5
+
6
+ from playwright.sync_api import Browser, sync_playwright
7
+ from playwright.sync_api import TimeoutError as PlaywrightTimeoutError
8
+
9
+ from epub2pdf_cli.errors import StageError
10
+ from epub2pdf_cli.render.options import RenderOptions
11
+
12
+ DEFAULT_TIMEOUT_MS = 120_000
13
+
14
+
15
+ class PlaywrightEngine:
16
+ name = "playwright"
17
+
18
+ def __init__(self, timeout_ms: int = DEFAULT_TIMEOUT_MS, *, browser: Browser | None = None) -> None:
19
+ self.timeout_ms = timeout_ms
20
+ self._provided_browser = browser
21
+ self._owned_browser: Browser | None = None
22
+ self._owned_playwright: Any | None = None
23
+
24
+ def render(self, html: str, options: RenderOptions) -> None:
25
+ browser = self._provided_browser
26
+ own_browser = False
27
+ if browser is None:
28
+ browser, own_browser = self._launch_browser()
29
+ try:
30
+ page = browser.new_page()
31
+ try:
32
+ page.set_default_timeout(self.timeout_ms)
33
+ page.set_content(html, wait_until="load")
34
+ page.emulate_media(media="print")
35
+ page.pdf(
36
+ path=str(options.output_path),
37
+ format=options.page_size,
38
+ print_background=True,
39
+ prefer_css_page_size=True,
40
+ margin={
41
+ "top": f"{options.margin_mm}mm",
42
+ "bottom": f"{options.margin_mm}mm",
43
+ "left": f"{options.margin_mm}mm",
44
+ "right": f"{options.margin_mm}mm",
45
+ },
46
+ tagged=True,
47
+ outline=False,
48
+ )
49
+ finally:
50
+ page.close()
51
+ except PlaywrightTimeoutError as exc:
52
+ raise StageError(
53
+ "render",
54
+ f"Playwright rendering timed out after {self.timeout_ms}ms.",
55
+ ) from exc
56
+ except Exception as exc:
57
+ raise StageError(
58
+ "render",
59
+ "Playwright rendering failed. Ensure `playwright install chromium` has been run.",
60
+ ) from exc
61
+ finally:
62
+ if own_browser:
63
+ self._close_owned_browser()
64
+
65
+ def _launch_browser(self) -> tuple[Browser, bool]:
66
+ try:
67
+ self._owned_playwright = sync_playwright().start()
68
+ self._owned_browser = self._owned_playwright.chromium.launch()
69
+ return self._owned_browser, True
70
+ except Exception as exc:
71
+ self._close_owned_browser()
72
+ raise StageError(
73
+ "render",
74
+ "Playwright failed to launch Chromium. Ensure `playwright install chromium` has been run.",
75
+ ) from exc
76
+
77
+ def _close_owned_browser(self) -> None:
78
+ if self._owned_browser:
79
+ with suppress(Exception):
80
+ self._owned_browser.close()
81
+ self._owned_browser = None
82
+ if self._owned_playwright:
83
+ with suppress(Exception):
84
+ self._owned_playwright.stop()
85
+ self._owned_playwright = None
86
+
87
+ def __enter__(self) -> PlaywrightEngine:
88
+ return self
89
+
90
+ def __exit__(self, *exc: Any) -> None:
91
+ self._close_owned_browser()
@@ -0,0 +1,13 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Protocol, runtime_checkable
4
+
5
+ from epub2pdf_cli.render.options import RenderOptions
6
+
7
+
8
+ @runtime_checkable
9
+ class Renderer(Protocol):
10
+ name: str
11
+
12
+ def render(self, html: str, options: RenderOptions) -> None:
13
+ ...
@@ -0,0 +1,28 @@
1
+ from __future__ import annotations
2
+
3
+ from epub2pdf_cli.errors import ExitCode, StageError
4
+ from epub2pdf_cli.render.options import RenderOptions
5
+
6
+
7
+ class WeasyPrintEngine:
8
+ name = "weasyprint"
9
+
10
+ def render(self, html: str, options: RenderOptions) -> None:
11
+ try:
12
+ from weasyprint import HTML
13
+ except Exception as exc:
14
+ raise StageError(
15
+ "render",
16
+ "WeasyPrint is not installed. Install with `python3 -m pip install -e '.[weasyprint]'`.",
17
+ exit_code=ExitCode.USAGE,
18
+ ) from exc
19
+
20
+ try:
21
+ HTML(string=html).write_pdf(
22
+ str(options.output_path),
23
+ size=options.page_size,
24
+ margin=f"{options.margin_mm}mm",
25
+ title=options.title or None,
26
+ )
27
+ except Exception as exc:
28
+ raise StageError("render", "WeasyPrint rendering failed.") from exc