epub2pdf-cli 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epub2pdf_cli/__init__.py +5 -0
- epub2pdf_cli/__main__.py +4 -0
- epub2pdf_cli/api.py +160 -0
- epub2pdf_cli/cli.py +223 -0
- epub2pdf_cli/config.py +109 -0
- epub2pdf_cli/epub/__init__.py +3 -0
- epub2pdf_cli/epub/chapters.py +81 -0
- epub2pdf_cli/epub/container.py +25 -0
- epub2pdf_cli/epub/href.py +24 -0
- epub2pdf_cli/epub/opf.py +159 -0
- epub2pdf_cli/epub/parser.py +64 -0
- epub2pdf_cli/epub/toc.py +101 -0
- epub2pdf_cli/errors.py +27 -0
- epub2pdf_cli/html/__init__.py +3 -0
- epub2pdf_cli/html/builder.py +190 -0
- epub2pdf_cli/html/css.py +49 -0
- epub2pdf_cli/html/links.py +144 -0
- epub2pdf_cli/html/template.py +92 -0
- epub2pdf_cli/io_utils.py +24 -0
- epub2pdf_cli/markdown.py +97 -0
- epub2pdf_cli/mcp_server.py +189 -0
- epub2pdf_cli/models.py +116 -0
- epub2pdf_cli/pdf/__init__.py +5 -0
- epub2pdf_cli/pdf/extract.py +79 -0
- epub2pdf_cli/pdf/extractors/__init__.py +0 -0
- epub2pdf_cli/pdf/extractors/base.py +23 -0
- epub2pdf_cli/pdf/extractors/docling_extractor.py +139 -0
- epub2pdf_cli/pdf/extractors/opendataloader_extractor.py +86 -0
- epub2pdf_cli/pdf/extractors/pdfplumber_extractor.py +150 -0
- epub2pdf_cli/pdf/extractors/pypdfium2_extractor.py +151 -0
- epub2pdf_cli/pdf/text.py +45 -0
- epub2pdf_cli/pdf/validate.py +37 -0
- epub2pdf_cli/pipeline/__init__.py +6 -0
- epub2pdf_cli/pipeline/batch.py +84 -0
- epub2pdf_cli/pipeline/convert.py +122 -0
- epub2pdf_cli/pipeline/extract.py +64 -0
- epub2pdf_cli/pipeline/inspect.py +15 -0
- epub2pdf_cli/render/__init__.py +17 -0
- epub2pdf_cli/render/options.py +19 -0
- epub2pdf_cli/render/playwright.py +91 -0
- epub2pdf_cli/render/protocol.py +13 -0
- epub2pdf_cli/render/weasyprint.py +28 -0
- epub2pdf_cli-0.3.0.dist-info/METADATA +443 -0
- epub2pdf_cli-0.3.0.dist-info/RECORD +48 -0
- epub2pdf_cli-0.3.0.dist-info/WHEEL +5 -0
- epub2pdf_cli-0.3.0.dist-info/entry_points.txt +3 -0
- epub2pdf_cli-0.3.0.dist-info/licenses/LICENSE +21 -0
- epub2pdf_cli-0.3.0.dist-info/top_level.txt +1 -0
epub2pdf_cli/__init__.py
ADDED
epub2pdf_cli/__main__.py
ADDED
epub2pdf_cli/api.py
ADDED
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
"""Programmatic API for epub2pdf.
|
|
2
|
+
|
|
3
|
+
The :class:`Epub2Pdf` client provides a reusable context manager. When the
|
|
4
|
+
``playwright`` engine is selected, the browser instance is launched once and
|
|
5
|
+
reused across conversions, avoiding per-call launch overhead.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from collections.abc import Iterable
|
|
11
|
+
from concurrent.futures import ProcessPoolExecutor
|
|
12
|
+
from contextlib import suppress
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from types import TracebackType
|
|
15
|
+
from typing import Any
|
|
16
|
+
|
|
17
|
+
from epub2pdf_cli.config import ConvertConfig, EngineName, PageSize
|
|
18
|
+
from epub2pdf_cli.pipeline.batch import _convert_one
|
|
19
|
+
from epub2pdf_cli.pipeline.convert import convert_epub
|
|
20
|
+
from epub2pdf_cli.render.playwright import PlaywrightEngine
|
|
21
|
+
from epub2pdf_cli.render.protocol import Renderer
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class Epub2Pdf:
|
|
25
|
+
"""High-level client for converting EPUB files to PDF.
|
|
26
|
+
|
|
27
|
+
Use as a context manager when ``engine="playwright"`` to keep a single
|
|
28
|
+
browser process alive for multiple conversions:
|
|
29
|
+
|
|
30
|
+
with Epub2Pdf(engine="playwright") as client:
|
|
31
|
+
report1 = client.convert("a.epub", "a.pdf")
|
|
32
|
+
report2 = client.convert("b.epub", "b.pdf")
|
|
33
|
+
|
|
34
|
+
The WeasyPrint engine does not require context-manager entry.
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
def __init__(
|
|
38
|
+
self,
|
|
39
|
+
engine: EngineName = "weasyprint",
|
|
40
|
+
*,
|
|
41
|
+
page_size: PageSize = "A4",
|
|
42
|
+
margin_mm: int = 12,
|
|
43
|
+
cover: str = "first",
|
|
44
|
+
validate: bool = True,
|
|
45
|
+
verbose: bool = False,
|
|
46
|
+
**defaults: Any,
|
|
47
|
+
) -> None:
|
|
48
|
+
self.engine = engine
|
|
49
|
+
self.page_size = page_size
|
|
50
|
+
self.margin_mm = margin_mm
|
|
51
|
+
self.cover = cover
|
|
52
|
+
self.validate = validate
|
|
53
|
+
self.verbose = verbose
|
|
54
|
+
self._defaults = defaults
|
|
55
|
+
self._browser: Any | None = None
|
|
56
|
+
self._playwright: Any | None = None
|
|
57
|
+
|
|
58
|
+
def __enter__(self) -> Epub2Pdf:
|
|
59
|
+
if self.engine == "playwright":
|
|
60
|
+
self._start_browser()
|
|
61
|
+
return self
|
|
62
|
+
|
|
63
|
+
def __exit__(
|
|
64
|
+
self,
|
|
65
|
+
exc_type: type[BaseException] | None,
|
|
66
|
+
exc: BaseException | None,
|
|
67
|
+
tb: TracebackType | None,
|
|
68
|
+
) -> None:
|
|
69
|
+
self.close()
|
|
70
|
+
|
|
71
|
+
def close(self) -> None:
|
|
72
|
+
"""Release any pooled browser resources."""
|
|
73
|
+
if self._browser is not None:
|
|
74
|
+
with suppress(Exception):
|
|
75
|
+
self._browser.close()
|
|
76
|
+
self._browser = None
|
|
77
|
+
if self._playwright is not None:
|
|
78
|
+
with suppress(Exception):
|
|
79
|
+
self._playwright.stop()
|
|
80
|
+
self._playwright = None
|
|
81
|
+
|
|
82
|
+
def _start_browser(self) -> None:
|
|
83
|
+
try:
|
|
84
|
+
from playwright.sync_api import sync_playwright
|
|
85
|
+
except Exception as exc:
|
|
86
|
+
raise RuntimeError(
|
|
87
|
+
"Playwright is not installed. Install with `python3 -m pip install -e '.[playwright]'`."
|
|
88
|
+
) from exc
|
|
89
|
+
|
|
90
|
+
try:
|
|
91
|
+
self._playwright = sync_playwright().start()
|
|
92
|
+
self._browser = self._playwright.chromium.launch()
|
|
93
|
+
except Exception as exc:
|
|
94
|
+
self.close()
|
|
95
|
+
raise RuntimeError(
|
|
96
|
+
"Playwright failed to launch Chromium. Ensure `playwright install chromium` has been run."
|
|
97
|
+
) from exc
|
|
98
|
+
|
|
99
|
+
def convert(
|
|
100
|
+
self,
|
|
101
|
+
input_path: Path | str,
|
|
102
|
+
output_path: Path | str,
|
|
103
|
+
**kwargs: Any,
|
|
104
|
+
) -> dict[str, Any]:
|
|
105
|
+
"""Convert a single EPUB to PDF.
|
|
106
|
+
|
|
107
|
+
Keyword arguments override the client's default settings for this call.
|
|
108
|
+
"""
|
|
109
|
+
config = self._build_config(input_path, output_path, **kwargs)
|
|
110
|
+
engine = self._render_engine()
|
|
111
|
+
return convert_epub(config, engine=engine)
|
|
112
|
+
|
|
113
|
+
def batch_convert(
|
|
114
|
+
self,
|
|
115
|
+
jobs: Iterable[tuple[Path | str, Path | str]],
|
|
116
|
+
max_workers: int = 1,
|
|
117
|
+
**kwargs: Any,
|
|
118
|
+
) -> list[dict[str, Any]]:
|
|
119
|
+
"""Convert multiple EPUBs.
|
|
120
|
+
|
|
121
|
+
When ``max_workers`` is greater than 1, worker processes are used. For
|
|
122
|
+
Playwright this means each worker starts its own browser; the client's
|
|
123
|
+
pooled browser is only reused when ``max_workers`` is 1.
|
|
124
|
+
|
|
125
|
+
Returns a list of conversion reports in the same order as ``jobs``.
|
|
126
|
+
"""
|
|
127
|
+
configs = [self._build_config(input_path, output_path, **kwargs) for input_path, output_path in jobs]
|
|
128
|
+
if max_workers == 1:
|
|
129
|
+
engine = self._render_engine()
|
|
130
|
+
return [convert_epub(config, engine=engine) for config in configs]
|
|
131
|
+
|
|
132
|
+
with ProcessPoolExecutor(max_workers=max_workers) as executor:
|
|
133
|
+
return list(executor.map(_convert_one, configs))
|
|
134
|
+
|
|
135
|
+
def _render_engine(self) -> Renderer | None:
|
|
136
|
+
if self.engine == "playwright" and self._browser is not None:
|
|
137
|
+
return PlaywrightEngine(browser=self._browser)
|
|
138
|
+
return None
|
|
139
|
+
|
|
140
|
+
def _build_config(
|
|
141
|
+
self,
|
|
142
|
+
input_path: Path | str,
|
|
143
|
+
output_path: Path | str,
|
|
144
|
+
**kwargs: Any,
|
|
145
|
+
) -> ConvertConfig:
|
|
146
|
+
merged = {
|
|
147
|
+
"page_size": self.page_size,
|
|
148
|
+
"margin_mm": self.margin_mm,
|
|
149
|
+
"cover": self.cover,
|
|
150
|
+
"validate": self.validate,
|
|
151
|
+
"verbose": self.verbose,
|
|
152
|
+
**self._defaults,
|
|
153
|
+
**kwargs,
|
|
154
|
+
}
|
|
155
|
+
return ConvertConfig(
|
|
156
|
+
input_path=Path(input_path),
|
|
157
|
+
output_path=Path(output_path),
|
|
158
|
+
engine=self.engine,
|
|
159
|
+
**merged,
|
|
160
|
+
)
|
epub2pdf_cli/cli.py
ADDED
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import json
|
|
5
|
+
import logging
|
|
6
|
+
import sys
|
|
7
|
+
from collections.abc import Sequence
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
from epub2pdf_cli import __version__
|
|
11
|
+
from epub2pdf_cli.config import (
|
|
12
|
+
BatchConfig,
|
|
13
|
+
ConvertConfig,
|
|
14
|
+
InspectConfig,
|
|
15
|
+
PdfExtractConfig,
|
|
16
|
+
PdfExtractFormat,
|
|
17
|
+
)
|
|
18
|
+
from epub2pdf_cli.errors import Epub2PdfError, ExitCode
|
|
19
|
+
from epub2pdf_cli.pipeline import batch_convert, convert_epub, extract_pdf, inspect_epub
|
|
20
|
+
|
|
21
|
+
PDF_EXTRACT_FORMATS: tuple[PdfExtractFormat, ...] = (
|
|
22
|
+
"markdown",
|
|
23
|
+
"json",
|
|
24
|
+
"text",
|
|
25
|
+
"html",
|
|
26
|
+
"markdown-with-html",
|
|
27
|
+
"markdown-with-images",
|
|
28
|
+
"tagged-pdf",
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
33
|
+
parser = argparse.ArgumentParser(prog="epub2pdf", description="Convert EPUB files into machine-readable PDFs.")
|
|
34
|
+
parser.add_argument("--version", action="version", version=f"%(prog)s {__version__}")
|
|
35
|
+
subparsers = parser.add_subparsers(dest="command", required=True)
|
|
36
|
+
|
|
37
|
+
convert_parser = subparsers.add_parser("convert", help="Render an EPUB into PDF.")
|
|
38
|
+
convert_parser.add_argument("input", help="Path to the input .epub file.")
|
|
39
|
+
convert_parser.add_argument("-o", "--output", help="Path to the output PDF. Defaults to the input basename with .pdf.")
|
|
40
|
+
convert_parser.add_argument("--engine", choices=("playwright", "weasyprint"), default="weasyprint", help="Rendering backend. Default: weasyprint.")
|
|
41
|
+
convert_parser.add_argument("--sidecar-json", help="Write structured conversion output JSON to this path.")
|
|
42
|
+
convert_parser.add_argument("--sidecar-html", help="Write the normalized merged HTML to this path.")
|
|
43
|
+
convert_parser.add_argument("--sidecar-markdown", help="Write a Markdown version of the EPUB to this path.")
|
|
44
|
+
convert_parser.add_argument("--page-size", choices=("A4", "Letter"), default="A4")
|
|
45
|
+
convert_parser.add_argument("--margin-mm", type=int, default=12)
|
|
46
|
+
convert_parser.add_argument("--cover", choices=("first", "none"), default="first")
|
|
47
|
+
convert_parser.add_argument("--no-validate", action="store_true", help="Skip PDF validation after rendering.")
|
|
48
|
+
convert_parser.add_argument("--force", action="store_true", help="Overwrite the output file if it already exists.")
|
|
49
|
+
convert_parser.add_argument("--verbose", action="store_true", help="Enable verbose logs.")
|
|
50
|
+
|
|
51
|
+
batch_parser = subparsers.add_parser("batch", help="Convert multiple EPUBs in parallel.")
|
|
52
|
+
batch_parser.add_argument("inputs", nargs="+", help="Paths to input .epub files.")
|
|
53
|
+
batch_parser.add_argument("-o", "--output-dir", required=True, help="Directory for output PDFs.")
|
|
54
|
+
batch_parser.add_argument("--engine", choices=("playwright", "weasyprint"), default="weasyprint", help="Rendering backend. Default: weasyprint.")
|
|
55
|
+
batch_parser.add_argument("-j", "--workers", type=int, default=1, help="Number of parallel worker processes. Default: 1.")
|
|
56
|
+
batch_parser.add_argument("--sidecar-json", action="store_true", help="Write a JSON report next to each PDF.")
|
|
57
|
+
batch_parser.add_argument("--sidecar-html", action="store_true", help="Write merged HTML next to each PDF.")
|
|
58
|
+
batch_parser.add_argument("--sidecar-markdown", action="store_true", help="Write Markdown next to each PDF.")
|
|
59
|
+
batch_parser.add_argument("--page-size", choices=("A4", "Letter"), default="A4")
|
|
60
|
+
batch_parser.add_argument("--margin-mm", type=int, default=12)
|
|
61
|
+
batch_parser.add_argument("--cover", choices=("first", "none"), default="first")
|
|
62
|
+
batch_parser.add_argument("--no-validate", action="store_true", help="Skip PDF validation after rendering.")
|
|
63
|
+
batch_parser.add_argument("--force", action="store_true", help="Overwrite existing output files.")
|
|
64
|
+
batch_parser.add_argument("--verbose", action="store_true", help="Enable verbose logs.")
|
|
65
|
+
|
|
66
|
+
inspect_parser = subparsers.add_parser("inspect", help="Inspect EPUB metadata, manifest, spine, and TOC.")
|
|
67
|
+
inspect_parser.add_argument("input", help="Path to the input .epub file.")
|
|
68
|
+
inspect_parser.add_argument("--json", help="Write inspection output JSON to this path. Defaults to stdout.")
|
|
69
|
+
inspect_parser.add_argument("--verbose", action="store_true", help="Enable verbose logs.")
|
|
70
|
+
|
|
71
|
+
pdf_parser = subparsers.add_parser("pdf-extract", help="Extract Markdown/JSON/HTML from a PDF.")
|
|
72
|
+
pdf_parser.add_argument("input", help="Path to the input .pdf file.")
|
|
73
|
+
pdf_parser.add_argument("-o", "--output-dir", help="Directory for extracted files. Defaults to <pdf-stem>_extracted.")
|
|
74
|
+
pdf_parser.add_argument("--engine", choices=("pypdfium2", "docling", "pdfplumber", "opendataloader"), default="pypdfium2", help="Extraction backend. Default: pypdfium2.")
|
|
75
|
+
pdf_parser.add_argument(
|
|
76
|
+
"--format",
|
|
77
|
+
default="markdown,json",
|
|
78
|
+
help="Comma-separated output formats. Default: markdown,json.",
|
|
79
|
+
)
|
|
80
|
+
pdf_parser.add_argument("--pages", help='Pages to extract, for example "1,3,5-7".')
|
|
81
|
+
pdf_parser.add_argument("--password", help="Password for encrypted PDF files.")
|
|
82
|
+
pdf_parser.add_argument("--use-struct-tree", action="store_true", help="Use tagged PDF structure tree when available.")
|
|
83
|
+
pdf_parser.add_argument("--sanitize", action="store_true", help="Sanitize emails, phone numbers, IPs, cards, and URLs.")
|
|
84
|
+
pdf_parser.add_argument("--keep-line-breaks", action="store_true", help="Preserve original text line breaks.")
|
|
85
|
+
pdf_parser.add_argument("--include-header-footer", action="store_true", help="Include page headers and footers.")
|
|
86
|
+
pdf_parser.add_argument("--detect-strikethrough", action="store_true", help="Detect strikethrough text in Markdown/HTML.")
|
|
87
|
+
pdf_parser.add_argument("--table-method", choices=("default", "cluster"), help="Table detection method.")
|
|
88
|
+
pdf_parser.add_argument("--reading-order", choices=("off", "xycut"), default="xycut", help="Reading order algorithm.")
|
|
89
|
+
pdf_parser.add_argument("--markdown-page-separator", help="Separator between Markdown pages; supports %%page-number%%.")
|
|
90
|
+
pdf_parser.add_argument("--html-page-separator", help="Separator between HTML pages; supports %%page-number%%.")
|
|
91
|
+
pdf_parser.add_argument("--image-output", choices=("off", "embedded", "external"), default="external")
|
|
92
|
+
pdf_parser.add_argument("--image-dir", help="Directory for extracted images.")
|
|
93
|
+
pdf_parser.add_argument("--threads", type=int, help="Worker thread count for native extraction.")
|
|
94
|
+
pdf_parser.add_argument("--sidecar-json", help="Write structured extraction report JSON to this path.")
|
|
95
|
+
pdf_parser.add_argument("--force", action="store_true", help="Overwrite existing extraction outputs.")
|
|
96
|
+
pdf_parser.add_argument("--verbose", action="store_true", help="Enable verbose logs.")
|
|
97
|
+
return parser
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def main(argv: Sequence[str] | None = None) -> int:
|
|
101
|
+
parser = build_parser()
|
|
102
|
+
args = parser.parse_args(list(argv) if argv is not None else None)
|
|
103
|
+
_configure_logging(args.verbose)
|
|
104
|
+
|
|
105
|
+
try:
|
|
106
|
+
if args.command == "convert":
|
|
107
|
+
input_path = Path(args.input)
|
|
108
|
+
output_path = Path(args.output) if args.output else input_path.with_suffix(".pdf")
|
|
109
|
+
report = convert_epub(
|
|
110
|
+
ConvertConfig(
|
|
111
|
+
input_path=input_path,
|
|
112
|
+
output_path=output_path,
|
|
113
|
+
engine=args.engine,
|
|
114
|
+
sidecar_json_path=Path(args.sidecar_json) if args.sidecar_json else None,
|
|
115
|
+
sidecar_html_path=Path(args.sidecar_html) if args.sidecar_html else None,
|
|
116
|
+
sidecar_markdown_path=Path(args.sidecar_markdown) if args.sidecar_markdown else None,
|
|
117
|
+
page_size=args.page_size,
|
|
118
|
+
margin_mm=args.margin_mm,
|
|
119
|
+
cover=args.cover,
|
|
120
|
+
validate=not args.no_validate,
|
|
121
|
+
force=args.force,
|
|
122
|
+
verbose=args.verbose,
|
|
123
|
+
)
|
|
124
|
+
)
|
|
125
|
+
print(str(output_path))
|
|
126
|
+
logging.getLogger(__name__).debug("Conversion report: %s", json.dumps(report, ensure_ascii=False))
|
|
127
|
+
return ExitCode.OK
|
|
128
|
+
|
|
129
|
+
if args.command == "batch":
|
|
130
|
+
report = batch_convert(
|
|
131
|
+
BatchConfig(
|
|
132
|
+
input_paths=[Path(p) for p in args.inputs],
|
|
133
|
+
output_dir=Path(args.output_dir),
|
|
134
|
+
engine=args.engine,
|
|
135
|
+
workers=args.workers,
|
|
136
|
+
sidecar_json=args.sidecar_json,
|
|
137
|
+
sidecar_html=args.sidecar_html,
|
|
138
|
+
sidecar_markdown=args.sidecar_markdown,
|
|
139
|
+
page_size=args.page_size,
|
|
140
|
+
margin_mm=args.margin_mm,
|
|
141
|
+
cover=args.cover,
|
|
142
|
+
validate=not args.no_validate,
|
|
143
|
+
force=args.force,
|
|
144
|
+
verbose=args.verbose,
|
|
145
|
+
)
|
|
146
|
+
)
|
|
147
|
+
for result in report["results"]:
|
|
148
|
+
output_path = result.get("output", {}).get("path")
|
|
149
|
+
if output_path:
|
|
150
|
+
print(output_path)
|
|
151
|
+
logging.getLogger(__name__).debug("Batch report: %s", json.dumps(report, ensure_ascii=False))
|
|
152
|
+
return ExitCode.OK if report["failures"] == 0 else ExitCode.UNEXPECTED
|
|
153
|
+
|
|
154
|
+
if args.command == "inspect":
|
|
155
|
+
report = inspect_epub(
|
|
156
|
+
InspectConfig(
|
|
157
|
+
input_path=Path(args.input),
|
|
158
|
+
json_path=Path(args.json) if args.json else None,
|
|
159
|
+
)
|
|
160
|
+
)
|
|
161
|
+
if not args.json:
|
|
162
|
+
json.dump(report, sys.stdout, ensure_ascii=False, indent=2)
|
|
163
|
+
sys.stdout.write("\n")
|
|
164
|
+
return ExitCode.OK
|
|
165
|
+
|
|
166
|
+
if args.command == "pdf-extract":
|
|
167
|
+
input_path = Path(args.input)
|
|
168
|
+
output_dir = Path(args.output_dir) if args.output_dir else input_path.with_name(f"{input_path.stem}_extracted")
|
|
169
|
+
report = extract_pdf(
|
|
170
|
+
PdfExtractConfig(
|
|
171
|
+
input_path=input_path,
|
|
172
|
+
output_dir=output_dir,
|
|
173
|
+
formats=_parse_pdf_formats(args.format),
|
|
174
|
+
engine=args.engine,
|
|
175
|
+
pages=args.pages,
|
|
176
|
+
password=args.password,
|
|
177
|
+
use_struct_tree=args.use_struct_tree,
|
|
178
|
+
sanitize=args.sanitize,
|
|
179
|
+
keep_line_breaks=args.keep_line_breaks,
|
|
180
|
+
include_header_footer=args.include_header_footer,
|
|
181
|
+
detect_strikethrough=args.detect_strikethrough,
|
|
182
|
+
table_method=args.table_method,
|
|
183
|
+
reading_order=args.reading_order,
|
|
184
|
+
markdown_page_separator=args.markdown_page_separator,
|
|
185
|
+
html_page_separator=args.html_page_separator,
|
|
186
|
+
image_output=args.image_output,
|
|
187
|
+
image_dir=Path(args.image_dir) if args.image_dir else None,
|
|
188
|
+
threads=args.threads,
|
|
189
|
+
sidecar_json_path=Path(args.sidecar_json) if args.sidecar_json else None,
|
|
190
|
+
force=args.force,
|
|
191
|
+
verbose=args.verbose,
|
|
192
|
+
)
|
|
193
|
+
)
|
|
194
|
+
for output in report["outputs"]:
|
|
195
|
+
print(output)
|
|
196
|
+
return ExitCode.OK
|
|
197
|
+
except Epub2PdfError as exc:
|
|
198
|
+
print(str(exc), file=sys.stderr)
|
|
199
|
+
return exc.exit_code
|
|
200
|
+
except Exception as exc:
|
|
201
|
+
logging.getLogger(__name__).exception("Unexpected error")
|
|
202
|
+
print(f"Unexpected error: {exc}", file=sys.stderr)
|
|
203
|
+
return ExitCode.UNEXPECTED
|
|
204
|
+
|
|
205
|
+
# Unreachable because subparsers are required, but kept for safety.
|
|
206
|
+
parser.error(f"Unsupported command: {args.command}")
|
|
207
|
+
return ExitCode.USAGE
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def _configure_logging(verbose: bool) -> None:
|
|
211
|
+
level = logging.DEBUG if verbose else logging.WARNING
|
|
212
|
+
logging.basicConfig(level=level, format="%(levelname)s %(message)s")
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def _parse_pdf_formats(raw: str) -> list[PdfExtractFormat]:
|
|
216
|
+
formats = [part.strip() for part in raw.split(",") if part.strip()]
|
|
217
|
+
if not formats:
|
|
218
|
+
raise Epub2PdfError("At least one --format value is required.", exit_code=ExitCode.USAGE)
|
|
219
|
+
invalid = [fmt for fmt in formats if fmt not in PDF_EXTRACT_FORMATS]
|
|
220
|
+
if invalid:
|
|
221
|
+
allowed = ", ".join(PDF_EXTRACT_FORMATS)
|
|
222
|
+
raise Epub2PdfError(f"Unsupported PDF extract format(s): {', '.join(invalid)}. Allowed: {allowed}", exit_code=ExitCode.USAGE)
|
|
223
|
+
return formats # type: ignore[return-value]
|
epub2pdf_cli/config.py
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Literal
|
|
6
|
+
|
|
7
|
+
EngineName = Literal["playwright", "weasyprint"]
|
|
8
|
+
CoverMode = Literal["first", "none"]
|
|
9
|
+
PageSize = Literal["A4", "Letter"]
|
|
10
|
+
PdfExtractFormat = Literal[
|
|
11
|
+
"markdown",
|
|
12
|
+
"json",
|
|
13
|
+
"text",
|
|
14
|
+
"html",
|
|
15
|
+
"markdown-with-html",
|
|
16
|
+
"markdown-with-images",
|
|
17
|
+
"tagged-pdf",
|
|
18
|
+
]
|
|
19
|
+
ImageOutputMode = Literal["off", "embedded", "external"]
|
|
20
|
+
TableMethod = Literal["default", "cluster"]
|
|
21
|
+
ReadingOrder = Literal["off", "xycut"]
|
|
22
|
+
PdfExtractorName = Literal["pypdfium2", "docling", "pdfplumber", "opendataloader"]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass(frozen=True, slots=True)
|
|
26
|
+
class ConvertConfig:
|
|
27
|
+
input_path: Path
|
|
28
|
+
output_path: Path
|
|
29
|
+
engine: EngineName = "weasyprint"
|
|
30
|
+
sidecar_json_path: Path | None = None
|
|
31
|
+
sidecar_html_path: Path | None = None
|
|
32
|
+
sidecar_markdown_path: Path | None = None
|
|
33
|
+
page_size: PageSize = "A4"
|
|
34
|
+
margin_mm: int = 12
|
|
35
|
+
cover: CoverMode = "first"
|
|
36
|
+
validate: bool = True
|
|
37
|
+
force: bool = False
|
|
38
|
+
verbose: bool = False
|
|
39
|
+
|
|
40
|
+
def __post_init__(self) -> None:
|
|
41
|
+
if self.margin_mm < 0:
|
|
42
|
+
raise ValueError("margin_mm must be non-negative")
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass(frozen=True, slots=True)
|
|
46
|
+
class BatchConfig:
|
|
47
|
+
input_paths: list[Path]
|
|
48
|
+
output_dir: Path
|
|
49
|
+
engine: EngineName = "weasyprint"
|
|
50
|
+
workers: int = 1
|
|
51
|
+
sidecar_json: bool = False
|
|
52
|
+
sidecar_html: bool = False
|
|
53
|
+
sidecar_markdown: bool = False
|
|
54
|
+
page_size: PageSize = "A4"
|
|
55
|
+
margin_mm: int = 12
|
|
56
|
+
cover: CoverMode = "first"
|
|
57
|
+
validate: bool = True
|
|
58
|
+
force: bool = False
|
|
59
|
+
verbose: bool = False
|
|
60
|
+
|
|
61
|
+
def __post_init__(self) -> None:
|
|
62
|
+
if self.margin_mm < 0:
|
|
63
|
+
raise ValueError("margin_mm must be non-negative")
|
|
64
|
+
if self.workers < 1:
|
|
65
|
+
raise ValueError("workers must be at least 1")
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
@dataclass(frozen=True, slots=True)
|
|
69
|
+
class InspectConfig:
|
|
70
|
+
input_path: Path
|
|
71
|
+
json_path: Path | None = None
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
@dataclass(frozen=True, slots=True)
|
|
75
|
+
class PdfExtractConfig:
|
|
76
|
+
input_path: Path
|
|
77
|
+
output_dir: Path
|
|
78
|
+
formats: list[PdfExtractFormat]
|
|
79
|
+
engine: PdfExtractorName = "pypdfium2"
|
|
80
|
+
pages: str | None = None
|
|
81
|
+
password: str | None = None
|
|
82
|
+
use_struct_tree: bool = False
|
|
83
|
+
sanitize: bool = False
|
|
84
|
+
keep_line_breaks: bool = False
|
|
85
|
+
include_header_footer: bool = False
|
|
86
|
+
detect_strikethrough: bool = False
|
|
87
|
+
table_method: TableMethod | None = None
|
|
88
|
+
reading_order: ReadingOrder | None = "xycut"
|
|
89
|
+
markdown_page_separator: str | None = None
|
|
90
|
+
html_page_separator: str | None = None
|
|
91
|
+
image_output: ImageOutputMode | None = "external"
|
|
92
|
+
image_dir: Path | None = None
|
|
93
|
+
threads: int | None = None
|
|
94
|
+
sidecar_json_path: Path | None = None
|
|
95
|
+
force: bool = False
|
|
96
|
+
verbose: bool = False
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
@dataclass(frozen=True, slots=True)
|
|
100
|
+
class RenderOptions:
|
|
101
|
+
output_path: Path
|
|
102
|
+
page_size: PageSize
|
|
103
|
+
margin_mm: int
|
|
104
|
+
cover: CoverMode
|
|
105
|
+
title: str = ""
|
|
106
|
+
|
|
107
|
+
def __post_init__(self) -> None:
|
|
108
|
+
if self.margin_mm < 0:
|
|
109
|
+
raise ValueError("margin_mm must be non-negative")
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections.abc import Iterable
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from bs4 import BeautifulSoup
|
|
7
|
+
|
|
8
|
+
from epub2pdf_cli.errors import StageError
|
|
9
|
+
from epub2pdf_cli.models import Chapter, ManifestItem, SpineItem
|
|
10
|
+
|
|
11
|
+
TEXT_MEDIA_TYPES = {"application/xhtml+xml", "text/html"}
|
|
12
|
+
UNSUPPORTED_TAGS = {"script", "audio", "video", "canvas", "iframe", "form"}
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def read_chapters(
|
|
16
|
+
spine: Iterable[SpineItem],
|
|
17
|
+
manifest: dict[str, ManifestItem],
|
|
18
|
+
) -> tuple[list[Chapter], list[str]]:
|
|
19
|
+
chapters: list[Chapter] = []
|
|
20
|
+
warnings: list[str] = []
|
|
21
|
+
for item in spine:
|
|
22
|
+
if item.media_type not in TEXT_MEDIA_TYPES:
|
|
23
|
+
continue
|
|
24
|
+
manifest_item = manifest[item.idref]
|
|
25
|
+
if not manifest_item.content:
|
|
26
|
+
raise StageError("spine", f"Spine item is missing chapter content: {manifest_item.href}")
|
|
27
|
+
soup = BeautifulSoup(manifest_item.content, "lxml")
|
|
28
|
+
title = _extract_title(soup, fallback=Path(manifest_item.href).stem)
|
|
29
|
+
text = soup.get_text(" ", strip=True)
|
|
30
|
+
chapter_warnings = _chapter_warnings(soup, manifest_item.href, text)
|
|
31
|
+
warnings.extend(chapter_warnings)
|
|
32
|
+
chapters.append(
|
|
33
|
+
Chapter(
|
|
34
|
+
idref=item.idref,
|
|
35
|
+
href=manifest_item.href,
|
|
36
|
+
media_type=item.media_type,
|
|
37
|
+
title=title,
|
|
38
|
+
html=manifest_item.content.decode("utf-8", errors="replace"),
|
|
39
|
+
text=text,
|
|
40
|
+
linear=item.linear,
|
|
41
|
+
)
|
|
42
|
+
)
|
|
43
|
+
if not chapters:
|
|
44
|
+
raise StageError("spine", "Spine did not contain any XHTML/HTML documents")
|
|
45
|
+
return chapters, warnings
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _extract_title(soup: BeautifulSoup, fallback: str) -> str:
|
|
49
|
+
for selector in ("title", "h1", "h2"):
|
|
50
|
+
node = soup.find(selector)
|
|
51
|
+
if node:
|
|
52
|
+
text = str(node.get_text(" ", strip=True))
|
|
53
|
+
if text:
|
|
54
|
+
return text
|
|
55
|
+
return fallback
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _chapter_warnings(soup: BeautifulSoup, href: str, text: str) -> list[str]:
|
|
59
|
+
warnings: list[str] = []
|
|
60
|
+
if not text.strip() and soup.find("img"):
|
|
61
|
+
warnings.append(f"Chapter is image-heavy or image-only: {href}")
|
|
62
|
+
found: set[str] = set()
|
|
63
|
+
for tag in soup.find_all(True):
|
|
64
|
+
if tag.name in UNSUPPORTED_TAGS:
|
|
65
|
+
found.add(tag.name)
|
|
66
|
+
for tag_name in sorted(found):
|
|
67
|
+
warnings.append(f"Unsupported <{tag_name}> content detected in {href}")
|
|
68
|
+
return warnings
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def manifest_warnings(manifest: dict[str, ManifestItem]) -> list[str]:
|
|
72
|
+
warnings: list[str] = []
|
|
73
|
+
supported_prefixes = ("application/xhtml+xml", "text/html", "text/css", "image/", "font/")
|
|
74
|
+
supported_exact = {"application/x-dtbncx+xml"}
|
|
75
|
+
for item in manifest.values():
|
|
76
|
+
if any(item.media_type.startswith(prefix) for prefix in supported_prefixes):
|
|
77
|
+
continue
|
|
78
|
+
if item.media_type in supported_exact:
|
|
79
|
+
continue
|
|
80
|
+
warnings.append(f"Manifest item may not be fully represented in PDF: {item.href} ({item.media_type})")
|
|
81
|
+
return warnings
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import zipfile
|
|
4
|
+
from xml.etree import ElementTree as ET
|
|
5
|
+
|
|
6
|
+
from epub2pdf_cli.errors import ExitCode, StageError
|
|
7
|
+
|
|
8
|
+
CONTAINER_NS = {"c": "urn:oasis:names:tc:opendocument:xmlns:container"}
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def read_rootfile_path(archive: zipfile.ZipFile) -> str:
|
|
12
|
+
try:
|
|
13
|
+
container_bytes = archive.read("META-INF/container.xml")
|
|
14
|
+
except KeyError as exc:
|
|
15
|
+
raise StageError("container", "Missing required EPUB resource: META-INF/container.xml", exit_code=ExitCode.USAGE) from exc
|
|
16
|
+
|
|
17
|
+
try:
|
|
18
|
+
container = ET.fromstring(container_bytes)
|
|
19
|
+
except ET.ParseError as exc:
|
|
20
|
+
raise StageError("container", "Unable to parse META-INF/container.xml") from exc
|
|
21
|
+
|
|
22
|
+
rootfile = container.find("c:rootfiles/c:rootfile", CONTAINER_NS)
|
|
23
|
+
if rootfile is None or not rootfile.attrib.get("full-path"):
|
|
24
|
+
raise StageError("container", "container.xml does not declare a rootfile")
|
|
25
|
+
return rootfile.attrib["full-path"]
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import posixpath
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def split_href(href: str) -> tuple[str, str]:
|
|
7
|
+
if "#" not in href:
|
|
8
|
+
return href, ""
|
|
9
|
+
path, fragment = href.split("#", 1)
|
|
10
|
+
return path, fragment
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def resolve_relative_href(base_href: str, target: str) -> str:
|
|
14
|
+
if not target:
|
|
15
|
+
return base_href
|
|
16
|
+
if "://" in target or target.startswith("mailto:"):
|
|
17
|
+
return target
|
|
18
|
+
path, fragment = split_href(target)
|
|
19
|
+
resolved_path = (
|
|
20
|
+
posixpath.normpath(posixpath.join(posixpath.dirname(base_href), path))
|
|
21
|
+
if path
|
|
22
|
+
else base_href
|
|
23
|
+
)
|
|
24
|
+
return f"{resolved_path}#{fragment}" if fragment else resolved_path
|