epub2pdf-cli 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epub2pdf_cli/__init__.py +5 -0
- epub2pdf_cli/__main__.py +4 -0
- epub2pdf_cli/api.py +160 -0
- epub2pdf_cli/cli.py +223 -0
- epub2pdf_cli/config.py +109 -0
- epub2pdf_cli/epub/__init__.py +3 -0
- epub2pdf_cli/epub/chapters.py +81 -0
- epub2pdf_cli/epub/container.py +25 -0
- epub2pdf_cli/epub/href.py +24 -0
- epub2pdf_cli/epub/opf.py +159 -0
- epub2pdf_cli/epub/parser.py +64 -0
- epub2pdf_cli/epub/toc.py +101 -0
- epub2pdf_cli/errors.py +27 -0
- epub2pdf_cli/html/__init__.py +3 -0
- epub2pdf_cli/html/builder.py +190 -0
- epub2pdf_cli/html/css.py +49 -0
- epub2pdf_cli/html/links.py +144 -0
- epub2pdf_cli/html/template.py +92 -0
- epub2pdf_cli/io_utils.py +24 -0
- epub2pdf_cli/markdown.py +97 -0
- epub2pdf_cli/mcp_server.py +189 -0
- epub2pdf_cli/models.py +116 -0
- epub2pdf_cli/pdf/__init__.py +5 -0
- epub2pdf_cli/pdf/extract.py +79 -0
- epub2pdf_cli/pdf/extractors/__init__.py +0 -0
- epub2pdf_cli/pdf/extractors/base.py +23 -0
- epub2pdf_cli/pdf/extractors/docling_extractor.py +139 -0
- epub2pdf_cli/pdf/extractors/opendataloader_extractor.py +86 -0
- epub2pdf_cli/pdf/extractors/pdfplumber_extractor.py +150 -0
- epub2pdf_cli/pdf/extractors/pypdfium2_extractor.py +151 -0
- epub2pdf_cli/pdf/text.py +45 -0
- epub2pdf_cli/pdf/validate.py +37 -0
- epub2pdf_cli/pipeline/__init__.py +6 -0
- epub2pdf_cli/pipeline/batch.py +84 -0
- epub2pdf_cli/pipeline/convert.py +122 -0
- epub2pdf_cli/pipeline/extract.py +64 -0
- epub2pdf_cli/pipeline/inspect.py +15 -0
- epub2pdf_cli/render/__init__.py +17 -0
- epub2pdf_cli/render/options.py +19 -0
- epub2pdf_cli/render/playwright.py +91 -0
- epub2pdf_cli/render/protocol.py +13 -0
- epub2pdf_cli/render/weasyprint.py +28 -0
- epub2pdf_cli-0.3.0.dist-info/METADATA +443 -0
- epub2pdf_cli-0.3.0.dist-info/RECORD +48 -0
- epub2pdf_cli-0.3.0.dist-info/WHEEL +5 -0
- epub2pdf_cli-0.3.0.dist-info/entry_points.txt +3 -0
- epub2pdf_cli-0.3.0.dist-info/licenses/LICENSE +21 -0
- epub2pdf_cli-0.3.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections.abc import Sequence
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from epub2pdf_cli.config import PdfExtractConfig
|
|
7
|
+
from epub2pdf_cli.errors import ExitCode, StageError
|
|
8
|
+
from epub2pdf_cli.pdf.extractors.base import Extractor
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def run_pdf_extraction(config: PdfExtractConfig, timings: dict[str, float] | None = None) -> list[str]:
|
|
12
|
+
extractor = _select_extractor(config.engine)
|
|
13
|
+
options = {
|
|
14
|
+
"sanitize": config.sanitize,
|
|
15
|
+
"keep_line_breaks": config.keep_line_breaks,
|
|
16
|
+
"use_struct_tree": config.use_struct_tree,
|
|
17
|
+
"table_method": config.table_method,
|
|
18
|
+
"reading_order": config.reading_order,
|
|
19
|
+
"markdown_page_separator": config.markdown_page_separator,
|
|
20
|
+
"html_page_separator": config.html_page_separator,
|
|
21
|
+
"image_output": config.image_output,
|
|
22
|
+
"image_dir": config.image_dir,
|
|
23
|
+
"include_header_footer": config.include_header_footer,
|
|
24
|
+
"detect_strikethrough": config.detect_strikethrough,
|
|
25
|
+
"threads": config.threads,
|
|
26
|
+
}
|
|
27
|
+
return extractor.extract(
|
|
28
|
+
config.input_path,
|
|
29
|
+
config.output_dir,
|
|
30
|
+
config.formats,
|
|
31
|
+
pages=config.pages,
|
|
32
|
+
password=config.password,
|
|
33
|
+
options={k: v for k, v in options.items() if v is not None},
|
|
34
|
+
timings=timings,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _select_extractor(name: str) -> Extractor:
|
|
39
|
+
from epub2pdf_cli.pdf.extractors.docling_extractor import DoclingExtractor
|
|
40
|
+
from epub2pdf_cli.pdf.extractors.opendataloader_extractor import OpendataloaderExtractor
|
|
41
|
+
from epub2pdf_cli.pdf.extractors.pdfplumber_extractor import PdfPlumberExtractor
|
|
42
|
+
from epub2pdf_cli.pdf.extractors.pypdfium2_extractor import Pypdfium2Extractor
|
|
43
|
+
|
|
44
|
+
registry: dict[str, type[Extractor]] = {
|
|
45
|
+
"pypdfium2": Pypdfium2Extractor,
|
|
46
|
+
"docling": DoclingExtractor,
|
|
47
|
+
"pdfplumber": PdfPlumberExtractor,
|
|
48
|
+
"opendataloader": OpendataloaderExtractor,
|
|
49
|
+
}
|
|
50
|
+
cls = registry.get(name)
|
|
51
|
+
if cls is None:
|
|
52
|
+
raise StageError("pdf-extract", f"Unsupported extractor: {name}. Choose from {', '.join(registry)}.", exit_code=ExitCode.USAGE)
|
|
53
|
+
return cls()
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def find_extract_outputs(input_path: Path, output_dir: Path, formats: Sequence[str]) -> list[str]:
|
|
57
|
+
outputs = [path for path in planned_extract_paths(input_path, output_dir, formats) if path.exists()]
|
|
58
|
+
if outputs:
|
|
59
|
+
return [str(path) for path in outputs]
|
|
60
|
+
return [str(path) for path in sorted(output_dir.glob(f"{input_path.stem}*")) if path.is_file()]
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def planned_extract_paths(input_path: Path, output_dir: Path, formats: Sequence[str]) -> list[Path]:
|
|
64
|
+
extension_map = {
|
|
65
|
+
"json": ".json",
|
|
66
|
+
"text": ".txt",
|
|
67
|
+
"html": ".html",
|
|
68
|
+
"pdf": ".pdf",
|
|
69
|
+
"markdown": ".md",
|
|
70
|
+
"markdown-with-html": ".md",
|
|
71
|
+
"markdown-with-images": ".md",
|
|
72
|
+
"tagged-pdf": ".pdf",
|
|
73
|
+
}
|
|
74
|
+
planned: list[Path] = []
|
|
75
|
+
for fmt in formats:
|
|
76
|
+
suffix = extension_map.get(fmt)
|
|
77
|
+
if suffix:
|
|
78
|
+
planned.append(output_dir / f"{input_path.stem}{suffix}")
|
|
79
|
+
return planned
|
|
File without changes
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections.abc import Sequence
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any, Protocol, runtime_checkable
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@runtime_checkable
|
|
9
|
+
class Extractor(Protocol):
|
|
10
|
+
name: str
|
|
11
|
+
|
|
12
|
+
def extract(
|
|
13
|
+
self,
|
|
14
|
+
input_path: Path,
|
|
15
|
+
output_dir: Path,
|
|
16
|
+
formats: Sequence[str],
|
|
17
|
+
*,
|
|
18
|
+
pages: str | None = None,
|
|
19
|
+
password: str | None = None,
|
|
20
|
+
options: dict[str, Any] | None = None,
|
|
21
|
+
timings: dict[str, float] | None = None,
|
|
22
|
+
) -> list[str]:
|
|
23
|
+
...
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import time
|
|
5
|
+
from collections.abc import Callable, Sequence
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any, TypeVar
|
|
8
|
+
|
|
9
|
+
from epub2pdf_cli.errors import ExitCode, StageError
|
|
10
|
+
from epub2pdf_cli.pdf.extractors.base import Extractor
|
|
11
|
+
|
|
12
|
+
T = TypeVar("T")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class DoclingExtractor(Extractor):
|
|
16
|
+
name = "docling"
|
|
17
|
+
|
|
18
|
+
def extract(
|
|
19
|
+
self,
|
|
20
|
+
input_path: Path,
|
|
21
|
+
output_dir: Path,
|
|
22
|
+
formats: Sequence[str],
|
|
23
|
+
*,
|
|
24
|
+
pages: str | None = None,
|
|
25
|
+
password: str | None = None,
|
|
26
|
+
options: dict[str, Any] | None = None,
|
|
27
|
+
timings: dict[str, float] | None = None,
|
|
28
|
+
) -> list[str]:
|
|
29
|
+
try:
|
|
30
|
+
from docling.datamodel.base_models import ConversionStatus
|
|
31
|
+
from docling.document_converter import DocumentConverter
|
|
32
|
+
except Exception as exc:
|
|
33
|
+
raise StageError(
|
|
34
|
+
"pdf-extract",
|
|
35
|
+
"Docling is not installed. Install with `python3 -m pip install -e '.[docling]'`.",
|
|
36
|
+
exit_code=ExitCode.USAGE,
|
|
37
|
+
) from exc
|
|
38
|
+
|
|
39
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
40
|
+
base_name = input_path.stem
|
|
41
|
+
|
|
42
|
+
converter = DocumentConverter()
|
|
43
|
+
result, _ = _timed_stage(
|
|
44
|
+
"convert_document",
|
|
45
|
+
lambda: converter.convert(str(input_path)),
|
|
46
|
+
timings,
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
if result.status != ConversionStatus.SUCCESS:
|
|
50
|
+
raise StageError("pdf-extract", f"Docling conversion status: {result.status.name}")
|
|
51
|
+
|
|
52
|
+
outputs: list[str] = []
|
|
53
|
+
|
|
54
|
+
if "markdown" in formats:
|
|
55
|
+
md, _ = _timed_stage("export_markdown", lambda: result.document.export_to_markdown(), timings)
|
|
56
|
+
path = output_dir / f"{base_name}.md"
|
|
57
|
+
_timed_stage_void("write_markdown", lambda: path.write_text(md, encoding="utf-8"), timings)
|
|
58
|
+
outputs.append(str(path))
|
|
59
|
+
|
|
60
|
+
if "text" in formats:
|
|
61
|
+
md, _ = _timed_stage("export_markdown", lambda: result.document.export_to_markdown(), timings)
|
|
62
|
+
text, _ = _timed_stage("markdown_to_text", lambda: self._markdown_to_text(md), timings)
|
|
63
|
+
path = output_dir / f"{base_name}.txt"
|
|
64
|
+
_timed_stage_void("write_text", lambda: path.write_text(text, encoding="utf-8"), timings)
|
|
65
|
+
outputs.append(str(path))
|
|
66
|
+
|
|
67
|
+
if "html" in formats:
|
|
68
|
+
md, _ = _timed_stage("export_markdown", lambda: result.document.export_to_markdown(), timings)
|
|
69
|
+
html, _ = _timed_stage("markdown_to_html", lambda: self._markdown_to_html(md), timings)
|
|
70
|
+
path = output_dir / f"{base_name}.html"
|
|
71
|
+
_timed_stage_void("write_html", lambda: path.write_text(html, encoding="utf-8"), timings)
|
|
72
|
+
outputs.append(str(path))
|
|
73
|
+
|
|
74
|
+
if "json" in formats:
|
|
75
|
+
data, _ = _timed_stage("export_to_dict", lambda: result.document.export_to_dict(), timings)
|
|
76
|
+
path = output_dir / f"{base_name}.json"
|
|
77
|
+
_timed_stage_void("write_json", lambda: path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8"), timings)
|
|
78
|
+
outputs.append(str(path))
|
|
79
|
+
|
|
80
|
+
return outputs
|
|
81
|
+
|
|
82
|
+
def _markdown_to_text(self, markdown: str) -> str:
|
|
83
|
+
# Minimal conversion: strip heading markers and list markers
|
|
84
|
+
lines = []
|
|
85
|
+
for line in markdown.splitlines():
|
|
86
|
+
line = line.lstrip("#> ").lstrip("- ").lstrip("* ")
|
|
87
|
+
lines.append(line)
|
|
88
|
+
return "\n".join(lines)
|
|
89
|
+
|
|
90
|
+
def _markdown_to_html(self, markdown: str) -> str:
|
|
91
|
+
from html import escape
|
|
92
|
+
|
|
93
|
+
lines = []
|
|
94
|
+
in_list = False
|
|
95
|
+
for raw in markdown.splitlines():
|
|
96
|
+
stripped = raw.strip()
|
|
97
|
+
if stripped.startswith("# "):
|
|
98
|
+
text = escape(stripped[2:])
|
|
99
|
+
lines.append(f"<h1>{text}</h1>")
|
|
100
|
+
elif stripped.startswith("## "):
|
|
101
|
+
text = escape(stripped[3:])
|
|
102
|
+
lines.append(f"<h2>{text}</h2>")
|
|
103
|
+
elif stripped.startswith("### "):
|
|
104
|
+
text = escape(stripped[4:])
|
|
105
|
+
lines.append(f"<h3>{text}</h3>")
|
|
106
|
+
elif stripped.startswith("- ") or stripped.startswith("* "):
|
|
107
|
+
if not in_list:
|
|
108
|
+
lines.append("<ul>")
|
|
109
|
+
in_list = True
|
|
110
|
+
text = escape(stripped[2:])
|
|
111
|
+
lines.append(f"<li>{text}</li>")
|
|
112
|
+
else:
|
|
113
|
+
if in_list:
|
|
114
|
+
lines.append("</ul>")
|
|
115
|
+
in_list = False
|
|
116
|
+
if stripped:
|
|
117
|
+
lines.append(f"<p>{escape(stripped)}</p>")
|
|
118
|
+
if in_list:
|
|
119
|
+
lines.append("</ul>")
|
|
120
|
+
body = "\n".join(lines)
|
|
121
|
+
return f"<!DOCTYPE html>\n<html><body>\n{body}\n</body></html>\n"
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def _timed_stage(name: str, fn: Callable[[], T], timings: dict[str, float] | None) -> tuple[T, float]:
|
|
125
|
+
start = time.perf_counter()
|
|
126
|
+
result = fn()
|
|
127
|
+
duration = round(time.perf_counter() - start, 3)
|
|
128
|
+
if timings is not None:
|
|
129
|
+
timings[name] = duration
|
|
130
|
+
return result, duration
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def _timed_stage_void(name: str, fn: Callable[[], object], timings: dict[str, float] | None) -> float:
|
|
134
|
+
start = time.perf_counter()
|
|
135
|
+
fn()
|
|
136
|
+
duration = round(time.perf_counter() - start, 3)
|
|
137
|
+
if timings is not None:
|
|
138
|
+
timings[name] = duration
|
|
139
|
+
return duration
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import time
|
|
4
|
+
from collections.abc import Callable, Sequence
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any, TypeVar
|
|
7
|
+
|
|
8
|
+
from epub2pdf_cli.errors import ExitCode, StageError
|
|
9
|
+
from epub2pdf_cli.pdf.extractors.base import Extractor
|
|
10
|
+
|
|
11
|
+
T = TypeVar("T")
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class OpendataloaderExtractor(Extractor):
|
|
15
|
+
name = "opendataloader"
|
|
16
|
+
|
|
17
|
+
def extract(
|
|
18
|
+
self,
|
|
19
|
+
input_path: Path,
|
|
20
|
+
output_dir: Path,
|
|
21
|
+
formats: Sequence[str],
|
|
22
|
+
*,
|
|
23
|
+
pages: str | None = None,
|
|
24
|
+
password: str | None = None,
|
|
25
|
+
options: dict[str, Any] | None = None,
|
|
26
|
+
timings: dict[str, float] | None = None,
|
|
27
|
+
) -> list[str]:
|
|
28
|
+
try:
|
|
29
|
+
import opendataloader_pdf
|
|
30
|
+
except Exception as exc:
|
|
31
|
+
raise StageError(
|
|
32
|
+
"pdf-extract",
|
|
33
|
+
"opendataloader-pdf is not installed. Install with `python3 -m pip install -e '.[legacy-pdf]'`.",
|
|
34
|
+
exit_code=ExitCode.USAGE,
|
|
35
|
+
) from exc
|
|
36
|
+
|
|
37
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
38
|
+
opts = options or {}
|
|
39
|
+
image_dir = opts.get("image_dir")
|
|
40
|
+
|
|
41
|
+
def _convert() -> None:
|
|
42
|
+
opendataloader_pdf.convert(
|
|
43
|
+
input_path=str(input_path),
|
|
44
|
+
output_dir=str(output_dir),
|
|
45
|
+
password=password,
|
|
46
|
+
format=",".join(formats),
|
|
47
|
+
quiet=True,
|
|
48
|
+
sanitize=opts.get("sanitize", False),
|
|
49
|
+
keep_line_breaks=opts.get("keep_line_breaks", False),
|
|
50
|
+
use_struct_tree=opts.get("use_struct_tree", False),
|
|
51
|
+
table_method=opts.get("table_method"),
|
|
52
|
+
reading_order=opts.get("reading_order"),
|
|
53
|
+
markdown_page_separator=opts.get("markdown_page_separator"),
|
|
54
|
+
html_page_separator=opts.get("html_page_separator"),
|
|
55
|
+
image_output=opts.get("image_output"),
|
|
56
|
+
image_dir=str(image_dir) if image_dir else None,
|
|
57
|
+
pages=pages,
|
|
58
|
+
include_header_footer=opts.get("include_header_footer", False),
|
|
59
|
+
detect_strikethrough=opts.get("detect_strikethrough", False),
|
|
60
|
+
threads=opts.get("threads"),
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
_timed_stage_void("opendataloader_convert", _convert, timings)
|
|
64
|
+
|
|
65
|
+
from epub2pdf_cli.pdf.extract import find_extract_outputs
|
|
66
|
+
|
|
67
|
+
outputs, _ = _timed_stage("find_outputs", lambda: find_extract_outputs(input_path, output_dir, formats), timings)
|
|
68
|
+
return outputs
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _timed_stage(name: str, fn: Callable[[], T], timings: dict[str, float] | None) -> tuple[T, float]:
|
|
72
|
+
start = time.perf_counter()
|
|
73
|
+
result = fn()
|
|
74
|
+
duration = round(time.perf_counter() - start, 3)
|
|
75
|
+
if timings is not None:
|
|
76
|
+
timings[name] = duration
|
|
77
|
+
return result, duration
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _timed_stage_void(name: str, fn: Callable[[], object], timings: dict[str, float] | None) -> float:
|
|
81
|
+
start = time.perf_counter()
|
|
82
|
+
fn()
|
|
83
|
+
duration = round(time.perf_counter() - start, 3)
|
|
84
|
+
if timings is not None:
|
|
85
|
+
timings[name] = duration
|
|
86
|
+
return duration
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import time
|
|
5
|
+
from collections.abc import Callable, Sequence
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any, TypeVar
|
|
8
|
+
|
|
9
|
+
from epub2pdf_cli.errors import ExitCode, StageError
|
|
10
|
+
from epub2pdf_cli.pdf.extractors.base import Extractor
|
|
11
|
+
|
|
12
|
+
T = TypeVar("T")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class PdfPlumberExtractor(Extractor):
|
|
16
|
+
name = "pdfplumber"
|
|
17
|
+
|
|
18
|
+
def extract(
|
|
19
|
+
self,
|
|
20
|
+
input_path: Path,
|
|
21
|
+
output_dir: Path,
|
|
22
|
+
formats: Sequence[str],
|
|
23
|
+
*,
|
|
24
|
+
pages: str | None = None,
|
|
25
|
+
password: str | None = None,
|
|
26
|
+
options: dict[str, Any] | None = None,
|
|
27
|
+
timings: dict[str, float] | None = None,
|
|
28
|
+
) -> list[str]:
|
|
29
|
+
try:
|
|
30
|
+
import pdfplumber
|
|
31
|
+
except Exception as exc:
|
|
32
|
+
raise StageError(
|
|
33
|
+
"pdf-extract",
|
|
34
|
+
"pdfplumber is not installed. Install with `python3 -m pip install -e '.[pdfplumber]'`.",
|
|
35
|
+
exit_code=ExitCode.USAGE,
|
|
36
|
+
) from exc
|
|
37
|
+
|
|
38
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
39
|
+
base_name = input_path.stem
|
|
40
|
+
|
|
41
|
+
try:
|
|
42
|
+
pdf = _timed_stage("open_pdf", lambda: pdfplumber.open(str(input_path), password=password), timings)[0]
|
|
43
|
+
except Exception as exc:
|
|
44
|
+
raise StageError("pdf-extract", f"Unable to open PDF: {input_path}") from exc
|
|
45
|
+
|
|
46
|
+
with pdf:
|
|
47
|
+
page_numbers = _parse_page_numbers(pages, len(pdf.pages)) if pages else list(range(1, len(pdf.pages) + 1))
|
|
48
|
+
|
|
49
|
+
outputs: list[str] = []
|
|
50
|
+
|
|
51
|
+
if "text" in formats or "markdown" in formats or "html" in formats:
|
|
52
|
+
text, _ = _timed_stage("extract_text", lambda: self._extract_text(pdf, page_numbers), timings)
|
|
53
|
+
if "text" in formats:
|
|
54
|
+
path = output_dir / f"{base_name}.txt"
|
|
55
|
+
_timed_stage_void("write_text", lambda: path.write_text(text, encoding="utf-8"), timings)
|
|
56
|
+
outputs.append(str(path))
|
|
57
|
+
if "markdown" in formats:
|
|
58
|
+
md, _ = _timed_stage("text_to_markdown", lambda: self._text_to_markdown(text), timings)
|
|
59
|
+
path = output_dir / f"{base_name}.md"
|
|
60
|
+
_timed_stage_void("write_markdown", lambda: path.write_text(md, encoding="utf-8"), timings)
|
|
61
|
+
outputs.append(str(path))
|
|
62
|
+
if "html" in formats:
|
|
63
|
+
html, _ = _timed_stage("text_to_html", lambda: self._text_to_html(text), timings)
|
|
64
|
+
path = output_dir / f"{base_name}.html"
|
|
65
|
+
_timed_stage_void("write_html", lambda: path.write_text(html, encoding="utf-8"), timings)
|
|
66
|
+
outputs.append(str(path))
|
|
67
|
+
|
|
68
|
+
if "json" in formats:
|
|
69
|
+
data, _ = _timed_stage("extract_json", lambda: self._extract_json(pdf, page_numbers), timings)
|
|
70
|
+
path = output_dir / f"{base_name}.json"
|
|
71
|
+
_timed_stage_void("write_json", lambda: path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8"), timings)
|
|
72
|
+
outputs.append(str(path))
|
|
73
|
+
|
|
74
|
+
return outputs
|
|
75
|
+
|
|
76
|
+
def _extract_text(self, pdf: Any, page_numbers: list[int]) -> str:
|
|
77
|
+
parts: list[str] = []
|
|
78
|
+
for num in page_numbers:
|
|
79
|
+
try:
|
|
80
|
+
page = pdf.pages[num - 1]
|
|
81
|
+
text = page.extract_text() or ""
|
|
82
|
+
if text:
|
|
83
|
+
parts.append(text)
|
|
84
|
+
except Exception:
|
|
85
|
+
continue
|
|
86
|
+
return "\n\n".join(parts)
|
|
87
|
+
|
|
88
|
+
def _text_to_markdown(self, text: str) -> str:
|
|
89
|
+
lines = []
|
|
90
|
+
for paragraph in text.split("\n\n"):
|
|
91
|
+
paragraph = paragraph.strip().replace("\n", " ")
|
|
92
|
+
if paragraph:
|
|
93
|
+
lines.append(paragraph)
|
|
94
|
+
lines.append("")
|
|
95
|
+
return "\n".join(lines)
|
|
96
|
+
|
|
97
|
+
def _text_to_html(self, text: str) -> str:
|
|
98
|
+
paragraphs = [p.strip().replace("\n", "<br/>") for p in text.split("\n\n") if p.strip()]
|
|
99
|
+
body = "\n".join(f"<p>{p}</p>" for p in paragraphs)
|
|
100
|
+
return f"<!DOCTYPE html>\n<html><body>\n{body}\n</body></html>\n"
|
|
101
|
+
|
|
102
|
+
def _extract_json(self, pdf: Any, page_numbers: list[int]) -> dict[str, Any]:
|
|
103
|
+
pages = []
|
|
104
|
+
for num in page_numbers:
|
|
105
|
+
try:
|
|
106
|
+
page = pdf.pages[num - 1]
|
|
107
|
+
tables = page.extract_tables() or []
|
|
108
|
+
pages.append({
|
|
109
|
+
"page": num,
|
|
110
|
+
"text": page.extract_text() or "",
|
|
111
|
+
"tables": tables,
|
|
112
|
+
})
|
|
113
|
+
except Exception as exc:
|
|
114
|
+
pages.append({"page": num, "text": "", "tables": [], "error": str(exc)})
|
|
115
|
+
return {
|
|
116
|
+
"source": str(pdf.stream.name),
|
|
117
|
+
"page_count": len(pdf.pages),
|
|
118
|
+
"extracted_pages": page_numbers,
|
|
119
|
+
"pages": pages,
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def _parse_page_numbers(pages: str, total: int) -> list[int]:
|
|
124
|
+
numbers: set[int] = set()
|
|
125
|
+
for part in pages.split(","):
|
|
126
|
+
part = part.strip()
|
|
127
|
+
if "-" in part:
|
|
128
|
+
start, end = part.split("-", 1)
|
|
129
|
+
numbers.update(range(int(start), int(end) + 1))
|
|
130
|
+
else:
|
|
131
|
+
numbers.add(int(part))
|
|
132
|
+
return sorted(n for n in numbers if 1 <= n <= total)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def _timed_stage(name: str, fn: Callable[[], T], timings: dict[str, float] | None) -> tuple[T, float]:
|
|
136
|
+
start = time.perf_counter()
|
|
137
|
+
result = fn()
|
|
138
|
+
duration = round(time.perf_counter() - start, 3)
|
|
139
|
+
if timings is not None:
|
|
140
|
+
timings[name] = duration
|
|
141
|
+
return result, duration
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def _timed_stage_void(name: str, fn: Callable[[], object], timings: dict[str, float] | None) -> float:
|
|
145
|
+
start = time.perf_counter()
|
|
146
|
+
fn()
|
|
147
|
+
duration = round(time.perf_counter() - start, 3)
|
|
148
|
+
if timings is not None:
|
|
149
|
+
timings[name] = duration
|
|
150
|
+
return duration
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import time
|
|
5
|
+
from collections.abc import Callable, Sequence
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any, TypeVar
|
|
8
|
+
|
|
9
|
+
from epub2pdf_cli.errors import ExitCode, StageError
|
|
10
|
+
from epub2pdf_cli.pdf.extractors.base import Extractor
|
|
11
|
+
|
|
12
|
+
T = TypeVar("T")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class Pypdfium2Extractor(Extractor):
|
|
16
|
+
name = "pypdfium2"
|
|
17
|
+
|
|
18
|
+
def extract(
|
|
19
|
+
self,
|
|
20
|
+
input_path: Path,
|
|
21
|
+
output_dir: Path,
|
|
22
|
+
formats: Sequence[str],
|
|
23
|
+
*,
|
|
24
|
+
pages: str | None = None,
|
|
25
|
+
password: str | None = None,
|
|
26
|
+
options: dict[str, Any] | None = None,
|
|
27
|
+
timings: dict[str, float] | None = None,
|
|
28
|
+
) -> list[str]:
|
|
29
|
+
try:
|
|
30
|
+
import pypdfium2 as pdfium
|
|
31
|
+
except Exception as exc:
|
|
32
|
+
raise StageError(
|
|
33
|
+
"pdf-extract",
|
|
34
|
+
"pypdfium2 is not installed. Install with `python3 -m pip install pypdfium2`.",
|
|
35
|
+
exit_code=ExitCode.USAGE,
|
|
36
|
+
) from exc
|
|
37
|
+
|
|
38
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
39
|
+
opts: dict[str, Any] = {}
|
|
40
|
+
if password:
|
|
41
|
+
opts["password"] = password
|
|
42
|
+
|
|
43
|
+
document, _ = _timed_stage("open_pdf", lambda: pdfium.PdfDocument(str(input_path), **opts), timings)
|
|
44
|
+
|
|
45
|
+
try:
|
|
46
|
+
page_indices = _parse_page_range(pages, len(document)) if pages else range(len(document))
|
|
47
|
+
|
|
48
|
+
outputs: list[str] = []
|
|
49
|
+
base_name = input_path.stem
|
|
50
|
+
|
|
51
|
+
if "text" in formats or "markdown" in formats or "html" in formats:
|
|
52
|
+
text, _ = _timed_stage("extract_text", lambda: self._extract_text(document, page_indices), timings)
|
|
53
|
+
if "text" in formats:
|
|
54
|
+
path = output_dir / f"{base_name}.txt"
|
|
55
|
+
_timed_stage_void("write_text", lambda: path.write_text(text, encoding="utf-8"), timings)
|
|
56
|
+
outputs.append(str(path))
|
|
57
|
+
if "markdown" in formats:
|
|
58
|
+
md, _ = _timed_stage("text_to_markdown", lambda: self._text_to_markdown(text), timings)
|
|
59
|
+
path = output_dir / f"{base_name}.md"
|
|
60
|
+
_timed_stage_void("write_markdown", lambda: path.write_text(md, encoding="utf-8"), timings)
|
|
61
|
+
outputs.append(str(path))
|
|
62
|
+
if "html" in formats:
|
|
63
|
+
html, _ = _timed_stage("text_to_html", lambda: self._text_to_html(text), timings)
|
|
64
|
+
path = output_dir / f"{base_name}.html"
|
|
65
|
+
_timed_stage_void("write_html", lambda: path.write_text(html, encoding="utf-8"), timings)
|
|
66
|
+
outputs.append(str(path))
|
|
67
|
+
|
|
68
|
+
if "json" in formats:
|
|
69
|
+
data, _ = _timed_stage("extract_json", lambda: self._extract_json(document, page_indices), timings)
|
|
70
|
+
path = output_dir / f"{base_name}.json"
|
|
71
|
+
_timed_stage_void("write_json", lambda: path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8"), timings)
|
|
72
|
+
outputs.append(str(path))
|
|
73
|
+
|
|
74
|
+
return outputs
|
|
75
|
+
finally:
|
|
76
|
+
document.close()
|
|
77
|
+
|
|
78
|
+
def _extract_text(self, document: Any, page_indices: range) -> str:
|
|
79
|
+
parts: list[str] = []
|
|
80
|
+
for idx in page_indices:
|
|
81
|
+
try:
|
|
82
|
+
textpage = document[idx].get_textpage()
|
|
83
|
+
page_text = textpage.get_text_bounded()
|
|
84
|
+
if page_text:
|
|
85
|
+
parts.append(page_text)
|
|
86
|
+
except Exception:
|
|
87
|
+
continue
|
|
88
|
+
return "\n\n".join(parts)
|
|
89
|
+
|
|
90
|
+
def _text_to_markdown(self, text: str) -> str:
|
|
91
|
+
lines = []
|
|
92
|
+
for paragraph in text.split("\n\n"):
|
|
93
|
+
paragraph = paragraph.strip().replace("\n", " ")
|
|
94
|
+
if paragraph:
|
|
95
|
+
lines.append(paragraph)
|
|
96
|
+
lines.append("")
|
|
97
|
+
return "\n".join(lines)
|
|
98
|
+
|
|
99
|
+
def _text_to_html(self, text: str) -> str:
|
|
100
|
+
paragraphs = [p.strip().replace("\n", "<br/>") for p in text.split("\n\n") if p.strip()]
|
|
101
|
+
body = "\n".join(f"<p>{p}</p>" for p in paragraphs)
|
|
102
|
+
return f"<!DOCTYPE html>\n<html><body>\n{body}\n</body></html>\n"
|
|
103
|
+
|
|
104
|
+
def _extract_json(self, document: Any, page_indices: range) -> dict[str, Any]:
|
|
105
|
+
pages = []
|
|
106
|
+
for idx in page_indices:
|
|
107
|
+
try:
|
|
108
|
+
textpage = document[idx].get_textpage()
|
|
109
|
+
text = textpage.get_text_bounded()
|
|
110
|
+
pages.append({"page": idx + 1, "text": text})
|
|
111
|
+
except Exception as exc:
|
|
112
|
+
pages.append({"page": idx + 1, "text": "", "error": str(exc)})
|
|
113
|
+
return {
|
|
114
|
+
"source": str(document),
|
|
115
|
+
"page_count": len(document),
|
|
116
|
+
"extracted_pages": list(page_indices),
|
|
117
|
+
"pages": pages,
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def _parse_page_range(pages: str, total: int) -> range:
|
|
122
|
+
indices: set[int] = set()
|
|
123
|
+
for part in pages.split(","):
|
|
124
|
+
part = part.strip()
|
|
125
|
+
if "-" in part:
|
|
126
|
+
start, end = part.split("-", 1)
|
|
127
|
+
indices.update(range(int(start) - 1, int(end)))
|
|
128
|
+
else:
|
|
129
|
+
indices.add(int(part) - 1)
|
|
130
|
+
# Clamp and sort
|
|
131
|
+
valid = sorted(i for i in indices if 0 <= i < total)
|
|
132
|
+
return range(valid[0], valid[-1] + 1) if valid else range(total)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def _timed_stage(name: str, fn: Callable[[], T], timings: dict[str, float] | None) -> tuple[T, float]:
|
|
137
|
+
start = time.perf_counter()
|
|
138
|
+
result = fn()
|
|
139
|
+
duration = round(time.perf_counter() - start, 3)
|
|
140
|
+
if timings is not None:
|
|
141
|
+
timings[name] = duration
|
|
142
|
+
return result, duration
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def _timed_stage_void(name: str, fn: Callable[[], object], timings: dict[str, float] | None) -> float:
|
|
146
|
+
start = time.perf_counter()
|
|
147
|
+
fn()
|
|
148
|
+
duration = round(time.perf_counter() - start, 3)
|
|
149
|
+
if timings is not None:
|
|
150
|
+
timings[name] = duration
|
|
151
|
+
return duration
|
epub2pdf_cli/pdf/text.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import shutil
|
|
4
|
+
import subprocess
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from pypdf import PdfReader
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def extract_text(output_path: Path, *, reader: PdfReader | None = None) -> dict[str, Any]:
|
|
12
|
+
pdftotext_bin = shutil.which("pdftotext")
|
|
13
|
+
if pdftotext_bin:
|
|
14
|
+
try:
|
|
15
|
+
result = subprocess.run(
|
|
16
|
+
[pdftotext_bin, str(output_path), "-"],
|
|
17
|
+
check=True,
|
|
18
|
+
capture_output=True,
|
|
19
|
+
text=True,
|
|
20
|
+
)
|
|
21
|
+
text = result.stdout.strip()
|
|
22
|
+
return {
|
|
23
|
+
"has_text": bool(text),
|
|
24
|
+
"text_length": len(text),
|
|
25
|
+
"extractor": "pdftotext",
|
|
26
|
+
"text_sample": text[:240],
|
|
27
|
+
}
|
|
28
|
+
except Exception:
|
|
29
|
+
pass
|
|
30
|
+
|
|
31
|
+
if reader is None:
|
|
32
|
+
reader = PdfReader(str(output_path))
|
|
33
|
+
text_chunks = []
|
|
34
|
+
for page in reader.pages:
|
|
35
|
+
try:
|
|
36
|
+
text_chunks.append(page.extract_text() or "")
|
|
37
|
+
except Exception:
|
|
38
|
+
continue
|
|
39
|
+
text = "\n".join(text_chunks).strip()
|
|
40
|
+
return {
|
|
41
|
+
"has_text": bool(text),
|
|
42
|
+
"text_length": len(text),
|
|
43
|
+
"extractor": "pypdf",
|
|
44
|
+
"text_sample": text[:240],
|
|
45
|
+
}
|