epub2pdf-cli 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. epub2pdf_cli/__init__.py +5 -0
  2. epub2pdf_cli/__main__.py +4 -0
  3. epub2pdf_cli/api.py +160 -0
  4. epub2pdf_cli/cli.py +223 -0
  5. epub2pdf_cli/config.py +109 -0
  6. epub2pdf_cli/epub/__init__.py +3 -0
  7. epub2pdf_cli/epub/chapters.py +81 -0
  8. epub2pdf_cli/epub/container.py +25 -0
  9. epub2pdf_cli/epub/href.py +24 -0
  10. epub2pdf_cli/epub/opf.py +159 -0
  11. epub2pdf_cli/epub/parser.py +64 -0
  12. epub2pdf_cli/epub/toc.py +101 -0
  13. epub2pdf_cli/errors.py +27 -0
  14. epub2pdf_cli/html/__init__.py +3 -0
  15. epub2pdf_cli/html/builder.py +190 -0
  16. epub2pdf_cli/html/css.py +49 -0
  17. epub2pdf_cli/html/links.py +144 -0
  18. epub2pdf_cli/html/template.py +92 -0
  19. epub2pdf_cli/io_utils.py +24 -0
  20. epub2pdf_cli/markdown.py +97 -0
  21. epub2pdf_cli/mcp_server.py +189 -0
  22. epub2pdf_cli/models.py +116 -0
  23. epub2pdf_cli/pdf/__init__.py +5 -0
  24. epub2pdf_cli/pdf/extract.py +79 -0
  25. epub2pdf_cli/pdf/extractors/__init__.py +0 -0
  26. epub2pdf_cli/pdf/extractors/base.py +23 -0
  27. epub2pdf_cli/pdf/extractors/docling_extractor.py +139 -0
  28. epub2pdf_cli/pdf/extractors/opendataloader_extractor.py +86 -0
  29. epub2pdf_cli/pdf/extractors/pdfplumber_extractor.py +150 -0
  30. epub2pdf_cli/pdf/extractors/pypdfium2_extractor.py +151 -0
  31. epub2pdf_cli/pdf/text.py +45 -0
  32. epub2pdf_cli/pdf/validate.py +37 -0
  33. epub2pdf_cli/pipeline/__init__.py +6 -0
  34. epub2pdf_cli/pipeline/batch.py +84 -0
  35. epub2pdf_cli/pipeline/convert.py +122 -0
  36. epub2pdf_cli/pipeline/extract.py +64 -0
  37. epub2pdf_cli/pipeline/inspect.py +15 -0
  38. epub2pdf_cli/render/__init__.py +17 -0
  39. epub2pdf_cli/render/options.py +19 -0
  40. epub2pdf_cli/render/playwright.py +91 -0
  41. epub2pdf_cli/render/protocol.py +13 -0
  42. epub2pdf_cli/render/weasyprint.py +28 -0
  43. epub2pdf_cli-0.3.0.dist-info/METADATA +443 -0
  44. epub2pdf_cli-0.3.0.dist-info/RECORD +48 -0
  45. epub2pdf_cli-0.3.0.dist-info/WHEEL +5 -0
  46. epub2pdf_cli-0.3.0.dist-info/entry_points.txt +3 -0
  47. epub2pdf_cli-0.3.0.dist-info/licenses/LICENSE +21 -0
  48. epub2pdf_cli-0.3.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,79 @@
1
+ from __future__ import annotations
2
+
3
+ from collections.abc import Sequence
4
+ from pathlib import Path
5
+
6
+ from epub2pdf_cli.config import PdfExtractConfig
7
+ from epub2pdf_cli.errors import ExitCode, StageError
8
+ from epub2pdf_cli.pdf.extractors.base import Extractor
9
+
10
+
11
+ def run_pdf_extraction(config: PdfExtractConfig, timings: dict[str, float] | None = None) -> list[str]:
12
+ extractor = _select_extractor(config.engine)
13
+ options = {
14
+ "sanitize": config.sanitize,
15
+ "keep_line_breaks": config.keep_line_breaks,
16
+ "use_struct_tree": config.use_struct_tree,
17
+ "table_method": config.table_method,
18
+ "reading_order": config.reading_order,
19
+ "markdown_page_separator": config.markdown_page_separator,
20
+ "html_page_separator": config.html_page_separator,
21
+ "image_output": config.image_output,
22
+ "image_dir": config.image_dir,
23
+ "include_header_footer": config.include_header_footer,
24
+ "detect_strikethrough": config.detect_strikethrough,
25
+ "threads": config.threads,
26
+ }
27
+ return extractor.extract(
28
+ config.input_path,
29
+ config.output_dir,
30
+ config.formats,
31
+ pages=config.pages,
32
+ password=config.password,
33
+ options={k: v for k, v in options.items() if v is not None},
34
+ timings=timings,
35
+ )
36
+
37
+
38
+ def _select_extractor(name: str) -> Extractor:
39
+ from epub2pdf_cli.pdf.extractors.docling_extractor import DoclingExtractor
40
+ from epub2pdf_cli.pdf.extractors.opendataloader_extractor import OpendataloaderExtractor
41
+ from epub2pdf_cli.pdf.extractors.pdfplumber_extractor import PdfPlumberExtractor
42
+ from epub2pdf_cli.pdf.extractors.pypdfium2_extractor import Pypdfium2Extractor
43
+
44
+ registry: dict[str, type[Extractor]] = {
45
+ "pypdfium2": Pypdfium2Extractor,
46
+ "docling": DoclingExtractor,
47
+ "pdfplumber": PdfPlumberExtractor,
48
+ "opendataloader": OpendataloaderExtractor,
49
+ }
50
+ cls = registry.get(name)
51
+ if cls is None:
52
+ raise StageError("pdf-extract", f"Unsupported extractor: {name}. Choose from {', '.join(registry)}.", exit_code=ExitCode.USAGE)
53
+ return cls()
54
+
55
+
56
+ def find_extract_outputs(input_path: Path, output_dir: Path, formats: Sequence[str]) -> list[str]:
57
+ outputs = [path for path in planned_extract_paths(input_path, output_dir, formats) if path.exists()]
58
+ if outputs:
59
+ return [str(path) for path in outputs]
60
+ return [str(path) for path in sorted(output_dir.glob(f"{input_path.stem}*")) if path.is_file()]
61
+
62
+
63
+ def planned_extract_paths(input_path: Path, output_dir: Path, formats: Sequence[str]) -> list[Path]:
64
+ extension_map = {
65
+ "json": ".json",
66
+ "text": ".txt",
67
+ "html": ".html",
68
+ "pdf": ".pdf",
69
+ "markdown": ".md",
70
+ "markdown-with-html": ".md",
71
+ "markdown-with-images": ".md",
72
+ "tagged-pdf": ".pdf",
73
+ }
74
+ planned: list[Path] = []
75
+ for fmt in formats:
76
+ suffix = extension_map.get(fmt)
77
+ if suffix:
78
+ planned.append(output_dir / f"{input_path.stem}{suffix}")
79
+ return planned
File without changes
@@ -0,0 +1,23 @@
1
+ from __future__ import annotations
2
+
3
+ from collections.abc import Sequence
4
+ from pathlib import Path
5
+ from typing import Any, Protocol, runtime_checkable
6
+
7
+
8
+ @runtime_checkable
9
+ class Extractor(Protocol):
10
+ name: str
11
+
12
+ def extract(
13
+ self,
14
+ input_path: Path,
15
+ output_dir: Path,
16
+ formats: Sequence[str],
17
+ *,
18
+ pages: str | None = None,
19
+ password: str | None = None,
20
+ options: dict[str, Any] | None = None,
21
+ timings: dict[str, float] | None = None,
22
+ ) -> list[str]:
23
+ ...
@@ -0,0 +1,139 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import time
5
+ from collections.abc import Callable, Sequence
6
+ from pathlib import Path
7
+ from typing import Any, TypeVar
8
+
9
+ from epub2pdf_cli.errors import ExitCode, StageError
10
+ from epub2pdf_cli.pdf.extractors.base import Extractor
11
+
12
+ T = TypeVar("T")
13
+
14
+
15
+ class DoclingExtractor(Extractor):
16
+ name = "docling"
17
+
18
+ def extract(
19
+ self,
20
+ input_path: Path,
21
+ output_dir: Path,
22
+ formats: Sequence[str],
23
+ *,
24
+ pages: str | None = None,
25
+ password: str | None = None,
26
+ options: dict[str, Any] | None = None,
27
+ timings: dict[str, float] | None = None,
28
+ ) -> list[str]:
29
+ try:
30
+ from docling.datamodel.base_models import ConversionStatus
31
+ from docling.document_converter import DocumentConverter
32
+ except Exception as exc:
33
+ raise StageError(
34
+ "pdf-extract",
35
+ "Docling is not installed. Install with `python3 -m pip install -e '.[docling]'`.",
36
+ exit_code=ExitCode.USAGE,
37
+ ) from exc
38
+
39
+ output_dir.mkdir(parents=True, exist_ok=True)
40
+ base_name = input_path.stem
41
+
42
+ converter = DocumentConverter()
43
+ result, _ = _timed_stage(
44
+ "convert_document",
45
+ lambda: converter.convert(str(input_path)),
46
+ timings,
47
+ )
48
+
49
+ if result.status != ConversionStatus.SUCCESS:
50
+ raise StageError("pdf-extract", f"Docling conversion status: {result.status.name}")
51
+
52
+ outputs: list[str] = []
53
+
54
+ if "markdown" in formats:
55
+ md, _ = _timed_stage("export_markdown", lambda: result.document.export_to_markdown(), timings)
56
+ path = output_dir / f"{base_name}.md"
57
+ _timed_stage_void("write_markdown", lambda: path.write_text(md, encoding="utf-8"), timings)
58
+ outputs.append(str(path))
59
+
60
+ if "text" in formats:
61
+ md, _ = _timed_stage("export_markdown", lambda: result.document.export_to_markdown(), timings)
62
+ text, _ = _timed_stage("markdown_to_text", lambda: self._markdown_to_text(md), timings)
63
+ path = output_dir / f"{base_name}.txt"
64
+ _timed_stage_void("write_text", lambda: path.write_text(text, encoding="utf-8"), timings)
65
+ outputs.append(str(path))
66
+
67
+ if "html" in formats:
68
+ md, _ = _timed_stage("export_markdown", lambda: result.document.export_to_markdown(), timings)
69
+ html, _ = _timed_stage("markdown_to_html", lambda: self._markdown_to_html(md), timings)
70
+ path = output_dir / f"{base_name}.html"
71
+ _timed_stage_void("write_html", lambda: path.write_text(html, encoding="utf-8"), timings)
72
+ outputs.append(str(path))
73
+
74
+ if "json" in formats:
75
+ data, _ = _timed_stage("export_to_dict", lambda: result.document.export_to_dict(), timings)
76
+ path = output_dir / f"{base_name}.json"
77
+ _timed_stage_void("write_json", lambda: path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8"), timings)
78
+ outputs.append(str(path))
79
+
80
+ return outputs
81
+
82
+ def _markdown_to_text(self, markdown: str) -> str:
83
+ # Minimal conversion: strip heading markers and list markers
84
+ lines = []
85
+ for line in markdown.splitlines():
86
+ line = line.lstrip("#> ").lstrip("- ").lstrip("* ")
87
+ lines.append(line)
88
+ return "\n".join(lines)
89
+
90
+ def _markdown_to_html(self, markdown: str) -> str:
91
+ from html import escape
92
+
93
+ lines = []
94
+ in_list = False
95
+ for raw in markdown.splitlines():
96
+ stripped = raw.strip()
97
+ if stripped.startswith("# "):
98
+ text = escape(stripped[2:])
99
+ lines.append(f"<h1>{text}</h1>")
100
+ elif stripped.startswith("## "):
101
+ text = escape(stripped[3:])
102
+ lines.append(f"<h2>{text}</h2>")
103
+ elif stripped.startswith("### "):
104
+ text = escape(stripped[4:])
105
+ lines.append(f"<h3>{text}</h3>")
106
+ elif stripped.startswith("- ") or stripped.startswith("* "):
107
+ if not in_list:
108
+ lines.append("<ul>")
109
+ in_list = True
110
+ text = escape(stripped[2:])
111
+ lines.append(f"<li>{text}</li>")
112
+ else:
113
+ if in_list:
114
+ lines.append("</ul>")
115
+ in_list = False
116
+ if stripped:
117
+ lines.append(f"<p>{escape(stripped)}</p>")
118
+ if in_list:
119
+ lines.append("</ul>")
120
+ body = "\n".join(lines)
121
+ return f"<!DOCTYPE html>\n<html><body>\n{body}\n</body></html>\n"
122
+
123
+
124
+ def _timed_stage(name: str, fn: Callable[[], T], timings: dict[str, float] | None) -> tuple[T, float]:
125
+ start = time.perf_counter()
126
+ result = fn()
127
+ duration = round(time.perf_counter() - start, 3)
128
+ if timings is not None:
129
+ timings[name] = duration
130
+ return result, duration
131
+
132
+
133
+ def _timed_stage_void(name: str, fn: Callable[[], object], timings: dict[str, float] | None) -> float:
134
+ start = time.perf_counter()
135
+ fn()
136
+ duration = round(time.perf_counter() - start, 3)
137
+ if timings is not None:
138
+ timings[name] = duration
139
+ return duration
@@ -0,0 +1,86 @@
1
+ from __future__ import annotations
2
+
3
+ import time
4
+ from collections.abc import Callable, Sequence
5
+ from pathlib import Path
6
+ from typing import Any, TypeVar
7
+
8
+ from epub2pdf_cli.errors import ExitCode, StageError
9
+ from epub2pdf_cli.pdf.extractors.base import Extractor
10
+
11
+ T = TypeVar("T")
12
+
13
+
14
+ class OpendataloaderExtractor(Extractor):
15
+ name = "opendataloader"
16
+
17
+ def extract(
18
+ self,
19
+ input_path: Path,
20
+ output_dir: Path,
21
+ formats: Sequence[str],
22
+ *,
23
+ pages: str | None = None,
24
+ password: str | None = None,
25
+ options: dict[str, Any] | None = None,
26
+ timings: dict[str, float] | None = None,
27
+ ) -> list[str]:
28
+ try:
29
+ import opendataloader_pdf
30
+ except Exception as exc:
31
+ raise StageError(
32
+ "pdf-extract",
33
+ "opendataloader-pdf is not installed. Install with `python3 -m pip install -e '.[legacy-pdf]'`.",
34
+ exit_code=ExitCode.USAGE,
35
+ ) from exc
36
+
37
+ output_dir.mkdir(parents=True, exist_ok=True)
38
+ opts = options or {}
39
+ image_dir = opts.get("image_dir")
40
+
41
+ def _convert() -> None:
42
+ opendataloader_pdf.convert(
43
+ input_path=str(input_path),
44
+ output_dir=str(output_dir),
45
+ password=password,
46
+ format=",".join(formats),
47
+ quiet=True,
48
+ sanitize=opts.get("sanitize", False),
49
+ keep_line_breaks=opts.get("keep_line_breaks", False),
50
+ use_struct_tree=opts.get("use_struct_tree", False),
51
+ table_method=opts.get("table_method"),
52
+ reading_order=opts.get("reading_order"),
53
+ markdown_page_separator=opts.get("markdown_page_separator"),
54
+ html_page_separator=opts.get("html_page_separator"),
55
+ image_output=opts.get("image_output"),
56
+ image_dir=str(image_dir) if image_dir else None,
57
+ pages=pages,
58
+ include_header_footer=opts.get("include_header_footer", False),
59
+ detect_strikethrough=opts.get("detect_strikethrough", False),
60
+ threads=opts.get("threads"),
61
+ )
62
+
63
+ _timed_stage_void("opendataloader_convert", _convert, timings)
64
+
65
+ from epub2pdf_cli.pdf.extract import find_extract_outputs
66
+
67
+ outputs, _ = _timed_stage("find_outputs", lambda: find_extract_outputs(input_path, output_dir, formats), timings)
68
+ return outputs
69
+
70
+
71
+ def _timed_stage(name: str, fn: Callable[[], T], timings: dict[str, float] | None) -> tuple[T, float]:
72
+ start = time.perf_counter()
73
+ result = fn()
74
+ duration = round(time.perf_counter() - start, 3)
75
+ if timings is not None:
76
+ timings[name] = duration
77
+ return result, duration
78
+
79
+
80
+ def _timed_stage_void(name: str, fn: Callable[[], object], timings: dict[str, float] | None) -> float:
81
+ start = time.perf_counter()
82
+ fn()
83
+ duration = round(time.perf_counter() - start, 3)
84
+ if timings is not None:
85
+ timings[name] = duration
86
+ return duration
@@ -0,0 +1,150 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import time
5
+ from collections.abc import Callable, Sequence
6
+ from pathlib import Path
7
+ from typing import Any, TypeVar
8
+
9
+ from epub2pdf_cli.errors import ExitCode, StageError
10
+ from epub2pdf_cli.pdf.extractors.base import Extractor
11
+
12
+ T = TypeVar("T")
13
+
14
+
15
+ class PdfPlumberExtractor(Extractor):
16
+ name = "pdfplumber"
17
+
18
+ def extract(
19
+ self,
20
+ input_path: Path,
21
+ output_dir: Path,
22
+ formats: Sequence[str],
23
+ *,
24
+ pages: str | None = None,
25
+ password: str | None = None,
26
+ options: dict[str, Any] | None = None,
27
+ timings: dict[str, float] | None = None,
28
+ ) -> list[str]:
29
+ try:
30
+ import pdfplumber
31
+ except Exception as exc:
32
+ raise StageError(
33
+ "pdf-extract",
34
+ "pdfplumber is not installed. Install with `python3 -m pip install -e '.[pdfplumber]'`.",
35
+ exit_code=ExitCode.USAGE,
36
+ ) from exc
37
+
38
+ output_dir.mkdir(parents=True, exist_ok=True)
39
+ base_name = input_path.stem
40
+
41
+ try:
42
+ pdf = _timed_stage("open_pdf", lambda: pdfplumber.open(str(input_path), password=password), timings)[0]
43
+ except Exception as exc:
44
+ raise StageError("pdf-extract", f"Unable to open PDF: {input_path}") from exc
45
+
46
+ with pdf:
47
+ page_numbers = _parse_page_numbers(pages, len(pdf.pages)) if pages else list(range(1, len(pdf.pages) + 1))
48
+
49
+ outputs: list[str] = []
50
+
51
+ if "text" in formats or "markdown" in formats or "html" in formats:
52
+ text, _ = _timed_stage("extract_text", lambda: self._extract_text(pdf, page_numbers), timings)
53
+ if "text" in formats:
54
+ path = output_dir / f"{base_name}.txt"
55
+ _timed_stage_void("write_text", lambda: path.write_text(text, encoding="utf-8"), timings)
56
+ outputs.append(str(path))
57
+ if "markdown" in formats:
58
+ md, _ = _timed_stage("text_to_markdown", lambda: self._text_to_markdown(text), timings)
59
+ path = output_dir / f"{base_name}.md"
60
+ _timed_stage_void("write_markdown", lambda: path.write_text(md, encoding="utf-8"), timings)
61
+ outputs.append(str(path))
62
+ if "html" in formats:
63
+ html, _ = _timed_stage("text_to_html", lambda: self._text_to_html(text), timings)
64
+ path = output_dir / f"{base_name}.html"
65
+ _timed_stage_void("write_html", lambda: path.write_text(html, encoding="utf-8"), timings)
66
+ outputs.append(str(path))
67
+
68
+ if "json" in formats:
69
+ data, _ = _timed_stage("extract_json", lambda: self._extract_json(pdf, page_numbers), timings)
70
+ path = output_dir / f"{base_name}.json"
71
+ _timed_stage_void("write_json", lambda: path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8"), timings)
72
+ outputs.append(str(path))
73
+
74
+ return outputs
75
+
76
+ def _extract_text(self, pdf: Any, page_numbers: list[int]) -> str:
77
+ parts: list[str] = []
78
+ for num in page_numbers:
79
+ try:
80
+ page = pdf.pages[num - 1]
81
+ text = page.extract_text() or ""
82
+ if text:
83
+ parts.append(text)
84
+ except Exception:
85
+ continue
86
+ return "\n\n".join(parts)
87
+
88
+ def _text_to_markdown(self, text: str) -> str:
89
+ lines = []
90
+ for paragraph in text.split("\n\n"):
91
+ paragraph = paragraph.strip().replace("\n", " ")
92
+ if paragraph:
93
+ lines.append(paragraph)
94
+ lines.append("")
95
+ return "\n".join(lines)
96
+
97
+ def _text_to_html(self, text: str) -> str:
98
+ paragraphs = [p.strip().replace("\n", "<br/>") for p in text.split("\n\n") if p.strip()]
99
+ body = "\n".join(f"<p>{p}</p>" for p in paragraphs)
100
+ return f"<!DOCTYPE html>\n<html><body>\n{body}\n</body></html>\n"
101
+
102
+ def _extract_json(self, pdf: Any, page_numbers: list[int]) -> dict[str, Any]:
103
+ pages = []
104
+ for num in page_numbers:
105
+ try:
106
+ page = pdf.pages[num - 1]
107
+ tables = page.extract_tables() or []
108
+ pages.append({
109
+ "page": num,
110
+ "text": page.extract_text() or "",
111
+ "tables": tables,
112
+ })
113
+ except Exception as exc:
114
+ pages.append({"page": num, "text": "", "tables": [], "error": str(exc)})
115
+ return {
116
+ "source": str(pdf.stream.name),
117
+ "page_count": len(pdf.pages),
118
+ "extracted_pages": page_numbers,
119
+ "pages": pages,
120
+ }
121
+
122
+
123
+ def _parse_page_numbers(pages: str, total: int) -> list[int]:
124
+ numbers: set[int] = set()
125
+ for part in pages.split(","):
126
+ part = part.strip()
127
+ if "-" in part:
128
+ start, end = part.split("-", 1)
129
+ numbers.update(range(int(start), int(end) + 1))
130
+ else:
131
+ numbers.add(int(part))
132
+ return sorted(n for n in numbers if 1 <= n <= total)
133
+
134
+
135
+ def _timed_stage(name: str, fn: Callable[[], T], timings: dict[str, float] | None) -> tuple[T, float]:
136
+ start = time.perf_counter()
137
+ result = fn()
138
+ duration = round(time.perf_counter() - start, 3)
139
+ if timings is not None:
140
+ timings[name] = duration
141
+ return result, duration
142
+
143
+
144
+ def _timed_stage_void(name: str, fn: Callable[[], object], timings: dict[str, float] | None) -> float:
145
+ start = time.perf_counter()
146
+ fn()
147
+ duration = round(time.perf_counter() - start, 3)
148
+ if timings is not None:
149
+ timings[name] = duration
150
+ return duration
@@ -0,0 +1,151 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import time
5
+ from collections.abc import Callable, Sequence
6
+ from pathlib import Path
7
+ from typing import Any, TypeVar
8
+
9
+ from epub2pdf_cli.errors import ExitCode, StageError
10
+ from epub2pdf_cli.pdf.extractors.base import Extractor
11
+
12
+ T = TypeVar("T")
13
+
14
+
15
+ class Pypdfium2Extractor(Extractor):
16
+ name = "pypdfium2"
17
+
18
+ def extract(
19
+ self,
20
+ input_path: Path,
21
+ output_dir: Path,
22
+ formats: Sequence[str],
23
+ *,
24
+ pages: str | None = None,
25
+ password: str | None = None,
26
+ options: dict[str, Any] | None = None,
27
+ timings: dict[str, float] | None = None,
28
+ ) -> list[str]:
29
+ try:
30
+ import pypdfium2 as pdfium
31
+ except Exception as exc:
32
+ raise StageError(
33
+ "pdf-extract",
34
+ "pypdfium2 is not installed. Install with `python3 -m pip install pypdfium2`.",
35
+ exit_code=ExitCode.USAGE,
36
+ ) from exc
37
+
38
+ output_dir.mkdir(parents=True, exist_ok=True)
39
+ opts: dict[str, Any] = {}
40
+ if password:
41
+ opts["password"] = password
42
+
43
+ document, _ = _timed_stage("open_pdf", lambda: pdfium.PdfDocument(str(input_path), **opts), timings)
44
+
45
+ try:
46
+ page_indices = _parse_page_range(pages, len(document)) if pages else range(len(document))
47
+
48
+ outputs: list[str] = []
49
+ base_name = input_path.stem
50
+
51
+ if "text" in formats or "markdown" in formats or "html" in formats:
52
+ text, _ = _timed_stage("extract_text", lambda: self._extract_text(document, page_indices), timings)
53
+ if "text" in formats:
54
+ path = output_dir / f"{base_name}.txt"
55
+ _timed_stage_void("write_text", lambda: path.write_text(text, encoding="utf-8"), timings)
56
+ outputs.append(str(path))
57
+ if "markdown" in formats:
58
+ md, _ = _timed_stage("text_to_markdown", lambda: self._text_to_markdown(text), timings)
59
+ path = output_dir / f"{base_name}.md"
60
+ _timed_stage_void("write_markdown", lambda: path.write_text(md, encoding="utf-8"), timings)
61
+ outputs.append(str(path))
62
+ if "html" in formats:
63
+ html, _ = _timed_stage("text_to_html", lambda: self._text_to_html(text), timings)
64
+ path = output_dir / f"{base_name}.html"
65
+ _timed_stage_void("write_html", lambda: path.write_text(html, encoding="utf-8"), timings)
66
+ outputs.append(str(path))
67
+
68
+ if "json" in formats:
69
+ data, _ = _timed_stage("extract_json", lambda: self._extract_json(document, page_indices), timings)
70
+ path = output_dir / f"{base_name}.json"
71
+ _timed_stage_void("write_json", lambda: path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8"), timings)
72
+ outputs.append(str(path))
73
+
74
+ return outputs
75
+ finally:
76
+ document.close()
77
+
78
+ def _extract_text(self, document: Any, page_indices: range) -> str:
79
+ parts: list[str] = []
80
+ for idx in page_indices:
81
+ try:
82
+ textpage = document[idx].get_textpage()
83
+ page_text = textpage.get_text_bounded()
84
+ if page_text:
85
+ parts.append(page_text)
86
+ except Exception:
87
+ continue
88
+ return "\n\n".join(parts)
89
+
90
+ def _text_to_markdown(self, text: str) -> str:
91
+ lines = []
92
+ for paragraph in text.split("\n\n"):
93
+ paragraph = paragraph.strip().replace("\n", " ")
94
+ if paragraph:
95
+ lines.append(paragraph)
96
+ lines.append("")
97
+ return "\n".join(lines)
98
+
99
+ def _text_to_html(self, text: str) -> str:
100
+ paragraphs = [p.strip().replace("\n", "<br/>") for p in text.split("\n\n") if p.strip()]
101
+ body = "\n".join(f"<p>{p}</p>" for p in paragraphs)
102
+ return f"<!DOCTYPE html>\n<html><body>\n{body}\n</body></html>\n"
103
+
104
+ def _extract_json(self, document: Any, page_indices: range) -> dict[str, Any]:
105
+ pages = []
106
+ for idx in page_indices:
107
+ try:
108
+ textpage = document[idx].get_textpage()
109
+ text = textpage.get_text_bounded()
110
+ pages.append({"page": idx + 1, "text": text})
111
+ except Exception as exc:
112
+ pages.append({"page": idx + 1, "text": "", "error": str(exc)})
113
+ return {
114
+ "source": str(document),
115
+ "page_count": len(document),
116
+ "extracted_pages": list(page_indices),
117
+ "pages": pages,
118
+ }
119
+
120
+
121
+ def _parse_page_range(pages: str, total: int) -> range:
122
+ indices: set[int] = set()
123
+ for part in pages.split(","):
124
+ part = part.strip()
125
+ if "-" in part:
126
+ start, end = part.split("-", 1)
127
+ indices.update(range(int(start) - 1, int(end)))
128
+ else:
129
+ indices.add(int(part) - 1)
130
+ # Clamp and sort
131
+ valid = sorted(i for i in indices if 0 <= i < total)
132
+ return range(valid[0], valid[-1] + 1) if valid else range(total)
133
+
134
+
135
+
136
+ def _timed_stage(name: str, fn: Callable[[], T], timings: dict[str, float] | None) -> tuple[T, float]:
137
+ start = time.perf_counter()
138
+ result = fn()
139
+ duration = round(time.perf_counter() - start, 3)
140
+ if timings is not None:
141
+ timings[name] = duration
142
+ return result, duration
143
+
144
+
145
+ def _timed_stage_void(name: str, fn: Callable[[], object], timings: dict[str, float] | None) -> float:
146
+ start = time.perf_counter()
147
+ fn()
148
+ duration = round(time.perf_counter() - start, 3)
149
+ if timings is not None:
150
+ timings[name] = duration
151
+ return duration
@@ -0,0 +1,45 @@
1
+ from __future__ import annotations
2
+
3
+ import shutil
4
+ import subprocess
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ from pypdf import PdfReader
9
+
10
+
11
+ def extract_text(output_path: Path, *, reader: PdfReader | None = None) -> dict[str, Any]:
12
+ pdftotext_bin = shutil.which("pdftotext")
13
+ if pdftotext_bin:
14
+ try:
15
+ result = subprocess.run(
16
+ [pdftotext_bin, str(output_path), "-"],
17
+ check=True,
18
+ capture_output=True,
19
+ text=True,
20
+ )
21
+ text = result.stdout.strip()
22
+ return {
23
+ "has_text": bool(text),
24
+ "text_length": len(text),
25
+ "extractor": "pdftotext",
26
+ "text_sample": text[:240],
27
+ }
28
+ except Exception:
29
+ pass
30
+
31
+ if reader is None:
32
+ reader = PdfReader(str(output_path))
33
+ text_chunks = []
34
+ for page in reader.pages:
35
+ try:
36
+ text_chunks.append(page.extract_text() or "")
37
+ except Exception:
38
+ continue
39
+ text = "\n".join(text_chunks).strip()
40
+ return {
41
+ "has_text": bool(text),
42
+ "text_length": len(text),
43
+ "extractor": "pypdf",
44
+ "text_sample": text[:240],
45
+ }