epub2pdf-cli 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. epub2pdf_cli/__init__.py +5 -0
  2. epub2pdf_cli/__main__.py +4 -0
  3. epub2pdf_cli/api.py +160 -0
  4. epub2pdf_cli/cli.py +223 -0
  5. epub2pdf_cli/config.py +109 -0
  6. epub2pdf_cli/epub/__init__.py +3 -0
  7. epub2pdf_cli/epub/chapters.py +81 -0
  8. epub2pdf_cli/epub/container.py +25 -0
  9. epub2pdf_cli/epub/href.py +24 -0
  10. epub2pdf_cli/epub/opf.py +159 -0
  11. epub2pdf_cli/epub/parser.py +64 -0
  12. epub2pdf_cli/epub/toc.py +101 -0
  13. epub2pdf_cli/errors.py +27 -0
  14. epub2pdf_cli/html/__init__.py +3 -0
  15. epub2pdf_cli/html/builder.py +190 -0
  16. epub2pdf_cli/html/css.py +49 -0
  17. epub2pdf_cli/html/links.py +144 -0
  18. epub2pdf_cli/html/template.py +92 -0
  19. epub2pdf_cli/io_utils.py +24 -0
  20. epub2pdf_cli/markdown.py +97 -0
  21. epub2pdf_cli/mcp_server.py +189 -0
  22. epub2pdf_cli/models.py +116 -0
  23. epub2pdf_cli/pdf/__init__.py +5 -0
  24. epub2pdf_cli/pdf/extract.py +79 -0
  25. epub2pdf_cli/pdf/extractors/__init__.py +0 -0
  26. epub2pdf_cli/pdf/extractors/base.py +23 -0
  27. epub2pdf_cli/pdf/extractors/docling_extractor.py +139 -0
  28. epub2pdf_cli/pdf/extractors/opendataloader_extractor.py +86 -0
  29. epub2pdf_cli/pdf/extractors/pdfplumber_extractor.py +150 -0
  30. epub2pdf_cli/pdf/extractors/pypdfium2_extractor.py +151 -0
  31. epub2pdf_cli/pdf/text.py +45 -0
  32. epub2pdf_cli/pdf/validate.py +37 -0
  33. epub2pdf_cli/pipeline/__init__.py +6 -0
  34. epub2pdf_cli/pipeline/batch.py +84 -0
  35. epub2pdf_cli/pipeline/convert.py +122 -0
  36. epub2pdf_cli/pipeline/extract.py +64 -0
  37. epub2pdf_cli/pipeline/inspect.py +15 -0
  38. epub2pdf_cli/render/__init__.py +17 -0
  39. epub2pdf_cli/render/options.py +19 -0
  40. epub2pdf_cli/render/playwright.py +91 -0
  41. epub2pdf_cli/render/protocol.py +13 -0
  42. epub2pdf_cli/render/weasyprint.py +28 -0
  43. epub2pdf_cli-0.3.0.dist-info/METADATA +443 -0
  44. epub2pdf_cli-0.3.0.dist-info/RECORD +48 -0
  45. epub2pdf_cli-0.3.0.dist-info/WHEEL +5 -0
  46. epub2pdf_cli-0.3.0.dist-info/entry_points.txt +3 -0
  47. epub2pdf_cli-0.3.0.dist-info/licenses/LICENSE +21 -0
  48. epub2pdf_cli-0.3.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,159 @@
1
+ from __future__ import annotations
2
+
3
+ import posixpath
4
+ import zipfile
5
+ from typing import Any
6
+ from xml.etree import ElementTree as ET
7
+
8
+ from epub2pdf_cli.errors import StageError
9
+ from epub2pdf_cli.models import CoverAsset, ManifestItem, SpineItem
10
+
11
+ OPF_NS = {
12
+ "opf": "http://www.idpf.org/2007/opf",
13
+ "dc": "http://purl.org/dc/elements/1.1/",
14
+ }
15
+
16
+
17
+ def read_required(archive: zipfile.ZipFile, path: str, *, stage: str) -> bytes:
18
+ try:
19
+ return archive.read(path)
20
+ except KeyError as exc:
21
+ raise StageError(stage, f"Missing required EPUB resource: {path}") from exc
22
+
23
+
24
+ def parse_opf(archive: zipfile.ZipFile, rootfile_path: str) -> tuple[ET.Element, str]:
25
+ opf_bytes = read_required(archive, rootfile_path, stage="opf")
26
+ try:
27
+ package = ET.fromstring(opf_bytes)
28
+ except ET.ParseError as exc:
29
+ raise StageError("opf", f"Unable to parse package document: {rootfile_path}") from exc
30
+ opf_dir = posixpath.dirname(rootfile_path)
31
+ return package, opf_dir
32
+
33
+
34
+ def read_manifest(
35
+ archive: zipfile.ZipFile,
36
+ package: ET.Element,
37
+ opf_dir: str,
38
+ warnings: list[str],
39
+ ) -> dict[str, ManifestItem]:
40
+ manifest: dict[str, ManifestItem] = {}
41
+ manifest_node = package.find("opf:manifest", OPF_NS)
42
+ if manifest_node is None:
43
+ raise StageError("opf", "Package document is missing a manifest")
44
+
45
+ for item in manifest_node.findall("opf:item", OPF_NS):
46
+ item_id = item.attrib.get("id")
47
+ href = item.attrib.get("href")
48
+ media_type = item.attrib.get("media-type")
49
+ if not item_id or not href or not media_type:
50
+ continue
51
+ normalized_href = posixpath.normpath(posixpath.join(opf_dir, href))
52
+ properties = tuple(item.attrib.get("properties", "").split())
53
+ content = b""
54
+ try:
55
+ content = archive.read(normalized_href)
56
+ except KeyError:
57
+ warnings.append(f"Missing manifest resource: {normalized_href}")
58
+ manifest[item_id] = ManifestItem(
59
+ id=item_id,
60
+ href=normalized_href,
61
+ media_type=media_type,
62
+ properties=properties,
63
+ fallback=item.attrib.get("fallback"),
64
+ content=content,
65
+ )
66
+ return manifest
67
+
68
+
69
+ def read_metadata(package: ET.Element) -> dict[str, Any]:
70
+ metadata_node = package.find("opf:metadata", OPF_NS)
71
+ metadata: dict[str, Any] = {
72
+ "title": "",
73
+ "language": "",
74
+ "creators": [],
75
+ "identifiers": [],
76
+ "publisher": "",
77
+ "dates": [],
78
+ "subjects": [],
79
+ "descriptions": [],
80
+ "contributors": [],
81
+ "rights": [],
82
+ }
83
+ if metadata_node is None:
84
+ return metadata
85
+
86
+ def read_texts(tag: str) -> list[str]:
87
+ values = []
88
+ for element in metadata_node.findall(f"dc:{tag}", OPF_NS):
89
+ text = (element.text or "").strip()
90
+ if text:
91
+ values.append(text)
92
+ return values
93
+
94
+ titles = read_texts("title")
95
+ metadata["title"] = titles[0] if titles else ""
96
+ languages = read_texts("language")
97
+ metadata["language"] = languages[0] if languages else ""
98
+ metadata["creators"] = read_texts("creator")
99
+ metadata["identifiers"] = read_texts("identifier")
100
+ metadata["subjects"] = read_texts("subject")
101
+ metadata["descriptions"] = read_texts("description")
102
+ metadata["contributors"] = read_texts("contributor")
103
+ metadata["rights"] = read_texts("rights")
104
+ publishers = read_texts("publisher")
105
+ metadata["publisher"] = publishers[0] if publishers else ""
106
+ metadata["dates"] = read_texts("date")
107
+ return metadata
108
+
109
+
110
+ def read_cover_asset(
111
+ package: ET.Element,
112
+ manifest: dict[str, ManifestItem],
113
+ ) -> CoverAsset | None:
114
+ metadata_node = package.find("opf:metadata", OPF_NS)
115
+ if metadata_node is not None:
116
+ for meta in metadata_node.findall("opf:meta", OPF_NS):
117
+ if meta.attrib.get("name") == "cover":
118
+ cover_id = meta.attrib.get("content")
119
+ item = manifest.get(cover_id or "")
120
+ if item and item.content:
121
+ return CoverAsset(href=item.href, media_type=item.media_type, content=item.content)
122
+
123
+ for item in manifest.values():
124
+ if "cover-image" in item.properties and item.content:
125
+ return CoverAsset(href=item.href, media_type=item.media_type, content=item.content)
126
+ return None
127
+
128
+
129
+ def read_spine(package: ET.Element, manifest: dict[str, ManifestItem]) -> list[SpineItem]:
130
+ spine_node = package.find("opf:spine", OPF_NS)
131
+ if spine_node is None:
132
+ raise StageError("spine", "Package document is missing a spine")
133
+
134
+ spine: list[SpineItem] = []
135
+ for itemref in spine_node.findall("opf:itemref", OPF_NS):
136
+ idref = itemref.attrib.get("idref")
137
+ if not idref:
138
+ continue
139
+ manifest_item = manifest.get(idref)
140
+ if manifest_item is None:
141
+ raise StageError("spine", f"Spine references missing manifest item: {idref}")
142
+ spine.append(
143
+ SpineItem(
144
+ idref=idref,
145
+ href=manifest_item.href,
146
+ media_type=manifest_item.media_type,
147
+ linear=itemref.attrib.get("linear", "yes").lower() != "no",
148
+ )
149
+ )
150
+ if not spine:
151
+ raise StageError("spine", "Spine does not contain any readable items")
152
+ return spine
153
+
154
+
155
+ def get_toc_id(package: ET.Element) -> str | None:
156
+ spine_node = package.find("opf:spine", OPF_NS)
157
+ if spine_node is None:
158
+ return None
159
+ return spine_node.attrib.get("toc")
@@ -0,0 +1,64 @@
1
+ from __future__ import annotations
2
+
3
+ import zipfile
4
+ from pathlib import Path
5
+
6
+ from epub2pdf_cli.epub.chapters import manifest_warnings, read_chapters
7
+ from epub2pdf_cli.epub.container import read_rootfile_path
8
+ from epub2pdf_cli.epub.opf import (
9
+ get_toc_id,
10
+ parse_opf,
11
+ read_cover_asset,
12
+ read_manifest,
13
+ read_metadata,
14
+ read_spine,
15
+ )
16
+ from epub2pdf_cli.epub.toc import read_toc
17
+ from epub2pdf_cli.errors import ExitCode, StageError
18
+ from epub2pdf_cli.models import EpubBook, ManifestItem
19
+
20
+
21
+ def read_epub(input_path: Path) -> EpubBook:
22
+ try:
23
+ archive = zipfile.ZipFile(input_path)
24
+ except FileNotFoundError as exc:
25
+ raise StageError("container", f"Input file does not exist: {input_path}", exit_code=ExitCode.USAGE) from exc
26
+ except zipfile.BadZipFile as exc:
27
+ raise StageError("container", f"Input is not a valid EPUB/ZIP archive: {input_path}", exit_code=ExitCode.USAGE) from exc
28
+
29
+ with archive:
30
+ rootfile_path = read_rootfile_path(archive)
31
+ package, opf_dir = parse_opf(archive, rootfile_path)
32
+
33
+ warnings: list[str] = []
34
+ manifest = read_manifest(archive, package, opf_dir, warnings)
35
+ metadata = read_metadata(package)
36
+ cover = read_cover_asset(package, manifest)
37
+ spine = read_spine(package, manifest)
38
+ chapters, chapter_warnings = read_chapters(spine, manifest)
39
+ warnings.extend(chapter_warnings)
40
+ warnings.extend(manifest_warnings(manifest))
41
+
42
+ toc = read_toc(
43
+ nav_item=next((item for item in manifest.values() if "nav" in item.properties), None),
44
+ ncx_item=_find_ncx_item(manifest, get_toc_id(package)),
45
+ warnings=warnings,
46
+ )
47
+
48
+ return EpubBook(
49
+ source_path=str(input_path),
50
+ rootfile_path=rootfile_path,
51
+ metadata=metadata,
52
+ manifest=manifest,
53
+ spine=spine,
54
+ chapters=chapters,
55
+ toc=toc,
56
+ warnings=warnings,
57
+ cover=cover,
58
+ )
59
+
60
+
61
+ def _find_ncx_item(manifest: dict[str, ManifestItem], toc_id: str | None) -> ManifestItem | None:
62
+ if not toc_id or toc_id not in manifest:
63
+ return None
64
+ return manifest[toc_id]
@@ -0,0 +1,101 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+ from xml.etree import ElementTree as ET
5
+
6
+ from bs4 import BeautifulSoup
7
+
8
+ from epub2pdf_cli.epub.href import resolve_relative_href
9
+ from epub2pdf_cli.models import ManifestItem, TocEntry
10
+
11
+ NCX_NS = {"ncx": "http://www.daisy.org/z3986/2005/ncx/"}
12
+
13
+
14
+ def read_toc(
15
+ nav_item: ManifestItem | None,
16
+ ncx_item: ManifestItem | None,
17
+ warnings: list[str],
18
+ ) -> list[TocEntry]:
19
+ if nav_item and nav_item.content:
20
+ toc = _parse_nav_document(nav_item.content, nav_item.href)
21
+ if toc:
22
+ return toc
23
+ warnings.append("EPUB nav document did not contain a usable toc")
24
+
25
+ if ncx_item and ncx_item.content:
26
+ toc = _parse_ncx_document(ncx_item.content, ncx_item.href)
27
+ if toc:
28
+ return toc
29
+ warnings.append("NCX document did not contain a usable toc")
30
+
31
+ return []
32
+
33
+
34
+ def _parse_nav_document(content: bytes, base_href: str) -> list[TocEntry]:
35
+ try:
36
+ soup = BeautifulSoup(content, "lxml")
37
+ except Exception:
38
+ return []
39
+
40
+ nav = None
41
+ for candidate in soup.find_all("nav"):
42
+ epub_type = str(candidate.get("epub:type") or candidate.get("type") or "")
43
+ if "toc" in epub_type.split():
44
+ nav = candidate
45
+ break
46
+ if nav is None:
47
+ nav = soup.find("nav")
48
+ if nav is None:
49
+ return []
50
+
51
+ list_node = nav.find(["ol", "ul"])
52
+ return _parse_nav_list(list_node, base_href) if list_node else []
53
+
54
+
55
+ def _parse_nav_list(list_node: Any, base_href: str) -> list[TocEntry]:
56
+ entries: list[TocEntry] = []
57
+ for li in list_node.find_all("li", recursive=False):
58
+ link = li.find("a", recursive=False)
59
+ title = ""
60
+ href = base_href
61
+ if link:
62
+ title = link.get_text(" ", strip=True)
63
+ href = resolve_relative_href(base_href, link.get("href") or "")
64
+ else:
65
+ title = li.get_text(" ", strip=True)
66
+ child_list = li.find(["ol", "ul"], recursive=False)
67
+ entries.append(
68
+ TocEntry(
69
+ title=title,
70
+ href=href,
71
+ children=_parse_nav_list(child_list, base_href) if child_list else [],
72
+ )
73
+ )
74
+ return entries
75
+
76
+
77
+ def _parse_ncx_document(content: bytes, base_href: str) -> list[TocEntry]:
78
+ try:
79
+ root = ET.fromstring(content)
80
+ except ET.ParseError:
81
+ return []
82
+
83
+ nav_map = root.find("ncx:navMap", NCX_NS)
84
+ if nav_map is None:
85
+ return []
86
+ return [_parse_navpoint(node, base_href) for node in nav_map.findall("ncx:navPoint", NCX_NS)]
87
+
88
+
89
+ def _parse_navpoint(node: ET.Element, base_href: str) -> TocEntry:
90
+ label_node = node.find("ncx:navLabel/ncx:text", NCX_NS)
91
+ content_node = node.find("ncx:content", NCX_NS)
92
+ title = (label_node.text or "").strip() if label_node is not None and label_node.text else ""
93
+ href = resolve_relative_href(
94
+ base_href,
95
+ content_node.attrib.get("src", "") if content_node is not None else "",
96
+ )
97
+ return TocEntry(
98
+ title=title,
99
+ href=href,
100
+ children=[_parse_navpoint(child, base_href) for child in node.findall("ncx:navPoint", NCX_NS)],
101
+ )
epub2pdf_cli/errors.py ADDED
@@ -0,0 +1,27 @@
1
+ from __future__ import annotations
2
+
3
+ from enum import IntEnum
4
+
5
+
6
+ class ExitCode(IntEnum):
7
+ OK = 0
8
+ UNEXPECTED = 1
9
+ USAGE = 2
10
+ STAGE = 3
11
+ OUTPUT_EXISTS = 5
12
+
13
+
14
+ class Epub2PdfError(Exception):
15
+ """Base error for CLI failures."""
16
+
17
+ def __init__(self, message: str, *, exit_code: ExitCode = ExitCode.UNEXPECTED) -> None:
18
+ super().__init__(message)
19
+ self.exit_code = exit_code
20
+
21
+
22
+ class StageError(Epub2PdfError):
23
+ """Error raised for pipeline stage failures."""
24
+
25
+ def __init__(self, stage: str, message: str, *, exit_code: ExitCode = ExitCode.STAGE) -> None:
26
+ super().__init__(f"[{stage}] {message}", exit_code=exit_code)
27
+ self.stage = stage
@@ -0,0 +1,3 @@
1
+ from epub2pdf_cli.html.builder import BuildResult, build_html
2
+
3
+ __all__ = ["BuildResult", "build_html"]
@@ -0,0 +1,190 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from html import escape
5
+ from typing import Any, cast
6
+
7
+ from bs4 import BeautifulSoup
8
+
9
+ from epub2pdf_cli.config import ConvertConfig
10
+ from epub2pdf_cli.html.css import rewrite_css_item
11
+ from epub2pdf_cli.html.links import (
12
+ render_toc_items,
13
+ rewrite_resources,
14
+ )
15
+ from epub2pdf_cli.html.template import base_css, wrap_document
16
+ from epub2pdf_cli.models import Chapter, EpubBook
17
+
18
+
19
+ @dataclass(frozen=True, slots=True)
20
+ class BuildResult:
21
+ html: str
22
+ chapters: list[dict[str, Any]]
23
+ assets: list[dict[str, Any]]
24
+ warnings: list[str]
25
+
26
+
27
+ def build_html(book: EpubBook, config: ConvertConfig) -> BuildResult:
28
+ chapter_lookup = {chapter.href: chapter for chapter in book.chapters}
29
+ chapter_section_ids = {chapter.href: f"chapter-{index + 1}" for index, chapter in enumerate(book.chapters)}
30
+ soups = {chapter.href: BeautifulSoup(chapter.html, "lxml") for chapter in book.chapters}
31
+ element_id_map = _build_element_id_map(soups)
32
+ assets: dict[str, dict[str, Any]] = {}
33
+ warnings: list[str] = []
34
+
35
+ stylesheet_blocks = [
36
+ rewrite_css_item(item.href, item.content.decode("utf-8", errors="replace"), book, assets, warnings)
37
+ for item in book.manifest.values()
38
+ if item.media_type == "text/css" and item.content
39
+ ]
40
+ stylesheet_blocks = [block for block in stylesheet_blocks if block.strip()]
41
+
42
+ rendered_sections: list[str] = []
43
+ sidecar_chapters: list[dict[str, Any]] = []
44
+
45
+ if config.cover == "first" and book.cover is not None:
46
+ cover_src = _data_uri(book.cover.content, book.cover.media_type)
47
+ assets[book.cover.href] = {
48
+ "href": book.cover.href,
49
+ "media_type": book.cover.media_type,
50
+ "rewritten_as": "data-uri",
51
+ "usage": "cover",
52
+ }
53
+ rendered_sections.append(
54
+ "\n".join(
55
+ [
56
+ '<section class="epub-cover page-break" id="cover-page">',
57
+ f'<img alt="Cover image" src="{cover_src}" />',
58
+ "</section>",
59
+ ]
60
+ )
61
+ )
62
+
63
+ if book.toc:
64
+ rendered_sections.append(_render_generated_toc(book.toc, chapter_section_ids, element_id_map))
65
+
66
+ for index, chapter in enumerate(book.chapters, start=1):
67
+ section_id = chapter_section_ids[chapter.href]
68
+ section_html, chapter_info = _render_chapter(
69
+ chapter,
70
+ soup=soups[chapter.href],
71
+ chapter_index=index,
72
+ section_id=section_id,
73
+ chapter_lookup=chapter_lookup,
74
+ chapter_section_ids=chapter_section_ids,
75
+ element_id_map=element_id_map,
76
+ book=book,
77
+ assets=assets,
78
+ warnings=warnings,
79
+ )
80
+ rendered_sections.append(section_html)
81
+ sidecar_chapters.append(chapter_info)
82
+
83
+ title = book.metadata.get("title") or "Untitled EPUB"
84
+ author = ", ".join(book.metadata.get("creators", []))
85
+
86
+ html = wrap_document(
87
+ title=title,
88
+ language=book.metadata.get("language", ""),
89
+ author=author,
90
+ stylesheets=[base_css(config.page_size, config.margin_mm), *stylesheet_blocks],
91
+ body_sections=rendered_sections,
92
+ )
93
+
94
+ return BuildResult(
95
+ html=html,
96
+ chapters=sidecar_chapters,
97
+ assets=list(assets.values()),
98
+ warnings=warnings,
99
+ )
100
+
101
+
102
+ def _build_element_id_map(soups: dict[str, BeautifulSoup]) -> dict[tuple[str, str], str]:
103
+ element_id_map: dict[tuple[str, str], str] = {}
104
+ for index, (href, soup) in enumerate(soups.items(), start=1):
105
+ for node in soup.find_all(id=True):
106
+ original = cast(str, node.get("id"))
107
+ if not original:
108
+ continue
109
+ element_id_map[(href, original)] = f"chapter-{index}-{original}"
110
+ return element_id_map
111
+
112
+
113
+ def _render_chapter(
114
+ chapter: Chapter,
115
+ *,
116
+ soup: BeautifulSoup,
117
+ chapter_index: int,
118
+ section_id: str,
119
+ chapter_lookup: dict[str, Chapter],
120
+ chapter_section_ids: dict[str, str],
121
+ element_id_map: dict[tuple[str, str], str],
122
+ book: EpubBook,
123
+ assets: dict[str, dict[str, Any]],
124
+ warnings: list[str],
125
+ ) -> tuple[str, dict[str, Any]]:
126
+ for link in soup.find_all("link"):
127
+ if (link.get("rel") or [""])[0].lower() == "stylesheet":
128
+ link.decompose()
129
+
130
+ body = soup.body
131
+ if body is None:
132
+ body = soup
133
+ # If there is no body, avoid wrapping the entire document including head
134
+ for tag in list(body.find_all()):
135
+ if tag.name in {"head", "title", "meta", "link", "style", "script"}:
136
+ tag.decompose()
137
+
138
+ for node in body.find_all(id=True):
139
+ original = cast(str, node.get("id"))
140
+ if not original:
141
+ continue
142
+ node["id"] = element_id_map.get((chapter.href, original), original)
143
+
144
+ rewrite_resources(body, chapter.href, chapter_lookup, chapter_section_ids, element_id_map, book, assets, warnings)
145
+ title = chapter.title or f"Chapter {chapter_index}"
146
+ chapter_info = chapter.to_dict()
147
+ chapter_info.update(
148
+ {
149
+ "section_id": section_id,
150
+ "anchors": sorted(
151
+ mapped_id for (href, _), mapped_id in element_id_map.items() if href == chapter.href
152
+ ),
153
+ }
154
+ )
155
+ section_html = "\n".join(
156
+ [
157
+ f'<section class="epub-chapter page-break" id="{escape(section_id)}" data-source-href="{escape(chapter.href)}">',
158
+ f'<h1 class="chapter-title">{escape(title)}</h1>',
159
+ "".join(str(child) for child in body.contents),
160
+ "</section>",
161
+ ]
162
+ )
163
+ return section_html, chapter_info
164
+
165
+
166
+ def _render_generated_toc(
167
+ toc: list[Any],
168
+ chapter_section_ids: dict[str, str],
169
+ element_id_map: dict[tuple[str, str], str],
170
+ ) -> str:
171
+ items = render_toc_items(toc, chapter_section_ids, element_id_map)
172
+ if not items:
173
+ return ""
174
+ return "\n".join(
175
+ [
176
+ '<section class="generated-toc page-break" id="generated-toc">',
177
+ "<h1>Table of Contents</h1>",
178
+ "<nav>",
179
+ f"<ol>{items}</ol>",
180
+ "</nav>",
181
+ "</section>",
182
+ ]
183
+ )
184
+
185
+
186
+ def _data_uri(content: bytes, media_type: str) -> str:
187
+ import base64
188
+
189
+ encoded = base64.b64encode(content).decode("ascii")
190
+ return f"data:{media_type};base64,{encoded}"
@@ -0,0 +1,49 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from typing import Any
5
+
6
+ from epub2pdf_cli.epub.href import split_href
7
+ from epub2pdf_cli.models import EpubBook
8
+
9
+ URL_PATTERN = re.compile(r"url\((?P<quote>['\"]?)(?P<target>[^)'\"]+)(?P=quote)\)")
10
+ DATA_SCHEMES = ("http://", "https://", "mailto:", "data:")
11
+
12
+
13
+ def rewrite_css_item(
14
+ css_href: str,
15
+ css_text: str,
16
+ book: EpubBook,
17
+ assets: dict[str, dict[str, Any]],
18
+ warnings: list[str],
19
+ ) -> str:
20
+ def replace(match: re.Match[str]) -> str:
21
+ target = match.group("target").strip()
22
+ if any(target.startswith(prefix) for prefix in DATA_SCHEMES):
23
+ return match.group(0)
24
+ path, _fragment = split_href(target)
25
+ if not path:
26
+ return match.group(0)
27
+ import posixpath
28
+
29
+ resolved = posixpath.normpath(posixpath.join(posixpath.dirname(css_href), path))
30
+ item = book.manifest_by_href.get(resolved)
31
+ if not item or not item.content:
32
+ warnings.append(f"Missing CSS asset during normalization: {resolved}")
33
+ return "url()"
34
+ assets[resolved] = {
35
+ "href": resolved,
36
+ "media_type": item.media_type,
37
+ "rewritten_as": "data-uri",
38
+ "usage": "css-url",
39
+ }
40
+ return f"url('{_data_uri(item.content, item.media_type)}')"
41
+
42
+ return URL_PATTERN.sub(replace, css_text)
43
+
44
+
45
+ def _data_uri(content: bytes, media_type: str) -> str:
46
+ import base64
47
+
48
+ encoded = base64.b64encode(content).decode("ascii")
49
+ return f"data:{media_type};base64,{encoded}"