epub-generator 0.1.5__tar.gz → 0.1.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {epub_generator-0.1.5 → epub_generator-0.1.7}/PKG-INFO +1 -1
- {epub_generator-0.1.5 → epub_generator-0.1.7}/epub_generator/__init__.py +3 -0
- {epub_generator-0.1.5 → epub_generator-0.1.7}/epub_generator/context.py +5 -2
- epub_generator-0.1.7/epub_generator/generation/__init__.py +1 -0
- {epub_generator-0.1.5 → epub_generator-0.1.7}/epub_generator/generation/gen_asset.py +20 -15
- {epub_generator-0.1.5 → epub_generator-0.1.7}/epub_generator/generation/gen_chapter.py +6 -6
- {epub_generator-0.1.5 → epub_generator-0.1.7}/epub_generator/generation/gen_content.py +3 -4
- {epub_generator-0.1.5 → epub_generator-0.1.7}/epub_generator/generation/gen_epub.py +9 -31
- {epub_generator-0.1.5 → epub_generator-0.1.7}/epub_generator/generation/gen_toc.py +6 -11
- {epub_generator-0.1.5 → epub_generator-0.1.7}/epub_generator/generation/xml_utils.py +16 -20
- {epub_generator-0.1.5 → epub_generator-0.1.7}/epub_generator/html_tag.py +4 -2
- {epub_generator-0.1.5 → epub_generator-0.1.7}/epub_generator/types.py +22 -3
- epub_generator-0.1.7/epub_generator/validate.py +226 -0
- {epub_generator-0.1.5 → epub_generator-0.1.7}/pyproject.toml +2 -1
- epub_generator-0.1.5/epub_generator/generation/__init__.py +0 -1
- {epub_generator-0.1.5 → epub_generator-0.1.7}/LICENSE +0 -0
- {epub_generator-0.1.5 → epub_generator-0.1.7}/README.md +0 -0
- {epub_generator-0.1.5 → epub_generator-0.1.7}/epub_generator/data/container.xml.jinja +0 -0
- {epub_generator-0.1.5 → epub_generator-0.1.7}/epub_generator/data/content.opf.jinja +0 -0
- {epub_generator-0.1.5 → epub_generator-0.1.7}/epub_generator/data/cover.xhtml.jinja +0 -0
- {epub_generator-0.1.5 → epub_generator-0.1.7}/epub_generator/data/mimetype.jinja +0 -0
- {epub_generator-0.1.5 → epub_generator-0.1.7}/epub_generator/data/nav.xhtml.jinja +0 -0
- {epub_generator-0.1.5 → epub_generator-0.1.7}/epub_generator/data/part.xhtml.jinja +0 -0
- {epub_generator-0.1.5 → epub_generator-0.1.7}/epub_generator/data/style.css.jinja +0 -0
- {epub_generator-0.1.5 → epub_generator-0.1.7}/epub_generator/generation/gen_nav.py +0 -0
- {epub_generator-0.1.5 → epub_generator-0.1.7}/epub_generator/i18n.py +0 -0
- {epub_generator-0.1.5 → epub_generator-0.1.7}/epub_generator/options.py +0 -0
- {epub_generator-0.1.5 → epub_generator-0.1.7}/epub_generator/template.py +0 -0
|
@@ -17,10 +17,13 @@ from .types import (
|
|
|
17
17
|
TextKind,
|
|
18
18
|
TocItem,
|
|
19
19
|
)
|
|
20
|
+
from .validate import InvalidUnicodeError
|
|
20
21
|
|
|
21
22
|
__all__ = [
|
|
22
23
|
# Main API function
|
|
23
24
|
"generate_epub",
|
|
25
|
+
# Validation
|
|
26
|
+
"InvalidUnicodeError",
|
|
24
27
|
# Options
|
|
25
28
|
"TableRender",
|
|
26
29
|
"LaTeXRender",
|
|
@@ -18,6 +18,7 @@ class _AssetNode:
|
|
|
18
18
|
media_type: str
|
|
19
19
|
content_hash: str
|
|
20
20
|
|
|
21
|
+
|
|
21
22
|
class Context:
|
|
22
23
|
def __init__(
|
|
23
24
|
self,
|
|
@@ -55,7 +56,7 @@ class Context:
|
|
|
55
56
|
nodes = list(self._hash_to_node.values())
|
|
56
57
|
nodes.sort(key=lambda node: node.file_name)
|
|
57
58
|
return [(node.file_name, node.media_type) for node in nodes]
|
|
58
|
-
|
|
59
|
+
|
|
59
60
|
@property
|
|
60
61
|
def chapters_with_mathml(self) -> set[str]:
|
|
61
62
|
return self._chapters_with_mathml
|
|
@@ -117,6 +118,7 @@ class Context:
|
|
|
117
118
|
)
|
|
118
119
|
return file_name
|
|
119
120
|
|
|
121
|
+
|
|
120
122
|
class Template:
|
|
121
123
|
def __init__(self):
|
|
122
124
|
templates_path = cast(Path, files("epub_generator")) / "data"
|
|
@@ -134,7 +136,8 @@ class Template:
|
|
|
134
136
|
self._templates[name] = template
|
|
135
137
|
return template
|
|
136
138
|
|
|
139
|
+
|
|
137
140
|
def _sha256_hash(data: bytes) -> str:
|
|
138
141
|
hash256 = sha256()
|
|
139
142
|
hash256.update(data)
|
|
140
|
-
return hash256.hexdigest()
|
|
143
|
+
return hash256.hexdigest()
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .gen_epub import generate_epub
|
|
@@ -22,13 +22,15 @@ _MEDIA_TYPE_MAP = {
|
|
|
22
22
|
|
|
23
23
|
def render_inline_formula(context: Context, formula: Formula) -> Element | None:
|
|
24
24
|
return _render_formula(
|
|
25
|
-
context=context,
|
|
26
|
-
formula=formula,
|
|
25
|
+
context=context,
|
|
26
|
+
formula=formula,
|
|
27
27
|
inline_mode=True,
|
|
28
28
|
)
|
|
29
29
|
|
|
30
30
|
|
|
31
|
-
def render_asset_block(
|
|
31
|
+
def render_asset_block(
|
|
32
|
+
context: Context, block: Table | Formula | Image
|
|
33
|
+
) -> Element | None:
|
|
32
34
|
element: Element | None = None
|
|
33
35
|
if isinstance(block, Table):
|
|
34
36
|
element = _render_table(context, block)
|
|
@@ -44,17 +46,17 @@ def _render_table(context: Context, table: Table) -> Element | None:
|
|
|
44
46
|
return None
|
|
45
47
|
|
|
46
48
|
return _wrap_asset_content(
|
|
47
|
-
context=context,
|
|
48
|
-
asset=table,
|
|
49
|
+
context=context,
|
|
50
|
+
asset=table,
|
|
49
51
|
content_element=render_html_tag(context, table.html_content),
|
|
50
52
|
)
|
|
51
53
|
|
|
52
54
|
|
|
53
55
|
def _render_formula(
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
56
|
+
context: Context,
|
|
57
|
+
formula: Formula,
|
|
58
|
+
inline_mode: bool,
|
|
59
|
+
) -> Element | None:
|
|
58
60
|
|
|
59
61
|
if context.latex_render == LaTeXRender.CLIPPING:
|
|
60
62
|
return None
|
|
@@ -88,7 +90,7 @@ def _render_formula(
|
|
|
88
90
|
|
|
89
91
|
return _wrap_asset_content(
|
|
90
92
|
context=context,
|
|
91
|
-
asset=formula,
|
|
93
|
+
asset=formula,
|
|
92
94
|
content_element=content_element,
|
|
93
95
|
inline_mode=inline_mode,
|
|
94
96
|
)
|
|
@@ -106,11 +108,12 @@ def _process_image(context: Context, image: Image) -> Element:
|
|
|
106
108
|
img_element.set("alt", "") # Empty alt text, use caption instead
|
|
107
109
|
|
|
108
110
|
return _wrap_asset_content(
|
|
109
|
-
context=context,
|
|
110
|
-
asset=image,
|
|
111
|
+
context=context,
|
|
112
|
+
asset=image,
|
|
111
113
|
content_element=img_element,
|
|
112
114
|
)
|
|
113
115
|
|
|
116
|
+
|
|
114
117
|
def _normalize_expression(expression: str) -> str:
|
|
115
118
|
expression = expression.replace("\n", "")
|
|
116
119
|
expression = expression.strip()
|
|
@@ -159,7 +162,9 @@ def _latex_formula2svg(latex: str, font_size: int = 12):
|
|
|
159
162
|
plt.rc("text", usetex=True)
|
|
160
163
|
plt.rc("font", size=font_size)
|
|
161
164
|
fig, ax = plt.subplots()
|
|
162
|
-
txt = ax.text(
|
|
165
|
+
txt = ax.text(
|
|
166
|
+
0.5, 0.5, f"${latex}$", ha="center", va="center", transform=ax.transAxes
|
|
167
|
+
)
|
|
163
168
|
ax.axis("off")
|
|
164
169
|
fig.canvas.draw()
|
|
165
170
|
bbox = txt.get_window_extent(cast(Any, fig.canvas).get_renderer())
|
|
@@ -174,7 +179,7 @@ def _latex_formula2svg(latex: str, font_size: int = 12):
|
|
|
174
179
|
return output.getvalue()
|
|
175
180
|
except Exception:
|
|
176
181
|
return None
|
|
177
|
-
|
|
182
|
+
|
|
178
183
|
|
|
179
184
|
def _wrap_asset_content(
|
|
180
185
|
context: Context,
|
|
@@ -182,7 +187,7 @@ def _wrap_asset_content(
|
|
|
182
187
|
content_element: Element,
|
|
183
188
|
inline_mode: bool = False,
|
|
184
189
|
) -> Element:
|
|
185
|
-
|
|
190
|
+
|
|
186
191
|
if inline_mode:
|
|
187
192
|
wrapper = Element("span", attrib={"class": "formula-inline"})
|
|
188
193
|
else:
|
|
@@ -16,7 +16,7 @@ from .gen_asset import render_asset_block
|
|
|
16
16
|
from .gen_content import render_inline_content
|
|
17
17
|
from .xml_utils import serialize_element, set_epub_type
|
|
18
18
|
|
|
19
|
-
_MAX_HEADING_LEVEL = 6
|
|
19
|
+
_MAX_HEADING_LEVEL = 6 # HTML standard defines heading levels from h1 to h6
|
|
20
20
|
|
|
21
21
|
|
|
22
22
|
def generate_chapter(
|
|
@@ -28,15 +28,14 @@ def generate_chapter(
|
|
|
28
28
|
template="part.xhtml",
|
|
29
29
|
i18n=i18n,
|
|
30
30
|
content=[
|
|
31
|
-
serialize_element(child)
|
|
32
|
-
for child in _render_contents(context, chapter)
|
|
31
|
+
serialize_element(child) for child in _render_contents(context, chapter)
|
|
33
32
|
],
|
|
34
33
|
citations=[
|
|
35
|
-
serialize_element(child)
|
|
36
|
-
for child in _render_footnotes(context, chapter)
|
|
34
|
+
serialize_element(child) for child in _render_footnotes(context, chapter)
|
|
37
35
|
],
|
|
38
36
|
)
|
|
39
37
|
|
|
38
|
+
|
|
40
39
|
def _render_contents(
|
|
41
40
|
context: Context,
|
|
42
41
|
chapter: Chapter,
|
|
@@ -46,6 +45,7 @@ def _render_contents(
|
|
|
46
45
|
if layout is not None:
|
|
47
46
|
yield layout
|
|
48
47
|
|
|
48
|
+
|
|
49
49
|
def _render_footnotes(
|
|
50
50
|
context: Context,
|
|
51
51
|
chapter: Chapter,
|
|
@@ -115,6 +115,6 @@ def _render_content_block(context: Context, block: ContentBlock) -> Element | No
|
|
|
115
115
|
return blockquote
|
|
116
116
|
|
|
117
117
|
return container
|
|
118
|
-
|
|
118
|
+
|
|
119
119
|
else:
|
|
120
120
|
return None
|
|
@@ -6,9 +6,7 @@ from .xml_utils import set_epub_type
|
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
def render_inline_content(
|
|
9
|
-
context: Context,
|
|
10
|
-
parent: Element,
|
|
11
|
-
content: list[str | Mark | Formula | HTMLTag]
|
|
9
|
+
context: Context, parent: Element, content: list[str | Mark | Formula | HTMLTag]
|
|
12
10
|
) -> None:
|
|
13
11
|
current_element = parent
|
|
14
12
|
for item in content:
|
|
@@ -31,6 +29,7 @@ def render_inline_content(
|
|
|
31
29
|
|
|
32
30
|
elif isinstance(item, Formula):
|
|
33
31
|
from .gen_asset import render_inline_formula # avoid circular import
|
|
32
|
+
|
|
34
33
|
formula_element = render_inline_formula(context, item)
|
|
35
34
|
if formula_element is not None:
|
|
36
35
|
parent.append(formula_element)
|
|
@@ -56,4 +55,4 @@ def render_html_tag(context: Context, tag: HTMLTag) -> Element:
|
|
|
56
55
|
for attr, value in tag.attributes:
|
|
57
56
|
element.set(attr, value)
|
|
58
57
|
render_inline_content(context, element, tag.content)
|
|
59
|
-
return element
|
|
58
|
+
return element
|
|
@@ -6,13 +6,14 @@ from uuid import uuid4
|
|
|
6
6
|
from zipfile import ZipFile
|
|
7
7
|
|
|
8
8
|
from ..context import Context, Template
|
|
9
|
-
from ..html_tag import search_content
|
|
10
9
|
from ..i18n import I18N
|
|
11
10
|
from ..options import LaTeXRender, TableRender
|
|
12
|
-
from ..types import
|
|
11
|
+
from ..types import EpubData
|
|
12
|
+
from ..validate import validate_chapter, validate_epub_data
|
|
13
13
|
from .gen_chapter import generate_chapter
|
|
14
14
|
from .gen_nav import gen_nav
|
|
15
15
|
from .gen_toc import TocPoint, gen_toc, iter_toc
|
|
16
|
+
from .xml_utils import MATHML_NS
|
|
16
17
|
|
|
17
18
|
|
|
18
19
|
def generate_epub(
|
|
@@ -23,6 +24,9 @@ def generate_epub(
|
|
|
23
24
|
latex_render: LaTeXRender = LaTeXRender.MATHML,
|
|
24
25
|
assert_not_aborted: Callable[[], None] = lambda: None,
|
|
25
26
|
) -> None:
|
|
27
|
+
# Validate epub_data for invalid Unicode characters before processing
|
|
28
|
+
validate_epub_data(epub_data)
|
|
29
|
+
|
|
26
30
|
i18n = I18N(lan)
|
|
27
31
|
template = Template()
|
|
28
32
|
epub_file_path = Path(epub_file_path)
|
|
@@ -114,12 +118,14 @@ def _write_chapters_from_data(
|
|
|
114
118
|
):
|
|
115
119
|
for file_name, get_chapter in _search_chapters(epub_data, toc_points):
|
|
116
120
|
chapter = get_chapter()
|
|
121
|
+
# Validate chapter content for invalid Unicode characters
|
|
122
|
+
validate_chapter(chapter, context=f"Chapter '{file_name}'")
|
|
117
123
|
data = generate_chapter(context, chapter, i18n)
|
|
118
124
|
context.file.writestr(
|
|
119
125
|
zinfo_or_arcname="OEBPS/Text/" + file_name,
|
|
120
126
|
data=data.encode("utf-8"),
|
|
121
127
|
)
|
|
122
|
-
if latex_render == LaTeXRender.MATHML and
|
|
128
|
+
if latex_render == LaTeXRender.MATHML and MATHML_NS in data:
|
|
123
129
|
context.mark_chapter_has_mathml(file_name)
|
|
124
130
|
assert_not_aborted()
|
|
125
131
|
|
|
@@ -131,34 +137,6 @@ def _search_chapters(epub_data: EpubData, toc_points: list[TocPoint]):
|
|
|
131
137
|
yield ref.file_name, ref.get_chapter
|
|
132
138
|
|
|
133
139
|
|
|
134
|
-
def _chapter_has_formula(chapter: Chapter) -> bool:
|
|
135
|
-
for element in chapter.elements:
|
|
136
|
-
if _content_block_has_formula(element):
|
|
137
|
-
return True
|
|
138
|
-
for footnote in chapter.footnotes:
|
|
139
|
-
for content_block in footnote.contents:
|
|
140
|
-
if _content_block_has_formula(content_block):
|
|
141
|
-
return True
|
|
142
|
-
return False
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
def _content_block_has_formula(content_block: ContentBlock) -> bool:
|
|
146
|
-
if isinstance(content_block, Formula):
|
|
147
|
-
return True
|
|
148
|
-
if isinstance(content_block, TextBlock):
|
|
149
|
-
for item in search_content(content_block.content):
|
|
150
|
-
if isinstance(item, Formula):
|
|
151
|
-
return True
|
|
152
|
-
if isinstance(content_block, BasicAsset):
|
|
153
|
-
for item in search_content(content_block.title):
|
|
154
|
-
if isinstance(item, Formula):
|
|
155
|
-
return True
|
|
156
|
-
for item in search_content(content_block.caption):
|
|
157
|
-
if isinstance(item, Formula):
|
|
158
|
-
return True
|
|
159
|
-
return False
|
|
160
|
-
|
|
161
|
-
|
|
162
140
|
def _write_basic_files(
|
|
163
141
|
context: Context,
|
|
164
142
|
i18n: I18N,
|
|
@@ -21,6 +21,7 @@ class TocPoint:
|
|
|
21
21
|
"""是否有对应的 XHTML 文件"""
|
|
22
22
|
return self.ref is not None
|
|
23
23
|
|
|
24
|
+
|
|
24
25
|
@dataclass
|
|
25
26
|
class TocPointRef:
|
|
26
27
|
part_id: str
|
|
@@ -40,10 +41,7 @@ def gen_toc(epub_data: EpubData) -> list[TocPoint]:
|
|
|
40
41
|
chapters = epub_data.chapters
|
|
41
42
|
|
|
42
43
|
toc_point_generation = _TocPointGenerator(
|
|
43
|
-
chapters_count=(
|
|
44
|
-
_count_toc_items(prefaces) +
|
|
45
|
-
_count_toc_items(chapters)
|
|
46
|
-
),
|
|
44
|
+
chapters_count=(_count_toc_items(prefaces) + _count_toc_items(chapters)),
|
|
47
45
|
)
|
|
48
46
|
toc_points: list[TocPoint] = []
|
|
49
47
|
for chapters_list in (prefaces, chapters):
|
|
@@ -91,15 +89,12 @@ class _TocPointGenerator:
|
|
|
91
89
|
file_name=f"part{part_id}.xhtml",
|
|
92
90
|
get_chapter=toc_item.get_chapter,
|
|
93
91
|
)
|
|
94
|
-
order = self._next_order
|
|
92
|
+
order = self._next_order # 确保 order 以中序遍历为顺序
|
|
95
93
|
self._next_order += 1
|
|
96
94
|
|
|
97
95
|
return TocPoint(
|
|
98
|
-
title=toc_item.title,
|
|
96
|
+
title=toc_item.title,
|
|
99
97
|
order=order,
|
|
100
|
-
ref=ref,
|
|
101
|
-
children=[
|
|
102
|
-
self._create_toc_point(child)
|
|
103
|
-
for child in toc_item.children
|
|
104
|
-
],
|
|
98
|
+
ref=ref,
|
|
99
|
+
children=[self._create_toc_point(child) for child in toc_item.children],
|
|
105
100
|
)
|
|
@@ -1,19 +1,19 @@
|
|
|
1
1
|
import re
|
|
2
|
-
from typing import Container
|
|
3
2
|
from xml.etree.ElementTree import Element, tostring
|
|
4
3
|
|
|
4
|
+
MATHML_NS = "http://www.w3.org/1998/Math/MathML"
|
|
5
5
|
_EPUB_NS = "http://www.idpf.org/2007/ops"
|
|
6
|
-
_MATHML_NS = "http://www.w3.org/1998/Math/MathML"
|
|
7
6
|
|
|
8
7
|
|
|
9
8
|
def set_epub_type(element: Element, epub_type: str) -> None:
|
|
10
9
|
element.set(f"{{{_EPUB_NS}}}type", epub_type)
|
|
11
10
|
|
|
11
|
+
|
|
12
12
|
def serialize_element(element: Element) -> str:
|
|
13
13
|
xml_string = tostring(element, encoding="unicode")
|
|
14
14
|
for prefix, namespace_uri, keep_xmlns in (
|
|
15
15
|
("epub", _EPUB_NS, False), # EPUB namespace: remove xmlns (declared at root)
|
|
16
|
-
("m",
|
|
16
|
+
("m", MATHML_NS, True), # MathML namespace: keep xmlns with clean prefix
|
|
17
17
|
):
|
|
18
18
|
xml_string = xml_string.replace(f"{{{namespace_uri}}}", f"{prefix}:")
|
|
19
19
|
pattern = r"xmlns:(ns\d+)=\"" + re.escape(namespace_uri) + r"\""
|
|
@@ -22,33 +22,29 @@ def serialize_element(element: Element) -> str:
|
|
|
22
22
|
for ns_prefix in matches:
|
|
23
23
|
if keep_xmlns:
|
|
24
24
|
xml_string = xml_string.replace(
|
|
25
|
-
f
|
|
26
|
-
f
|
|
25
|
+
f' xmlns:{ns_prefix}="{namespace_uri}"',
|
|
26
|
+
f' xmlns:{prefix}="{namespace_uri}"',
|
|
27
27
|
)
|
|
28
28
|
else:
|
|
29
|
-
xml_string = xml_string.replace(
|
|
29
|
+
xml_string = xml_string.replace(
|
|
30
|
+
f' xmlns:{ns_prefix}="{namespace_uri}"', ""
|
|
31
|
+
)
|
|
30
32
|
xml_string = xml_string.replace(f"{ns_prefix}:", f"{prefix}:")
|
|
31
33
|
|
|
32
34
|
return xml_string
|
|
33
35
|
|
|
34
|
-
|
|
36
|
+
|
|
37
|
+
def indent(elem: Element, level: int = 0) -> Element:
|
|
35
38
|
indent_str = " " * level
|
|
36
39
|
next_indent_str = " " * (level + 1)
|
|
37
|
-
|
|
38
|
-
if elem.tag in skip_tags:
|
|
39
|
-
if level > 0 and (not elem.tail or not elem.tail.strip()):
|
|
40
|
-
elem.tail = "\n" + indent_str
|
|
41
|
-
return elem
|
|
42
|
-
|
|
43
40
|
if len(elem):
|
|
44
41
|
if not elem.text or not elem.text.strip():
|
|
45
42
|
elem.text = "\n" + next_indent_str
|
|
46
43
|
for i, child in enumerate(elem):
|
|
47
|
-
indent(child, level + 1
|
|
48
|
-
if
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
elem.tail = "\n" + indent_str
|
|
44
|
+
indent(child, level + 1)
|
|
45
|
+
if not child.tail or not child.tail.strip():
|
|
46
|
+
if i == len(elem) - 1:
|
|
47
|
+
child.tail = "\n" + indent_str
|
|
48
|
+
else:
|
|
49
|
+
child.tail = "\n" + next_indent_str
|
|
54
50
|
return elem
|
|
@@ -3,9 +3,11 @@ from typing import Generator
|
|
|
3
3
|
from .types import Formula, HTMLTag, Mark
|
|
4
4
|
|
|
5
5
|
|
|
6
|
-
def search_content(
|
|
6
|
+
def search_content(
|
|
7
|
+
content: list[str | Mark | Formula | HTMLTag],
|
|
8
|
+
) -> Generator[str | Mark | Formula, None, None]:
|
|
7
9
|
for child in content:
|
|
8
10
|
if isinstance(child, HTMLTag):
|
|
9
11
|
yield from search_content(child.content)
|
|
10
12
|
else:
|
|
11
|
-
yield child
|
|
13
|
+
yield child
|
|
@@ -24,6 +24,7 @@ class EpubData:
|
|
|
24
24
|
cover_image_path: Path | None = None
|
|
25
25
|
"""Cover image file path (optional, absolute path)"""
|
|
26
26
|
|
|
27
|
+
|
|
27
28
|
@dataclass
|
|
28
29
|
class BookMeta:
|
|
29
30
|
"""Book metadata information."""
|
|
@@ -57,9 +58,11 @@ class BookMeta:
|
|
|
57
58
|
# Table of Contents structure
|
|
58
59
|
# ============================================================================
|
|
59
60
|
|
|
61
|
+
|
|
60
62
|
@dataclass
|
|
61
63
|
class TocItem:
|
|
62
64
|
"""Table of contents item with title, content, and optional nested children."""
|
|
65
|
+
|
|
63
66
|
title: str
|
|
64
67
|
"""Chapter title displayed in table of contents"""
|
|
65
68
|
|
|
@@ -69,6 +72,7 @@ class TocItem:
|
|
|
69
72
|
children: "list[TocItem]" = field(default_factory=list)
|
|
70
73
|
"""Nested sub-chapters (recursive, optional)"""
|
|
71
74
|
|
|
75
|
+
|
|
72
76
|
class TextKind(Enum):
|
|
73
77
|
BODY = "body"
|
|
74
78
|
"""Regular paragraph."""
|
|
@@ -77,21 +81,29 @@ class TextKind(Enum):
|
|
|
77
81
|
QUOTE = "quote"
|
|
78
82
|
"""Quoted text."""
|
|
79
83
|
|
|
84
|
+
|
|
80
85
|
@dataclass
|
|
81
86
|
class Mark:
|
|
82
87
|
"""Citation reference marker."""
|
|
88
|
+
|
|
83
89
|
id: int
|
|
84
90
|
"""Citation ID, matches Footnote.id"""
|
|
85
91
|
|
|
92
|
+
|
|
86
93
|
@dataclass
|
|
87
94
|
class BasicAsset:
|
|
88
95
|
"""Asset as a base class for other assets."""
|
|
89
96
|
|
|
90
|
-
title: list["str | Mark | Formula | HTMLTag"] = field(
|
|
97
|
+
title: list["str | Mark | Formula | HTMLTag"] = field(
|
|
98
|
+
default_factory=list, kw_only=True
|
|
99
|
+
)
|
|
91
100
|
"""Asset title (before content)"""
|
|
92
|
-
caption: list["str | Mark | Formula | HTMLTag"] = field(
|
|
101
|
+
caption: list["str | Mark | Formula | HTMLTag"] = field(
|
|
102
|
+
default_factory=list, kw_only=True
|
|
103
|
+
)
|
|
93
104
|
"""Asset caption (after content)"""
|
|
94
105
|
|
|
106
|
+
|
|
95
107
|
@dataclass
|
|
96
108
|
class Table(BasicAsset):
|
|
97
109
|
"""Table representation."""
|
|
@@ -115,6 +127,7 @@ class Image(BasicAsset):
|
|
|
115
127
|
path: Path
|
|
116
128
|
"""Absolute path to the image file"""
|
|
117
129
|
|
|
130
|
+
|
|
118
131
|
@dataclass
|
|
119
132
|
class TextBlock:
|
|
120
133
|
"""Text block representation."""
|
|
@@ -126,9 +139,11 @@ class TextBlock:
|
|
|
126
139
|
content: list["str | Mark | Formula | HTMLTag"]
|
|
127
140
|
"""Text content with optional citation marks."""
|
|
128
141
|
|
|
142
|
+
|
|
129
143
|
@dataclass
|
|
130
144
|
class Footnote:
|
|
131
145
|
"""Footnote/citation section."""
|
|
146
|
+
|
|
132
147
|
id: int
|
|
133
148
|
"""Footnote ID"""
|
|
134
149
|
|
|
@@ -142,17 +157,21 @@ class Footnote:
|
|
|
142
157
|
ContentBlock = TextBlock | Table | Formula | Image
|
|
143
158
|
"""Union of all content blocks that appear in main chapter content."""
|
|
144
159
|
|
|
160
|
+
|
|
145
161
|
@dataclass
|
|
146
162
|
class Chapter:
|
|
147
163
|
"""Complete content of a single chapter."""
|
|
164
|
+
|
|
148
165
|
elements: list[ContentBlock] = field(default_factory=list)
|
|
149
166
|
"""Main content blocks"""
|
|
150
167
|
|
|
151
168
|
footnotes: list[Footnote] = field(default_factory=list)
|
|
152
169
|
"""Footnotes"""
|
|
153
170
|
|
|
171
|
+
|
|
154
172
|
ChapterGetter = Callable[[], Chapter]
|
|
155
173
|
|
|
174
|
+
|
|
156
175
|
@dataclass
|
|
157
176
|
class HTMLTag:
|
|
158
177
|
"""Generic HTML tag representation."""
|
|
@@ -164,4 +183,4 @@ class HTMLTag:
|
|
|
164
183
|
"""List of (attribute, value) pairs"""
|
|
165
184
|
|
|
166
185
|
content: list["str | Mark | Formula | HTMLTag"] = field(default_factory=list)
|
|
167
|
-
"""Inner HTML content"""
|
|
186
|
+
"""Inner HTML content"""
|
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
from .types import (
|
|
2
|
+
BasicAsset,
|
|
3
|
+
Chapter,
|
|
4
|
+
ContentBlock,
|
|
5
|
+
EpubData,
|
|
6
|
+
Footnote,
|
|
7
|
+
Formula,
|
|
8
|
+
HTMLTag,
|
|
9
|
+
Image,
|
|
10
|
+
Mark,
|
|
11
|
+
Table,
|
|
12
|
+
TextBlock,
|
|
13
|
+
TocItem,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class InvalidUnicodeError(Exception):
|
|
18
|
+
"""Raised when invalid Unicode characters (surrogates) are detected in EPUB data."""
|
|
19
|
+
|
|
20
|
+
def __init__(self, field_path: str, invalid_char_info: str):
|
|
21
|
+
"""Initialize with field path and character information.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
field_path: Dot-separated path to the field containing invalid characters
|
|
25
|
+
invalid_char_info: Information about the invalid character(s)
|
|
26
|
+
"""
|
|
27
|
+
self.field_path = field_path
|
|
28
|
+
self.invalid_char_info = invalid_char_info
|
|
29
|
+
super().__init__(
|
|
30
|
+
f"Invalid Unicode character detected in {field_path}: {invalid_char_info}"
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def validate_epub_data(epub_data: EpubData) -> None:
|
|
35
|
+
"""Validate an EpubData object for invalid Unicode characters.
|
|
36
|
+
|
|
37
|
+
This function checks all string fields in the EPUB data structure including:
|
|
38
|
+
- Book metadata (title, description, authors, etc.)
|
|
39
|
+
- Table of contents titles (recursively)
|
|
40
|
+
- Chapter content is NOT validated here (use validate_chapter separately)
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
epub_data: EPUB data to validate
|
|
44
|
+
|
|
45
|
+
Raises:
|
|
46
|
+
InvalidUnicodeError: If surrogate characters are detected in any string field
|
|
47
|
+
"""
|
|
48
|
+
# Check metadata
|
|
49
|
+
if epub_data.meta:
|
|
50
|
+
meta = epub_data.meta
|
|
51
|
+
_check_string(meta.title, "EpubData.meta.title")
|
|
52
|
+
_check_string(meta.description, "EpubData.meta.description")
|
|
53
|
+
_check_string(meta.publisher, "EpubData.meta.publisher")
|
|
54
|
+
_check_string(meta.isbn, "EpubData.meta.isbn")
|
|
55
|
+
|
|
56
|
+
for i, author in enumerate(meta.authors):
|
|
57
|
+
_check_string(author, f"EpubData.meta.authors[{i}]")
|
|
58
|
+
|
|
59
|
+
for i, editor in enumerate(meta.editors):
|
|
60
|
+
_check_string(editor, f"EpubData.meta.editors[{i}]")
|
|
61
|
+
|
|
62
|
+
for i, translator in enumerate(meta.translators):
|
|
63
|
+
_check_string(translator, f"EpubData.meta.translators[{i}]")
|
|
64
|
+
|
|
65
|
+
# Check prefaces TOC
|
|
66
|
+
for i, preface in enumerate(epub_data.prefaces):
|
|
67
|
+
_check_toc_item(preface, f"EpubData.prefaces[{i}]")
|
|
68
|
+
|
|
69
|
+
# Check chapters TOC
|
|
70
|
+
for i, chapter_toc in enumerate(epub_data.chapters):
|
|
71
|
+
_check_toc_item(chapter_toc, f"EpubData.chapters[{i}]")
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def validate_chapter(chapter: Chapter, context: str = "Chapter") -> None:
|
|
75
|
+
"""Validate a Chapter object for invalid Unicode characters.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
chapter: Chapter to validate
|
|
79
|
+
context: Context string for error reporting (e.g., "Chapter", "chapters[0]")
|
|
80
|
+
|
|
81
|
+
Raises:
|
|
82
|
+
InvalidUnicodeError: If surrogate characters are detected in any string field
|
|
83
|
+
"""
|
|
84
|
+
# Check main content elements
|
|
85
|
+
for i, element in enumerate(chapter.elements):
|
|
86
|
+
_check_content_block(element, f"{context}.elements[{i}]")
|
|
87
|
+
|
|
88
|
+
# Check footnotes
|
|
89
|
+
for i, footnote in enumerate(chapter.footnotes):
|
|
90
|
+
_check_footnote(footnote, f"{context}.footnotes[{i}]")
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _check_string(value: str | None, field_path: str) -> None:
|
|
94
|
+
"""Check if a string contains surrogate characters.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
value: String to check
|
|
98
|
+
field_path: Path to the field for error reporting
|
|
99
|
+
|
|
100
|
+
Raises:
|
|
101
|
+
InvalidUnicodeError: If surrogate characters are detected
|
|
102
|
+
"""
|
|
103
|
+
if value is None:
|
|
104
|
+
return
|
|
105
|
+
|
|
106
|
+
for i, char in enumerate(value):
|
|
107
|
+
code_point = ord(char)
|
|
108
|
+
# Check for surrogate pair range (U+D800 to U+DFFF)
|
|
109
|
+
if 0xD800 <= code_point <= 0xDFFF:
|
|
110
|
+
raise InvalidUnicodeError(
|
|
111
|
+
field_path=field_path,
|
|
112
|
+
invalid_char_info=f"surrogate character U+{code_point:04X} at position {i}",
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def _check_string_list(
|
|
117
|
+
values: list[str | Mark | Formula | HTMLTag], field_path: str
|
|
118
|
+
) -> None:
|
|
119
|
+
"""Recursively check a list that may contain strings, marks, formulas, or HTML tags.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
values: List to check
|
|
123
|
+
field_path: Path to the field for error reporting
|
|
124
|
+
|
|
125
|
+
Raises:
|
|
126
|
+
InvalidUnicodeError: If surrogate characters are detected
|
|
127
|
+
"""
|
|
128
|
+
for i, item in enumerate(values):
|
|
129
|
+
item_path = f"{field_path}[{i}]"
|
|
130
|
+
if isinstance(item, str):
|
|
131
|
+
_check_string(item, item_path)
|
|
132
|
+
elif isinstance(item, Mark):
|
|
133
|
+
pass # Mark only contains int ID
|
|
134
|
+
elif isinstance(item, Formula):
|
|
135
|
+
_check_string(item.latex_expression, f"{item_path}.latex_expression")
|
|
136
|
+
_check_string_list(item.title, f"{item_path}.title")
|
|
137
|
+
_check_string_list(item.caption, f"{item_path}.caption")
|
|
138
|
+
elif isinstance(item, HTMLTag):
|
|
139
|
+
_check_html_tag(item, item_path)
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def _check_html_tag(tag: HTMLTag, field_path: str) -> None:
|
|
143
|
+
"""Check an HTML tag for invalid characters.
|
|
144
|
+
|
|
145
|
+
Args:
|
|
146
|
+
tag: HTML tag to check
|
|
147
|
+
field_path: Path to the field for error reporting
|
|
148
|
+
|
|
149
|
+
Raises:
|
|
150
|
+
InvalidUnicodeError: If surrogate characters are detected
|
|
151
|
+
"""
|
|
152
|
+
_check_string(tag.name, f"{field_path}.name")
|
|
153
|
+
|
|
154
|
+
for i, (attr_name, attr_value) in enumerate(tag.attributes):
|
|
155
|
+
_check_string(attr_name, f"{field_path}.attributes[{i}][0]")
|
|
156
|
+
_check_string(attr_value, f"{field_path}.attributes[{i}][1]")
|
|
157
|
+
|
|
158
|
+
_check_string_list(tag.content, f"{field_path}.content")
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def _check_basic_asset(asset: BasicAsset, field_path: str) -> None:
|
|
162
|
+
"""Check BasicAsset (and subclasses) for invalid characters.
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
asset: Asset to check
|
|
166
|
+
field_path: Path to the field for error reporting
|
|
167
|
+
|
|
168
|
+
Raises:
|
|
169
|
+
InvalidUnicodeError: If surrogate characters are detected
|
|
170
|
+
"""
|
|
171
|
+
_check_string_list(asset.title, f"{field_path}.title")
|
|
172
|
+
_check_string_list(asset.caption, f"{field_path}.caption")
|
|
173
|
+
|
|
174
|
+
if isinstance(asset, Formula):
|
|
175
|
+
_check_string(asset.latex_expression, f"{field_path}.latex_expression")
|
|
176
|
+
elif isinstance(asset, Table):
|
|
177
|
+
_check_html_tag(asset.html_content, f"{field_path}.html_content")
|
|
178
|
+
elif isinstance(asset, Image):
|
|
179
|
+
pass # Image only contains Path, no string content to check
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def _check_content_block(block: ContentBlock, field_path: str) -> None:
|
|
183
|
+
"""Check a content block for invalid characters.
|
|
184
|
+
|
|
185
|
+
Args:
|
|
186
|
+
block: Content block to check
|
|
187
|
+
field_path: Path to the field for error reporting
|
|
188
|
+
|
|
189
|
+
Raises:
|
|
190
|
+
InvalidUnicodeError: If surrogate characters are detected
|
|
191
|
+
"""
|
|
192
|
+
if isinstance(block, TextBlock):
|
|
193
|
+
_check_string_list(block.content, f"{field_path}.content")
|
|
194
|
+
elif isinstance(block, (Table, Formula, Image)):
|
|
195
|
+
_check_basic_asset(block, field_path)
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def _check_footnote(footnote: Footnote, field_path: str) -> None:
|
|
199
|
+
"""Check a footnote for invalid characters.
|
|
200
|
+
|
|
201
|
+
Args:
|
|
202
|
+
footnote: Footnote to check
|
|
203
|
+
field_path: Path to the field for error reporting
|
|
204
|
+
|
|
205
|
+
Raises:
|
|
206
|
+
InvalidUnicodeError: If surrogate characters are detected
|
|
207
|
+
"""
|
|
208
|
+
for i, content_block in enumerate(footnote.contents):
|
|
209
|
+
_check_content_block(content_block, f"{field_path}.contents[{i}]")
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def _check_toc_item(item: TocItem, field_path: str) -> None:
|
|
213
|
+
"""Recursively check a TOC item for invalid characters.
|
|
214
|
+
|
|
215
|
+
Args:
|
|
216
|
+
item: TOC item to check
|
|
217
|
+
field_path: Path to the field for error reporting
|
|
218
|
+
|
|
219
|
+
Raises:
|
|
220
|
+
InvalidUnicodeError: If surrogate characters are detected
|
|
221
|
+
"""
|
|
222
|
+
_check_string(item.title, f"{field_path}.title")
|
|
223
|
+
|
|
224
|
+
# Check nested children recursively
|
|
225
|
+
for i, child in enumerate(item.children):
|
|
226
|
+
_check_toc_item(child, f"{field_path}.children[{i}]")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "epub-generator"
|
|
3
|
-
version = "0.1.
|
|
3
|
+
version = "0.1.7"
|
|
4
4
|
description = "A simple Python EPUB 3.0 generator with a single API call"
|
|
5
5
|
authors = ["Tao Zeyu <i@taozeyu.com>"]
|
|
6
6
|
license = "MIT"
|
|
@@ -32,6 +32,7 @@ matplotlib = ">=3.10.1,<3.11.0"
|
|
|
32
32
|
pylint = ">=3.0.0,<4.0.0"
|
|
33
33
|
autopep8 = ">=2.0.0,<3.0.0"
|
|
34
34
|
isort = ">=5.0.0,<6.0.0"
|
|
35
|
+
ruff = "^0.15.0"
|
|
35
36
|
|
|
36
37
|
[build-system]
|
|
37
38
|
requires = ["poetry-core>=2.0.0,<3.0.0"]
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
from .gen_epub import generate_epub
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|