epub-generator 0.1.5__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epub_generator/__init__.py +3 -0
- epub_generator/generation/gen_epub.py +6 -0
- epub_generator/generation/xml_utils.py +7 -15
- epub_generator/validate.py +224 -0
- {epub_generator-0.1.5.dist-info → epub_generator-0.1.6.dist-info}/METADATA +1 -1
- {epub_generator-0.1.5.dist-info → epub_generator-0.1.6.dist-info}/RECORD +8 -7
- {epub_generator-0.1.5.dist-info → epub_generator-0.1.6.dist-info}/LICENSE +0 -0
- {epub_generator-0.1.5.dist-info → epub_generator-0.1.6.dist-info}/WHEEL +0 -0
epub_generator/__init__.py
CHANGED
|
@@ -17,10 +17,13 @@ from .types import (
|
|
|
17
17
|
TextKind,
|
|
18
18
|
TocItem,
|
|
19
19
|
)
|
|
20
|
+
from .validate import InvalidUnicodeError
|
|
20
21
|
|
|
21
22
|
__all__ = [
|
|
22
23
|
# Main API function
|
|
23
24
|
"generate_epub",
|
|
25
|
+
# Validation
|
|
26
|
+
"InvalidUnicodeError",
|
|
24
27
|
# Options
|
|
25
28
|
"TableRender",
|
|
26
29
|
"LaTeXRender",
|
|
@@ -10,6 +10,7 @@ from ..html_tag import search_content
|
|
|
10
10
|
from ..i18n import I18N
|
|
11
11
|
from ..options import LaTeXRender, TableRender
|
|
12
12
|
from ..types import BasicAsset, Chapter, ContentBlock, EpubData, Formula, TextBlock
|
|
13
|
+
from ..validate import validate_chapter, validate_epub_data
|
|
13
14
|
from .gen_chapter import generate_chapter
|
|
14
15
|
from .gen_nav import gen_nav
|
|
15
16
|
from .gen_toc import TocPoint, gen_toc, iter_toc
|
|
@@ -23,6 +24,9 @@ def generate_epub(
|
|
|
23
24
|
latex_render: LaTeXRender = LaTeXRender.MATHML,
|
|
24
25
|
assert_not_aborted: Callable[[], None] = lambda: None,
|
|
25
26
|
) -> None:
|
|
27
|
+
# Validate epub_data for invalid Unicode characters before processing
|
|
28
|
+
validate_epub_data(epub_data)
|
|
29
|
+
|
|
26
30
|
i18n = I18N(lan)
|
|
27
31
|
template = Template()
|
|
28
32
|
epub_file_path = Path(epub_file_path)
|
|
@@ -114,6 +118,8 @@ def _write_chapters_from_data(
|
|
|
114
118
|
):
|
|
115
119
|
for file_name, get_chapter in _search_chapters(epub_data, toc_points):
|
|
116
120
|
chapter = get_chapter()
|
|
121
|
+
# Validate chapter content for invalid Unicode characters
|
|
122
|
+
validate_chapter(chapter, context=f"Chapter '{file_name}'")
|
|
117
123
|
data = generate_chapter(context, chapter, i18n)
|
|
118
124
|
context.file.writestr(
|
|
119
125
|
zinfo_or_arcname="OEBPS/Text/" + file_name,
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import re
|
|
2
|
-
from typing import Container
|
|
3
2
|
from xml.etree.ElementTree import Element, tostring
|
|
4
3
|
|
|
5
4
|
_EPUB_NS = "http://www.idpf.org/2007/ops"
|
|
@@ -31,24 +30,17 @@ def serialize_element(element: Element) -> str:
|
|
|
31
30
|
|
|
32
31
|
return xml_string
|
|
33
32
|
|
|
34
|
-
def indent(elem: Element, level: int = 0
|
|
33
|
+
def indent(elem: Element, level: int = 0) -> Element:
|
|
35
34
|
indent_str = " " * level
|
|
36
35
|
next_indent_str = " " * (level + 1)
|
|
37
|
-
|
|
38
|
-
if elem.tag in skip_tags:
|
|
39
|
-
if level > 0 and (not elem.tail or not elem.tail.strip()):
|
|
40
|
-
elem.tail = "\n" + indent_str
|
|
41
|
-
return elem
|
|
42
|
-
|
|
43
36
|
if len(elem):
|
|
44
37
|
if not elem.text or not elem.text.strip():
|
|
45
38
|
elem.text = "\n" + next_indent_str
|
|
46
39
|
for i, child in enumerate(elem):
|
|
47
|
-
indent(child, level + 1
|
|
48
|
-
if
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
elem.tail = "\n" + indent_str
|
|
40
|
+
indent(child, level + 1)
|
|
41
|
+
if not child.tail or not child.tail.strip():
|
|
42
|
+
if i == len(elem) - 1:
|
|
43
|
+
child.tail = "\n" + indent_str
|
|
44
|
+
else:
|
|
45
|
+
child.tail = "\n" + next_indent_str
|
|
54
46
|
return elem
|
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
from .types import (
|
|
2
|
+
BasicAsset,
|
|
3
|
+
Chapter,
|
|
4
|
+
ContentBlock,
|
|
5
|
+
EpubData,
|
|
6
|
+
Footnote,
|
|
7
|
+
Formula,
|
|
8
|
+
HTMLTag,
|
|
9
|
+
Image,
|
|
10
|
+
Mark,
|
|
11
|
+
Table,
|
|
12
|
+
TextBlock,
|
|
13
|
+
TocItem,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class InvalidUnicodeError(Exception):
|
|
18
|
+
"""Raised when invalid Unicode characters (surrogates) are detected in EPUB data."""
|
|
19
|
+
|
|
20
|
+
def __init__(self, field_path: str, invalid_char_info: str):
|
|
21
|
+
"""Initialize with field path and character information.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
field_path: Dot-separated path to the field containing invalid characters
|
|
25
|
+
invalid_char_info: Information about the invalid character(s)
|
|
26
|
+
"""
|
|
27
|
+
self.field_path = field_path
|
|
28
|
+
self.invalid_char_info = invalid_char_info
|
|
29
|
+
super().__init__(
|
|
30
|
+
f"Invalid Unicode character detected in {field_path}: {invalid_char_info}"
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def validate_epub_data(epub_data: EpubData) -> None:
|
|
35
|
+
"""Validate an EpubData object for invalid Unicode characters.
|
|
36
|
+
|
|
37
|
+
This function checks all string fields in the EPUB data structure including:
|
|
38
|
+
- Book metadata (title, description, authors, etc.)
|
|
39
|
+
- Table of contents titles (recursively)
|
|
40
|
+
- Chapter content is NOT validated here (use validate_chapter separately)
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
epub_data: EPUB data to validate
|
|
44
|
+
|
|
45
|
+
Raises:
|
|
46
|
+
InvalidUnicodeError: If surrogate characters are detected in any string field
|
|
47
|
+
"""
|
|
48
|
+
# Check metadata
|
|
49
|
+
if epub_data.meta:
|
|
50
|
+
meta = epub_data.meta
|
|
51
|
+
_check_string(meta.title, "EpubData.meta.title")
|
|
52
|
+
_check_string(meta.description, "EpubData.meta.description")
|
|
53
|
+
_check_string(meta.publisher, "EpubData.meta.publisher")
|
|
54
|
+
_check_string(meta.isbn, "EpubData.meta.isbn")
|
|
55
|
+
|
|
56
|
+
for i, author in enumerate(meta.authors):
|
|
57
|
+
_check_string(author, f"EpubData.meta.authors[{i}]")
|
|
58
|
+
|
|
59
|
+
for i, editor in enumerate(meta.editors):
|
|
60
|
+
_check_string(editor, f"EpubData.meta.editors[{i}]")
|
|
61
|
+
|
|
62
|
+
for i, translator in enumerate(meta.translators):
|
|
63
|
+
_check_string(translator, f"EpubData.meta.translators[{i}]")
|
|
64
|
+
|
|
65
|
+
# Check prefaces TOC
|
|
66
|
+
for i, preface in enumerate(epub_data.prefaces):
|
|
67
|
+
_check_toc_item(preface, f"EpubData.prefaces[{i}]")
|
|
68
|
+
|
|
69
|
+
# Check chapters TOC
|
|
70
|
+
for i, chapter_toc in enumerate(epub_data.chapters):
|
|
71
|
+
_check_toc_item(chapter_toc, f"EpubData.chapters[{i}]")
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def validate_chapter(chapter: Chapter, context: str = "Chapter") -> None:
|
|
75
|
+
"""Validate a Chapter object for invalid Unicode characters.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
chapter: Chapter to validate
|
|
79
|
+
context: Context string for error reporting (e.g., "Chapter", "chapters[0]")
|
|
80
|
+
|
|
81
|
+
Raises:
|
|
82
|
+
InvalidUnicodeError: If surrogate characters are detected in any string field
|
|
83
|
+
"""
|
|
84
|
+
# Check main content elements
|
|
85
|
+
for i, element in enumerate(chapter.elements):
|
|
86
|
+
_check_content_block(element, f"{context}.elements[{i}]")
|
|
87
|
+
|
|
88
|
+
# Check footnotes
|
|
89
|
+
for i, footnote in enumerate(chapter.footnotes):
|
|
90
|
+
_check_footnote(footnote, f"{context}.footnotes[{i}]")
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _check_string(value: str | None, field_path: str) -> None:
|
|
94
|
+
"""Check if a string contains surrogate characters.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
value: String to check
|
|
98
|
+
field_path: Path to the field for error reporting
|
|
99
|
+
|
|
100
|
+
Raises:
|
|
101
|
+
InvalidUnicodeError: If surrogate characters are detected
|
|
102
|
+
"""
|
|
103
|
+
if value is None:
|
|
104
|
+
return
|
|
105
|
+
|
|
106
|
+
for i, char in enumerate(value):
|
|
107
|
+
code_point = ord(char)
|
|
108
|
+
# Check for surrogate pair range (U+D800 to U+DFFF)
|
|
109
|
+
if 0xD800 <= code_point <= 0xDFFF:
|
|
110
|
+
raise InvalidUnicodeError(
|
|
111
|
+
field_path=field_path,
|
|
112
|
+
invalid_char_info=f"surrogate character U+{code_point:04X} at position {i}",
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def _check_string_list(values: list[str | Mark | Formula | HTMLTag], field_path: str) -> None:
|
|
117
|
+
"""Recursively check a list that may contain strings, marks, formulas, or HTML tags.
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
values: List to check
|
|
121
|
+
field_path: Path to the field for error reporting
|
|
122
|
+
|
|
123
|
+
Raises:
|
|
124
|
+
InvalidUnicodeError: If surrogate characters are detected
|
|
125
|
+
"""
|
|
126
|
+
for i, item in enumerate(values):
|
|
127
|
+
item_path = f"{field_path}[{i}]"
|
|
128
|
+
if isinstance(item, str):
|
|
129
|
+
_check_string(item, item_path)
|
|
130
|
+
elif isinstance(item, Mark):
|
|
131
|
+
pass # Mark only contains int ID
|
|
132
|
+
elif isinstance(item, Formula):
|
|
133
|
+
_check_string(item.latex_expression, f"{item_path}.latex_expression")
|
|
134
|
+
_check_string_list(item.title, f"{item_path}.title")
|
|
135
|
+
_check_string_list(item.caption, f"{item_path}.caption")
|
|
136
|
+
elif isinstance(item, HTMLTag):
|
|
137
|
+
_check_html_tag(item, item_path)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def _check_html_tag(tag: HTMLTag, field_path: str) -> None:
|
|
141
|
+
"""Check an HTML tag for invalid characters.
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
tag: HTML tag to check
|
|
145
|
+
field_path: Path to the field for error reporting
|
|
146
|
+
|
|
147
|
+
Raises:
|
|
148
|
+
InvalidUnicodeError: If surrogate characters are detected
|
|
149
|
+
"""
|
|
150
|
+
_check_string(tag.name, f"{field_path}.name")
|
|
151
|
+
|
|
152
|
+
for i, (attr_name, attr_value) in enumerate(tag.attributes):
|
|
153
|
+
_check_string(attr_name, f"{field_path}.attributes[{i}][0]")
|
|
154
|
+
_check_string(attr_value, f"{field_path}.attributes[{i}][1]")
|
|
155
|
+
|
|
156
|
+
_check_string_list(tag.content, f"{field_path}.content")
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def _check_basic_asset(asset: BasicAsset, field_path: str) -> None:
|
|
160
|
+
"""Check BasicAsset (and subclasses) for invalid characters.
|
|
161
|
+
|
|
162
|
+
Args:
|
|
163
|
+
asset: Asset to check
|
|
164
|
+
field_path: Path to the field for error reporting
|
|
165
|
+
|
|
166
|
+
Raises:
|
|
167
|
+
InvalidUnicodeError: If surrogate characters are detected
|
|
168
|
+
"""
|
|
169
|
+
_check_string_list(asset.title, f"{field_path}.title")
|
|
170
|
+
_check_string_list(asset.caption, f"{field_path}.caption")
|
|
171
|
+
|
|
172
|
+
if isinstance(asset, Formula):
|
|
173
|
+
_check_string(asset.latex_expression, f"{field_path}.latex_expression")
|
|
174
|
+
elif isinstance(asset, Table):
|
|
175
|
+
_check_html_tag(asset.html_content, f"{field_path}.html_content")
|
|
176
|
+
elif isinstance(asset, Image):
|
|
177
|
+
pass # Image only contains Path, no string content to check
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def _check_content_block(block: ContentBlock, field_path: str) -> None:
|
|
181
|
+
"""Check a content block for invalid characters.
|
|
182
|
+
|
|
183
|
+
Args:
|
|
184
|
+
block: Content block to check
|
|
185
|
+
field_path: Path to the field for error reporting
|
|
186
|
+
|
|
187
|
+
Raises:
|
|
188
|
+
InvalidUnicodeError: If surrogate characters are detected
|
|
189
|
+
"""
|
|
190
|
+
if isinstance(block, TextBlock):
|
|
191
|
+
_check_string_list(block.content, f"{field_path}.content")
|
|
192
|
+
elif isinstance(block, (Table, Formula, Image)):
|
|
193
|
+
_check_basic_asset(block, field_path)
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def _check_footnote(footnote: Footnote, field_path: str) -> None:
|
|
197
|
+
"""Check a footnote for invalid characters.
|
|
198
|
+
|
|
199
|
+
Args:
|
|
200
|
+
footnote: Footnote to check
|
|
201
|
+
field_path: Path to the field for error reporting
|
|
202
|
+
|
|
203
|
+
Raises:
|
|
204
|
+
InvalidUnicodeError: If surrogate characters are detected
|
|
205
|
+
"""
|
|
206
|
+
for i, content_block in enumerate(footnote.contents):
|
|
207
|
+
_check_content_block(content_block, f"{field_path}.contents[{i}]")
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def _check_toc_item(item: TocItem, field_path: str) -> None:
|
|
211
|
+
"""Recursively check a TOC item for invalid characters.
|
|
212
|
+
|
|
213
|
+
Args:
|
|
214
|
+
item: TOC item to check
|
|
215
|
+
field_path: Path to the field for error reporting
|
|
216
|
+
|
|
217
|
+
Raises:
|
|
218
|
+
InvalidUnicodeError: If surrogate characters are detected
|
|
219
|
+
"""
|
|
220
|
+
_check_string(item.title, f"{field_path}.title")
|
|
221
|
+
|
|
222
|
+
# Check nested children recursively
|
|
223
|
+
for i, child in enumerate(item.children):
|
|
224
|
+
_check_toc_item(child, f"{field_path}.children[{i}]")
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
epub_generator/__init__.py,sha256=
|
|
1
|
+
epub_generator/__init__.py,sha256=G1P_GAUym94iv56PPK31641vlYrukUoMJZgWtmKscog,768
|
|
2
2
|
epub_generator/context.py,sha256=9jHRpnQsNooRUSBoY_tiQ7aQ_AMZmyKUO22gPoO8Koc,4324
|
|
3
3
|
epub_generator/data/container.xml.jinja,sha256=SkACyZgsAVUS5lmiCEhq3SpbFspYdyCnRNjWnLztLt0,252
|
|
4
4
|
epub_generator/data/content.opf.jinja,sha256=DDaR9GZnSBcpNk2BWUu56Uo_248TA91AxE4tKsBuKnQ,2839
|
|
@@ -11,16 +11,17 @@ epub_generator/generation/__init__.py,sha256=UIscwHa8ocr2D1mk1KaP-zi3P1x9eYJzxTo
|
|
|
11
11
|
epub_generator/generation/gen_asset.py,sha256=WYwfGUvHM_CrwTuIIH7dYm-SL-vdhkTnvaZDymZxXzg,5978
|
|
12
12
|
epub_generator/generation/gen_chapter.py,sha256=P6kmB8hdQnJB6SCheHzu5cOmZrC5H0LqNV-uuuigX1M,3425
|
|
13
13
|
epub_generator/generation/gen_content.py,sha256=2ojjTgalveRnk1MXQaKsY53hPCgb7NHTwbMpLOXVrss,2018
|
|
14
|
-
epub_generator/generation/gen_epub.py,sha256=
|
|
14
|
+
epub_generator/generation/gen_epub.py,sha256=rxHBp4nP5OFi9SJBfiCrncV1fmhb0j3WKfUqofxJykc,6487
|
|
15
15
|
epub_generator/generation/gen_nav.py,sha256=_cjOP18C1CoTn_DELIB06pyMPZZ0CPbkk4oPEvICdKs,1955
|
|
16
16
|
epub_generator/generation/gen_toc.py,sha256=MK2iTYBpF8VUtPHpwz5JB_H6nWsKRKpVuLzRPYGy0nw,2864
|
|
17
|
-
epub_generator/generation/xml_utils.py,sha256=
|
|
17
|
+
epub_generator/generation/xml_utils.py,sha256=AVnU3AN6lmqWrdgaZTV7v77L9LonI7DX59BxkMZlef8,1822
|
|
18
18
|
epub_generator/html_tag.py,sha256=P_Y0uRStCEEh7cCtpvK4t432NEcY9OLntAznvdxUF5k,343
|
|
19
19
|
epub_generator/i18n.py,sha256=-L6J6hsy796_IQ4nLpNtAeXIkRM6oFSWSHDlRZXW8aA,705
|
|
20
20
|
epub_generator/options.py,sha256=Er1dnaNvzDSnZRSRJGSqhkJsv1XtsCW2Ym_hUc8o_QI,181
|
|
21
21
|
epub_generator/template.py,sha256=RdN2QRICIrYMzpxCU_x4m4V9WWZEP9VvT6QLp2YCm90,1556
|
|
22
22
|
epub_generator/types.py,sha256=gBrdi1KYOVEnI0qEp1slLsyUw_Sd7v09uHvN8_Hf9Z8,4440
|
|
23
|
-
epub_generator
|
|
24
|
-
epub_generator-0.1.
|
|
25
|
-
epub_generator-0.1.
|
|
26
|
-
epub_generator-0.1.
|
|
23
|
+
epub_generator/validate.py,sha256=KBgvBsBuVnWTc4N-29cr2P92X0w_tGR4pMemk_KHy78,7544
|
|
24
|
+
epub_generator-0.1.6.dist-info/LICENSE,sha256=9Zt_a4mrzkvR2rc0UbqTgbboIjWuumDFgeQyKos0H2E,1066
|
|
25
|
+
epub_generator-0.1.6.dist-info/METADATA,sha256=JziMt9LukPRKo8rPy10qf9sIiiv98CgSxKoi7juHcYE,16555
|
|
26
|
+
epub_generator-0.1.6.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
|
27
|
+
epub_generator-0.1.6.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|