epub-generator 0.1.5__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -17,10 +17,13 @@ from .types import (
17
17
  TextKind,
18
18
  TocItem,
19
19
  )
20
+ from .validate import InvalidUnicodeError
20
21
 
21
22
  __all__ = [
22
23
  # Main API function
23
24
  "generate_epub",
25
+ # Validation
26
+ "InvalidUnicodeError",
24
27
  # Options
25
28
  "TableRender",
26
29
  "LaTeXRender",
@@ -10,6 +10,7 @@ from ..html_tag import search_content
10
10
  from ..i18n import I18N
11
11
  from ..options import LaTeXRender, TableRender
12
12
  from ..types import BasicAsset, Chapter, ContentBlock, EpubData, Formula, TextBlock
13
+ from ..validate import validate_chapter, validate_epub_data
13
14
  from .gen_chapter import generate_chapter
14
15
  from .gen_nav import gen_nav
15
16
  from .gen_toc import TocPoint, gen_toc, iter_toc
@@ -23,6 +24,9 @@ def generate_epub(
23
24
  latex_render: LaTeXRender = LaTeXRender.MATHML,
24
25
  assert_not_aborted: Callable[[], None] = lambda: None,
25
26
  ) -> None:
27
+ # Validate epub_data for invalid Unicode characters before processing
28
+ validate_epub_data(epub_data)
29
+
26
30
  i18n = I18N(lan)
27
31
  template = Template()
28
32
  epub_file_path = Path(epub_file_path)
@@ -114,6 +118,8 @@ def _write_chapters_from_data(
114
118
  ):
115
119
  for file_name, get_chapter in _search_chapters(epub_data, toc_points):
116
120
  chapter = get_chapter()
121
+ # Validate chapter content for invalid Unicode characters
122
+ validate_chapter(chapter, context=f"Chapter '{file_name}'")
117
123
  data = generate_chapter(context, chapter, i18n)
118
124
  context.file.writestr(
119
125
  zinfo_or_arcname="OEBPS/Text/" + file_name,
@@ -1,5 +1,4 @@
1
1
  import re
2
- from typing import Container
3
2
  from xml.etree.ElementTree import Element, tostring
4
3
 
5
4
  _EPUB_NS = "http://www.idpf.org/2007/ops"
@@ -31,24 +30,17 @@ def serialize_element(element: Element) -> str:
31
30
 
32
31
  return xml_string
33
32
 
34
- def indent(elem: Element, level: int = 0, skip_tags: Container[str] = ()) -> Element:
33
+ def indent(elem: Element, level: int = 0) -> Element:
35
34
  indent_str = " " * level
36
35
  next_indent_str = " " * (level + 1)
37
-
38
- if elem.tag in skip_tags:
39
- if level > 0 and (not elem.tail or not elem.tail.strip()):
40
- elem.tail = "\n" + indent_str
41
- return elem
42
-
43
36
  if len(elem):
44
37
  if not elem.text or not elem.text.strip():
45
38
  elem.text = "\n" + next_indent_str
46
39
  for i, child in enumerate(elem):
47
- indent(child, level + 1, skip_tags)
48
- if i < len(elem) - 1:
49
- child.tail = "\n" + next_indent_str
50
- else:
51
- child.tail = "\n" + indent_str
52
- elif level > 0 and (not elem.tail or not elem.tail.strip()):
53
- elem.tail = "\n" + indent_str
40
+ indent(child, level + 1)
41
+ if not child.tail or not child.tail.strip():
42
+ if i == len(elem) - 1:
43
+ child.tail = "\n" + indent_str
44
+ else:
45
+ child.tail = "\n" + next_indent_str
54
46
  return elem
@@ -0,0 +1,224 @@
1
+ from .types import (
2
+ BasicAsset,
3
+ Chapter,
4
+ ContentBlock,
5
+ EpubData,
6
+ Footnote,
7
+ Formula,
8
+ HTMLTag,
9
+ Image,
10
+ Mark,
11
+ Table,
12
+ TextBlock,
13
+ TocItem,
14
+ )
15
+
16
+
17
+ class InvalidUnicodeError(Exception):
18
+ """Raised when invalid Unicode characters (surrogates) are detected in EPUB data."""
19
+
20
+ def __init__(self, field_path: str, invalid_char_info: str):
21
+ """Initialize with field path and character information.
22
+
23
+ Args:
24
+ field_path: Dot-separated path to the field containing invalid characters
25
+ invalid_char_info: Information about the invalid character(s)
26
+ """
27
+ self.field_path = field_path
28
+ self.invalid_char_info = invalid_char_info
29
+ super().__init__(
30
+ f"Invalid Unicode character detected in {field_path}: {invalid_char_info}"
31
+ )
32
+
33
+
34
+ def validate_epub_data(epub_data: EpubData) -> None:
35
+ """Validate an EpubData object for invalid Unicode characters.
36
+
37
+ This function checks all string fields in the EPUB data structure including:
38
+ - Book metadata (title, description, authors, etc.)
39
+ - Table of contents titles (recursively)
40
+ - Chapter content is NOT validated here (use validate_chapter separately)
41
+
42
+ Args:
43
+ epub_data: EPUB data to validate
44
+
45
+ Raises:
46
+ InvalidUnicodeError: If surrogate characters are detected in any string field
47
+ """
48
+ # Check metadata
49
+ if epub_data.meta:
50
+ meta = epub_data.meta
51
+ _check_string(meta.title, "EpubData.meta.title")
52
+ _check_string(meta.description, "EpubData.meta.description")
53
+ _check_string(meta.publisher, "EpubData.meta.publisher")
54
+ _check_string(meta.isbn, "EpubData.meta.isbn")
55
+
56
+ for i, author in enumerate(meta.authors):
57
+ _check_string(author, f"EpubData.meta.authors[{i}]")
58
+
59
+ for i, editor in enumerate(meta.editors):
60
+ _check_string(editor, f"EpubData.meta.editors[{i}]")
61
+
62
+ for i, translator in enumerate(meta.translators):
63
+ _check_string(translator, f"EpubData.meta.translators[{i}]")
64
+
65
+ # Check prefaces TOC
66
+ for i, preface in enumerate(epub_data.prefaces):
67
+ _check_toc_item(preface, f"EpubData.prefaces[{i}]")
68
+
69
+ # Check chapters TOC
70
+ for i, chapter_toc in enumerate(epub_data.chapters):
71
+ _check_toc_item(chapter_toc, f"EpubData.chapters[{i}]")
72
+
73
+
74
+ def validate_chapter(chapter: Chapter, context: str = "Chapter") -> None:
75
+ """Validate a Chapter object for invalid Unicode characters.
76
+
77
+ Args:
78
+ chapter: Chapter to validate
79
+ context: Context string for error reporting (e.g., "Chapter", "chapters[0]")
80
+
81
+ Raises:
82
+ InvalidUnicodeError: If surrogate characters are detected in any string field
83
+ """
84
+ # Check main content elements
85
+ for i, element in enumerate(chapter.elements):
86
+ _check_content_block(element, f"{context}.elements[{i}]")
87
+
88
+ # Check footnotes
89
+ for i, footnote in enumerate(chapter.footnotes):
90
+ _check_footnote(footnote, f"{context}.footnotes[{i}]")
91
+
92
+
93
+ def _check_string(value: str | None, field_path: str) -> None:
94
+ """Check if a string contains surrogate characters.
95
+
96
+ Args:
97
+ value: String to check
98
+ field_path: Path to the field for error reporting
99
+
100
+ Raises:
101
+ InvalidUnicodeError: If surrogate characters are detected
102
+ """
103
+ if value is None:
104
+ return
105
+
106
+ for i, char in enumerate(value):
107
+ code_point = ord(char)
108
+ # Check for surrogate pair range (U+D800 to U+DFFF)
109
+ if 0xD800 <= code_point <= 0xDFFF:
110
+ raise InvalidUnicodeError(
111
+ field_path=field_path,
112
+ invalid_char_info=f"surrogate character U+{code_point:04X} at position {i}",
113
+ )
114
+
115
+
116
+ def _check_string_list(values: list[str | Mark | Formula | HTMLTag], field_path: str) -> None:
117
+ """Recursively check a list that may contain strings, marks, formulas, or HTML tags.
118
+
119
+ Args:
120
+ values: List to check
121
+ field_path: Path to the field for error reporting
122
+
123
+ Raises:
124
+ InvalidUnicodeError: If surrogate characters are detected
125
+ """
126
+ for i, item in enumerate(values):
127
+ item_path = f"{field_path}[{i}]"
128
+ if isinstance(item, str):
129
+ _check_string(item, item_path)
130
+ elif isinstance(item, Mark):
131
+ pass # Mark only contains int ID
132
+ elif isinstance(item, Formula):
133
+ _check_string(item.latex_expression, f"{item_path}.latex_expression")
134
+ _check_string_list(item.title, f"{item_path}.title")
135
+ _check_string_list(item.caption, f"{item_path}.caption")
136
+ elif isinstance(item, HTMLTag):
137
+ _check_html_tag(item, item_path)
138
+
139
+
140
+ def _check_html_tag(tag: HTMLTag, field_path: str) -> None:
141
+ """Check an HTML tag for invalid characters.
142
+
143
+ Args:
144
+ tag: HTML tag to check
145
+ field_path: Path to the field for error reporting
146
+
147
+ Raises:
148
+ InvalidUnicodeError: If surrogate characters are detected
149
+ """
150
+ _check_string(tag.name, f"{field_path}.name")
151
+
152
+ for i, (attr_name, attr_value) in enumerate(tag.attributes):
153
+ _check_string(attr_name, f"{field_path}.attributes[{i}][0]")
154
+ _check_string(attr_value, f"{field_path}.attributes[{i}][1]")
155
+
156
+ _check_string_list(tag.content, f"{field_path}.content")
157
+
158
+
159
+ def _check_basic_asset(asset: BasicAsset, field_path: str) -> None:
160
+ """Check BasicAsset (and subclasses) for invalid characters.
161
+
162
+ Args:
163
+ asset: Asset to check
164
+ field_path: Path to the field for error reporting
165
+
166
+ Raises:
167
+ InvalidUnicodeError: If surrogate characters are detected
168
+ """
169
+ _check_string_list(asset.title, f"{field_path}.title")
170
+ _check_string_list(asset.caption, f"{field_path}.caption")
171
+
172
+ if isinstance(asset, Formula):
173
+ _check_string(asset.latex_expression, f"{field_path}.latex_expression")
174
+ elif isinstance(asset, Table):
175
+ _check_html_tag(asset.html_content, f"{field_path}.html_content")
176
+ elif isinstance(asset, Image):
177
+ pass # Image only contains Path, no string content to check
178
+
179
+
180
+ def _check_content_block(block: ContentBlock, field_path: str) -> None:
181
+ """Check a content block for invalid characters.
182
+
183
+ Args:
184
+ block: Content block to check
185
+ field_path: Path to the field for error reporting
186
+
187
+ Raises:
188
+ InvalidUnicodeError: If surrogate characters are detected
189
+ """
190
+ if isinstance(block, TextBlock):
191
+ _check_string_list(block.content, f"{field_path}.content")
192
+ elif isinstance(block, (Table, Formula, Image)):
193
+ _check_basic_asset(block, field_path)
194
+
195
+
196
+ def _check_footnote(footnote: Footnote, field_path: str) -> None:
197
+ """Check a footnote for invalid characters.
198
+
199
+ Args:
200
+ footnote: Footnote to check
201
+ field_path: Path to the field for error reporting
202
+
203
+ Raises:
204
+ InvalidUnicodeError: If surrogate characters are detected
205
+ """
206
+ for i, content_block in enumerate(footnote.contents):
207
+ _check_content_block(content_block, f"{field_path}.contents[{i}]")
208
+
209
+
210
+ def _check_toc_item(item: TocItem, field_path: str) -> None:
211
+ """Recursively check a TOC item for invalid characters.
212
+
213
+ Args:
214
+ item: TOC item to check
215
+ field_path: Path to the field for error reporting
216
+
217
+ Raises:
218
+ InvalidUnicodeError: If surrogate characters are detected
219
+ """
220
+ _check_string(item.title, f"{field_path}.title")
221
+
222
+ # Check nested children recursively
223
+ for i, child in enumerate(item.children):
224
+ _check_toc_item(child, f"{field_path}.children[{i}]")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: epub-generator
3
- Version: 0.1.5
3
+ Version: 0.1.6
4
4
  Summary: A simple Python EPUB 3.0 generator with a single API call
5
5
  License: MIT
6
6
  Keywords: epub,epub3,ebook,generator,publishing
@@ -1,4 +1,4 @@
1
- epub_generator/__init__.py,sha256=5fFpZdgB4-FfXgCpE5IshBfrfrMaxNQK4SRKaKV2RdI,682
1
+ epub_generator/__init__.py,sha256=G1P_GAUym94iv56PPK31641vlYrukUoMJZgWtmKscog,768
2
2
  epub_generator/context.py,sha256=9jHRpnQsNooRUSBoY_tiQ7aQ_AMZmyKUO22gPoO8Koc,4324
3
3
  epub_generator/data/container.xml.jinja,sha256=SkACyZgsAVUS5lmiCEhq3SpbFspYdyCnRNjWnLztLt0,252
4
4
  epub_generator/data/content.opf.jinja,sha256=DDaR9GZnSBcpNk2BWUu56Uo_248TA91AxE4tKsBuKnQ,2839
@@ -11,16 +11,17 @@ epub_generator/generation/__init__.py,sha256=UIscwHa8ocr2D1mk1KaP-zi3P1x9eYJzxTo
11
11
  epub_generator/generation/gen_asset.py,sha256=WYwfGUvHM_CrwTuIIH7dYm-SL-vdhkTnvaZDymZxXzg,5978
12
12
  epub_generator/generation/gen_chapter.py,sha256=P6kmB8hdQnJB6SCheHzu5cOmZrC5H0LqNV-uuuigX1M,3425
13
13
  epub_generator/generation/gen_content.py,sha256=2ojjTgalveRnk1MXQaKsY53hPCgb7NHTwbMpLOXVrss,2018
14
- epub_generator/generation/gen_epub.py,sha256=I7u8rrrslF9xoyDUsALarB2iWzY9zjKM9ZOR1wLMX1E,6184
14
+ epub_generator/generation/gen_epub.py,sha256=rxHBp4nP5OFi9SJBfiCrncV1fmhb0j3WKfUqofxJykc,6487
15
15
  epub_generator/generation/gen_nav.py,sha256=_cjOP18C1CoTn_DELIB06pyMPZZ0CPbkk4oPEvICdKs,1955
16
16
  epub_generator/generation/gen_toc.py,sha256=MK2iTYBpF8VUtPHpwz5JB_H6nWsKRKpVuLzRPYGy0nw,2864
17
- epub_generator/generation/xml_utils.py,sha256=kyHBWUihT5se5n_425BcEvBpsIK6yC52W25t012QUn0,2084
17
+ epub_generator/generation/xml_utils.py,sha256=AVnU3AN6lmqWrdgaZTV7v77L9LonI7DX59BxkMZlef8,1822
18
18
  epub_generator/html_tag.py,sha256=P_Y0uRStCEEh7cCtpvK4t432NEcY9OLntAznvdxUF5k,343
19
19
  epub_generator/i18n.py,sha256=-L6J6hsy796_IQ4nLpNtAeXIkRM6oFSWSHDlRZXW8aA,705
20
20
  epub_generator/options.py,sha256=Er1dnaNvzDSnZRSRJGSqhkJsv1XtsCW2Ym_hUc8o_QI,181
21
21
  epub_generator/template.py,sha256=RdN2QRICIrYMzpxCU_x4m4V9WWZEP9VvT6QLp2YCm90,1556
22
22
  epub_generator/types.py,sha256=gBrdi1KYOVEnI0qEp1slLsyUw_Sd7v09uHvN8_Hf9Z8,4440
23
- epub_generator-0.1.5.dist-info/LICENSE,sha256=9Zt_a4mrzkvR2rc0UbqTgbboIjWuumDFgeQyKos0H2E,1066
24
- epub_generator-0.1.5.dist-info/METADATA,sha256=cwIGyOGFrt0hvtw_FHaaTjeoy-l-FP-SGZC4zP0MJyw,16555
25
- epub_generator-0.1.5.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
26
- epub_generator-0.1.5.dist-info/RECORD,,
23
+ epub_generator/validate.py,sha256=KBgvBsBuVnWTc4N-29cr2P92X0w_tGR4pMemk_KHy78,7544
24
+ epub_generator-0.1.6.dist-info/LICENSE,sha256=9Zt_a4mrzkvR2rc0UbqTgbboIjWuumDFgeQyKos0H2E,1066
25
+ epub_generator-0.1.6.dist-info/METADATA,sha256=JziMt9LukPRKo8rPy10qf9sIiiv98CgSxKoi7juHcYE,16555
26
+ epub_generator-0.1.6.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
27
+ epub_generator-0.1.6.dist-info/RECORD,,