PyPI - epub-generator - Versions diffs - 0.1.5__py3-none-any.whl → 0.1.6__py3-none-any.whl - Mend

epub-generator 0.1.5py3-none-any.whl → 0.1.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

epub_generator/__init__.py CHANGED Viewed

@@ -17,10 +17,13 @@ from .types import (
     TextKind,
     TocItem,
 )
+from .validate import InvalidUnicodeError
 __all__ = [
     # Main API function
     "generate_epub",
+    # Validation
+    "InvalidUnicodeError",
     # Options
     "TableRender",
     "LaTeXRender",

epub_generator/generation/gen_epub.py CHANGED Viewed

@@ -10,6 +10,7 @@ from ..html_tag import search_content
 from ..i18n import I18N
 from ..options import LaTeXRender, TableRender
 from ..types import BasicAsset, Chapter, ContentBlock, EpubData, Formula, TextBlock
+from ..validate import validate_chapter, validate_epub_data
 from .gen_chapter import generate_chapter
 from .gen_nav import gen_nav
 from .gen_toc import TocPoint, gen_toc, iter_toc
@@ -23,6 +24,9 @@ def generate_epub(
     latex_render: LaTeXRender = LaTeXRender.MATHML,
     assert_not_aborted: Callable[[], None] = lambda: None,
 ) -> None:
+    # Validate epub_data for invalid Unicode characters before processing
+    validate_epub_data(epub_data)
     i18n = I18N(lan)
     template = Template()
     epub_file_path = Path(epub_file_path)
@@ -114,6 +118,8 @@ def _write_chapters_from_data(
 ):
     for file_name, get_chapter in _search_chapters(epub_data, toc_points):
         chapter = get_chapter()
+        # Validate chapter content for invalid Unicode characters
+        validate_chapter(chapter, context=f"Chapter '{file_name}'")
         data = generate_chapter(context, chapter, i18n)
         context.file.writestr(
             zinfo_or_arcname="OEBPS/Text/" + file_name,

epub_generator/generation/xml_utils.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import re
-from typing import Container
 from xml.etree.ElementTree import Element, tostring
 _EPUB_NS = "http://www.idpf.org/2007/ops"
@@ -31,24 +30,17 @@ def serialize_element(element: Element) -> str:
     return xml_string
-def indent(elem: Element, level: int = 0, skip_tags: Container[str] = ()) -> Element:
+def indent(elem: Element, level: int = 0) -> Element:
     indent_str = "  " * level
     next_indent_str = "  " * (level + 1)
-    if elem.tag in skip_tags:
-        if level > 0 and (not elem.tail or not elem.tail.strip()):
-            elem.tail = "\n" + indent_str
-        return elem
     if len(elem):
         if not elem.text or not elem.text.strip():
             elem.text = "\n" + next_indent_str
         for i, child in enumerate(elem):
-            indent(child, level + 1, skip_tags)
-            if i < len(elem) - 1:
-                child.tail = "\n" + next_indent_str
-            else:
-                child.tail = "\n" + indent_str
-    elif level > 0 and (not elem.tail or not elem.tail.strip()):
-        elem.tail = "\n" + indent_str
+            indent(child, level + 1)
+            if not child.tail or not child.tail.strip():
+                if i == len(elem) - 1:
+                    child.tail = "\n" + indent_str
+                else:
+                    child.tail = "\n" + next_indent_str
     return elem

epub_generator/validate.py ADDED Viewed

@@ -0,0 +1,224 @@
+from .types import (
+    BasicAsset,
+    Chapter,
+    ContentBlock,
+    EpubData,
+    Footnote,
+    Formula,
+    HTMLTag,
+    Image,
+    Mark,
+    Table,
+    TextBlock,
+    TocItem,
+)
+class InvalidUnicodeError(Exception):
+    """Raised when invalid Unicode characters (surrogates) are detected in EPUB data."""
+    def __init__(self, field_path: str, invalid_char_info: str):
+        """Initialize with field path and character information.
+        Args:
+            field_path: Dot-separated path to the field containing invalid characters
+            invalid_char_info: Information about the invalid character(s)
+        """
+        self.field_path = field_path
+        self.invalid_char_info = invalid_char_info
+        super().__init__(
+            f"Invalid Unicode character detected in {field_path}: {invalid_char_info}"
+        )
+def validate_epub_data(epub_data: EpubData) -> None:
+    """Validate an EpubData object for invalid Unicode characters.
+    This function checks all string fields in the EPUB data structure including:
+    - Book metadata (title, description, authors, etc.)
+    - Table of contents titles (recursively)
+    - Chapter content is NOT validated here (use validate_chapter separately)
+    Args:
+        epub_data: EPUB data to validate
+    Raises:
+        InvalidUnicodeError: If surrogate characters are detected in any string field
+    """
+    # Check metadata
+    if epub_data.meta:
+        meta = epub_data.meta
+        _check_string(meta.title, "EpubData.meta.title")
+        _check_string(meta.description, "EpubData.meta.description")
+        _check_string(meta.publisher, "EpubData.meta.publisher")
+        _check_string(meta.isbn, "EpubData.meta.isbn")
+        for i, author in enumerate(meta.authors):
+            _check_string(author, f"EpubData.meta.authors[{i}]")
+        for i, editor in enumerate(meta.editors):
+            _check_string(editor, f"EpubData.meta.editors[{i}]")
+        for i, translator in enumerate(meta.translators):
+            _check_string(translator, f"EpubData.meta.translators[{i}]")
+    # Check prefaces TOC
+    for i, preface in enumerate(epub_data.prefaces):
+        _check_toc_item(preface, f"EpubData.prefaces[{i}]")
+    # Check chapters TOC
+    for i, chapter_toc in enumerate(epub_data.chapters):
+        _check_toc_item(chapter_toc, f"EpubData.chapters[{i}]")
+def validate_chapter(chapter: Chapter, context: str = "Chapter") -> None:
+    """Validate a Chapter object for invalid Unicode characters.
+    Args:
+        chapter: Chapter to validate
+        context: Context string for error reporting (e.g., "Chapter", "chapters[0]")
+    Raises:
+        InvalidUnicodeError: If surrogate characters are detected in any string field
+    """
+    # Check main content elements
+    for i, element in enumerate(chapter.elements):
+        _check_content_block(element, f"{context}.elements[{i}]")
+    # Check footnotes
+    for i, footnote in enumerate(chapter.footnotes):
+        _check_footnote(footnote, f"{context}.footnotes[{i}]")
+def _check_string(value: str | None, field_path: str) -> None:
+    """Check if a string contains surrogate characters.
+    Args:
+        value: String to check
+        field_path: Path to the field for error reporting
+    Raises:
+        InvalidUnicodeError: If surrogate characters are detected
+    """
+    if value is None:
+        return
+    for i, char in enumerate(value):
+        code_point = ord(char)
+        # Check for surrogate pair range (U+D800 to U+DFFF)
+        if 0xD800 <= code_point <= 0xDFFF:
+            raise InvalidUnicodeError(
+                field_path=field_path,
+                invalid_char_info=f"surrogate character U+{code_point:04X} at position {i}",
+            )
+def _check_string_list(values: list[str | Mark | Formula | HTMLTag], field_path: str) -> None:
+    """Recursively check a list that may contain strings, marks, formulas, or HTML tags.
+    Args:
+        values: List to check
+        field_path: Path to the field for error reporting
+    Raises:
+        InvalidUnicodeError: If surrogate characters are detected
+    """
+    for i, item in enumerate(values):
+        item_path = f"{field_path}[{i}]"
+        if isinstance(item, str):
+            _check_string(item, item_path)
+        elif isinstance(item, Mark):
+            pass  # Mark only contains int ID
+        elif isinstance(item, Formula):
+            _check_string(item.latex_expression, f"{item_path}.latex_expression")
+            _check_string_list(item.title, f"{item_path}.title")
+            _check_string_list(item.caption, f"{item_path}.caption")
+        elif isinstance(item, HTMLTag):
+            _check_html_tag(item, item_path)
+def _check_html_tag(tag: HTMLTag, field_path: str) -> None:
+    """Check an HTML tag for invalid characters.
+    Args:
+        tag: HTML tag to check
+        field_path: Path to the field for error reporting
+    Raises:
+        InvalidUnicodeError: If surrogate characters are detected
+    """
+    _check_string(tag.name, f"{field_path}.name")
+    for i, (attr_name, attr_value) in enumerate(tag.attributes):
+        _check_string(attr_name, f"{field_path}.attributes[{i}][0]")
+        _check_string(attr_value, f"{field_path}.attributes[{i}][1]")
+    _check_string_list(tag.content, f"{field_path}.content")
+def _check_basic_asset(asset: BasicAsset, field_path: str) -> None:
+    """Check BasicAsset (and subclasses) for invalid characters.
+    Args:
+        asset: Asset to check
+        field_path: Path to the field for error reporting
+    Raises:
+        InvalidUnicodeError: If surrogate characters are detected
+    """
+    _check_string_list(asset.title, f"{field_path}.title")
+    _check_string_list(asset.caption, f"{field_path}.caption")
+    if isinstance(asset, Formula):
+        _check_string(asset.latex_expression, f"{field_path}.latex_expression")
+    elif isinstance(asset, Table):
+        _check_html_tag(asset.html_content, f"{field_path}.html_content")
+    elif isinstance(asset, Image):
+        pass  # Image only contains Path, no string content to check
+def _check_content_block(block: ContentBlock, field_path: str) -> None:
+    """Check a content block for invalid characters.
+    Args:
+        block: Content block to check
+        field_path: Path to the field for error reporting
+    Raises:
+        InvalidUnicodeError: If surrogate characters are detected
+    """
+    if isinstance(block, TextBlock):
+        _check_string_list(block.content, f"{field_path}.content")
+    elif isinstance(block, (Table, Formula, Image)):
+        _check_basic_asset(block, field_path)
+def _check_footnote(footnote: Footnote, field_path: str) -> None:
+    """Check a footnote for invalid characters.
+    Args:
+        footnote: Footnote to check
+        field_path: Path to the field for error reporting
+    Raises:
+        InvalidUnicodeError: If surrogate characters are detected
+    """
+    for i, content_block in enumerate(footnote.contents):
+        _check_content_block(content_block, f"{field_path}.contents[{i}]")
+def _check_toc_item(item: TocItem, field_path: str) -> None:
+    """Recursively check a TOC item for invalid characters.
+    Args:
+        item: TOC item to check
+        field_path: Path to the field for error reporting
+    Raises:
+        InvalidUnicodeError: If surrogate characters are detected
+    """
+    _check_string(item.title, f"{field_path}.title")
+    # Check nested children recursively
+    for i, child in enumerate(item.children):
+        _check_toc_item(child, f"{field_path}.children[{i}]")

{epub_generator-0.1.5.dist-info → epub_generator-0.1.6.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: epub-generator
-Version: 0.1.5
+Version: 0.1.6
 Summary: A simple Python EPUB 3.0 generator with a single API call
 License: MIT
 Keywords: epub,epub3,ebook,generator,publishing

{epub_generator-0.1.5.dist-info → epub_generator-0.1.6.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-epub_generator/__init__.py,sha256=5fFpZdgB4-FfXgCpE5IshBfrfrMaxNQK4SRKaKV2RdI,682
+epub_generator/__init__.py,sha256=G1P_GAUym94iv56PPK31641vlYrukUoMJZgWtmKscog,768
 epub_generator/context.py,sha256=9jHRpnQsNooRUSBoY_tiQ7aQ_AMZmyKUO22gPoO8Koc,4324
 epub_generator/data/container.xml.jinja,sha256=SkACyZgsAVUS5lmiCEhq3SpbFspYdyCnRNjWnLztLt0,252
 epub_generator/data/content.opf.jinja,sha256=DDaR9GZnSBcpNk2BWUu56Uo_248TA91AxE4tKsBuKnQ,2839
@@ -11,16 +11,17 @@ epub_generator/generation/__init__.py,sha256=UIscwHa8ocr2D1mk1KaP-zi3P1x9eYJzxTo
 epub_generator/generation/gen_asset.py,sha256=WYwfGUvHM_CrwTuIIH7dYm-SL-vdhkTnvaZDymZxXzg,5978
 epub_generator/generation/gen_chapter.py,sha256=P6kmB8hdQnJB6SCheHzu5cOmZrC5H0LqNV-uuuigX1M,3425
 epub_generator/generation/gen_content.py,sha256=2ojjTgalveRnk1MXQaKsY53hPCgb7NHTwbMpLOXVrss,2018
-epub_generator/generation/gen_epub.py,sha256=I7u8rrrslF9xoyDUsALarB2iWzY9zjKM9ZOR1wLMX1E,6184
+epub_generator/generation/gen_epub.py,sha256=rxHBp4nP5OFi9SJBfiCrncV1fmhb0j3WKfUqofxJykc,6487
 epub_generator/generation/gen_nav.py,sha256=_cjOP18C1CoTn_DELIB06pyMPZZ0CPbkk4oPEvICdKs,1955
 epub_generator/generation/gen_toc.py,sha256=MK2iTYBpF8VUtPHpwz5JB_H6nWsKRKpVuLzRPYGy0nw,2864
-epub_generator/generation/xml_utils.py,sha256=kyHBWUihT5se5n_425BcEvBpsIK6yC52W25t012QUn0,2084
+epub_generator/generation/xml_utils.py,sha256=AVnU3AN6lmqWrdgaZTV7v77L9LonI7DX59BxkMZlef8,1822
 epub_generator/html_tag.py,sha256=P_Y0uRStCEEh7cCtpvK4t432NEcY9OLntAznvdxUF5k,343
 epub_generator/i18n.py,sha256=-L6J6hsy796_IQ4nLpNtAeXIkRM6oFSWSHDlRZXW8aA,705
 epub_generator/options.py,sha256=Er1dnaNvzDSnZRSRJGSqhkJsv1XtsCW2Ym_hUc8o_QI,181
 epub_generator/template.py,sha256=RdN2QRICIrYMzpxCU_x4m4V9WWZEP9VvT6QLp2YCm90,1556
 epub_generator/types.py,sha256=gBrdi1KYOVEnI0qEp1slLsyUw_Sd7v09uHvN8_Hf9Z8,4440
-epub_generator-0.1.5.dist-info/LICENSE,sha256=9Zt_a4mrzkvR2rc0UbqTgbboIjWuumDFgeQyKos0H2E,1066
-epub_generator-0.1.5.dist-info/METADATA,sha256=cwIGyOGFrt0hvtw_FHaaTjeoy-l-FP-SGZC4zP0MJyw,16555
-epub_generator-0.1.5.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
-epub_generator-0.1.5.dist-info/RECORD,,
+epub_generator/validate.py,sha256=KBgvBsBuVnWTc4N-29cr2P92X0w_tGR4pMemk_KHy78,7544
+epub_generator-0.1.6.dist-info/LICENSE,sha256=9Zt_a4mrzkvR2rc0UbqTgbboIjWuumDFgeQyKos0H2E,1066
+epub_generator-0.1.6.dist-info/METADATA,sha256=JziMt9LukPRKo8rPy10qf9sIiiv98CgSxKoi7juHcYE,16555
+epub_generator-0.1.6.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
+epub_generator-0.1.6.dist-info/RECORD,,

{epub_generator-0.1.5.dist-info → epub_generator-0.1.6.dist-info}/LICENSE RENAMED Viewed

File without changes

{epub_generator-0.1.5.dist-info → epub_generator-0.1.6.dist-info}/WHEEL RENAMED Viewed

File without changes

epub-generator 0.1.5__py3-none-any.whl → 0.1.6__py3-none-any.whl

epub-generator 0.1.5py3-none-any.whl → 0.1.6py3-none-any.whl