PyPI - epub-translator - Versions diffs - 0.1.1__py3-none-any.whl → 0.1.4__py3-none-any.whl - Mend

epub-translator 0.1.1py3-none-any.whl → 0.1.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

epub_translator/__init__.py +9 -2
epub_translator/data/fill.jinja +143 -38
epub_translator/epub/__init__.py +1 -1
epub_translator/epub/metadata.py +122 -0
epub_translator/epub/spines.py +3 -2
epub_translator/epub/zip.py +11 -9
epub_translator/epub_transcode.py +108 -0
epub_translator/llm/__init__.py +1 -0
epub_translator/llm/context.py +109 -0
epub_translator/llm/core.py +32 -113
epub_translator/llm/executor.py +25 -31
epub_translator/llm/increasable.py +1 -1
epub_translator/llm/types.py +0 -3
epub_translator/punctuation.py +34 -0
epub_translator/segment/__init__.py +26 -0
epub_translator/segment/block_segment.py +124 -0
epub_translator/segment/common.py +29 -0
epub_translator/segment/inline_segment.py +356 -0
epub_translator/{xml_translator → segment}/text_segment.py +7 -72
epub_translator/segment/utils.py +43 -0
epub_translator/translator.py +152 -184
epub_translator/utils.py +33 -0
epub_translator/xml/__init__.py +3 -0
epub_translator/xml/const.py +1 -0
epub_translator/xml/deduplication.py +3 -3
epub_translator/xml/inline.py +67 -0
epub_translator/xml/self_closing.py +182 -0
epub_translator/xml/utils.py +42 -0
epub_translator/xml/xml.py +7 -0
epub_translator/xml/xml_like.py +8 -33
epub_translator/xml_interrupter.py +165 -0
epub_translator/xml_translator/__init__.py +3 -3
epub_translator/xml_translator/callbacks.py +34 -0
epub_translator/xml_translator/{const.py → common.py} +0 -1
epub_translator/xml_translator/hill_climbing.py +104 -0
epub_translator/xml_translator/stream_mapper.py +253 -0
epub_translator/xml_translator/submitter.py +352 -91
epub_translator/xml_translator/translator.py +182 -114
epub_translator/xml_translator/validation.py +458 -0
{epub_translator-0.1.1.dist-info → epub_translator-0.1.4.dist-info}/METADATA +134 -21
epub_translator-0.1.4.dist-info/RECORD +68 -0
epub_translator/epub/placeholder.py +0 -53
epub_translator/iter_sync.py +0 -24
epub_translator/xml_translator/fill.py +0 -128
epub_translator/xml_translator/format.py +0 -282
epub_translator/xml_translator/fragmented.py +0 -125
epub_translator/xml_translator/group.py +0 -183
epub_translator/xml_translator/progressive_locking.py +0 -256
epub_translator/xml_translator/utils.py +0 -29
epub_translator-0.1.1.dist-info/RECORD +0 -58
{epub_translator-0.1.1.dist-info → epub_translator-0.1.4.dist-info}/LICENSE +0 -0
{epub_translator-0.1.1.dist-info → epub_translator-0.1.4.dist-info}/WHEEL +0 -0

epub_translator/llm/core.py CHANGED Viewed

@@ -1,104 +1,18 @@
 import datetime
-import hashlib
-import json
-import uuid
-from collections.abc import Callable, Generator
+from collections.abc import Generator
 from importlib.resources import files
 from logging import DEBUG, FileHandler, Formatter, Logger, getLogger
 from os import PathLike
 from pathlib import Path
-from typing import Self
 from jinja2 import Environment, Template
 from tiktoken import Encoding, get_encoding
 from ..template import create_env
+from .context import LLMContext
 from .executor import LLMExecutor
 from .increasable import Increasable
-from .types import Message, MessageRole, R
-class LLMContext:
-    """Context manager for LLM requests with transactional caching."""
-    def __init__(
-        self,
-        executor: LLMExecutor,
-        cache_path: Path | None,
-    ) -> None:
-        self._executor = executor
-        self._cache_path = cache_path
-        self._context_id = uuid.uuid4().hex[:12]
-        self._temp_files: list[Path] = []
-    def __enter__(self) -> Self:
-        return self
-    def __exit__(self, exc_type, exc_val, exc_tb) -> None:
-        if exc_type is None:
-            # Success: commit all temporary cache files
-            self._commit()
-        else:
-            # Failure: rollback (delete) all temporary cache files
-            self._rollback()
-    def request(
-        self,
-        input: str | list[Message],
-        parser: Callable[[str], R] = lambda x: x,
-        max_tokens: int | None = None,
-    ) -> R:
-        messages: list[Message]
-        if isinstance(input, str):
-            messages = [Message(role=MessageRole.USER, message=input)]
-        else:
-            messages = input
-        cache_key: str | None = None
-        if self._cache_path is not None:
-            cache_key = self._compute_messages_hash(messages)
-            permanent_cache_file = self._cache_path / f"{cache_key}.txt"
-            if permanent_cache_file.exists():
-                cached_content = permanent_cache_file.read_text(encoding="utf-8")
-                return parser(cached_content)
-            temp_cache_file = self._cache_path / f"{cache_key}.{self._context_id}.txt"
-            if temp_cache_file.exists():
-                cached_content = temp_cache_file.read_text(encoding="utf-8")
-                return parser(cached_content)
-        # Make the actual request
-        response = self._executor.request(
-            messages=messages,
-            parser=lambda x: x,
-            max_tokens=max_tokens,
-        )
-        # Save to temporary cache if cache_path is set
-        if self._cache_path is not None and cache_key is not None:
-            temp_cache_file = self._cache_path / f"{cache_key}.{self._context_id}.txt"
-            temp_cache_file.write_text(response, encoding="utf-8")
-            self._temp_files.append(temp_cache_file)
-        return parser(response)
-    def _compute_messages_hash(self, messages: list[Message]) -> str:
-        messages_dict = [{"role": msg.role.value, "message": msg.message} for msg in messages]
-        messages_json = json.dumps(messages_dict, ensure_ascii=False, sort_keys=True)
-        return hashlib.sha512(messages_json.encode("utf-8")).hexdigest()
-    def _commit(self) -> None:
-        for temp_file in self._temp_files:
-            if temp_file.exists():
-                # Remove the .[context-id].txt suffix to get permanent name
-                permanent_name = temp_file.name.rsplit(".", 2)[0] + ".txt"
-                permanent_file = temp_file.parent / permanent_name
-                temp_file.rename(permanent_file)
-    def _rollback(self) -> None:
-        for temp_file in self._temp_files:
-            if temp_file.exists():
-                temp_file.unlink()
+from .types import Message
 class LLM:
@@ -108,42 +22,28 @@ class LLM:
         url: str,
         model: str,
         token_encoding: str,
-        cache_path: PathLike | None = None,
         timeout: float | None = None,
         top_p: float | tuple[float, float] | None = None,
         temperature: float | tuple[float, float] | None = None,
         retry_times: int = 5,
         retry_interval_seconds: float = 6.0,
-        log_dir_path: PathLike | None = None,
+        cache_path: PathLike | str | None = None,
+        log_dir_path: PathLike | str | None = None,
     ) -> None:
         prompts_path = Path(str(files("epub_translator"))) / "data"
         self._templates: dict[str, Template] = {}
         self._encoding: Encoding = get_encoding(token_encoding)
         self._env: Environment = create_env(prompts_path)
-        self._logger_save_path: Path | None = None
-        self._cache_path: Path | None = None
-        if cache_path is not None:
-            self._cache_path = Path(cache_path)
-            if not self._cache_path.exists():
-                self._cache_path.mkdir(parents=True, exist_ok=True)
-            elif not self._cache_path.is_dir():
-                self._cache_path = None
-        if log_dir_path is not None:
-            self._logger_save_path = Path(log_dir_path)
-            if not self._logger_save_path.exists():
-                self._logger_save_path.mkdir(parents=True, exist_ok=True)
-            elif not self._logger_save_path.is_dir():
-                self._logger_save_path = None
+        self._top_p: Increasable = Increasable(top_p)
+        self._temperature: Increasable = Increasable(temperature)
+        self._cache_path: Path | None = self._ensure_dir_path(cache_path)
+        self._logger_save_path: Path | None = self._ensure_dir_path(log_dir_path)
         self._executor = LLMExecutor(
             url=url,
             model=model,
             api_key=key,
             timeout=timeout,
-            top_p=Increasable(top_p),
-            temperature=Increasable(temperature),
             retry_times=retry_times,
             retry_interval_seconds=retry_interval_seconds,
             create_logger=self._create_logger,
@@ -153,20 +53,29 @@ class LLM:
     def encoding(self) -> Encoding:
         return self._encoding
-    def context(self) -> LLMContext:
+    def context(self, cache_seed_content: str | None = None) -> LLMContext:
         return LLMContext(
             executor=self._executor,
             cache_path=self._cache_path,
+            cache_seed_content=cache_seed_content,
+            top_p=self._top_p,
+            temperature=self._temperature,
         )
     def request(
         self,
         input: str | list[Message],
-        parser: Callable[[str], R] = lambda x: x,
         max_tokens: int | None = None,
-    ) -> R:
+        temperature: float | None = None,
+        top_p: float | None = None,
+    ) -> str:
         with self.context() as ctx:
-            return ctx.request(input=input, parser=parser, max_tokens=max_tokens)
+            return ctx.request(
+                input=input,
+                max_tokens=max_tokens,
+                temperature=temperature,
+                top_p=top_p,
+            )
     def template(self, template_name: str) -> Template:
         template = self._templates.get(template_name, None)
@@ -175,6 +84,16 @@ class LLM:
             self._templates[template_name] = template
         return template
+    def _ensure_dir_path(self, path: PathLike | str | None) -> Path | None:
+        if path is None:
+            return None
+        dir_path = Path(path)
+        if not dir_path.exists():
+            dir_path.mkdir(parents=True, exist_ok=True)
+        elif not dir_path.is_dir():
+            return None
+        return dir_path.resolve()
     def _create_logger(self) -> Logger | None:
         if self._logger_save_path is None:
             return None

epub_translator/llm/executor.py CHANGED Viewed

@@ -2,14 +2,12 @@ from collections.abc import Callable
 from io import StringIO
 from logging import Logger
 from time import sleep
-from typing import cast
 from openai import OpenAI
 from openai.types.chat import ChatCompletionMessageParam
 from .error import is_retry_error
-from .increasable import Increasable, Increaser
-from .types import Message, MessageRole, R
+from .types import Message, MessageRole
 class LLMExecutor:
@@ -19,16 +17,12 @@ class LLMExecutor:
         url: str,
         model: str,
         timeout: float | None,
-        top_p: Increasable,
-        temperature: Increasable,
         retry_times: int,
         retry_interval_seconds: float,
         create_logger: Callable[[], Logger | None],
     ) -> None:
         self._model_name: str = model
         self._timeout: float | None = timeout
-        self._top_p: Increasable = top_p
-        self._temperature: Increasable = temperature
         self._retry_times: int = retry_times
         self._retry_interval_seconds: float = retry_interval_seconds
         self._create_logger: Callable[[], Logger | None] = create_logger
@@ -38,15 +32,29 @@ class LLMExecutor:
             timeout=timeout,
         )
-    def request(self, messages: list[Message], parser: Callable[[str], R], max_tokens: int | None) -> R:
-        result: R | None = None
+    def request(
+        self,
+        messages: list[Message],
+        max_tokens: int | None,
+        temperature: float | None,
+        top_p: float | None,
+        cache_key: str | None,
+    ) -> str:
+        response: str = ""
         last_error: Exception | None = None
         did_success = False
-        top_p: Increaser = self._top_p.context()
-        temperature: Increaser = self._temperature.context()
         logger = self._create_logger()
         if logger is not None:
+            parameters: list[str] = [
+                f"\t\ntemperature={temperature}",
+                f"\t\ntop_p={top_p}",
+                f"\t\nmax_tokens={max_tokens}",
+            ]
+            if cache_key is not None:
+                parameters.append(f"\t\ncache_key={cache_key}")
+            logger.debug(f"[[Parameters]]:{''.join(parameters)}\n")
             logger.debug(f"[[Request]]:\n{self._input2str(messages)}\n")
         try:
@@ -54,8 +62,8 @@ class LLMExecutor:
                 try:
                     response = self._invoke_model(
                         input_messages=messages,
-                        top_p=top_p.current,
-                        temperature=temperature.current,
+                        temperature=temperature,
+                        top_p=top_p,
                         max_tokens=max_tokens,
                     )
                     if logger is not None:
@@ -71,22 +79,8 @@ class LLMExecutor:
                         sleep(self._retry_interval_seconds)
                     continue
-                try:
-                    result = parser(response)
-                    did_success = True
-                    break
-                except Exception as err:
-                    last_error = err
-                    warn_message = f"request failed with parsing error, retrying... ({i + 1} times)"
-                    if logger is not None:
-                        logger.warning(warn_message)
-                    print(warn_message)
-                    top_p.increase()
-                    temperature.increase()
-                    if self._retry_interval_seconds > 0.0 and i < self._retry_times:
-                        sleep(self._retry_interval_seconds)
-                    continue
+                did_success = True
+                break
         except KeyboardInterrupt as err:
             if last_error is not None and logger is not None:
@@ -99,7 +93,7 @@ class LLMExecutor:
             else:
                 raise last_error
-        return cast(R, result)
+        return response
     def _input2str(self, input: str | list[Message]) -> str:
         if isinstance(input, str):
@@ -133,7 +127,7 @@ class LLMExecutor:
         top_p: float | None,
         temperature: float | None,
         max_tokens: int | None,
-    ):
+    ) -> str:
         messages: list[ChatCompletionMessageParam] = []
         for item in input_messages:
             if item.role == MessageRole.SYSTEM:

epub_translator/llm/increasable.py CHANGED Viewed

@@ -21,7 +21,7 @@ class Increasable:
             param = float(param)
         if isinstance(param, float):
             param = (param, param)
-        if isinstance(param, tuple):
+        if isinstance(param, (tuple, list)):
             if len(param) != 2:
                 raise ValueError(f"Expected a tuple of length 2, got {len(param)}")
             begin, end = param

epub_translator/llm/types.py CHANGED Viewed

@@ -1,8 +1,5 @@
 from dataclasses import dataclass
 from enum import Enum, auto
-from typing import TypeVar
-R = TypeVar("R")
 @dataclass

epub_translator/punctuation.py ADDED Viewed

@@ -0,0 +1,34 @@
+from xml.etree.ElementTree import Element
+from .xml import iter_with_stack
+_QUOTE_MAPPING = {
+    # 法语引号
+    "«": "",
+    "»": "",
+    "‹": "«",
+    "›": "»",
+    # 中文书书名号
+    "《": "",
+    "》": "",
+    "〈": "《",
+    "〉": "》",
+}
+def _strip_quotes(text: str):
+    for char in text:
+        mapped = _QUOTE_MAPPING.get(char, None)
+        if mapped is None:
+            yield char
+        elif mapped:
+            yield mapped
+def unwrap_french_quotes(element: Element) -> Element:
+    for _, child_element in iter_with_stack(element):
+        if child_element.text:
+            child_element.text = "".join(_strip_quotes(child_element.text))
+        if child_element.tail:
+            child_element.tail = "".join(_strip_quotes(child_element.tail))
+    return element

epub_translator/segment/__init__.py ADDED Viewed

@@ -0,0 +1,26 @@
+from .block_segment import (
+    BlockContentError,
+    BlockError,
+    BlockExpectedIDsError,
+    BlockSegment,
+    BlockSubmitter,
+    BlockUnexpectedIDError,
+    BlockWrongTagError,
+)
+from .common import FoundInvalidIDError
+from .inline_segment import (
+    InlineError,
+    InlineExpectedIDsError,
+    InlineLostIDError,
+    InlineSegment,
+    InlineUnexpectedIDError,
+    InlineWrongTagCountError,
+    search_inline_segments,
+)
+from .text_segment import (
+    TextPosition,
+    TextSegment,
+    combine_text_segments,
+    incision_between,
+    search_text_segments,
+)

epub_translator/segment/block_segment.py ADDED Viewed

@@ -0,0 +1,124 @@
+from collections.abc import Generator
+from dataclasses import dataclass
+from typing import cast
+from xml.etree.ElementTree import Element
+from .common import FoundInvalidIDError, validate_id_in_element
+from .inline_segment import InlineError, InlineSegment
+from .text_segment import TextSegment
+from .utils import IDGenerator, id_in_element
+@dataclass
+class BlockSubmitter:
+    id: int
+    origin_text_segments: list[TextSegment]
+    submitted_element: Element
+@dataclass
+class BlockWrongTagError:
+    block: tuple[int, Element] | None  # (block_id, block_element) | None 表示根元素
+    expected_tag: str
+    instead_tag: str
+@dataclass
+class BlockUnexpectedIDError:
+    id: int
+    element: Element
+@dataclass
+class BlockExpectedIDsError:
+    id2element: dict[int, Element]
+@dataclass
+class BlockContentError:
+    id: int
+    element: Element
+    errors: list[InlineError | FoundInvalidIDError]
+BlockError = BlockWrongTagError | BlockUnexpectedIDError | BlockExpectedIDsError | BlockContentError
+class BlockSegment:
+    def __init__(self, root_tag: str, inline_segments: list[InlineSegment]) -> None:
+        id_generator = IDGenerator()
+        for inline_segment in inline_segments:
+            inline_segment.id = id_generator.next_id()
+            inline_segment.recreate_ids(id_generator)
+        self._root_tag: str = root_tag
+        self._inline_segments: list[InlineSegment] = inline_segments
+        self._id2inline_segment: dict[int, InlineSegment] = dict((cast(int, s.id), s) for s in self._inline_segments)
+    def __iter__(self) -> Generator[InlineSegment, None, None]:
+        yield from self._inline_segments
+    def create_element(self) -> Element:
+        root_element = Element(self._root_tag)
+        for inline_segment in self._inline_segments:
+            root_element.append(inline_segment.create_element())
+        return root_element
+    def validate(self, validated_element: Element) -> Generator[BlockError | FoundInvalidIDError, None, None]:
+        if validated_element.tag != self._root_tag:
+            yield BlockWrongTagError(
+                block=None,
+                expected_tag=self._root_tag,
+                instead_tag=validated_element.tag,
+            )
+        remain_expected_elements: dict[int, Element] = dict(
+            (id, inline_segment.parent) for id, inline_segment in self._id2inline_segment.items()
+        )
+        for child_validated_element in validated_element:
+            element_id = validate_id_in_element(child_validated_element)
+            if isinstance(element_id, FoundInvalidIDError):
+                yield element_id
+            else:
+                inline_segment = self._id2inline_segment.get(element_id, None)
+                if inline_segment is None:
+                    yield BlockUnexpectedIDError(
+                        id=element_id,
+                        element=child_validated_element,
+                    )
+                else:
+                    if inline_segment.parent.tag != child_validated_element.tag:
+                        yield BlockWrongTagError(
+                            block=(cast(int, inline_segment.id), inline_segment.parent),
+                            expected_tag=inline_segment.parent.tag,
+                            instead_tag=child_validated_element.tag,
+                        )
+                    remain_expected_elements.pop(element_id, None)
+                    inline_errors = list(inline_segment.validate(child_validated_element))
+                    if inline_errors:
+                        yield BlockContentError(
+                            id=element_id,
+                            element=child_validated_element,
+                            errors=inline_errors,
+                        )
+        if remain_expected_elements:
+            yield BlockExpectedIDsError(id2element=remain_expected_elements)
+    def submit(self, target: Element) -> Generator[BlockSubmitter, None, None]:
+        for child_element in target:
+            element_id = id_in_element(child_element)
+            if element_id is None:
+                continue
+            inline_segment = self._id2inline_segment.get(element_id, None)
+            if inline_segment is None:
+                continue
+            inline_segment_id = inline_segment.id
+            assert inline_segment_id is not None
+            yield BlockSubmitter(
+                id=inline_segment_id,
+                origin_text_segments=list(inline_segment),
+                submitted_element=inline_segment.assign_attributes(child_element),
+            )

epub_translator/segment/common.py ADDED Viewed

@@ -0,0 +1,29 @@
+from dataclasses import dataclass
+from xml.etree.ElementTree import Element
+from ..xml import ID_KEY
+@dataclass
+class FoundInvalidIDError(Exception):
+    invalid_id: str | None
+    element: Element
+def validate_id_in_element(element: Element, enable_no_id: bool = False) -> int | FoundInvalidIDError:
+    id_str = element.get(ID_KEY, None)
+    if id_str is None:
+        if enable_no_id:
+            return -1
+        else:
+            return FoundInvalidIDError(
+                invalid_id=None,
+                element=element,
+            )
+    try:
+        return int(id_str)
+    except ValueError:
+        return FoundInvalidIDError(
+            invalid_id=id_str,
+            element=element,
+        )

epub-translator 0.1.1__py3-none-any.whl → 0.1.4__py3-none-any.whl

epub-translator 0.1.1py3-none-any.whl → 0.1.4py3-none-any.whl