epub-translator 0.1.0__tar.gz → 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. {epub_translator-0.1.0 → epub_translator-0.1.1}/PKG-INFO +1 -1
  2. {epub_translator-0.1.0 → epub_translator-0.1.1}/epub_translator/llm/core.py +95 -37
  3. {epub_translator-0.1.0 → epub_translator-0.1.1}/epub_translator/translator.py +5 -2
  4. epub_translator-0.1.1/epub_translator/xml/xml_like.py +231 -0
  5. {epub_translator-0.1.0 → epub_translator-0.1.1}/epub_translator/xml_translator/translator.py +48 -47
  6. {epub_translator-0.1.0 → epub_translator-0.1.1}/pyproject.toml +1 -1
  7. epub_translator-0.1.0/epub_translator/xml/xml_like.py +0 -176
  8. {epub_translator-0.1.0 → epub_translator-0.1.1}/LICENSE +0 -0
  9. {epub_translator-0.1.0 → epub_translator-0.1.1}/README.md +0 -0
  10. {epub_translator-0.1.0 → epub_translator-0.1.1}/epub_translator/__init__.py +0 -0
  11. {epub_translator-0.1.0 → epub_translator-0.1.1}/epub_translator/data/fill.jinja +0 -0
  12. {epub_translator-0.1.0 → epub_translator-0.1.1}/epub_translator/data/mmltex/README.md +0 -0
  13. {epub_translator-0.1.0 → epub_translator-0.1.1}/epub_translator/data/mmltex/cmarkup.xsl +0 -0
  14. {epub_translator-0.1.0 → epub_translator-0.1.1}/epub_translator/data/mmltex/entities.xsl +0 -0
  15. {epub_translator-0.1.0 → epub_translator-0.1.1}/epub_translator/data/mmltex/glayout.xsl +0 -0
  16. {epub_translator-0.1.0 → epub_translator-0.1.1}/epub_translator/data/mmltex/mmltex.xsl +0 -0
  17. {epub_translator-0.1.0 → epub_translator-0.1.1}/epub_translator/data/mmltex/scripts.xsl +0 -0
  18. {epub_translator-0.1.0 → epub_translator-0.1.1}/epub_translator/data/mmltex/tables.xsl +0 -0
  19. {epub_translator-0.1.0 → epub_translator-0.1.1}/epub_translator/data/mmltex/tokens.xsl +0 -0
  20. {epub_translator-0.1.0 → epub_translator-0.1.1}/epub_translator/data/translate.jinja +0 -0
  21. {epub_translator-0.1.0 → epub_translator-0.1.1}/epub_translator/epub/__init__.py +0 -0
  22. {epub_translator-0.1.0 → epub_translator-0.1.1}/epub_translator/epub/common.py +0 -0
  23. {epub_translator-0.1.0 → epub_translator-0.1.1}/epub_translator/epub/math.py +0 -0
  24. {epub_translator-0.1.0 → epub_translator-0.1.1}/epub_translator/epub/placeholder.py +0 -0
  25. {epub_translator-0.1.0 → epub_translator-0.1.1}/epub_translator/epub/spines.py +0 -0
  26. {epub_translator-0.1.0 → epub_translator-0.1.1}/epub_translator/epub/toc.py +0 -0
  27. {epub_translator-0.1.0 → epub_translator-0.1.1}/epub_translator/epub/zip.py +0 -0
  28. {epub_translator-0.1.0 → epub_translator-0.1.1}/epub_translator/iter_sync.py +0 -0
  29. {epub_translator-0.1.0 → epub_translator-0.1.1}/epub_translator/language.py +0 -0
  30. {epub_translator-0.1.0 → epub_translator-0.1.1}/epub_translator/llm/__init__.py +0 -0
  31. {epub_translator-0.1.0 → epub_translator-0.1.1}/epub_translator/llm/error.py +0 -0
  32. {epub_translator-0.1.0 → epub_translator-0.1.1}/epub_translator/llm/executor.py +0 -0
  33. {epub_translator-0.1.0 → epub_translator-0.1.1}/epub_translator/llm/increasable.py +0 -0
  34. {epub_translator-0.1.0 → epub_translator-0.1.1}/epub_translator/llm/types.py +0 -0
  35. {epub_translator-0.1.0 → epub_translator-0.1.1}/epub_translator/serial/__init__.py +0 -0
  36. {epub_translator-0.1.0 → epub_translator-0.1.1}/epub_translator/serial/chunk.py +0 -0
  37. {epub_translator-0.1.0 → epub_translator-0.1.1}/epub_translator/serial/segment.py +0 -0
  38. {epub_translator-0.1.0 → epub_translator-0.1.1}/epub_translator/serial/splitter.py +0 -0
  39. {epub_translator-0.1.0 → epub_translator-0.1.1}/epub_translator/template.py +0 -0
  40. {epub_translator-0.1.0 → epub_translator-0.1.1}/epub_translator/utils.py +0 -0
  41. {epub_translator-0.1.0 → epub_translator-0.1.1}/epub_translator/xml/__init__.py +0 -0
  42. {epub_translator-0.1.0 → epub_translator-0.1.1}/epub_translator/xml/deduplication.py +0 -0
  43. {epub_translator-0.1.0 → epub_translator-0.1.1}/epub_translator/xml/firendly/__init__.py +0 -0
  44. {epub_translator-0.1.0 → epub_translator-0.1.1}/epub_translator/xml/firendly/decoder.py +0 -0
  45. {epub_translator-0.1.0 → epub_translator-0.1.1}/epub_translator/xml/firendly/encoder.py +0 -0
  46. {epub_translator-0.1.0 → epub_translator-0.1.1}/epub_translator/xml/firendly/parser.py +0 -0
  47. {epub_translator-0.1.0 → epub_translator-0.1.1}/epub_translator/xml/firendly/tag.py +0 -0
  48. {epub_translator-0.1.0 → epub_translator-0.1.1}/epub_translator/xml/firendly/transform.py +0 -0
  49. {epub_translator-0.1.0 → epub_translator-0.1.1}/epub_translator/xml/xml.py +0 -0
  50. {epub_translator-0.1.0 → epub_translator-0.1.1}/epub_translator/xml_translator/__init__.py +0 -0
  51. {epub_translator-0.1.0 → epub_translator-0.1.1}/epub_translator/xml_translator/const.py +0 -0
  52. {epub_translator-0.1.0 → epub_translator-0.1.1}/epub_translator/xml_translator/fill.py +0 -0
  53. {epub_translator-0.1.0 → epub_translator-0.1.1}/epub_translator/xml_translator/format.py +0 -0
  54. {epub_translator-0.1.0 → epub_translator-0.1.1}/epub_translator/xml_translator/fragmented.py +0 -0
  55. {epub_translator-0.1.0 → epub_translator-0.1.1}/epub_translator/xml_translator/group.py +0 -0
  56. {epub_translator-0.1.0 → epub_translator-0.1.1}/epub_translator/xml_translator/progressive_locking.py +0 -0
  57. {epub_translator-0.1.0 → epub_translator-0.1.1}/epub_translator/xml_translator/submitter.py +0 -0
  58. {epub_translator-0.1.0 → epub_translator-0.1.1}/epub_translator/xml_translator/text_segment.py +0 -0
  59. {epub_translator-0.1.0 → epub_translator-0.1.1}/epub_translator/xml_translator/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: epub-translator
3
- Version: 0.1.0
3
+ Version: 0.1.1
4
4
  Summary: Translate the epub book using LLM. The translated book will retain the original text and list the translated text side by side with the original text.
5
5
  License: MIT
6
6
  Keywords: epub,llm,translation,translator
@@ -1,11 +1,13 @@
1
1
  import datetime
2
2
  import hashlib
3
3
  import json
4
+ import uuid
4
5
  from collections.abc import Callable, Generator
5
6
  from importlib.resources import files
6
7
  from logging import DEBUG, FileHandler, Formatter, Logger, getLogger
7
8
  from os import PathLike
8
9
  from pathlib import Path
10
+ from typing import Self
9
11
 
10
12
  from jinja2 import Environment, Template
11
13
  from tiktoken import Encoding, get_encoding
@@ -16,6 +18,89 @@ from .increasable import Increasable
16
18
  from .types import Message, MessageRole, R
17
19
 
18
20
 
21
+ class LLMContext:
22
+ """Context manager for LLM requests with transactional caching."""
23
+
24
+ def __init__(
25
+ self,
26
+ executor: LLMExecutor,
27
+ cache_path: Path | None,
28
+ ) -> None:
29
+ self._executor = executor
30
+ self._cache_path = cache_path
31
+ self._context_id = uuid.uuid4().hex[:12]
32
+ self._temp_files: list[Path] = []
33
+
34
+ def __enter__(self) -> Self:
35
+ return self
36
+
37
+ def __exit__(self, exc_type, exc_val, exc_tb) -> None:
38
+ if exc_type is None:
39
+ # Success: commit all temporary cache files
40
+ self._commit()
41
+ else:
42
+ # Failure: rollback (delete) all temporary cache files
43
+ self._rollback()
44
+
45
+ def request(
46
+ self,
47
+ input: str | list[Message],
48
+ parser: Callable[[str], R] = lambda x: x,
49
+ max_tokens: int | None = None,
50
+ ) -> R:
51
+ messages: list[Message]
52
+ if isinstance(input, str):
53
+ messages = [Message(role=MessageRole.USER, message=input)]
54
+ else:
55
+ messages = input
56
+
57
+ cache_key: str | None = None
58
+ if self._cache_path is not None:
59
+ cache_key = self._compute_messages_hash(messages)
60
+ permanent_cache_file = self._cache_path / f"{cache_key}.txt"
61
+ if permanent_cache_file.exists():
62
+ cached_content = permanent_cache_file.read_text(encoding="utf-8")
63
+ return parser(cached_content)
64
+
65
+ temp_cache_file = self._cache_path / f"{cache_key}.{self._context_id}.txt"
66
+ if temp_cache_file.exists():
67
+ cached_content = temp_cache_file.read_text(encoding="utf-8")
68
+ return parser(cached_content)
69
+
70
+ # Make the actual request
71
+ response = self._executor.request(
72
+ messages=messages,
73
+ parser=lambda x: x,
74
+ max_tokens=max_tokens,
75
+ )
76
+
77
+ # Save to temporary cache if cache_path is set
78
+ if self._cache_path is not None and cache_key is not None:
79
+ temp_cache_file = self._cache_path / f"{cache_key}.{self._context_id}.txt"
80
+ temp_cache_file.write_text(response, encoding="utf-8")
81
+ self._temp_files.append(temp_cache_file)
82
+
83
+ return parser(response)
84
+
85
+ def _compute_messages_hash(self, messages: list[Message]) -> str:
86
+ messages_dict = [{"role": msg.role.value, "message": msg.message} for msg in messages]
87
+ messages_json = json.dumps(messages_dict, ensure_ascii=False, sort_keys=True)
88
+ return hashlib.sha512(messages_json.encode("utf-8")).hexdigest()
89
+
90
+ def _commit(self) -> None:
91
+ for temp_file in self._temp_files:
92
+ if temp_file.exists():
93
+ # Remove the .[context-id].txt suffix to get permanent name
94
+ permanent_name = temp_file.name.rsplit(".", 2)[0] + ".txt"
95
+ permanent_file = temp_file.parent / permanent_name
96
+ temp_file.rename(permanent_file)
97
+
98
+ def _rollback(self) -> None:
99
+ for temp_file in self._temp_files:
100
+ if temp_file.exists():
101
+ temp_file.unlink()
102
+
103
+
19
104
  class LLM:
20
105
  def __init__(
21
106
  self,
@@ -30,7 +115,7 @@ class LLM:
30
115
  retry_times: int = 5,
31
116
  retry_interval_seconds: float = 6.0,
32
117
  log_dir_path: PathLike | None = None,
33
- ):
118
+ ) -> None:
34
119
  prompts_path = Path(str(files("epub_translator"))) / "data"
35
120
  self._templates: dict[str, Template] = {}
36
121
  self._encoding: Encoding = get_encoding(token_encoding)
@@ -68,41 +153,20 @@ class LLM:
68
153
  def encoding(self) -> Encoding:
69
154
  return self._encoding
70
155
 
156
+ def context(self) -> LLMContext:
157
+ return LLMContext(
158
+ executor=self._executor,
159
+ cache_path=self._cache_path,
160
+ )
161
+
71
162
  def request(
72
163
  self,
73
164
  input: str | list[Message],
74
165
  parser: Callable[[str], R] = lambda x: x,
75
166
  max_tokens: int | None = None,
76
167
  ) -> R:
77
- messages: list[Message]
78
- if isinstance(input, str):
79
- messages = [Message(role=MessageRole.USER, message=input)]
80
- else:
81
- messages = input
82
-
83
- # Check cache if cache_path is set
84
- if self._cache_path is not None:
85
- cache_key = self._compute_messages_hash(messages)
86
- cache_file = self._cache_path / f"{cache_key}.txt"
87
-
88
- if cache_file.exists():
89
- cached_content = cache_file.read_text(encoding="utf-8")
90
- return parser(cached_content)
91
-
92
- # Make the actual request
93
- response = self._executor.request(
94
- messages=messages,
95
- parser=lambda x: x,
96
- max_tokens=max_tokens,
97
- )
98
-
99
- # Save to cache if cache_path is set
100
- if self._cache_path is not None:
101
- cache_key = self._compute_messages_hash(messages)
102
- cache_file = self._cache_path / f"{cache_key}.txt"
103
- cache_file.write_text(response, encoding="utf-8")
104
-
105
- return parser(response)
168
+ with self.context() as ctx:
169
+ return ctx.request(input=input, parser=parser, max_tokens=max_tokens)
106
170
 
107
171
  def template(self, template_name: str) -> Template:
108
172
  template = self._templates.get(template_name, None)
@@ -111,17 +175,11 @@ class LLM:
111
175
  self._templates[template_name] = template
112
176
  return template
113
177
 
114
- def _compute_messages_hash(self, messages: list[Message]) -> str:
115
- """Compute SHA-512 hash of m·essages for cache key."""
116
- messages_dict = [{"role": msg.role.value, "message": msg.message} for msg in messages]
117
- messages_json = json.dumps(messages_dict, ensure_ascii=False, sort_keys=True)
118
- return hashlib.sha512(messages_json.encode("utf-8")).hexdigest()
119
-
120
178
  def _create_logger(self) -> Logger | None:
121
179
  if self._logger_save_path is None:
122
180
  return None
123
181
 
124
- now = datetime.datetime.now(datetime.timezone.utc)
182
+ now = datetime.datetime.now(datetime.UTC)
125
183
  timestamp = now.strftime("%Y-%m-%d %H-%M-%S %f")
126
184
  file_path = self._logger_save_path / f"request {timestamp}.log"
127
185
  logger = getLogger(f"LLM Request {timestamp}")
@@ -71,7 +71,7 @@ def translate(
71
71
  placeholder.recover()
72
72
  deduplicate_ids_in_element(xml.element)
73
73
  with zip.replace(chapter_path) as target_file:
74
- xml.save(target_file, is_html_like=True)
74
+ xml.save(target_file)
75
75
 
76
76
  # Update progress after each chapter
77
77
  processed_chapters += 1
@@ -198,7 +198,10 @@ def _count_chapters(zip: Zip) -> int:
198
198
  def _search_chapter_items(zip: Zip):
199
199
  for chapter_path in search_spine_paths(zip):
200
200
  with zip.read(chapter_path) as chapter_file:
201
- xml = XMLLikeNode(chapter_file)
201
+ xml = XMLLikeNode(
202
+ file=chapter_file,
203
+ is_html_like=chapter_path.suffix.lower() in (".html", ".htm"),
204
+ )
202
205
  body_element = find_first(xml.element, "body")
203
206
  if body_element is not None:
204
207
  placeholder = Placeholder(body_element)
@@ -0,0 +1,231 @@
1
+ import io
2
+ import re
3
+ import warnings
4
+ from typing import IO
5
+ from xml.etree.ElementTree import Element, fromstring, tostring
6
+
7
+ from .xml import iter_with_stack
8
+
9
+ _XML_NAMESPACE_URI = "http://www.w3.org/XML/1998/namespace"
10
+
11
+ _COMMON_NAMESPACES = {
12
+ "http://www.w3.org/1999/xhtml": "xhtml",
13
+ "http://www.idpf.org/2007/ops": "epub",
14
+ "http://www.w3.org/1998/Math/MathML": "m",
15
+ "http://purl.org/dc/elements/1.1/": "dc",
16
+ "http://www.daisy.org/z3986/2005/ncx/": "ncx",
17
+ "http://www.idpf.org/2007/opf": "opf",
18
+ "http://www.w3.org/2000/svg": "svg",
19
+ "urn:oasis:names:tc:opendocument:xmlns:container": "container",
20
+ "http://www.w3.org/XML/1998/namespace": "xml", # Reserved XML namespace
21
+ }
22
+
23
+ _ROOT_NAMESPACES = {
24
+ "http://www.w3.org/1999/xhtml", # XHTML
25
+ "http://www.daisy.org/z3986/2005/ncx/", # NCX
26
+ "http://www.idpf.org/2007/opf", # OPF
27
+ "urn:oasis:names:tc:opendocument:xmlns:container", # Container
28
+ }
29
+
30
+ _ENCODING_PATTERN = re.compile(r'encoding\s*=\s*["\']([^"\']+)["\']', re.IGNORECASE)
31
+ _FIRST_ELEMENT_PATTERN = re.compile(r"<(?![?!])[a-zA-Z]")
32
+ _NAMESPACE_IN_TAG = re.compile(r"\{([^}]+)\}")
33
+
34
+ # Some non-standard EPUB generators use HTML-style tags without self-closing syntax
35
+ # We need to convert them to XML-compatible format before parsing
36
+ _EMPTY_TAGS = (
37
+ "br",
38
+ "hr",
39
+ "input",
40
+ "col",
41
+ "base",
42
+ "meta",
43
+ "area",
44
+ )
45
+
46
+ # For reading: match tags like <br> or <br class="x"> (but not <br/> or <body>)
47
+ _EMPTY_TAG_OPEN_PATTERN = re.compile(r"<(" + "|".join(_EMPTY_TAGS) + r")(\s[^/>]*)>")
48
+
49
+ # For saving: match self-closing tags like <br />
50
+ _EMPTY_TAG_CLOSE_PATTERN = re.compile(r"<(" + "|".join(_EMPTY_TAGS) + r")(\s[^>]*?)\s*/>")
51
+
52
+
53
+ class XMLLikeNode:
54
+ def __init__(self, file: IO[bytes], is_html_like: bool = False) -> None:
55
+ raw_content = file.read()
56
+ self._encoding: str = self._detect_encoding(raw_content)
57
+ content = raw_content.decode(self._encoding)
58
+ self._header, xml_content = self._extract_header(content)
59
+ self._namespaces: dict[str, str] = {}
60
+ self._tag_to_namespace: dict[str, str] = {}
61
+ self._attr_to_namespace: dict[str, str] = {}
62
+
63
+ # For non-standard HTML files, convert <br> to <br/> before parsing
64
+ self._is_html_like = is_html_like
65
+ if is_html_like:
66
+ xml_content = re.sub(
67
+ pattern=_EMPTY_TAG_OPEN_PATTERN,
68
+ repl=lambda m: f"<{m.group(1)}{m.group(2)} />",
69
+ string=xml_content,
70
+ )
71
+
72
+ try:
73
+ self.element = self._extract_and_clean_namespaces(
74
+ element=fromstring(xml_content),
75
+ )
76
+ except Exception as error:
77
+ raise ValueError("Failed to parse XML-like content") from error
78
+
79
+ @property
80
+ def encoding(self) -> str:
81
+ return self._encoding
82
+
83
+ @property
84
+ def namespaces(self) -> list[str]:
85
+ return list(self._namespaces.keys())
86
+
87
+ def save(self, file: IO[bytes]) -> None:
88
+ writer = io.TextIOWrapper(file, encoding=self._encoding, write_through=True)
89
+ try:
90
+ if self._header:
91
+ writer.write(self._header)
92
+
93
+ content = self._serialize_with_namespaces(self.element)
94
+
95
+ # For non-standard HTML files, convert back from <br/> to <br>
96
+ if self._is_html_like:
97
+ content = re.sub(
98
+ pattern=_EMPTY_TAG_CLOSE_PATTERN,
99
+ repl=lambda m: f"<{m.group(1)}{m.group(2)}>",
100
+ string=content,
101
+ )
102
+
103
+ writer.write(content)
104
+
105
+ finally:
106
+ writer.detach()
107
+
108
+ def _detect_encoding(self, raw_content: bytes) -> str:
109
+ if raw_content.startswith(b"\xef\xbb\xbf"):
110
+ return "utf-8-sig"
111
+ elif raw_content.startswith(b"\xff\xfe"):
112
+ return "utf-16-le"
113
+ elif raw_content.startswith(b"\xfe\xff"):
114
+ return "utf-16-be"
115
+
116
+ # 尝试从 XML 声明中提取编码:只读取前 1024 字节来查找 XML 声明
117
+ header_bytes = raw_content[:1024]
118
+ for try_encoding in ("utf-8", "utf-16-le", "utf-16-be", "iso-8859-1"):
119
+ try:
120
+ header_str = header_bytes.decode(try_encoding)
121
+ match = _ENCODING_PATTERN.search(header_str)
122
+ if match:
123
+ declared_encoding = match.group(1).lower()
124
+ try:
125
+ raw_content.decode(declared_encoding)
126
+ return declared_encoding
127
+ except (LookupError, UnicodeDecodeError):
128
+ pass
129
+ except UnicodeDecodeError:
130
+ continue
131
+
132
+ try:
133
+ raw_content.decode("utf-8")
134
+ return "utf-8"
135
+ except UnicodeDecodeError:
136
+ pass
137
+ return "iso-8859-1"
138
+
139
+ def _extract_header(self, content: str) -> tuple[str, str]:
140
+ match = _FIRST_ELEMENT_PATTERN.search(content)
141
+ if match:
142
+ split_pos = match.start()
143
+ header = content[:split_pos]
144
+ xml_content = content[split_pos:]
145
+ return header, xml_content
146
+ return "", content
147
+
148
+ def _extract_and_clean_namespaces(self, element: Element) -> Element:
149
+ for _, elem in iter_with_stack(element):
150
+ match = _NAMESPACE_IN_TAG.match(elem.tag)
151
+ if match:
152
+ namespace_uri = match.group(1)
153
+ if namespace_uri not in self._namespaces:
154
+ prefix = _COMMON_NAMESPACES.get(namespace_uri, f"ns{len(self._namespaces)}")
155
+ self._namespaces[namespace_uri] = prefix
156
+
157
+ tag_name = elem.tag[len(match.group(0)) :]
158
+
159
+ # Record tag -> namespace mapping (warn if conflict)
160
+ if tag_name in self._tag_to_namespace and self._tag_to_namespace[tag_name] != namespace_uri:
161
+ warnings.warn(
162
+ f"Tag '{tag_name}' has multiple namespaces: "
163
+ f"{self._tag_to_namespace[tag_name]} and {namespace_uri}. "
164
+ f"Using the first one.",
165
+ stacklevel=2,
166
+ )
167
+ else:
168
+ self._tag_to_namespace[tag_name] = namespace_uri
169
+
170
+ # Clean: remove namespace URI completely
171
+ elem.tag = tag_name
172
+
173
+ for attr_key in list(elem.attrib.keys()):
174
+ match = _NAMESPACE_IN_TAG.match(attr_key)
175
+ if match:
176
+ namespace_uri = match.group(1)
177
+ if namespace_uri not in self._namespaces:
178
+ prefix = _COMMON_NAMESPACES.get(namespace_uri, f"ns{len(self._namespaces)}")
179
+ self._namespaces[namespace_uri] = prefix
180
+
181
+ attr_name = attr_key[len(match.group(0)) :]
182
+ attr_value = elem.attrib.pop(attr_key)
183
+
184
+ # Record attr -> namespace mapping (warn if conflict)
185
+ if attr_name in self._attr_to_namespace and self._attr_to_namespace[attr_name] != namespace_uri:
186
+ warnings.warn(
187
+ f"Attribute '{attr_name}' has multiple namespaces: "
188
+ f"{self._attr_to_namespace[attr_name]} and {namespace_uri}. "
189
+ f"Using the first one.",
190
+ stacklevel=2,
191
+ )
192
+ else:
193
+ self._attr_to_namespace[attr_name] = namespace_uri
194
+
195
+ # Clean: remove namespace URI completely
196
+ elem.attrib[attr_name] = attr_value
197
+ return element
198
+
199
+ def _serialize_with_namespaces(self, element: Element) -> str:
200
+ # First, add namespace declarations to root element (before serialization)
201
+ for namespace_uri, prefix in self._namespaces.items():
202
+ # Skip the reserved xml namespace - it's implicit
203
+ if namespace_uri == _XML_NAMESPACE_URI:
204
+ continue
205
+ if namespace_uri in _ROOT_NAMESPACES:
206
+ element.attrib["xmlns"] = namespace_uri
207
+ else:
208
+ element.attrib[f"xmlns:{prefix}"] = namespace_uri
209
+
210
+ # Serialize the element tree as-is (tags are simple names without prefixes)
211
+ xml_string = tostring(element, encoding="unicode")
212
+
213
+ # Now restore namespace prefixes in the serialized string
214
+ # For each tag that should have a namespace prefix, wrap it with the prefix
215
+ for tag_name, namespace_uri in self._tag_to_namespace.items():
216
+ if namespace_uri not in _ROOT_NAMESPACES:
217
+ # Get the prefix for this namespace
218
+ prefix = self._namespaces[namespace_uri]
219
+ # Replace opening and closing tags
220
+ xml_string = xml_string.replace(f"<{tag_name} ", f"<{prefix}:{tag_name} ")
221
+ xml_string = xml_string.replace(f"<{tag_name}>", f"<{prefix}:{tag_name}>")
222
+ xml_string = xml_string.replace(f"</{tag_name}>", f"</{prefix}:{tag_name}>")
223
+ xml_string = xml_string.replace(f"<{tag_name}/>", f"<{prefix}:{tag_name}/>")
224
+
225
+ # Similarly for attributes (though less common in EPUB)
226
+ for attr_name, namespace_uri in self._attr_to_namespace.items():
227
+ if namespace_uri not in _ROOT_NAMESPACES:
228
+ prefix = self._namespaces[namespace_uri]
229
+ xml_string = xml_string.replace(f' {attr_name}="', f' {prefix}:{attr_name}="')
230
+
231
+ return xml_string
@@ -126,53 +126,54 @@ class XMLTranslator:
126
126
  conversation_history: list[Message] = []
127
127
  latest_error: ValidationError | None = None
128
128
 
129
- for _ in range(self._max_retries):
130
- # Request LLM response
131
- response = self._llm.request(
132
- input=fixed_messages + conversation_history,
133
- )
134
-
135
- try:
136
- # Extract XML from response
137
- validated_element = _extract_xml_element(response)
138
-
139
- # Validate with progressive locking
140
- is_complete, error_message, newly_locked = validator.validate_with_locking(
141
- template_ele=fill.request_element,
142
- validated_ele=validated_element,
143
- errors_limit=self._max_fill_displaying_errors,
129
+ with self._llm.context() as llm_context:
130
+ for _ in range(self._max_retries):
131
+ # Request LLM response
132
+ response = llm_context.request(
133
+ input=fixed_messages + conversation_history,
144
134
  )
145
135
 
146
- if is_complete:
147
- # All nodes locked, fill successful
148
- fill._fill_submitted_texts( # pylint: disable=protected-access
149
- generated_ids_stack=[],
150
- element=validated_element,
136
+ try:
137
+ # Extract XML from response
138
+ validated_element = _extract_xml_element(response)
139
+
140
+ # Validate with progressive locking
141
+ is_complete, error_message, newly_locked = validator.validate_with_locking(
142
+ template_ele=fill.request_element,
143
+ validated_ele=validated_element,
144
+ errors_limit=self._max_fill_displaying_errors,
151
145
  )
152
- return validated_element
153
-
154
- # Not complete yet, construct error message with progress info
155
- progress_msg = f"Progress: {len(validator.locked_ids)} nodes locked"
156
- if newly_locked:
157
- progress_msg += f", {len(newly_locked)} newly locked this round"
158
-
159
- full_error_message = f"{progress_msg}\n\n{error_message}"
160
-
161
- conversation_history = [
162
- Message(role=MessageRole.ASSISTANT, message=response),
163
- Message(role=MessageRole.USER, message=full_error_message),
164
- ]
165
-
166
- except ValidationError as error:
167
- # XML extraction or basic validation failed
168
- latest_error = error
169
- conversation_history = [
170
- Message(role=MessageRole.ASSISTANT, message=response),
171
- Message(role=MessageRole.USER, message=str(error)),
172
- ]
173
-
174
- message = f"Failed to get valid XML structure after {self._max_retries} attempts"
175
- if latest_error is None:
176
- raise ValueError(message)
177
- else:
178
- raise ValueError(message) from latest_error
146
+
147
+ if is_complete:
148
+ # All nodes locked, fill successful
149
+ fill._fill_submitted_texts( # pylint: disable=protected-access
150
+ generated_ids_stack=[],
151
+ element=validated_element,
152
+ )
153
+ return validated_element
154
+
155
+ # Not complete yet, construct error message with progress info
156
+ progress_msg = f"Progress: {len(validator.locked_ids)} nodes locked"
157
+ if newly_locked:
158
+ progress_msg += f", {len(newly_locked)} newly locked this round"
159
+
160
+ full_error_message = f"{progress_msg}\n\n{error_message}"
161
+
162
+ conversation_history = [
163
+ Message(role=MessageRole.ASSISTANT, message=response),
164
+ Message(role=MessageRole.USER, message=full_error_message),
165
+ ]
166
+
167
+ except ValidationError as error:
168
+ # XML extraction or basic validation failed
169
+ latest_error = error
170
+ conversation_history = [
171
+ Message(role=MessageRole.ASSISTANT, message=response),
172
+ Message(role=MessageRole.USER, message=str(error)),
173
+ ]
174
+
175
+ message = f"Failed to get valid XML structure after {self._max_retries} attempts"
176
+ if latest_error is None:
177
+ raise ValueError(message)
178
+ else:
179
+ raise ValueError(message) from latest_error
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "epub-translator"
3
- version = "0.1.0"
3
+ version = "0.1.1"
4
4
  description = "Translate the epub book using LLM. The translated book will retain the original text and list the translated text side by side with the original text."
5
5
  keywords = ["epub", "llm", "translation", "translator"]
6
6
  authors = [
@@ -1,176 +0,0 @@
1
- import io
2
- import re
3
- from typing import IO
4
- from xml.etree.ElementTree import Element, fromstring, tostring
5
-
6
- from .xml import iter_with_stack
7
-
8
- _COMMON_NAMESPACES = {
9
- "http://www.w3.org/1999/xhtml": "xhtml",
10
- "http://www.idpf.org/2007/ops": "epub",
11
- "http://www.w3.org/1998/Math/MathML": "m",
12
- "http://purl.org/dc/elements/1.1/": "dc",
13
- "http://www.daisy.org/z3986/2005/ncx/": "ncx",
14
- "http://www.idpf.org/2007/opf": "opf",
15
- "http://www.w3.org/2000/svg": "svg",
16
- "urn:oasis:names:tc:opendocument:xmlns:container": "container",
17
- }
18
-
19
- _ROOT_NAMESPACES = {
20
- "http://www.w3.org/1999/xhtml", # XHTML
21
- "http://www.daisy.org/z3986/2005/ncx/", # NCX
22
- "http://www.idpf.org/2007/opf", # OPF
23
- "urn:oasis:names:tc:opendocument:xmlns:container", # Container
24
- }
25
-
26
- _ENCODING_PATTERN = re.compile(r'encoding\s*=\s*["\']([^"\']+)["\']', re.IGNORECASE)
27
- _FIRST_ELEMENT_PATTERN = re.compile(r"<(?![?!])[a-zA-Z]")
28
- _NAMESPACE_IN_TAG = re.compile(r"\{([^}]+)\}")
29
-
30
- # HTML 规定了一系列自闭标签,这些标签需要改成非自闭的,因为 EPub 格式不支持
31
- # https://www.tutorialspoint.com/which-html-tags-are-self-closing
32
- _EMPTY_TAGS = (
33
- "br",
34
- "hr",
35
- "input",
36
- "col",
37
- "base",
38
- "meta",
39
- "area",
40
- )
41
-
42
- _EMPTY_TAG_PATTERN = re.compile(r"<(" + "|".join(_EMPTY_TAGS) + r")(\s[^>]*?)\s*/?>")
43
-
44
-
45
- class XMLLikeNode:
46
- def __init__(self, file: IO[bytes]) -> None:
47
- raw_content = file.read()
48
- self._encoding: str = _detect_encoding(raw_content)
49
- content = raw_content.decode(self._encoding)
50
- self._header, xml_content = _extract_header(content)
51
- try:
52
- self.element = fromstring(xml_content)
53
- except Exception as error:
54
- raise ValueError("Failed to parse XML-like content") from error
55
- self._namespaces: dict[str, str] = _extract_and_clean_namespaces(self.element)
56
-
57
- @property
58
- def encoding(self) -> str:
59
- return self._encoding
60
-
61
- @property
62
- def namespaces(self) -> list[str]:
63
- return list(self._namespaces.keys())
64
-
65
- def save(self, file: IO[bytes], is_html_like: bool = False) -> None:
66
- writer = io.TextIOWrapper(file, encoding=self._encoding, write_through=True)
67
- try:
68
- if self._header:
69
- writer.write(self._header)
70
-
71
- content = _serialize_with_namespaces(element=self.element, namespaces=self._namespaces)
72
- if is_html_like:
73
- content = re.sub(
74
- pattern=_EMPTY_TAG_PATTERN,
75
- repl=lambda m: f"<{m.group(1)}{m.group(2)}>",
76
- string=content,
77
- )
78
- else:
79
- content = re.sub(
80
- pattern=_EMPTY_TAG_PATTERN,
81
- repl=lambda m: f"<{m.group(1)}{m.group(2)} />",
82
- string=content,
83
- )
84
- writer.write(content)
85
-
86
- finally:
87
- writer.detach()
88
-
89
-
90
- def _detect_encoding(raw_content: bytes) -> str:
91
- if raw_content.startswith(b"\xef\xbb\xbf"):
92
- return "utf-8-sig"
93
- elif raw_content.startswith(b"\xff\xfe"):
94
- return "utf-16-le"
95
- elif raw_content.startswith(b"\xfe\xff"):
96
- return "utf-16-be"
97
-
98
- # 尝试从 XML 声明中提取编码:只读取前 1024 字节来查找 XML 声明
99
- header_bytes = raw_content[:1024]
100
- for try_encoding in ("utf-8", "utf-16-le", "utf-16-be", "iso-8859-1"):
101
- try:
102
- header_str = header_bytes.decode(try_encoding)
103
- match = _ENCODING_PATTERN.search(header_str)
104
- if match:
105
- declared_encoding = match.group(1).lower()
106
- try:
107
- raw_content.decode(declared_encoding)
108
- return declared_encoding
109
- except (LookupError, UnicodeDecodeError):
110
- pass
111
- except UnicodeDecodeError:
112
- continue
113
-
114
- try:
115
- raw_content.decode("utf-8")
116
- return "utf-8"
117
- except UnicodeDecodeError:
118
- pass
119
- return "iso-8859-1"
120
-
121
-
122
- def _extract_header(content: str) -> tuple[str, str]:
123
- match = _FIRST_ELEMENT_PATTERN.search(content)
124
- if match:
125
- split_pos = match.start()
126
- header = content[:split_pos]
127
- xml_content = content[split_pos:]
128
- return header, xml_content
129
- return "", content
130
-
131
-
132
- def _extract_and_clean_namespaces(element: Element):
133
- namespaces: dict[str, str] = {}
134
- for _, elem in iter_with_stack(element):
135
- match = _NAMESPACE_IN_TAG.match(elem.tag)
136
- if match:
137
- namespace_uri = match.group(1)
138
- if namespace_uri not in namespaces:
139
- prefix = _COMMON_NAMESPACES.get(namespace_uri, f"ns{len(namespaces)}")
140
- namespaces[namespace_uri] = prefix
141
-
142
- tag_name = elem.tag[len(match.group(0)) :]
143
- elem.tag = tag_name
144
-
145
- for attr_key in list(elem.attrib.keys()):
146
- match = _NAMESPACE_IN_TAG.match(attr_key)
147
- if match:
148
- namespace_uri = match.group(1)
149
- if namespace_uri not in namespaces:
150
- prefix = _COMMON_NAMESPACES.get(namespace_uri, f"ns{len(namespaces)}")
151
- namespaces[namespace_uri] = prefix
152
-
153
- attr_name = attr_key[len(match.group(0)) :]
154
- attr_value = elem.attrib.pop(attr_key)
155
- elem.attrib[attr_name] = attr_value
156
- return namespaces
157
-
158
-
159
- def _serialize_with_namespaces(
160
- element: Element,
161
- namespaces: dict[str, str],
162
- ) -> str:
163
- for namespace_uri, prefix in namespaces.items():
164
- if namespace_uri in _ROOT_NAMESPACES:
165
- element.attrib["xmlns"] = namespace_uri
166
- else:
167
- element.attrib[f"xmlns:{prefix}"] = namespace_uri
168
- xml_string = tostring(element, encoding="unicode")
169
- for namespace_uri, prefix in namespaces.items():
170
- if namespace_uri in _ROOT_NAMESPACES:
171
- xml_string = xml_string.replace(f"{{{namespace_uri}}}", "")
172
- else:
173
- xml_string = xml_string.replace(f"{{{namespace_uri}}}", f"{prefix}:")
174
- pattern = r'\s+xmlns:(ns\d+)="' + re.escape(namespace_uri) + r'"'
175
- xml_string = re.sub(pattern, "", xml_string)
176
- return xml_string
File without changes