epub-translator 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,11 +1,13 @@
1
1
  import datetime
2
2
  import hashlib
3
3
  import json
4
+ import uuid
4
5
  from collections.abc import Callable, Generator
5
6
  from importlib.resources import files
6
7
  from logging import DEBUG, FileHandler, Formatter, Logger, getLogger
7
8
  from os import PathLike
8
9
  from pathlib import Path
10
+ from typing import Self
9
11
 
10
12
  from jinja2 import Environment, Template
11
13
  from tiktoken import Encoding, get_encoding
@@ -16,6 +18,89 @@ from .increasable import Increasable
16
18
  from .types import Message, MessageRole, R
17
19
 
18
20
 
21
+ class LLMContext:
22
+ """Context manager for LLM requests with transactional caching."""
23
+
24
+ def __init__(
25
+ self,
26
+ executor: LLMExecutor,
27
+ cache_path: Path | None,
28
+ ) -> None:
29
+ self._executor = executor
30
+ self._cache_path = cache_path
31
+ self._context_id = uuid.uuid4().hex[:12]
32
+ self._temp_files: list[Path] = []
33
+
34
+ def __enter__(self) -> Self:
35
+ return self
36
+
37
+ def __exit__(self, exc_type, exc_val, exc_tb) -> None:
38
+ if exc_type is None:
39
+ # Success: commit all temporary cache files
40
+ self._commit()
41
+ else:
42
+ # Failure: rollback (delete) all temporary cache files
43
+ self._rollback()
44
+
45
+ def request(
46
+ self,
47
+ input: str | list[Message],
48
+ parser: Callable[[str], R] = lambda x: x,
49
+ max_tokens: int | None = None,
50
+ ) -> R:
51
+ messages: list[Message]
52
+ if isinstance(input, str):
53
+ messages = [Message(role=MessageRole.USER, message=input)]
54
+ else:
55
+ messages = input
56
+
57
+ cache_key: str | None = None
58
+ if self._cache_path is not None:
59
+ cache_key = self._compute_messages_hash(messages)
60
+ permanent_cache_file = self._cache_path / f"{cache_key}.txt"
61
+ if permanent_cache_file.exists():
62
+ cached_content = permanent_cache_file.read_text(encoding="utf-8")
63
+ return parser(cached_content)
64
+
65
+ temp_cache_file = self._cache_path / f"{cache_key}.{self._context_id}.txt"
66
+ if temp_cache_file.exists():
67
+ cached_content = temp_cache_file.read_text(encoding="utf-8")
68
+ return parser(cached_content)
69
+
70
+ # Make the actual request
71
+ response = self._executor.request(
72
+ messages=messages,
73
+ parser=lambda x: x,
74
+ max_tokens=max_tokens,
75
+ )
76
+
77
+ # Save to temporary cache if cache_path is set
78
+ if self._cache_path is not None and cache_key is not None:
79
+ temp_cache_file = self._cache_path / f"{cache_key}.{self._context_id}.txt"
80
+ temp_cache_file.write_text(response, encoding="utf-8")
81
+ self._temp_files.append(temp_cache_file)
82
+
83
+ return parser(response)
84
+
85
+ def _compute_messages_hash(self, messages: list[Message]) -> str:
86
+ messages_dict = [{"role": msg.role.value, "message": msg.message} for msg in messages]
87
+ messages_json = json.dumps(messages_dict, ensure_ascii=False, sort_keys=True)
88
+ return hashlib.sha512(messages_json.encode("utf-8")).hexdigest()
89
+
90
+ def _commit(self) -> None:
91
+ for temp_file in self._temp_files:
92
+ if temp_file.exists():
93
+ # Remove the .[context-id].txt suffix to get permanent name
94
+ permanent_name = temp_file.name.rsplit(".", 2)[0] + ".txt"
95
+ permanent_file = temp_file.parent / permanent_name
96
+ temp_file.rename(permanent_file)
97
+
98
+ def _rollback(self) -> None:
99
+ for temp_file in self._temp_files:
100
+ if temp_file.exists():
101
+ temp_file.unlink()
102
+
103
+
19
104
  class LLM:
20
105
  def __init__(
21
106
  self,
@@ -30,7 +115,7 @@ class LLM:
30
115
  retry_times: int = 5,
31
116
  retry_interval_seconds: float = 6.0,
32
117
  log_dir_path: PathLike | None = None,
33
- ):
118
+ ) -> None:
34
119
  prompts_path = Path(str(files("epub_translator"))) / "data"
35
120
  self._templates: dict[str, Template] = {}
36
121
  self._encoding: Encoding = get_encoding(token_encoding)
@@ -68,41 +153,20 @@ class LLM:
68
153
  def encoding(self) -> Encoding:
69
154
  return self._encoding
70
155
 
156
+ def context(self) -> LLMContext:
157
+ return LLMContext(
158
+ executor=self._executor,
159
+ cache_path=self._cache_path,
160
+ )
161
+
71
162
  def request(
72
163
  self,
73
164
  input: str | list[Message],
74
165
  parser: Callable[[str], R] = lambda x: x,
75
166
  max_tokens: int | None = None,
76
167
  ) -> R:
77
- messages: list[Message]
78
- if isinstance(input, str):
79
- messages = [Message(role=MessageRole.USER, message=input)]
80
- else:
81
- messages = input
82
-
83
- # Check cache if cache_path is set
84
- if self._cache_path is not None:
85
- cache_key = self._compute_messages_hash(messages)
86
- cache_file = self._cache_path / f"{cache_key}.txt"
87
-
88
- if cache_file.exists():
89
- cached_content = cache_file.read_text(encoding="utf-8")
90
- return parser(cached_content)
91
-
92
- # Make the actual request
93
- response = self._executor.request(
94
- messages=messages,
95
- parser=lambda x: x,
96
- max_tokens=max_tokens,
97
- )
98
-
99
- # Save to cache if cache_path is set
100
- if self._cache_path is not None:
101
- cache_key = self._compute_messages_hash(messages)
102
- cache_file = self._cache_path / f"{cache_key}.txt"
103
- cache_file.write_text(response, encoding="utf-8")
104
-
105
- return parser(response)
168
+ with self.context() as ctx:
169
+ return ctx.request(input=input, parser=parser, max_tokens=max_tokens)
106
170
 
107
171
  def template(self, template_name: str) -> Template:
108
172
  template = self._templates.get(template_name, None)
@@ -111,17 +175,11 @@ class LLM:
111
175
  self._templates[template_name] = template
112
176
  return template
113
177
 
114
- def _compute_messages_hash(self, messages: list[Message]) -> str:
115
- """Compute SHA-512 hash of m·essages for cache key."""
116
- messages_dict = [{"role": msg.role.value, "message": msg.message} for msg in messages]
117
- messages_json = json.dumps(messages_dict, ensure_ascii=False, sort_keys=True)
118
- return hashlib.sha512(messages_json.encode("utf-8")).hexdigest()
119
-
120
178
  def _create_logger(self) -> Logger | None:
121
179
  if self._logger_save_path is None:
122
180
  return None
123
181
 
124
- now = datetime.datetime.now(datetime.timezone.utc)
182
+ now = datetime.datetime.now(datetime.UTC)
125
183
  timestamp = now.strftime("%Y-%m-%d %H-%M-%S %f")
126
184
  file_path = self._logger_save_path / f"request {timestamp}.log"
127
185
  logger = getLogger(f"LLM Request {timestamp}")
@@ -71,7 +71,7 @@ def translate(
71
71
  placeholder.recover()
72
72
  deduplicate_ids_in_element(xml.element)
73
73
  with zip.replace(chapter_path) as target_file:
74
- xml.save(target_file, is_html_like=True)
74
+ xml.save(target_file)
75
75
 
76
76
  # Update progress after each chapter
77
77
  processed_chapters += 1
@@ -198,7 +198,10 @@ def _count_chapters(zip: Zip) -> int:
198
198
  def _search_chapter_items(zip: Zip):
199
199
  for chapter_path in search_spine_paths(zip):
200
200
  with zip.read(chapter_path) as chapter_file:
201
- xml = XMLLikeNode(chapter_file)
201
+ xml = XMLLikeNode(
202
+ file=chapter_file,
203
+ is_html_like=chapter_path.suffix.lower() in (".html", ".htm"),
204
+ )
202
205
  body_element = find_first(xml.element, "body")
203
206
  if body_element is not None:
204
207
  placeholder = Placeholder(body_element)
@@ -1,10 +1,13 @@
1
1
  import io
2
2
  import re
3
+ import warnings
3
4
  from typing import IO
4
5
  from xml.etree.ElementTree import Element, fromstring, tostring
5
6
 
6
7
  from .xml import iter_with_stack
7
8
 
9
+ _XML_NAMESPACE_URI = "http://www.w3.org/XML/1998/namespace"
10
+
8
11
  _COMMON_NAMESPACES = {
9
12
  "http://www.w3.org/1999/xhtml": "xhtml",
10
13
  "http://www.idpf.org/2007/ops": "epub",
@@ -14,6 +17,7 @@ _COMMON_NAMESPACES = {
14
17
  "http://www.idpf.org/2007/opf": "opf",
15
18
  "http://www.w3.org/2000/svg": "svg",
16
19
  "urn:oasis:names:tc:opendocument:xmlns:container": "container",
20
+ "http://www.w3.org/XML/1998/namespace": "xml", # Reserved XML namespace
17
21
  }
18
22
 
19
23
  _ROOT_NAMESPACES = {
@@ -27,8 +31,8 @@ _ENCODING_PATTERN = re.compile(r'encoding\s*=\s*["\']([^"\']+)["\']', re.IGNOREC
27
31
  _FIRST_ELEMENT_PATTERN = re.compile(r"<(?![?!])[a-zA-Z]")
28
32
  _NAMESPACE_IN_TAG = re.compile(r"\{([^}]+)\}")
29
33
 
30
- # HTML 规定了一系列自闭标签,这些标签需要改成非自闭的,因为 EPub 格式不支持
31
- # https://www.tutorialspoint.com/which-html-tags-are-self-closing
34
+ # Some non-standard EPUB generators use HTML-style tags without self-closing syntax
35
+ # We need to convert them to XML-compatible format before parsing
32
36
  _EMPTY_TAGS = (
33
37
  "br",
34
38
  "hr",
@@ -39,20 +43,38 @@ _EMPTY_TAGS = (
39
43
  "area",
40
44
  )
41
45
 
42
- _EMPTY_TAG_PATTERN = re.compile(r"<(" + "|".join(_EMPTY_TAGS) + r")(\s[^>]*?)\s*/?>")
46
+ # For reading: match tags like <br> or <br class="x"> (but not <br/> or <body>)
47
+ _EMPTY_TAG_OPEN_PATTERN = re.compile(r"<(" + "|".join(_EMPTY_TAGS) + r")(\s[^/>]*)>")
48
+
49
+ # For saving: match self-closing tags like <br />
50
+ _EMPTY_TAG_CLOSE_PATTERN = re.compile(r"<(" + "|".join(_EMPTY_TAGS) + r")(\s[^>]*?)\s*/>")
43
51
 
44
52
 
45
53
  class XMLLikeNode:
46
- def __init__(self, file: IO[bytes]) -> None:
54
+ def __init__(self, file: IO[bytes], is_html_like: bool = False) -> None:
47
55
  raw_content = file.read()
48
- self._encoding: str = _detect_encoding(raw_content)
56
+ self._encoding: str = self._detect_encoding(raw_content)
49
57
  content = raw_content.decode(self._encoding)
50
- self._header, xml_content = _extract_header(content)
58
+ self._header, xml_content = self._extract_header(content)
59
+ self._namespaces: dict[str, str] = {}
60
+ self._tag_to_namespace: dict[str, str] = {}
61
+ self._attr_to_namespace: dict[str, str] = {}
62
+
63
+ # For non-standard HTML files, convert <br> to <br/> before parsing
64
+ self._is_html_like = is_html_like
65
+ if is_html_like:
66
+ xml_content = re.sub(
67
+ pattern=_EMPTY_TAG_OPEN_PATTERN,
68
+ repl=lambda m: f"<{m.group(1)}{m.group(2)} />",
69
+ string=xml_content,
70
+ )
71
+
51
72
  try:
52
- self.element = fromstring(xml_content)
73
+ self.element = self._extract_and_clean_namespaces(
74
+ element=fromstring(xml_content),
75
+ )
53
76
  except Exception as error:
54
77
  raise ValueError("Failed to parse XML-like content") from error
55
- self._namespaces: dict[str, str] = _extract_and_clean_namespaces(self.element)
56
78
 
57
79
  @property
58
80
  def encoding(self) -> str:
@@ -62,115 +84,148 @@ class XMLLikeNode:
62
84
  def namespaces(self) -> list[str]:
63
85
  return list(self._namespaces.keys())
64
86
 
65
- def save(self, file: IO[bytes], is_html_like: bool = False) -> None:
87
+ def save(self, file: IO[bytes]) -> None:
66
88
  writer = io.TextIOWrapper(file, encoding=self._encoding, write_through=True)
67
89
  try:
68
90
  if self._header:
69
91
  writer.write(self._header)
70
92
 
71
- content = _serialize_with_namespaces(element=self.element, namespaces=self._namespaces)
72
- if is_html_like:
93
+ content = self._serialize_with_namespaces(self.element)
94
+
95
+ # For non-standard HTML files, convert back from <br/> to <br>
96
+ if self._is_html_like:
73
97
  content = re.sub(
74
- pattern=_EMPTY_TAG_PATTERN,
98
+ pattern=_EMPTY_TAG_CLOSE_PATTERN,
75
99
  repl=lambda m: f"<{m.group(1)}{m.group(2)}>",
76
100
  string=content,
77
101
  )
78
- else:
79
- content = re.sub(
80
- pattern=_EMPTY_TAG_PATTERN,
81
- repl=lambda m: f"<{m.group(1)}{m.group(2)} />",
82
- string=content,
83
- )
102
+
84
103
  writer.write(content)
85
104
 
86
105
  finally:
87
106
  writer.detach()
88
107
 
108
+ def _detect_encoding(self, raw_content: bytes) -> str:
109
+ if raw_content.startswith(b"\xef\xbb\xbf"):
110
+ return "utf-8-sig"
111
+ elif raw_content.startswith(b"\xff\xfe"):
112
+ return "utf-16-le"
113
+ elif raw_content.startswith(b"\xfe\xff"):
114
+ return "utf-16-be"
115
+
116
+ # 尝试从 XML 声明中提取编码:只读取前 1024 字节来查找 XML 声明
117
+ header_bytes = raw_content[:1024]
118
+ for try_encoding in ("utf-8", "utf-16-le", "utf-16-be", "iso-8859-1"):
119
+ try:
120
+ header_str = header_bytes.decode(try_encoding)
121
+ match = _ENCODING_PATTERN.search(header_str)
122
+ if match:
123
+ declared_encoding = match.group(1).lower()
124
+ try:
125
+ raw_content.decode(declared_encoding)
126
+ return declared_encoding
127
+ except (LookupError, UnicodeDecodeError):
128
+ pass
129
+ except UnicodeDecodeError:
130
+ continue
89
131
 
90
- def _detect_encoding(raw_content: bytes) -> str:
91
- if raw_content.startswith(b"\xef\xbb\xbf"):
92
- return "utf-8-sig"
93
- elif raw_content.startswith(b"\xff\xfe"):
94
- return "utf-16-le"
95
- elif raw_content.startswith(b"\xfe\xff"):
96
- return "utf-16-be"
97
-
98
- # 尝试从 XML 声明中提取编码:只读取前 1024 字节来查找 XML 声明
99
- header_bytes = raw_content[:1024]
100
- for try_encoding in ("utf-8", "utf-16-le", "utf-16-be", "iso-8859-1"):
101
132
  try:
102
- header_str = header_bytes.decode(try_encoding)
103
- match = _ENCODING_PATTERN.search(header_str)
104
- if match:
105
- declared_encoding = match.group(1).lower()
106
- try:
107
- raw_content.decode(declared_encoding)
108
- return declared_encoding
109
- except (LookupError, UnicodeDecodeError):
110
- pass
133
+ raw_content.decode("utf-8")
134
+ return "utf-8"
111
135
  except UnicodeDecodeError:
112
- continue
113
-
114
- try:
115
- raw_content.decode("utf-8")
116
- return "utf-8"
117
- except UnicodeDecodeError:
118
- pass
119
- return "iso-8859-1"
120
-
121
-
122
- def _extract_header(content: str) -> tuple[str, str]:
123
- match = _FIRST_ELEMENT_PATTERN.search(content)
124
- if match:
125
- split_pos = match.start()
126
- header = content[:split_pos]
127
- xml_content = content[split_pos:]
128
- return header, xml_content
129
- return "", content
130
-
131
-
132
- def _extract_and_clean_namespaces(element: Element):
133
- namespaces: dict[str, str] = {}
134
- for _, elem in iter_with_stack(element):
135
- match = _NAMESPACE_IN_TAG.match(elem.tag)
136
- if match:
137
- namespace_uri = match.group(1)
138
- if namespace_uri not in namespaces:
139
- prefix = _COMMON_NAMESPACES.get(namespace_uri, f"ns{len(namespaces)}")
140
- namespaces[namespace_uri] = prefix
136
+ pass
137
+ return "iso-8859-1"
141
138
 
142
- tag_name = elem.tag[len(match.group(0)) :]
143
- elem.tag = tag_name
144
-
145
- for attr_key in list(elem.attrib.keys()):
146
- match = _NAMESPACE_IN_TAG.match(attr_key)
139
+ def _extract_header(self, content: str) -> tuple[str, str]:
140
+ match = _FIRST_ELEMENT_PATTERN.search(content)
141
+ if match:
142
+ split_pos = match.start()
143
+ header = content[:split_pos]
144
+ xml_content = content[split_pos:]
145
+ return header, xml_content
146
+ return "", content
147
+
148
+ def _extract_and_clean_namespaces(self, element: Element) -> Element:
149
+ for _, elem in iter_with_stack(element):
150
+ match = _NAMESPACE_IN_TAG.match(elem.tag)
147
151
  if match:
148
152
  namespace_uri = match.group(1)
149
- if namespace_uri not in namespaces:
150
- prefix = _COMMON_NAMESPACES.get(namespace_uri, f"ns{len(namespaces)}")
151
- namespaces[namespace_uri] = prefix
152
-
153
- attr_name = attr_key[len(match.group(0)) :]
154
- attr_value = elem.attrib.pop(attr_key)
155
- elem.attrib[attr_name] = attr_value
156
- return namespaces
157
-
158
-
159
- def _serialize_with_namespaces(
160
- element: Element,
161
- namespaces: dict[str, str],
162
- ) -> str:
163
- for namespace_uri, prefix in namespaces.items():
164
- if namespace_uri in _ROOT_NAMESPACES:
165
- element.attrib["xmlns"] = namespace_uri
166
- else:
167
- element.attrib[f"xmlns:{prefix}"] = namespace_uri
168
- xml_string = tostring(element, encoding="unicode")
169
- for namespace_uri, prefix in namespaces.items():
170
- if namespace_uri in _ROOT_NAMESPACES:
171
- xml_string = xml_string.replace(f"{{{namespace_uri}}}", "")
172
- else:
173
- xml_string = xml_string.replace(f"{{{namespace_uri}}}", f"{prefix}:")
174
- pattern = r'\s+xmlns:(ns\d+)="' + re.escape(namespace_uri) + r'"'
175
- xml_string = re.sub(pattern, "", xml_string)
176
- return xml_string
153
+ if namespace_uri not in self._namespaces:
154
+ prefix = _COMMON_NAMESPACES.get(namespace_uri, f"ns{len(self._namespaces)}")
155
+ self._namespaces[namespace_uri] = prefix
156
+
157
+ tag_name = elem.tag[len(match.group(0)) :]
158
+
159
+ # Record tag -> namespace mapping (warn if conflict)
160
+ if tag_name in self._tag_to_namespace and self._tag_to_namespace[tag_name] != namespace_uri:
161
+ warnings.warn(
162
+ f"Tag '{tag_name}' has multiple namespaces: "
163
+ f"{self._tag_to_namespace[tag_name]} and {namespace_uri}. "
164
+ f"Using the first one.",
165
+ stacklevel=2,
166
+ )
167
+ else:
168
+ self._tag_to_namespace[tag_name] = namespace_uri
169
+
170
+ # Clean: remove namespace URI completely
171
+ elem.tag = tag_name
172
+
173
+ for attr_key in list(elem.attrib.keys()):
174
+ match = _NAMESPACE_IN_TAG.match(attr_key)
175
+ if match:
176
+ namespace_uri = match.group(1)
177
+ if namespace_uri not in self._namespaces:
178
+ prefix = _COMMON_NAMESPACES.get(namespace_uri, f"ns{len(self._namespaces)}")
179
+ self._namespaces[namespace_uri] = prefix
180
+
181
+ attr_name = attr_key[len(match.group(0)) :]
182
+ attr_value = elem.attrib.pop(attr_key)
183
+
184
+ # Record attr -> namespace mapping (warn if conflict)
185
+ if attr_name in self._attr_to_namespace and self._attr_to_namespace[attr_name] != namespace_uri:
186
+ warnings.warn(
187
+ f"Attribute '{attr_name}' has multiple namespaces: "
188
+ f"{self._attr_to_namespace[attr_name]} and {namespace_uri}. "
189
+ f"Using the first one.",
190
+ stacklevel=2,
191
+ )
192
+ else:
193
+ self._attr_to_namespace[attr_name] = namespace_uri
194
+
195
+ # Clean: remove namespace URI completely
196
+ elem.attrib[attr_name] = attr_value
197
+ return element
198
+
199
+ def _serialize_with_namespaces(self, element: Element) -> str:
200
+ # First, add namespace declarations to root element (before serialization)
201
+ for namespace_uri, prefix in self._namespaces.items():
202
+ # Skip the reserved xml namespace - it's implicit
203
+ if namespace_uri == _XML_NAMESPACE_URI:
204
+ continue
205
+ if namespace_uri in _ROOT_NAMESPACES:
206
+ element.attrib["xmlns"] = namespace_uri
207
+ else:
208
+ element.attrib[f"xmlns:{prefix}"] = namespace_uri
209
+
210
+ # Serialize the element tree as-is (tags are simple names without prefixes)
211
+ xml_string = tostring(element, encoding="unicode")
212
+
213
+ # Now restore namespace prefixes in the serialized string
214
+ # For each tag that should have a namespace prefix, wrap it with the prefix
215
+ for tag_name, namespace_uri in self._tag_to_namespace.items():
216
+ if namespace_uri not in _ROOT_NAMESPACES:
217
+ # Get the prefix for this namespace
218
+ prefix = self._namespaces[namespace_uri]
219
+ # Replace opening and closing tags
220
+ xml_string = xml_string.replace(f"<{tag_name} ", f"<{prefix}:{tag_name} ")
221
+ xml_string = xml_string.replace(f"<{tag_name}>", f"<{prefix}:{tag_name}>")
222
+ xml_string = xml_string.replace(f"</{tag_name}>", f"</{prefix}:{tag_name}>")
223
+ xml_string = xml_string.replace(f"<{tag_name}/>", f"<{prefix}:{tag_name}/>")
224
+
225
+ # Similarly for attributes (though less common in EPUB)
226
+ for attr_name, namespace_uri in self._attr_to_namespace.items():
227
+ if namespace_uri not in _ROOT_NAMESPACES:
228
+ prefix = self._namespaces[namespace_uri]
229
+ xml_string = xml_string.replace(f' {attr_name}="', f' {prefix}:{attr_name}="')
230
+
231
+ return xml_string
@@ -126,53 +126,54 @@ class XMLTranslator:
126
126
  conversation_history: list[Message] = []
127
127
  latest_error: ValidationError | None = None
128
128
 
129
- for _ in range(self._max_retries):
130
- # Request LLM response
131
- response = self._llm.request(
132
- input=fixed_messages + conversation_history,
133
- )
134
-
135
- try:
136
- # Extract XML from response
137
- validated_element = _extract_xml_element(response)
138
-
139
- # Validate with progressive locking
140
- is_complete, error_message, newly_locked = validator.validate_with_locking(
141
- template_ele=fill.request_element,
142
- validated_ele=validated_element,
143
- errors_limit=self._max_fill_displaying_errors,
129
+ with self._llm.context() as llm_context:
130
+ for _ in range(self._max_retries):
131
+ # Request LLM response
132
+ response = llm_context.request(
133
+ input=fixed_messages + conversation_history,
144
134
  )
145
135
 
146
- if is_complete:
147
- # All nodes locked, fill successful
148
- fill._fill_submitted_texts( # pylint: disable=protected-access
149
- generated_ids_stack=[],
150
- element=validated_element,
136
+ try:
137
+ # Extract XML from response
138
+ validated_element = _extract_xml_element(response)
139
+
140
+ # Validate with progressive locking
141
+ is_complete, error_message, newly_locked = validator.validate_with_locking(
142
+ template_ele=fill.request_element,
143
+ validated_ele=validated_element,
144
+ errors_limit=self._max_fill_displaying_errors,
151
145
  )
152
- return validated_element
153
-
154
- # Not complete yet, construct error message with progress info
155
- progress_msg = f"Progress: {len(validator.locked_ids)} nodes locked"
156
- if newly_locked:
157
- progress_msg += f", {len(newly_locked)} newly locked this round"
158
-
159
- full_error_message = f"{progress_msg}\n\n{error_message}"
160
-
161
- conversation_history = [
162
- Message(role=MessageRole.ASSISTANT, message=response),
163
- Message(role=MessageRole.USER, message=full_error_message),
164
- ]
165
-
166
- except ValidationError as error:
167
- # XML extraction or basic validation failed
168
- latest_error = error
169
- conversation_history = [
170
- Message(role=MessageRole.ASSISTANT, message=response),
171
- Message(role=MessageRole.USER, message=str(error)),
172
- ]
173
-
174
- message = f"Failed to get valid XML structure after {self._max_retries} attempts"
175
- if latest_error is None:
176
- raise ValueError(message)
177
- else:
178
- raise ValueError(message) from latest_error
146
+
147
+ if is_complete:
148
+ # All nodes locked, fill successful
149
+ fill._fill_submitted_texts( # pylint: disable=protected-access
150
+ generated_ids_stack=[],
151
+ element=validated_element,
152
+ )
153
+ return validated_element
154
+
155
+ # Not complete yet, construct error message with progress info
156
+ progress_msg = f"Progress: {len(validator.locked_ids)} nodes locked"
157
+ if newly_locked:
158
+ progress_msg += f", {len(newly_locked)} newly locked this round"
159
+
160
+ full_error_message = f"{progress_msg}\n\n{error_message}"
161
+
162
+ conversation_history = [
163
+ Message(role=MessageRole.ASSISTANT, message=response),
164
+ Message(role=MessageRole.USER, message=full_error_message),
165
+ ]
166
+
167
+ except ValidationError as error:
168
+ # XML extraction or basic validation failed
169
+ latest_error = error
170
+ conversation_history = [
171
+ Message(role=MessageRole.ASSISTANT, message=response),
172
+ Message(role=MessageRole.USER, message=str(error)),
173
+ ]
174
+
175
+ message = f"Failed to get valid XML structure after {self._max_retries} attempts"
176
+ if latest_error is None:
177
+ raise ValueError(message)
178
+ else:
179
+ raise ValueError(message) from latest_error
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: epub-translator
3
- Version: 0.1.0
3
+ Version: 0.1.1
4
4
  Summary: Translate the epub book using LLM. The translated book will retain the original text and list the translated text side by side with the original text.
5
5
  License: MIT
6
6
  Keywords: epub,llm,translation,translator
@@ -19,7 +19,7 @@ epub_translator/epub/zip.py,sha256=CUE50LrrVNeQVecNm2ZFionJz4k_vMTXTi8an7BiQ_c,2
19
19
  epub_translator/iter_sync.py,sha256=56m-bRPqc731alGenqLvCIM99J8NzNuie86FDGtJj8k,588
20
20
  epub_translator/language.py,sha256=88osG0JNYxOkxBjg5Pm-P0Mhiyxf6GqdxoPW12HW0PE,493
21
21
  epub_translator/llm/__init__.py,sha256=QcAuTQpH0T7IMf-J3bRdtf8Tvyu6Z2CAe-wSzLJRLLw,43
22
- epub_translator/llm/core.py,sha256=YSc89c3BRFjJ7bDzrbfVBe1fX-AeBm9EJVcl2ihF_14,6155
22
+ epub_translator/llm/core.py,sha256=nRNAVDQD7kxSl2EN7m5OQ7CvlBL4ENbzQThUcJSzMsk,8123
23
23
  epub_translator/llm/error.py,sha256=4efAIQL14DFSvAnSTUfgdAbZRqaWBqOfUGsSfvxa5zM,1503
24
24
  epub_translator/llm/executor.py,sha256=Oax07rwivDbB0T3i_waLAvXvfQoR9dnWPTvw475C9vQ,6081
25
25
  epub_translator/llm/increasable.py,sha256=vQka-bysKuFR-Vu-GziGZfQCasLn9q2GxGEoV2RiCec,1289
@@ -29,7 +29,7 @@ epub_translator/serial/chunk.py,sha256=FrTaHikVOd6bLYumnEriTaAQ_DIDLjHm16gh-wBVR
29
29
  epub_translator/serial/segment.py,sha256=uEz-ke1KcYrON-68FaUEzMG2CzHlMjvbC11F3ZT4yH0,446
30
30
  epub_translator/serial/splitter.py,sha256=Nq0sxPXos8ez7QBG01sOKjnYKbeBWUBHflZGtqenVm8,1726
31
31
  epub_translator/template.py,sha256=0CqRmj3nTtPshw0NmTr2ECqelops2MMyX94fMrE-HKs,1587
32
- epub_translator/translator.py,sha256=e7rnMWXi4Ctkr10gg_oGDHYGAhc1Ofk8YXyz0xdmBjM,6880
32
+ epub_translator/translator.py,sha256=vEccCEFc-mArX4DzvUz09W_WFOxUv6dlQkwWDkbbVFs,6976
33
33
  epub_translator/utils.py,sha256=7lBWHNyv4GQiutqqqUhbAxc8gqVIkhS7B4rkL1EKOFs,144
34
34
  epub_translator/xml/__init__.py,sha256=te8vIRgG-2n1fEcTmNzCLc-WH9G0JUr_lJncJQvRbgw,96
35
35
  epub_translator/xml/deduplication.py,sha256=Vc7BtXXnAMQHNtE--o2Qkm_sYrjnJSh33reKFh9YUjo,1143
@@ -40,7 +40,7 @@ epub_translator/xml/firendly/parser.py,sha256=QlMHA0nfPJbNyx6IwRFrYVw7okuvzDB42N
40
40
  epub_translator/xml/firendly/tag.py,sha256=ahaGoYttuAlnFxLFFgTV51KUZSpUiHho-COZX14nxN8,3308
41
41
  epub_translator/xml/firendly/transform.py,sha256=5tG1MJmzrXIR_Z5gmRxwcoKvXBzJBVH0ELeaRsG-8w0,1201
42
42
  epub_translator/xml/xml.py,sha256=7NPinMOFGBeOHCG-hw0iQjL-p-_I4DmYL8lq0Ar8rag,1498
43
- epub_translator/xml/xml_like.py,sha256=UVx-Dvs4dXOnBcLX_qLOwyedj6fisZrrl1N_IibYHX8,6181
43
+ epub_translator/xml/xml_like.py,sha256=tgzqDQFfql9-QMSRbLf9SVlNsvyZXJTCEWmksxd3TuI,9489
44
44
  epub_translator/xml_translator/__init__.py,sha256=yNgwIermFXaRfAfnqXaNFCEf5I95cBVUDxha-6xkLq0,117
45
45
  epub_translator/xml_translator/const.py,sha256=Q9pmLplUR71TqF4MN5oLtPNl_pBRWoOJwsC5eIQOOWE,57
46
46
  epub_translator/xml_translator/fill.py,sha256=LxkPxlfbDDB3gP1rciXEBFyi1QRj5vXWzdca5SBcd5o,4839
@@ -50,9 +50,9 @@ epub_translator/xml_translator/group.py,sha256=2GxJl3RojyHyMuTZ5cn5PITT-F2fdaBlv
50
50
  epub_translator/xml_translator/progressive_locking.py,sha256=2eoCzVNeV4e4TziYTk4UgKmBUGuFQFj7X24ejO75lUA,9613
51
51
  epub_translator/xml_translator/submitter.py,sha256=bIoxhUIDMScgnxnqfCKR8d3u1DaISXqIM2WuHzrNU7M,4022
52
52
  epub_translator/xml_translator/text_segment.py,sha256=Aue5XHKYKzTuinFExcdu0CqGY5TiuJoIIhbP9t5ubPg,7673
53
- epub_translator/xml_translator/translator.py,sha256=ZQlcVbF7kLbsi1UHwdwh7H-yO4p6B3heTzOlDhA0Lrg,6844
53
+ epub_translator/xml_translator/translator.py,sha256=FGSXo2UWtcoIOWGzkI4emyqp1Q2Z8EoOBCBmdtty18A,7063
54
54
  epub_translator/xml_translator/utils.py,sha256=AIJOcB7Btad0yxxLwD3UC9NTk2gOPEM8qqx7sNO6tDc,626
55
- epub_translator-0.1.0.dist-info/LICENSE,sha256=5RF32sL3LtMOJIErdDKp1ZEYPGXS8WPpsiSz_jMBnGI,1066
56
- epub_translator-0.1.0.dist-info/METADATA,sha256=JEUwAD6kLkFbFZ-2GKaNK5f_U3prxY65k5s7DCOK7W4,9655
57
- epub_translator-0.1.0.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
58
- epub_translator-0.1.0.dist-info/RECORD,,
55
+ epub_translator-0.1.1.dist-info/LICENSE,sha256=5RF32sL3LtMOJIErdDKp1ZEYPGXS8WPpsiSz_jMBnGI,1066
56
+ epub_translator-0.1.1.dist-info/METADATA,sha256=BJDV44wO93Nw7e1hqBV33HXK8KUa_JO2XJ1qQ22RGmc,9655
57
+ epub_translator-0.1.1.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
58
+ epub_translator-0.1.1.dist-info/RECORD,,