epub-translator 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epub_translator/llm/core.py +95 -37
- epub_translator/translator.py +5 -2
- epub_translator/xml/xml_like.py +155 -100
- epub_translator/xml_translator/translator.py +48 -47
- {epub_translator-0.1.0.dist-info → epub_translator-0.1.1.dist-info}/METADATA +1 -1
- {epub_translator-0.1.0.dist-info → epub_translator-0.1.1.dist-info}/RECORD +8 -8
- {epub_translator-0.1.0.dist-info → epub_translator-0.1.1.dist-info}/LICENSE +0 -0
- {epub_translator-0.1.0.dist-info → epub_translator-0.1.1.dist-info}/WHEEL +0 -0
epub_translator/llm/core.py
CHANGED
|
@@ -1,11 +1,13 @@
|
|
|
1
1
|
import datetime
|
|
2
2
|
import hashlib
|
|
3
3
|
import json
|
|
4
|
+
import uuid
|
|
4
5
|
from collections.abc import Callable, Generator
|
|
5
6
|
from importlib.resources import files
|
|
6
7
|
from logging import DEBUG, FileHandler, Formatter, Logger, getLogger
|
|
7
8
|
from os import PathLike
|
|
8
9
|
from pathlib import Path
|
|
10
|
+
from typing import Self
|
|
9
11
|
|
|
10
12
|
from jinja2 import Environment, Template
|
|
11
13
|
from tiktoken import Encoding, get_encoding
|
|
@@ -16,6 +18,89 @@ from .increasable import Increasable
|
|
|
16
18
|
from .types import Message, MessageRole, R
|
|
17
19
|
|
|
18
20
|
|
|
21
|
+
class LLMContext:
|
|
22
|
+
"""Context manager for LLM requests with transactional caching."""
|
|
23
|
+
|
|
24
|
+
def __init__(
|
|
25
|
+
self,
|
|
26
|
+
executor: LLMExecutor,
|
|
27
|
+
cache_path: Path | None,
|
|
28
|
+
) -> None:
|
|
29
|
+
self._executor = executor
|
|
30
|
+
self._cache_path = cache_path
|
|
31
|
+
self._context_id = uuid.uuid4().hex[:12]
|
|
32
|
+
self._temp_files: list[Path] = []
|
|
33
|
+
|
|
34
|
+
def __enter__(self) -> Self:
|
|
35
|
+
return self
|
|
36
|
+
|
|
37
|
+
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
|
|
38
|
+
if exc_type is None:
|
|
39
|
+
# Success: commit all temporary cache files
|
|
40
|
+
self._commit()
|
|
41
|
+
else:
|
|
42
|
+
# Failure: rollback (delete) all temporary cache files
|
|
43
|
+
self._rollback()
|
|
44
|
+
|
|
45
|
+
def request(
|
|
46
|
+
self,
|
|
47
|
+
input: str | list[Message],
|
|
48
|
+
parser: Callable[[str], R] = lambda x: x,
|
|
49
|
+
max_tokens: int | None = None,
|
|
50
|
+
) -> R:
|
|
51
|
+
messages: list[Message]
|
|
52
|
+
if isinstance(input, str):
|
|
53
|
+
messages = [Message(role=MessageRole.USER, message=input)]
|
|
54
|
+
else:
|
|
55
|
+
messages = input
|
|
56
|
+
|
|
57
|
+
cache_key: str | None = None
|
|
58
|
+
if self._cache_path is not None:
|
|
59
|
+
cache_key = self._compute_messages_hash(messages)
|
|
60
|
+
permanent_cache_file = self._cache_path / f"{cache_key}.txt"
|
|
61
|
+
if permanent_cache_file.exists():
|
|
62
|
+
cached_content = permanent_cache_file.read_text(encoding="utf-8")
|
|
63
|
+
return parser(cached_content)
|
|
64
|
+
|
|
65
|
+
temp_cache_file = self._cache_path / f"{cache_key}.{self._context_id}.txt"
|
|
66
|
+
if temp_cache_file.exists():
|
|
67
|
+
cached_content = temp_cache_file.read_text(encoding="utf-8")
|
|
68
|
+
return parser(cached_content)
|
|
69
|
+
|
|
70
|
+
# Make the actual request
|
|
71
|
+
response = self._executor.request(
|
|
72
|
+
messages=messages,
|
|
73
|
+
parser=lambda x: x,
|
|
74
|
+
max_tokens=max_tokens,
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
# Save to temporary cache if cache_path is set
|
|
78
|
+
if self._cache_path is not None and cache_key is not None:
|
|
79
|
+
temp_cache_file = self._cache_path / f"{cache_key}.{self._context_id}.txt"
|
|
80
|
+
temp_cache_file.write_text(response, encoding="utf-8")
|
|
81
|
+
self._temp_files.append(temp_cache_file)
|
|
82
|
+
|
|
83
|
+
return parser(response)
|
|
84
|
+
|
|
85
|
+
def _compute_messages_hash(self, messages: list[Message]) -> str:
|
|
86
|
+
messages_dict = [{"role": msg.role.value, "message": msg.message} for msg in messages]
|
|
87
|
+
messages_json = json.dumps(messages_dict, ensure_ascii=False, sort_keys=True)
|
|
88
|
+
return hashlib.sha512(messages_json.encode("utf-8")).hexdigest()
|
|
89
|
+
|
|
90
|
+
def _commit(self) -> None:
|
|
91
|
+
for temp_file in self._temp_files:
|
|
92
|
+
if temp_file.exists():
|
|
93
|
+
# Remove the .[context-id].txt suffix to get permanent name
|
|
94
|
+
permanent_name = temp_file.name.rsplit(".", 2)[0] + ".txt"
|
|
95
|
+
permanent_file = temp_file.parent / permanent_name
|
|
96
|
+
temp_file.rename(permanent_file)
|
|
97
|
+
|
|
98
|
+
def _rollback(self) -> None:
|
|
99
|
+
for temp_file in self._temp_files:
|
|
100
|
+
if temp_file.exists():
|
|
101
|
+
temp_file.unlink()
|
|
102
|
+
|
|
103
|
+
|
|
19
104
|
class LLM:
|
|
20
105
|
def __init__(
|
|
21
106
|
self,
|
|
@@ -30,7 +115,7 @@ class LLM:
|
|
|
30
115
|
retry_times: int = 5,
|
|
31
116
|
retry_interval_seconds: float = 6.0,
|
|
32
117
|
log_dir_path: PathLike | None = None,
|
|
33
|
-
):
|
|
118
|
+
) -> None:
|
|
34
119
|
prompts_path = Path(str(files("epub_translator"))) / "data"
|
|
35
120
|
self._templates: dict[str, Template] = {}
|
|
36
121
|
self._encoding: Encoding = get_encoding(token_encoding)
|
|
@@ -68,41 +153,20 @@ class LLM:
|
|
|
68
153
|
def encoding(self) -> Encoding:
|
|
69
154
|
return self._encoding
|
|
70
155
|
|
|
156
|
+
def context(self) -> LLMContext:
|
|
157
|
+
return LLMContext(
|
|
158
|
+
executor=self._executor,
|
|
159
|
+
cache_path=self._cache_path,
|
|
160
|
+
)
|
|
161
|
+
|
|
71
162
|
def request(
|
|
72
163
|
self,
|
|
73
164
|
input: str | list[Message],
|
|
74
165
|
parser: Callable[[str], R] = lambda x: x,
|
|
75
166
|
max_tokens: int | None = None,
|
|
76
167
|
) -> R:
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
messages = [Message(role=MessageRole.USER, message=input)]
|
|
80
|
-
else:
|
|
81
|
-
messages = input
|
|
82
|
-
|
|
83
|
-
# Check cache if cache_path is set
|
|
84
|
-
if self._cache_path is not None:
|
|
85
|
-
cache_key = self._compute_messages_hash(messages)
|
|
86
|
-
cache_file = self._cache_path / f"{cache_key}.txt"
|
|
87
|
-
|
|
88
|
-
if cache_file.exists():
|
|
89
|
-
cached_content = cache_file.read_text(encoding="utf-8")
|
|
90
|
-
return parser(cached_content)
|
|
91
|
-
|
|
92
|
-
# Make the actual request
|
|
93
|
-
response = self._executor.request(
|
|
94
|
-
messages=messages,
|
|
95
|
-
parser=lambda x: x,
|
|
96
|
-
max_tokens=max_tokens,
|
|
97
|
-
)
|
|
98
|
-
|
|
99
|
-
# Save to cache if cache_path is set
|
|
100
|
-
if self._cache_path is not None:
|
|
101
|
-
cache_key = self._compute_messages_hash(messages)
|
|
102
|
-
cache_file = self._cache_path / f"{cache_key}.txt"
|
|
103
|
-
cache_file.write_text(response, encoding="utf-8")
|
|
104
|
-
|
|
105
|
-
return parser(response)
|
|
168
|
+
with self.context() as ctx:
|
|
169
|
+
return ctx.request(input=input, parser=parser, max_tokens=max_tokens)
|
|
106
170
|
|
|
107
171
|
def template(self, template_name: str) -> Template:
|
|
108
172
|
template = self._templates.get(template_name, None)
|
|
@@ -111,17 +175,11 @@ class LLM:
|
|
|
111
175
|
self._templates[template_name] = template
|
|
112
176
|
return template
|
|
113
177
|
|
|
114
|
-
def _compute_messages_hash(self, messages: list[Message]) -> str:
|
|
115
|
-
"""Compute SHA-512 hash of m·essages for cache key."""
|
|
116
|
-
messages_dict = [{"role": msg.role.value, "message": msg.message} for msg in messages]
|
|
117
|
-
messages_json = json.dumps(messages_dict, ensure_ascii=False, sort_keys=True)
|
|
118
|
-
return hashlib.sha512(messages_json.encode("utf-8")).hexdigest()
|
|
119
|
-
|
|
120
178
|
def _create_logger(self) -> Logger | None:
|
|
121
179
|
if self._logger_save_path is None:
|
|
122
180
|
return None
|
|
123
181
|
|
|
124
|
-
now = datetime.datetime.now(datetime.
|
|
182
|
+
now = datetime.datetime.now(datetime.UTC)
|
|
125
183
|
timestamp = now.strftime("%Y-%m-%d %H-%M-%S %f")
|
|
126
184
|
file_path = self._logger_save_path / f"request {timestamp}.log"
|
|
127
185
|
logger = getLogger(f"LLM Request {timestamp}")
|
epub_translator/translator.py
CHANGED
|
@@ -71,7 +71,7 @@ def translate(
|
|
|
71
71
|
placeholder.recover()
|
|
72
72
|
deduplicate_ids_in_element(xml.element)
|
|
73
73
|
with zip.replace(chapter_path) as target_file:
|
|
74
|
-
xml.save(target_file
|
|
74
|
+
xml.save(target_file)
|
|
75
75
|
|
|
76
76
|
# Update progress after each chapter
|
|
77
77
|
processed_chapters += 1
|
|
@@ -198,7 +198,10 @@ def _count_chapters(zip: Zip) -> int:
|
|
|
198
198
|
def _search_chapter_items(zip: Zip):
|
|
199
199
|
for chapter_path in search_spine_paths(zip):
|
|
200
200
|
with zip.read(chapter_path) as chapter_file:
|
|
201
|
-
xml = XMLLikeNode(
|
|
201
|
+
xml = XMLLikeNode(
|
|
202
|
+
file=chapter_file,
|
|
203
|
+
is_html_like=chapter_path.suffix.lower() in (".html", ".htm"),
|
|
204
|
+
)
|
|
202
205
|
body_element = find_first(xml.element, "body")
|
|
203
206
|
if body_element is not None:
|
|
204
207
|
placeholder = Placeholder(body_element)
|
epub_translator/xml/xml_like.py
CHANGED
|
@@ -1,10 +1,13 @@
|
|
|
1
1
|
import io
|
|
2
2
|
import re
|
|
3
|
+
import warnings
|
|
3
4
|
from typing import IO
|
|
4
5
|
from xml.etree.ElementTree import Element, fromstring, tostring
|
|
5
6
|
|
|
6
7
|
from .xml import iter_with_stack
|
|
7
8
|
|
|
9
|
+
_XML_NAMESPACE_URI = "http://www.w3.org/XML/1998/namespace"
|
|
10
|
+
|
|
8
11
|
_COMMON_NAMESPACES = {
|
|
9
12
|
"http://www.w3.org/1999/xhtml": "xhtml",
|
|
10
13
|
"http://www.idpf.org/2007/ops": "epub",
|
|
@@ -14,6 +17,7 @@ _COMMON_NAMESPACES = {
|
|
|
14
17
|
"http://www.idpf.org/2007/opf": "opf",
|
|
15
18
|
"http://www.w3.org/2000/svg": "svg",
|
|
16
19
|
"urn:oasis:names:tc:opendocument:xmlns:container": "container",
|
|
20
|
+
"http://www.w3.org/XML/1998/namespace": "xml", # Reserved XML namespace
|
|
17
21
|
}
|
|
18
22
|
|
|
19
23
|
_ROOT_NAMESPACES = {
|
|
@@ -27,8 +31,8 @@ _ENCODING_PATTERN = re.compile(r'encoding\s*=\s*["\']([^"\']+)["\']', re.IGNOREC
|
|
|
27
31
|
_FIRST_ELEMENT_PATTERN = re.compile(r"<(?![?!])[a-zA-Z]")
|
|
28
32
|
_NAMESPACE_IN_TAG = re.compile(r"\{([^}]+)\}")
|
|
29
33
|
|
|
30
|
-
# HTML
|
|
31
|
-
#
|
|
34
|
+
# Some non-standard EPUB generators use HTML-style tags without self-closing syntax
|
|
35
|
+
# We need to convert them to XML-compatible format before parsing
|
|
32
36
|
_EMPTY_TAGS = (
|
|
33
37
|
"br",
|
|
34
38
|
"hr",
|
|
@@ -39,20 +43,38 @@ _EMPTY_TAGS = (
|
|
|
39
43
|
"area",
|
|
40
44
|
)
|
|
41
45
|
|
|
42
|
-
|
|
46
|
+
# For reading: match tags like <br> or <br class="x"> (but not <br/> or <body>)
|
|
47
|
+
_EMPTY_TAG_OPEN_PATTERN = re.compile(r"<(" + "|".join(_EMPTY_TAGS) + r")(\s[^/>]*)>")
|
|
48
|
+
|
|
49
|
+
# For saving: match self-closing tags like <br />
|
|
50
|
+
_EMPTY_TAG_CLOSE_PATTERN = re.compile(r"<(" + "|".join(_EMPTY_TAGS) + r")(\s[^>]*?)\s*/>")
|
|
43
51
|
|
|
44
52
|
|
|
45
53
|
class XMLLikeNode:
|
|
46
|
-
def __init__(self, file: IO[bytes]) -> None:
|
|
54
|
+
def __init__(self, file: IO[bytes], is_html_like: bool = False) -> None:
|
|
47
55
|
raw_content = file.read()
|
|
48
|
-
self._encoding: str = _detect_encoding(raw_content)
|
|
56
|
+
self._encoding: str = self._detect_encoding(raw_content)
|
|
49
57
|
content = raw_content.decode(self._encoding)
|
|
50
|
-
self._header, xml_content = _extract_header(content)
|
|
58
|
+
self._header, xml_content = self._extract_header(content)
|
|
59
|
+
self._namespaces: dict[str, str] = {}
|
|
60
|
+
self._tag_to_namespace: dict[str, str] = {}
|
|
61
|
+
self._attr_to_namespace: dict[str, str] = {}
|
|
62
|
+
|
|
63
|
+
# For non-standard HTML files, convert <br> to <br/> before parsing
|
|
64
|
+
self._is_html_like = is_html_like
|
|
65
|
+
if is_html_like:
|
|
66
|
+
xml_content = re.sub(
|
|
67
|
+
pattern=_EMPTY_TAG_OPEN_PATTERN,
|
|
68
|
+
repl=lambda m: f"<{m.group(1)}{m.group(2)} />",
|
|
69
|
+
string=xml_content,
|
|
70
|
+
)
|
|
71
|
+
|
|
51
72
|
try:
|
|
52
|
-
self.element =
|
|
73
|
+
self.element = self._extract_and_clean_namespaces(
|
|
74
|
+
element=fromstring(xml_content),
|
|
75
|
+
)
|
|
53
76
|
except Exception as error:
|
|
54
77
|
raise ValueError("Failed to parse XML-like content") from error
|
|
55
|
-
self._namespaces: dict[str, str] = _extract_and_clean_namespaces(self.element)
|
|
56
78
|
|
|
57
79
|
@property
|
|
58
80
|
def encoding(self) -> str:
|
|
@@ -62,115 +84,148 @@ class XMLLikeNode:
|
|
|
62
84
|
def namespaces(self) -> list[str]:
|
|
63
85
|
return list(self._namespaces.keys())
|
|
64
86
|
|
|
65
|
-
def save(self, file: IO[bytes]
|
|
87
|
+
def save(self, file: IO[bytes]) -> None:
|
|
66
88
|
writer = io.TextIOWrapper(file, encoding=self._encoding, write_through=True)
|
|
67
89
|
try:
|
|
68
90
|
if self._header:
|
|
69
91
|
writer.write(self._header)
|
|
70
92
|
|
|
71
|
-
content = _serialize_with_namespaces(
|
|
72
|
-
|
|
93
|
+
content = self._serialize_with_namespaces(self.element)
|
|
94
|
+
|
|
95
|
+
# For non-standard HTML files, convert back from <br/> to <br>
|
|
96
|
+
if self._is_html_like:
|
|
73
97
|
content = re.sub(
|
|
74
|
-
pattern=
|
|
98
|
+
pattern=_EMPTY_TAG_CLOSE_PATTERN,
|
|
75
99
|
repl=lambda m: f"<{m.group(1)}{m.group(2)}>",
|
|
76
100
|
string=content,
|
|
77
101
|
)
|
|
78
|
-
|
|
79
|
-
content = re.sub(
|
|
80
|
-
pattern=_EMPTY_TAG_PATTERN,
|
|
81
|
-
repl=lambda m: f"<{m.group(1)}{m.group(2)} />",
|
|
82
|
-
string=content,
|
|
83
|
-
)
|
|
102
|
+
|
|
84
103
|
writer.write(content)
|
|
85
104
|
|
|
86
105
|
finally:
|
|
87
106
|
writer.detach()
|
|
88
107
|
|
|
108
|
+
def _detect_encoding(self, raw_content: bytes) -> str:
|
|
109
|
+
if raw_content.startswith(b"\xef\xbb\xbf"):
|
|
110
|
+
return "utf-8-sig"
|
|
111
|
+
elif raw_content.startswith(b"\xff\xfe"):
|
|
112
|
+
return "utf-16-le"
|
|
113
|
+
elif raw_content.startswith(b"\xfe\xff"):
|
|
114
|
+
return "utf-16-be"
|
|
115
|
+
|
|
116
|
+
# 尝试从 XML 声明中提取编码:只读取前 1024 字节来查找 XML 声明
|
|
117
|
+
header_bytes = raw_content[:1024]
|
|
118
|
+
for try_encoding in ("utf-8", "utf-16-le", "utf-16-be", "iso-8859-1"):
|
|
119
|
+
try:
|
|
120
|
+
header_str = header_bytes.decode(try_encoding)
|
|
121
|
+
match = _ENCODING_PATTERN.search(header_str)
|
|
122
|
+
if match:
|
|
123
|
+
declared_encoding = match.group(1).lower()
|
|
124
|
+
try:
|
|
125
|
+
raw_content.decode(declared_encoding)
|
|
126
|
+
return declared_encoding
|
|
127
|
+
except (LookupError, UnicodeDecodeError):
|
|
128
|
+
pass
|
|
129
|
+
except UnicodeDecodeError:
|
|
130
|
+
continue
|
|
89
131
|
|
|
90
|
-
def _detect_encoding(raw_content: bytes) -> str:
|
|
91
|
-
if raw_content.startswith(b"\xef\xbb\xbf"):
|
|
92
|
-
return "utf-8-sig"
|
|
93
|
-
elif raw_content.startswith(b"\xff\xfe"):
|
|
94
|
-
return "utf-16-le"
|
|
95
|
-
elif raw_content.startswith(b"\xfe\xff"):
|
|
96
|
-
return "utf-16-be"
|
|
97
|
-
|
|
98
|
-
# 尝试从 XML 声明中提取编码:只读取前 1024 字节来查找 XML 声明
|
|
99
|
-
header_bytes = raw_content[:1024]
|
|
100
|
-
for try_encoding in ("utf-8", "utf-16-le", "utf-16-be", "iso-8859-1"):
|
|
101
132
|
try:
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
if match:
|
|
105
|
-
declared_encoding = match.group(1).lower()
|
|
106
|
-
try:
|
|
107
|
-
raw_content.decode(declared_encoding)
|
|
108
|
-
return declared_encoding
|
|
109
|
-
except (LookupError, UnicodeDecodeError):
|
|
110
|
-
pass
|
|
133
|
+
raw_content.decode("utf-8")
|
|
134
|
+
return "utf-8"
|
|
111
135
|
except UnicodeDecodeError:
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
try:
|
|
115
|
-
raw_content.decode("utf-8")
|
|
116
|
-
return "utf-8"
|
|
117
|
-
except UnicodeDecodeError:
|
|
118
|
-
pass
|
|
119
|
-
return "iso-8859-1"
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
def _extract_header(content: str) -> tuple[str, str]:
|
|
123
|
-
match = _FIRST_ELEMENT_PATTERN.search(content)
|
|
124
|
-
if match:
|
|
125
|
-
split_pos = match.start()
|
|
126
|
-
header = content[:split_pos]
|
|
127
|
-
xml_content = content[split_pos:]
|
|
128
|
-
return header, xml_content
|
|
129
|
-
return "", content
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
def _extract_and_clean_namespaces(element: Element):
|
|
133
|
-
namespaces: dict[str, str] = {}
|
|
134
|
-
for _, elem in iter_with_stack(element):
|
|
135
|
-
match = _NAMESPACE_IN_TAG.match(elem.tag)
|
|
136
|
-
if match:
|
|
137
|
-
namespace_uri = match.group(1)
|
|
138
|
-
if namespace_uri not in namespaces:
|
|
139
|
-
prefix = _COMMON_NAMESPACES.get(namespace_uri, f"ns{len(namespaces)}")
|
|
140
|
-
namespaces[namespace_uri] = prefix
|
|
136
|
+
pass
|
|
137
|
+
return "iso-8859-1"
|
|
141
138
|
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
139
|
+
def _extract_header(self, content: str) -> tuple[str, str]:
|
|
140
|
+
match = _FIRST_ELEMENT_PATTERN.search(content)
|
|
141
|
+
if match:
|
|
142
|
+
split_pos = match.start()
|
|
143
|
+
header = content[:split_pos]
|
|
144
|
+
xml_content = content[split_pos:]
|
|
145
|
+
return header, xml_content
|
|
146
|
+
return "", content
|
|
147
|
+
|
|
148
|
+
def _extract_and_clean_namespaces(self, element: Element) -> Element:
|
|
149
|
+
for _, elem in iter_with_stack(element):
|
|
150
|
+
match = _NAMESPACE_IN_TAG.match(elem.tag)
|
|
147
151
|
if match:
|
|
148
152
|
namespace_uri = match.group(1)
|
|
149
|
-
if namespace_uri not in
|
|
150
|
-
prefix = _COMMON_NAMESPACES.get(namespace_uri, f"ns{len(
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
)
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
153
|
+
if namespace_uri not in self._namespaces:
|
|
154
|
+
prefix = _COMMON_NAMESPACES.get(namespace_uri, f"ns{len(self._namespaces)}")
|
|
155
|
+
self._namespaces[namespace_uri] = prefix
|
|
156
|
+
|
|
157
|
+
tag_name = elem.tag[len(match.group(0)) :]
|
|
158
|
+
|
|
159
|
+
# Record tag -> namespace mapping (warn if conflict)
|
|
160
|
+
if tag_name in self._tag_to_namespace and self._tag_to_namespace[tag_name] != namespace_uri:
|
|
161
|
+
warnings.warn(
|
|
162
|
+
f"Tag '{tag_name}' has multiple namespaces: "
|
|
163
|
+
f"{self._tag_to_namespace[tag_name]} and {namespace_uri}. "
|
|
164
|
+
f"Using the first one.",
|
|
165
|
+
stacklevel=2,
|
|
166
|
+
)
|
|
167
|
+
else:
|
|
168
|
+
self._tag_to_namespace[tag_name] = namespace_uri
|
|
169
|
+
|
|
170
|
+
# Clean: remove namespace URI completely
|
|
171
|
+
elem.tag = tag_name
|
|
172
|
+
|
|
173
|
+
for attr_key in list(elem.attrib.keys()):
|
|
174
|
+
match = _NAMESPACE_IN_TAG.match(attr_key)
|
|
175
|
+
if match:
|
|
176
|
+
namespace_uri = match.group(1)
|
|
177
|
+
if namespace_uri not in self._namespaces:
|
|
178
|
+
prefix = _COMMON_NAMESPACES.get(namespace_uri, f"ns{len(self._namespaces)}")
|
|
179
|
+
self._namespaces[namespace_uri] = prefix
|
|
180
|
+
|
|
181
|
+
attr_name = attr_key[len(match.group(0)) :]
|
|
182
|
+
attr_value = elem.attrib.pop(attr_key)
|
|
183
|
+
|
|
184
|
+
# Record attr -> namespace mapping (warn if conflict)
|
|
185
|
+
if attr_name in self._attr_to_namespace and self._attr_to_namespace[attr_name] != namespace_uri:
|
|
186
|
+
warnings.warn(
|
|
187
|
+
f"Attribute '{attr_name}' has multiple namespaces: "
|
|
188
|
+
f"{self._attr_to_namespace[attr_name]} and {namespace_uri}. "
|
|
189
|
+
f"Using the first one.",
|
|
190
|
+
stacklevel=2,
|
|
191
|
+
)
|
|
192
|
+
else:
|
|
193
|
+
self._attr_to_namespace[attr_name] = namespace_uri
|
|
194
|
+
|
|
195
|
+
# Clean: remove namespace URI completely
|
|
196
|
+
elem.attrib[attr_name] = attr_value
|
|
197
|
+
return element
|
|
198
|
+
|
|
199
|
+
def _serialize_with_namespaces(self, element: Element) -> str:
|
|
200
|
+
# First, add namespace declarations to root element (before serialization)
|
|
201
|
+
for namespace_uri, prefix in self._namespaces.items():
|
|
202
|
+
# Skip the reserved xml namespace - it's implicit
|
|
203
|
+
if namespace_uri == _XML_NAMESPACE_URI:
|
|
204
|
+
continue
|
|
205
|
+
if namespace_uri in _ROOT_NAMESPACES:
|
|
206
|
+
element.attrib["xmlns"] = namespace_uri
|
|
207
|
+
else:
|
|
208
|
+
element.attrib[f"xmlns:{prefix}"] = namespace_uri
|
|
209
|
+
|
|
210
|
+
# Serialize the element tree as-is (tags are simple names without prefixes)
|
|
211
|
+
xml_string = tostring(element, encoding="unicode")
|
|
212
|
+
|
|
213
|
+
# Now restore namespace prefixes in the serialized string
|
|
214
|
+
# For each tag that should have a namespace prefix, wrap it with the prefix
|
|
215
|
+
for tag_name, namespace_uri in self._tag_to_namespace.items():
|
|
216
|
+
if namespace_uri not in _ROOT_NAMESPACES:
|
|
217
|
+
# Get the prefix for this namespace
|
|
218
|
+
prefix = self._namespaces[namespace_uri]
|
|
219
|
+
# Replace opening and closing tags
|
|
220
|
+
xml_string = xml_string.replace(f"<{tag_name} ", f"<{prefix}:{tag_name} ")
|
|
221
|
+
xml_string = xml_string.replace(f"<{tag_name}>", f"<{prefix}:{tag_name}>")
|
|
222
|
+
xml_string = xml_string.replace(f"</{tag_name}>", f"</{prefix}:{tag_name}>")
|
|
223
|
+
xml_string = xml_string.replace(f"<{tag_name}/>", f"<{prefix}:{tag_name}/>")
|
|
224
|
+
|
|
225
|
+
# Similarly for attributes (though less common in EPUB)
|
|
226
|
+
for attr_name, namespace_uri in self._attr_to_namespace.items():
|
|
227
|
+
if namespace_uri not in _ROOT_NAMESPACES:
|
|
228
|
+
prefix = self._namespaces[namespace_uri]
|
|
229
|
+
xml_string = xml_string.replace(f' {attr_name}="', f' {prefix}:{attr_name}="')
|
|
230
|
+
|
|
231
|
+
return xml_string
|
|
@@ -126,53 +126,54 @@ class XMLTranslator:
|
|
|
126
126
|
conversation_history: list[Message] = []
|
|
127
127
|
latest_error: ValidationError | None = None
|
|
128
128
|
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
try:
|
|
136
|
-
# Extract XML from response
|
|
137
|
-
validated_element = _extract_xml_element(response)
|
|
138
|
-
|
|
139
|
-
# Validate with progressive locking
|
|
140
|
-
is_complete, error_message, newly_locked = validator.validate_with_locking(
|
|
141
|
-
template_ele=fill.request_element,
|
|
142
|
-
validated_ele=validated_element,
|
|
143
|
-
errors_limit=self._max_fill_displaying_errors,
|
|
129
|
+
with self._llm.context() as llm_context:
|
|
130
|
+
for _ in range(self._max_retries):
|
|
131
|
+
# Request LLM response
|
|
132
|
+
response = llm_context.request(
|
|
133
|
+
input=fixed_messages + conversation_history,
|
|
144
134
|
)
|
|
145
135
|
|
|
146
|
-
|
|
147
|
-
#
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
136
|
+
try:
|
|
137
|
+
# Extract XML from response
|
|
138
|
+
validated_element = _extract_xml_element(response)
|
|
139
|
+
|
|
140
|
+
# Validate with progressive locking
|
|
141
|
+
is_complete, error_message, newly_locked = validator.validate_with_locking(
|
|
142
|
+
template_ele=fill.request_element,
|
|
143
|
+
validated_ele=validated_element,
|
|
144
|
+
errors_limit=self._max_fill_displaying_errors,
|
|
151
145
|
)
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
146
|
+
|
|
147
|
+
if is_complete:
|
|
148
|
+
# All nodes locked, fill successful
|
|
149
|
+
fill._fill_submitted_texts( # pylint: disable=protected-access
|
|
150
|
+
generated_ids_stack=[],
|
|
151
|
+
element=validated_element,
|
|
152
|
+
)
|
|
153
|
+
return validated_element
|
|
154
|
+
|
|
155
|
+
# Not complete yet, construct error message with progress info
|
|
156
|
+
progress_msg = f"Progress: {len(validator.locked_ids)} nodes locked"
|
|
157
|
+
if newly_locked:
|
|
158
|
+
progress_msg += f", {len(newly_locked)} newly locked this round"
|
|
159
|
+
|
|
160
|
+
full_error_message = f"{progress_msg}\n\n{error_message}"
|
|
161
|
+
|
|
162
|
+
conversation_history = [
|
|
163
|
+
Message(role=MessageRole.ASSISTANT, message=response),
|
|
164
|
+
Message(role=MessageRole.USER, message=full_error_message),
|
|
165
|
+
]
|
|
166
|
+
|
|
167
|
+
except ValidationError as error:
|
|
168
|
+
# XML extraction or basic validation failed
|
|
169
|
+
latest_error = error
|
|
170
|
+
conversation_history = [
|
|
171
|
+
Message(role=MessageRole.ASSISTANT, message=response),
|
|
172
|
+
Message(role=MessageRole.USER, message=str(error)),
|
|
173
|
+
]
|
|
174
|
+
|
|
175
|
+
message = f"Failed to get valid XML structure after {self._max_retries} attempts"
|
|
176
|
+
if latest_error is None:
|
|
177
|
+
raise ValueError(message)
|
|
178
|
+
else:
|
|
179
|
+
raise ValueError(message) from latest_error
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: epub-translator
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.1
|
|
4
4
|
Summary: Translate the epub book using LLM. The translated book will retain the original text and list the translated text side by side with the original text.
|
|
5
5
|
License: MIT
|
|
6
6
|
Keywords: epub,llm,translation,translator
|
|
@@ -19,7 +19,7 @@ epub_translator/epub/zip.py,sha256=CUE50LrrVNeQVecNm2ZFionJz4k_vMTXTi8an7BiQ_c,2
|
|
|
19
19
|
epub_translator/iter_sync.py,sha256=56m-bRPqc731alGenqLvCIM99J8NzNuie86FDGtJj8k,588
|
|
20
20
|
epub_translator/language.py,sha256=88osG0JNYxOkxBjg5Pm-P0Mhiyxf6GqdxoPW12HW0PE,493
|
|
21
21
|
epub_translator/llm/__init__.py,sha256=QcAuTQpH0T7IMf-J3bRdtf8Tvyu6Z2CAe-wSzLJRLLw,43
|
|
22
|
-
epub_translator/llm/core.py,sha256=
|
|
22
|
+
epub_translator/llm/core.py,sha256=nRNAVDQD7kxSl2EN7m5OQ7CvlBL4ENbzQThUcJSzMsk,8123
|
|
23
23
|
epub_translator/llm/error.py,sha256=4efAIQL14DFSvAnSTUfgdAbZRqaWBqOfUGsSfvxa5zM,1503
|
|
24
24
|
epub_translator/llm/executor.py,sha256=Oax07rwivDbB0T3i_waLAvXvfQoR9dnWPTvw475C9vQ,6081
|
|
25
25
|
epub_translator/llm/increasable.py,sha256=vQka-bysKuFR-Vu-GziGZfQCasLn9q2GxGEoV2RiCec,1289
|
|
@@ -29,7 +29,7 @@ epub_translator/serial/chunk.py,sha256=FrTaHikVOd6bLYumnEriTaAQ_DIDLjHm16gh-wBVR
|
|
|
29
29
|
epub_translator/serial/segment.py,sha256=uEz-ke1KcYrON-68FaUEzMG2CzHlMjvbC11F3ZT4yH0,446
|
|
30
30
|
epub_translator/serial/splitter.py,sha256=Nq0sxPXos8ez7QBG01sOKjnYKbeBWUBHflZGtqenVm8,1726
|
|
31
31
|
epub_translator/template.py,sha256=0CqRmj3nTtPshw0NmTr2ECqelops2MMyX94fMrE-HKs,1587
|
|
32
|
-
epub_translator/translator.py,sha256=
|
|
32
|
+
epub_translator/translator.py,sha256=vEccCEFc-mArX4DzvUz09W_WFOxUv6dlQkwWDkbbVFs,6976
|
|
33
33
|
epub_translator/utils.py,sha256=7lBWHNyv4GQiutqqqUhbAxc8gqVIkhS7B4rkL1EKOFs,144
|
|
34
34
|
epub_translator/xml/__init__.py,sha256=te8vIRgG-2n1fEcTmNzCLc-WH9G0JUr_lJncJQvRbgw,96
|
|
35
35
|
epub_translator/xml/deduplication.py,sha256=Vc7BtXXnAMQHNtE--o2Qkm_sYrjnJSh33reKFh9YUjo,1143
|
|
@@ -40,7 +40,7 @@ epub_translator/xml/firendly/parser.py,sha256=QlMHA0nfPJbNyx6IwRFrYVw7okuvzDB42N
|
|
|
40
40
|
epub_translator/xml/firendly/tag.py,sha256=ahaGoYttuAlnFxLFFgTV51KUZSpUiHho-COZX14nxN8,3308
|
|
41
41
|
epub_translator/xml/firendly/transform.py,sha256=5tG1MJmzrXIR_Z5gmRxwcoKvXBzJBVH0ELeaRsG-8w0,1201
|
|
42
42
|
epub_translator/xml/xml.py,sha256=7NPinMOFGBeOHCG-hw0iQjL-p-_I4DmYL8lq0Ar8rag,1498
|
|
43
|
-
epub_translator/xml/xml_like.py,sha256=
|
|
43
|
+
epub_translator/xml/xml_like.py,sha256=tgzqDQFfql9-QMSRbLf9SVlNsvyZXJTCEWmksxd3TuI,9489
|
|
44
44
|
epub_translator/xml_translator/__init__.py,sha256=yNgwIermFXaRfAfnqXaNFCEf5I95cBVUDxha-6xkLq0,117
|
|
45
45
|
epub_translator/xml_translator/const.py,sha256=Q9pmLplUR71TqF4MN5oLtPNl_pBRWoOJwsC5eIQOOWE,57
|
|
46
46
|
epub_translator/xml_translator/fill.py,sha256=LxkPxlfbDDB3gP1rciXEBFyi1QRj5vXWzdca5SBcd5o,4839
|
|
@@ -50,9 +50,9 @@ epub_translator/xml_translator/group.py,sha256=2GxJl3RojyHyMuTZ5cn5PITT-F2fdaBlv
|
|
|
50
50
|
epub_translator/xml_translator/progressive_locking.py,sha256=2eoCzVNeV4e4TziYTk4UgKmBUGuFQFj7X24ejO75lUA,9613
|
|
51
51
|
epub_translator/xml_translator/submitter.py,sha256=bIoxhUIDMScgnxnqfCKR8d3u1DaISXqIM2WuHzrNU7M,4022
|
|
52
52
|
epub_translator/xml_translator/text_segment.py,sha256=Aue5XHKYKzTuinFExcdu0CqGY5TiuJoIIhbP9t5ubPg,7673
|
|
53
|
-
epub_translator/xml_translator/translator.py,sha256=
|
|
53
|
+
epub_translator/xml_translator/translator.py,sha256=FGSXo2UWtcoIOWGzkI4emyqp1Q2Z8EoOBCBmdtty18A,7063
|
|
54
54
|
epub_translator/xml_translator/utils.py,sha256=AIJOcB7Btad0yxxLwD3UC9NTk2gOPEM8qqx7sNO6tDc,626
|
|
55
|
-
epub_translator-0.1.
|
|
56
|
-
epub_translator-0.1.
|
|
57
|
-
epub_translator-0.1.
|
|
58
|
-
epub_translator-0.1.
|
|
55
|
+
epub_translator-0.1.1.dist-info/LICENSE,sha256=5RF32sL3LtMOJIErdDKp1ZEYPGXS8WPpsiSz_jMBnGI,1066
|
|
56
|
+
epub_translator-0.1.1.dist-info/METADATA,sha256=BJDV44wO93Nw7e1hqBV33HXK8KUa_JO2XJ1qQ22RGmc,9655
|
|
57
|
+
epub_translator-0.1.1.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
|
58
|
+
epub_translator-0.1.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|