epub-translator 0.1.0__py3-none-any.whl → 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epub_translator/__init__.py +2 -2
- epub_translator/data/fill.jinja +143 -38
- epub_translator/epub/__init__.py +1 -1
- epub_translator/epub/metadata.py +122 -0
- epub_translator/epub/spines.py +3 -2
- epub_translator/epub/zip.py +11 -9
- epub_translator/epub_transcode.py +108 -0
- epub_translator/llm/__init__.py +1 -0
- epub_translator/llm/context.py +109 -0
- epub_translator/llm/core.py +39 -62
- epub_translator/llm/executor.py +25 -31
- epub_translator/llm/increasable.py +1 -1
- epub_translator/llm/types.py +0 -3
- epub_translator/segment/__init__.py +26 -0
- epub_translator/segment/block_segment.py +124 -0
- epub_translator/segment/common.py +29 -0
- epub_translator/segment/inline_segment.py +356 -0
- epub_translator/{xml_translator → segment}/text_segment.py +8 -8
- epub_translator/segment/utils.py +43 -0
- epub_translator/translator.py +150 -183
- epub_translator/utils.py +33 -0
- epub_translator/xml/__init__.py +2 -0
- epub_translator/xml/const.py +1 -0
- epub_translator/xml/deduplication.py +3 -3
- epub_translator/xml/self_closing.py +182 -0
- epub_translator/xml/utils.py +42 -0
- epub_translator/xml/xml.py +7 -0
- epub_translator/xml/xml_like.py +145 -115
- epub_translator/xml_interrupter.py +165 -0
- epub_translator/xml_translator/__init__.py +1 -2
- epub_translator/xml_translator/callbacks.py +34 -0
- epub_translator/xml_translator/{const.py → common.py} +0 -1
- epub_translator/xml_translator/hill_climbing.py +104 -0
- epub_translator/xml_translator/stream_mapper.py +253 -0
- epub_translator/xml_translator/submitter.py +26 -72
- epub_translator/xml_translator/translator.py +157 -107
- epub_translator/xml_translator/validation.py +458 -0
- {epub_translator-0.1.0.dist-info → epub_translator-0.1.3.dist-info}/METADATA +72 -9
- epub_translator-0.1.3.dist-info/RECORD +66 -0
- epub_translator/epub/placeholder.py +0 -53
- epub_translator/iter_sync.py +0 -24
- epub_translator/xml_translator/fill.py +0 -128
- epub_translator/xml_translator/format.py +0 -282
- epub_translator/xml_translator/fragmented.py +0 -125
- epub_translator/xml_translator/group.py +0 -183
- epub_translator/xml_translator/progressive_locking.py +0 -256
- epub_translator/xml_translator/utils.py +0 -29
- epub_translator-0.1.0.dist-info/RECORD +0 -58
- {epub_translator-0.1.0.dist-info → epub_translator-0.1.3.dist-info}/LICENSE +0 -0
- {epub_translator-0.1.0.dist-info → epub_translator-0.1.3.dist-info}/WHEEL +0 -0
|
@@ -1,15 +1,14 @@
|
|
|
1
|
-
from collections.abc import Generator, Iterable
|
|
1
|
+
from collections.abc import Callable, Generator, Iterable
|
|
2
2
|
from typing import TypeVar
|
|
3
3
|
from xml.etree.ElementTree import Element
|
|
4
4
|
|
|
5
|
-
from ..iter_sync import IterSync
|
|
6
5
|
from ..llm import LLM, Message, MessageRole
|
|
7
|
-
from ..
|
|
8
|
-
from
|
|
9
|
-
from .
|
|
10
|
-
from .
|
|
11
|
-
from .
|
|
12
|
-
from .
|
|
6
|
+
from ..segment import BlockSegment, InlineSegment, TextSegment
|
|
7
|
+
from ..xml import decode_friendly, encode_friendly
|
|
8
|
+
from .callbacks import Callbacks, FillFailedEvent, warp_callbacks
|
|
9
|
+
from .hill_climbing import HillClimbing
|
|
10
|
+
from .stream_mapper import InlineSegmentMapping, XMLStreamMapper
|
|
11
|
+
from .submitter import submit_text_segments
|
|
13
12
|
|
|
14
13
|
T = TypeVar("T")
|
|
15
14
|
|
|
@@ -17,66 +16,110 @@ T = TypeVar("T")
|
|
|
17
16
|
class XMLTranslator:
|
|
18
17
|
def __init__(
|
|
19
18
|
self,
|
|
20
|
-
|
|
21
|
-
|
|
19
|
+
translation_llm: LLM,
|
|
20
|
+
fill_llm: LLM,
|
|
22
21
|
target_language: str,
|
|
23
22
|
user_prompt: str | None,
|
|
24
23
|
ignore_translated_error: bool,
|
|
25
24
|
max_retries: int,
|
|
26
25
|
max_fill_displaying_errors: int,
|
|
26
|
+
max_group_tokens: int,
|
|
27
|
+
cache_seed_content: str | None = None,
|
|
27
28
|
) -> None:
|
|
28
|
-
self.
|
|
29
|
-
self.
|
|
29
|
+
self._translation_llm: LLM = translation_llm
|
|
30
|
+
self._fill_llm: LLM = fill_llm
|
|
30
31
|
self._target_language: str = target_language
|
|
31
32
|
self._user_prompt: str | None = user_prompt
|
|
32
33
|
self._ignore_translated_error: bool = ignore_translated_error
|
|
33
34
|
self._max_retries: int = max_retries
|
|
34
35
|
self._max_fill_displaying_errors: int = max_fill_displaying_errors
|
|
36
|
+
self._cache_seed_content: str | None = cache_seed_content
|
|
37
|
+
self._stream_mapper: XMLStreamMapper = XMLStreamMapper(
|
|
38
|
+
encoding=translation_llm.encoding,
|
|
39
|
+
max_group_tokens=max_group_tokens,
|
|
40
|
+
)
|
|
35
41
|
|
|
36
|
-
def
|
|
37
|
-
|
|
42
|
+
def translate_element(
|
|
43
|
+
self,
|
|
44
|
+
element: Element,
|
|
45
|
+
interrupt_source_text_segments: Callable[[Iterable[TextSegment]], Iterable[TextSegment]] | None = None,
|
|
46
|
+
interrupt_translated_text_segments: Callable[[Iterable[TextSegment]], Iterable[TextSegment]] | None = None,
|
|
47
|
+
interrupt_block_element: Callable[[Element], Element] | None = None,
|
|
48
|
+
on_fill_failed: Callable[[FillFailedEvent], None] | None = None,
|
|
49
|
+
) -> Element:
|
|
50
|
+
for translated in self.translate_elements(
|
|
51
|
+
elements=((element),),
|
|
52
|
+
interrupt_source_text_segments=interrupt_source_text_segments,
|
|
53
|
+
interrupt_translated_text_segments=interrupt_translated_text_segments,
|
|
54
|
+
interrupt_block_element=interrupt_block_element,
|
|
55
|
+
on_fill_failed=on_fill_failed,
|
|
56
|
+
):
|
|
38
57
|
return translated
|
|
39
|
-
raise RuntimeError("Translation failed unexpectedly")
|
|
40
58
|
|
|
41
|
-
|
|
42
|
-
self, items: Iterable[tuple[Element, T]]
|
|
43
|
-
) -> Generator[tuple[Element, list[TextSegment], T], None, None]:
|
|
44
|
-
sync: IterSync[tuple[Element, T]] = IterSync()
|
|
45
|
-
text_segments: list[TextSegment] = []
|
|
59
|
+
raise RuntimeError("Translation failed unexpectedly")
|
|
46
60
|
|
|
47
|
-
|
|
48
|
-
|
|
61
|
+
def translate_elements(
|
|
62
|
+
self,
|
|
63
|
+
elements: Iterable[Element],
|
|
64
|
+
interrupt_source_text_segments: Callable[[Iterable[TextSegment]], Iterable[TextSegment]] | None = None,
|
|
65
|
+
interrupt_translated_text_segments: Callable[[Iterable[TextSegment]], Iterable[TextSegment]] | None = None,
|
|
66
|
+
interrupt_block_element: Callable[[Element], Element] | None = None,
|
|
67
|
+
on_fill_failed: Callable[[FillFailedEvent], None] | None = None,
|
|
68
|
+
) -> Generator[Element, None, None]:
|
|
69
|
+
callbacks = warp_callbacks(
|
|
70
|
+
interrupt_source_text_segments=interrupt_source_text_segments,
|
|
71
|
+
interrupt_translated_text_segments=interrupt_translated_text_segments,
|
|
72
|
+
interrupt_block_element=interrupt_block_element,
|
|
73
|
+
on_fill_failed=on_fill_failed,
|
|
74
|
+
)
|
|
75
|
+
for element, mappings in self._stream_mapper.map_stream(
|
|
76
|
+
elements=iter(elements),
|
|
77
|
+
callbacks=callbacks,
|
|
78
|
+
map=lambda inline_segments: self._translate_inline_segments(
|
|
79
|
+
inline_segments=inline_segments,
|
|
80
|
+
callbacks=callbacks,
|
|
81
|
+
),
|
|
49
82
|
):
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
tail_element, _ = sync.tail
|
|
54
|
-
if id(tail_element) == id(text_segment.root):
|
|
55
|
-
break
|
|
56
|
-
tail_element, payload = sync.take()
|
|
57
|
-
yield tail_element, text_segments, payload
|
|
58
|
-
text_segments = []
|
|
59
|
-
text_segments.append(text_segment)
|
|
60
|
-
|
|
61
|
-
while sync.tail is not None:
|
|
62
|
-
tail_element, payload = sync.take()
|
|
63
|
-
yield tail_element, text_segments, payload
|
|
64
|
-
text_segments = []
|
|
65
|
-
|
|
66
|
-
def _translate_text_segments(self, elements: Iterable[Element]):
|
|
67
|
-
for group in self._group_context.split_groups(elements):
|
|
68
|
-
text_segments = list(group)
|
|
69
|
-
fill = XMLFill(text_segments)
|
|
70
|
-
source_text = "".join(self._render_text_segments(text_segments))
|
|
71
|
-
translated_text = self._translate_text(source_text)
|
|
72
|
-
self._fill_into_xml(
|
|
73
|
-
fill=fill,
|
|
74
|
-
source_text=source_text,
|
|
75
|
-
translated_text=translated_text,
|
|
83
|
+
yield submit_text_segments(
|
|
84
|
+
element=element,
|
|
85
|
+
mappings=mappings,
|
|
76
86
|
)
|
|
77
|
-
|
|
87
|
+
|
|
88
|
+
def _translate_inline_segments(
|
|
89
|
+
self,
|
|
90
|
+
inline_segments: list[InlineSegment],
|
|
91
|
+
callbacks: Callbacks,
|
|
92
|
+
) -> list[InlineSegmentMapping | None]:
|
|
93
|
+
hill_climbing = HillClimbing(
|
|
94
|
+
encoding=self._fill_llm.encoding,
|
|
95
|
+
max_fill_displaying_errors=self._max_fill_displaying_errors,
|
|
96
|
+
block_segment=BlockSegment(
|
|
97
|
+
root_tag="xml",
|
|
98
|
+
inline_segments=inline_segments,
|
|
99
|
+
),
|
|
100
|
+
)
|
|
101
|
+
text_segments = (text for inline in inline_segments for text in inline)
|
|
102
|
+
source_text = "".join(self._render_text_segments(text_segments))
|
|
103
|
+
translated_text = self._translate_text(source_text)
|
|
104
|
+
|
|
105
|
+
self._request_and_submit(
|
|
106
|
+
hill_climbing=hill_climbing,
|
|
107
|
+
source_text=source_text,
|
|
108
|
+
translated_text=translated_text,
|
|
109
|
+
callbacks=callbacks,
|
|
110
|
+
)
|
|
111
|
+
mappings: list[InlineSegmentMapping | None] = []
|
|
112
|
+
for mapping in hill_climbing.gen_mappings():
|
|
113
|
+
if mapping:
|
|
114
|
+
_, text_segments = mapping
|
|
115
|
+
if not text_segments:
|
|
116
|
+
mapping = None
|
|
117
|
+
mappings.append(mapping)
|
|
118
|
+
|
|
119
|
+
return mappings
|
|
78
120
|
|
|
79
121
|
def _render_text_segments(self, segments: Iterable[TextSegment]):
|
|
122
|
+
# TODO: 没必要,直接按照新的 inline segment 组织就行了
|
|
80
123
|
iterator = iter(segments)
|
|
81
124
|
segment = next(iterator, None)
|
|
82
125
|
if segment is None:
|
|
@@ -92,87 +135,94 @@ class XMLTranslator:
|
|
|
92
135
|
yield segment.text
|
|
93
136
|
|
|
94
137
|
def _translate_text(self, text: str) -> str:
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
138
|
+
with self._translation_llm.context(cache_seed_content=self._cache_seed_content) as ctx:
|
|
139
|
+
return ctx.request(
|
|
140
|
+
input=[
|
|
141
|
+
Message(
|
|
142
|
+
role=MessageRole.SYSTEM,
|
|
143
|
+
message=self._translation_llm.template("translate").render(
|
|
144
|
+
target_language=self._target_language,
|
|
145
|
+
user_prompt=self._user_prompt,
|
|
146
|
+
),
|
|
102
147
|
),
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
)
|
|
148
|
+
Message(role=MessageRole.USER, message=text),
|
|
149
|
+
]
|
|
150
|
+
)
|
|
107
151
|
|
|
108
|
-
def
|
|
152
|
+
def _request_and_submit(
|
|
153
|
+
self,
|
|
154
|
+
hill_climbing: HillClimbing,
|
|
155
|
+
source_text: str,
|
|
156
|
+
translated_text: str,
|
|
157
|
+
callbacks: Callbacks,
|
|
158
|
+
) -> None:
|
|
109
159
|
user_message = (
|
|
110
160
|
f"Source text:\n{source_text}\n\n"
|
|
111
|
-
f"XML template:\n```XML\n{encode_friendly(
|
|
161
|
+
f"XML template:\n```XML\n{encode_friendly(hill_climbing.request_element())}\n```\n\n"
|
|
112
162
|
f"Translated text:\n{translated_text}"
|
|
113
163
|
)
|
|
114
164
|
fixed_messages: list[Message] = [
|
|
115
165
|
Message(
|
|
116
166
|
role=MessageRole.SYSTEM,
|
|
117
|
-
message=self.
|
|
167
|
+
message=self._fill_llm.template("fill").render(),
|
|
118
168
|
),
|
|
119
169
|
Message(
|
|
120
170
|
role=MessageRole.USER,
|
|
121
171
|
message=user_message,
|
|
122
172
|
),
|
|
123
173
|
]
|
|
124
|
-
|
|
125
|
-
validator = ProgressiveLockingValidator()
|
|
126
174
|
conversation_history: list[Message] = []
|
|
127
|
-
latest_error: ValidationError | None = None
|
|
128
175
|
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
response = self._llm.request(
|
|
132
|
-
input=fixed_messages + conversation_history,
|
|
133
|
-
)
|
|
176
|
+
with self._fill_llm.context(cache_seed_content=self._cache_seed_content) as llm_context:
|
|
177
|
+
error_message: str | None = None
|
|
134
178
|
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
validated_element = _extract_xml_element(response)
|
|
179
|
+
for retry_count in range(self._max_retries):
|
|
180
|
+
response = llm_context.request(fixed_messages + conversation_history)
|
|
181
|
+
validated_element = self._extract_xml_element(response)
|
|
182
|
+
error_message = None
|
|
183
|
+
if isinstance(validated_element, str):
|
|
184
|
+
error_message = validated_element
|
|
185
|
+
elif isinstance(validated_element, Element):
|
|
186
|
+
error_message = hill_climbing.submit(validated_element)
|
|
138
187
|
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
template_ele=fill.request_element,
|
|
142
|
-
validated_ele=validated_element,
|
|
143
|
-
errors_limit=self._max_fill_displaying_errors,
|
|
144
|
-
)
|
|
188
|
+
if error_message is None:
|
|
189
|
+
break
|
|
145
190
|
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
191
|
+
callbacks.on_fill_failed(
|
|
192
|
+
FillFailedEvent(
|
|
193
|
+
error_message=error_message,
|
|
194
|
+
retried_count=retry_count + 1,
|
|
195
|
+
over_maximum_retries=False,
|
|
151
196
|
)
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
# Not complete yet, construct error message with progress info
|
|
155
|
-
progress_msg = f"Progress: {len(validator.locked_ids)} nodes locked"
|
|
156
|
-
if newly_locked:
|
|
157
|
-
progress_msg += f", {len(newly_locked)} newly locked this round"
|
|
158
|
-
|
|
159
|
-
full_error_message = f"{progress_msg}\n\n{error_message}"
|
|
160
|
-
|
|
197
|
+
)
|
|
161
198
|
conversation_history = [
|
|
162
199
|
Message(role=MessageRole.ASSISTANT, message=response),
|
|
163
|
-
Message(role=MessageRole.USER, message=
|
|
200
|
+
Message(role=MessageRole.USER, message=error_message),
|
|
164
201
|
]
|
|
202
|
+
if error_message is not None:
|
|
203
|
+
callbacks.on_fill_failed(
|
|
204
|
+
FillFailedEvent(
|
|
205
|
+
error_message=error_message,
|
|
206
|
+
retried_count=self._max_retries,
|
|
207
|
+
over_maximum_retries=True,
|
|
208
|
+
)
|
|
209
|
+
)
|
|
165
210
|
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
211
|
+
def _extract_xml_element(self, text: str) -> Element | str:
|
|
212
|
+
first_xml_element: Element | None = None
|
|
213
|
+
all_xml_elements: int = 0
|
|
214
|
+
|
|
215
|
+
for xml_element in decode_friendly(text, tags="xml"):
|
|
216
|
+
if first_xml_element is None:
|
|
217
|
+
first_xml_element = xml_element
|
|
218
|
+
all_xml_elements += 1
|
|
173
219
|
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
220
|
+
if first_xml_element is None:
|
|
221
|
+
return "No complete <xml>...</xml> block found. Please ensure you have properly closed the XML with </xml> tag." # noqa: E501
|
|
222
|
+
|
|
223
|
+
if all_xml_elements > 1:
|
|
224
|
+
return (
|
|
225
|
+
f"Found {all_xml_elements} <xml>...</xml> blocks. "
|
|
226
|
+
"Please return only one XML block without any examples or explanations."
|
|
227
|
+
)
|
|
228
|
+
return first_xml_element
|