epub-translator 0.1.1__py3-none-any.whl → 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epub_translator/__init__.py +9 -2
- epub_translator/data/fill.jinja +143 -38
- epub_translator/epub/__init__.py +1 -1
- epub_translator/epub/metadata.py +122 -0
- epub_translator/epub/spines.py +3 -2
- epub_translator/epub/zip.py +11 -9
- epub_translator/epub_transcode.py +108 -0
- epub_translator/llm/__init__.py +1 -0
- epub_translator/llm/context.py +109 -0
- epub_translator/llm/core.py +32 -113
- epub_translator/llm/executor.py +25 -31
- epub_translator/llm/increasable.py +1 -1
- epub_translator/llm/types.py +0 -3
- epub_translator/punctuation.py +34 -0
- epub_translator/segment/__init__.py +26 -0
- epub_translator/segment/block_segment.py +124 -0
- epub_translator/segment/common.py +29 -0
- epub_translator/segment/inline_segment.py +356 -0
- epub_translator/{xml_translator → segment}/text_segment.py +7 -72
- epub_translator/segment/utils.py +43 -0
- epub_translator/translator.py +152 -184
- epub_translator/utils.py +33 -0
- epub_translator/xml/__init__.py +3 -0
- epub_translator/xml/const.py +1 -0
- epub_translator/xml/deduplication.py +3 -3
- epub_translator/xml/inline.py +67 -0
- epub_translator/xml/self_closing.py +182 -0
- epub_translator/xml/utils.py +42 -0
- epub_translator/xml/xml.py +7 -0
- epub_translator/xml/xml_like.py +8 -33
- epub_translator/xml_interrupter.py +165 -0
- epub_translator/xml_translator/__init__.py +3 -3
- epub_translator/xml_translator/callbacks.py +34 -0
- epub_translator/xml_translator/{const.py → common.py} +0 -1
- epub_translator/xml_translator/hill_climbing.py +104 -0
- epub_translator/xml_translator/stream_mapper.py +253 -0
- epub_translator/xml_translator/submitter.py +352 -91
- epub_translator/xml_translator/translator.py +182 -114
- epub_translator/xml_translator/validation.py +458 -0
- {epub_translator-0.1.1.dist-info → epub_translator-0.1.4.dist-info}/METADATA +134 -21
- epub_translator-0.1.4.dist-info/RECORD +68 -0
- epub_translator/epub/placeholder.py +0 -53
- epub_translator/iter_sync.py +0 -24
- epub_translator/xml_translator/fill.py +0 -128
- epub_translator/xml_translator/format.py +0 -282
- epub_translator/xml_translator/fragmented.py +0 -125
- epub_translator/xml_translator/group.py +0 -183
- epub_translator/xml_translator/progressive_locking.py +0 -256
- epub_translator/xml_translator/utils.py +0 -29
- epub_translator-0.1.1.dist-info/RECORD +0 -58
- {epub_translator-0.1.1.dist-info → epub_translator-0.1.4.dist-info}/LICENSE +0 -0
- {epub_translator-0.1.1.dist-info → epub_translator-0.1.4.dist-info}/WHEEL +0 -0
|
@@ -1,82 +1,144 @@
|
|
|
1
|
-
from collections.abc import Generator, Iterable
|
|
2
|
-
from
|
|
1
|
+
from collections.abc import Callable, Generator, Iterable
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import Generic, TypeVar
|
|
3
4
|
from xml.etree.ElementTree import Element
|
|
4
5
|
|
|
5
|
-
from ..iter_sync import IterSync
|
|
6
6
|
from ..llm import LLM, Message, MessageRole
|
|
7
|
-
from ..
|
|
8
|
-
from
|
|
9
|
-
from .
|
|
10
|
-
from .
|
|
11
|
-
from .
|
|
12
|
-
from .
|
|
7
|
+
from ..segment import BlockSegment, InlineSegment, TextSegment
|
|
8
|
+
from ..xml import decode_friendly, encode_friendly
|
|
9
|
+
from .callbacks import Callbacks, FillFailedEvent, warp_callbacks
|
|
10
|
+
from .hill_climbing import HillClimbing
|
|
11
|
+
from .stream_mapper import InlineSegmentMapping, XMLStreamMapper
|
|
12
|
+
from .submitter import SubmitKind, submit
|
|
13
13
|
|
|
14
14
|
T = TypeVar("T")
|
|
15
15
|
|
|
16
16
|
|
|
17
|
+
@dataclass
|
|
18
|
+
class TranslationTask(Generic[T]):
|
|
19
|
+
element: Element
|
|
20
|
+
action: SubmitKind
|
|
21
|
+
payload: T
|
|
22
|
+
|
|
23
|
+
|
|
17
24
|
class XMLTranslator:
|
|
18
25
|
def __init__(
|
|
19
26
|
self,
|
|
20
|
-
|
|
21
|
-
|
|
27
|
+
translation_llm: LLM,
|
|
28
|
+
fill_llm: LLM,
|
|
22
29
|
target_language: str,
|
|
23
30
|
user_prompt: str | None,
|
|
24
31
|
ignore_translated_error: bool,
|
|
25
32
|
max_retries: int,
|
|
26
33
|
max_fill_displaying_errors: int,
|
|
34
|
+
max_group_tokens: int,
|
|
35
|
+
cache_seed_content: str | None = None,
|
|
27
36
|
) -> None:
|
|
28
|
-
self.
|
|
29
|
-
self.
|
|
37
|
+
self._translation_llm: LLM = translation_llm
|
|
38
|
+
self._fill_llm: LLM = fill_llm
|
|
30
39
|
self._target_language: str = target_language
|
|
31
40
|
self._user_prompt: str | None = user_prompt
|
|
32
41
|
self._ignore_translated_error: bool = ignore_translated_error
|
|
33
42
|
self._max_retries: int = max_retries
|
|
34
43
|
self._max_fill_displaying_errors: int = max_fill_displaying_errors
|
|
44
|
+
self._cache_seed_content: str | None = cache_seed_content
|
|
45
|
+
self._stream_mapper: XMLStreamMapper = XMLStreamMapper(
|
|
46
|
+
encoding=translation_llm.encoding,
|
|
47
|
+
max_group_tokens=max_group_tokens,
|
|
48
|
+
)
|
|
35
49
|
|
|
36
|
-
def
|
|
37
|
-
|
|
50
|
+
def translate_element(
|
|
51
|
+
self,
|
|
52
|
+
task: TranslationTask[T],
|
|
53
|
+
interrupt_source_text_segments: Callable[[Iterable[TextSegment]], Iterable[TextSegment]] | None = None,
|
|
54
|
+
interrupt_translated_text_segments: Callable[[Iterable[TextSegment]], Iterable[TextSegment]] | None = None,
|
|
55
|
+
interrupt_block_element: Callable[[Element], Element] | None = None,
|
|
56
|
+
on_fill_failed: Callable[[FillFailedEvent], None] | None = None,
|
|
57
|
+
) -> tuple[Element, T]:
|
|
58
|
+
for translated in self.translate_elements(
|
|
59
|
+
tasks=((task),),
|
|
60
|
+
interrupt_source_text_segments=interrupt_source_text_segments,
|
|
61
|
+
interrupt_translated_text_segments=interrupt_translated_text_segments,
|
|
62
|
+
interrupt_block_element=interrupt_block_element,
|
|
63
|
+
on_fill_failed=on_fill_failed,
|
|
64
|
+
):
|
|
38
65
|
return translated
|
|
66
|
+
|
|
39
67
|
raise RuntimeError("Translation failed unexpectedly")
|
|
40
68
|
|
|
41
|
-
def
|
|
42
|
-
self,
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
69
|
+
def translate_elements(
|
|
70
|
+
self,
|
|
71
|
+
tasks: Iterable[TranslationTask[T]],
|
|
72
|
+
interrupt_source_text_segments: Callable[[Iterable[TextSegment]], Iterable[TextSegment]] | None = None,
|
|
73
|
+
interrupt_translated_text_segments: Callable[[Iterable[TextSegment]], Iterable[TextSegment]] | None = None,
|
|
74
|
+
interrupt_block_element: Callable[[Element], Element] | None = None,
|
|
75
|
+
on_fill_failed: Callable[[FillFailedEvent], None] | None = None,
|
|
76
|
+
) -> Generator[tuple[Element, T], None, None]:
|
|
77
|
+
element2task: dict[int, TranslationTask[T]] = {}
|
|
78
|
+
callbacks = warp_callbacks(
|
|
79
|
+
interrupt_source_text_segments=interrupt_source_text_segments,
|
|
80
|
+
interrupt_translated_text_segments=interrupt_translated_text_segments,
|
|
81
|
+
interrupt_block_element=interrupt_block_element,
|
|
82
|
+
on_fill_failed=on_fill_failed,
|
|
83
|
+
)
|
|
46
84
|
|
|
47
|
-
|
|
48
|
-
|
|
85
|
+
def generate_elements():
|
|
86
|
+
for task in tasks:
|
|
87
|
+
element2task[id(task.element)] = task
|
|
88
|
+
yield task.element
|
|
89
|
+
|
|
90
|
+
for element, mappings in self._stream_mapper.map_stream(
|
|
91
|
+
elements=generate_elements(),
|
|
92
|
+
callbacks=callbacks,
|
|
93
|
+
map=lambda inline_segments: self._translate_inline_segments(
|
|
94
|
+
inline_segments=inline_segments,
|
|
95
|
+
callbacks=callbacks,
|
|
96
|
+
),
|
|
49
97
|
):
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
yield
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
98
|
+
task = element2task.get(id(element), None)
|
|
99
|
+
if task:
|
|
100
|
+
translated_element = submit(
|
|
101
|
+
element=element,
|
|
102
|
+
action=task.action,
|
|
103
|
+
mappings=mappings,
|
|
104
|
+
)
|
|
105
|
+
yield translated_element, task.payload
|
|
106
|
+
|
|
107
|
+
def _translate_inline_segments(
|
|
108
|
+
self,
|
|
109
|
+
inline_segments: list[InlineSegment],
|
|
110
|
+
callbacks: Callbacks,
|
|
111
|
+
) -> list[InlineSegmentMapping | None]:
|
|
112
|
+
hill_climbing = HillClimbing(
|
|
113
|
+
encoding=self._fill_llm.encoding,
|
|
114
|
+
max_fill_displaying_errors=self._max_fill_displaying_errors,
|
|
115
|
+
block_segment=BlockSegment(
|
|
116
|
+
root_tag="xml",
|
|
117
|
+
inline_segments=inline_segments,
|
|
118
|
+
),
|
|
119
|
+
)
|
|
120
|
+
text_segments = (text for inline in inline_segments for text in inline)
|
|
121
|
+
source_text = "".join(self._render_text_segments(text_segments))
|
|
122
|
+
translated_text = self._translate_text(source_text)
|
|
123
|
+
|
|
124
|
+
self._request_and_submit(
|
|
125
|
+
hill_climbing=hill_climbing,
|
|
126
|
+
source_text=source_text,
|
|
127
|
+
translated_text=translated_text,
|
|
128
|
+
callbacks=callbacks,
|
|
129
|
+
)
|
|
130
|
+
mappings: list[InlineSegmentMapping | None] = []
|
|
131
|
+
for mapping in hill_climbing.gen_mappings():
|
|
132
|
+
if mapping:
|
|
133
|
+
_, text_segments = mapping
|
|
134
|
+
if not text_segments:
|
|
135
|
+
mapping = None
|
|
136
|
+
mappings.append(mapping)
|
|
137
|
+
|
|
138
|
+
return mappings
|
|
78
139
|
|
|
79
140
|
def _render_text_segments(self, segments: Iterable[TextSegment]):
|
|
141
|
+
# TODO: 没必要,直接按照新的 inline segment 组织就行了
|
|
80
142
|
iterator = iter(segments)
|
|
81
143
|
segment = next(iterator, None)
|
|
82
144
|
if segment is None:
|
|
@@ -92,88 +154,94 @@ class XMLTranslator:
|
|
|
92
154
|
yield segment.text
|
|
93
155
|
|
|
94
156
|
def _translate_text(self, text: str) -> str:
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
157
|
+
with self._translation_llm.context(cache_seed_content=self._cache_seed_content) as ctx:
|
|
158
|
+
return ctx.request(
|
|
159
|
+
input=[
|
|
160
|
+
Message(
|
|
161
|
+
role=MessageRole.SYSTEM,
|
|
162
|
+
message=self._translation_llm.template("translate").render(
|
|
163
|
+
target_language=self._target_language,
|
|
164
|
+
user_prompt=self._user_prompt,
|
|
165
|
+
),
|
|
102
166
|
),
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
)
|
|
167
|
+
Message(role=MessageRole.USER, message=text),
|
|
168
|
+
]
|
|
169
|
+
)
|
|
107
170
|
|
|
108
|
-
def
|
|
171
|
+
def _request_and_submit(
|
|
172
|
+
self,
|
|
173
|
+
hill_climbing: HillClimbing,
|
|
174
|
+
source_text: str,
|
|
175
|
+
translated_text: str,
|
|
176
|
+
callbacks: Callbacks,
|
|
177
|
+
) -> None:
|
|
109
178
|
user_message = (
|
|
110
179
|
f"Source text:\n{source_text}\n\n"
|
|
111
|
-
f"XML template:\n```XML\n{encode_friendly(
|
|
180
|
+
f"XML template:\n```XML\n{encode_friendly(hill_climbing.request_element())}\n```\n\n"
|
|
112
181
|
f"Translated text:\n{translated_text}"
|
|
113
182
|
)
|
|
114
183
|
fixed_messages: list[Message] = [
|
|
115
184
|
Message(
|
|
116
185
|
role=MessageRole.SYSTEM,
|
|
117
|
-
message=self.
|
|
186
|
+
message=self._fill_llm.template("fill").render(),
|
|
118
187
|
),
|
|
119
188
|
Message(
|
|
120
189
|
role=MessageRole.USER,
|
|
121
190
|
message=user_message,
|
|
122
191
|
),
|
|
123
192
|
]
|
|
124
|
-
|
|
125
|
-
validator = ProgressiveLockingValidator()
|
|
126
193
|
conversation_history: list[Message] = []
|
|
127
|
-
latest_error: ValidationError | None = None
|
|
128
194
|
|
|
129
|
-
with self.
|
|
130
|
-
|
|
131
|
-
# Request LLM response
|
|
132
|
-
response = llm_context.request(
|
|
133
|
-
input=fixed_messages + conversation_history,
|
|
134
|
-
)
|
|
195
|
+
with self._fill_llm.context(cache_seed_content=self._cache_seed_content) as llm_context:
|
|
196
|
+
error_message: str | None = None
|
|
135
197
|
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
198
|
+
for retry_count in range(self._max_retries):
|
|
199
|
+
response = llm_context.request(fixed_messages + conversation_history)
|
|
200
|
+
validated_element = self._extract_xml_element(response)
|
|
201
|
+
error_message = None
|
|
202
|
+
if isinstance(validated_element, str):
|
|
203
|
+
error_message = validated_element
|
|
204
|
+
elif isinstance(validated_element, Element):
|
|
205
|
+
error_message = hill_climbing.submit(validated_element)
|
|
139
206
|
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
207
|
+
if error_message is None:
|
|
208
|
+
break
|
|
209
|
+
|
|
210
|
+
callbacks.on_fill_failed(
|
|
211
|
+
FillFailedEvent(
|
|
212
|
+
error_message=error_message,
|
|
213
|
+
retried_count=retry_count + 1,
|
|
214
|
+
over_maximum_retries=False,
|
|
215
|
+
)
|
|
216
|
+
)
|
|
217
|
+
conversation_history = [
|
|
218
|
+
Message(role=MessageRole.ASSISTANT, message=response),
|
|
219
|
+
Message(role=MessageRole.USER, message=error_message),
|
|
220
|
+
]
|
|
221
|
+
if error_message is not None:
|
|
222
|
+
callbacks.on_fill_failed(
|
|
223
|
+
FillFailedEvent(
|
|
224
|
+
error_message=error_message,
|
|
225
|
+
retried_count=self._max_retries,
|
|
226
|
+
over_maximum_retries=True,
|
|
145
227
|
)
|
|
228
|
+
)
|
|
146
229
|
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
]
|
|
166
|
-
|
|
167
|
-
except ValidationError as error:
|
|
168
|
-
# XML extraction or basic validation failed
|
|
169
|
-
latest_error = error
|
|
170
|
-
conversation_history = [
|
|
171
|
-
Message(role=MessageRole.ASSISTANT, message=response),
|
|
172
|
-
Message(role=MessageRole.USER, message=str(error)),
|
|
173
|
-
]
|
|
174
|
-
|
|
175
|
-
message = f"Failed to get valid XML structure after {self._max_retries} attempts"
|
|
176
|
-
if latest_error is None:
|
|
177
|
-
raise ValueError(message)
|
|
178
|
-
else:
|
|
179
|
-
raise ValueError(message) from latest_error
|
|
230
|
+
def _extract_xml_element(self, text: str) -> Element | str:
|
|
231
|
+
first_xml_element: Element | None = None
|
|
232
|
+
all_xml_elements: int = 0
|
|
233
|
+
|
|
234
|
+
for xml_element in decode_friendly(text, tags="xml"):
|
|
235
|
+
if first_xml_element is None:
|
|
236
|
+
first_xml_element = xml_element
|
|
237
|
+
all_xml_elements += 1
|
|
238
|
+
|
|
239
|
+
if first_xml_element is None:
|
|
240
|
+
return "No complete <xml>...</xml> block found. Please ensure you have properly closed the XML with </xml> tag." # noqa: E501
|
|
241
|
+
|
|
242
|
+
if all_xml_elements > 1:
|
|
243
|
+
return (
|
|
244
|
+
f"Found {all_xml_elements} <xml>...</xml> blocks. "
|
|
245
|
+
"Please return only one XML block without any examples or explanations."
|
|
246
|
+
)
|
|
247
|
+
return first_xml_element
|