epub-translator 0.1.1__py3-none-any.whl → 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. epub_translator/__init__.py +9 -2
  2. epub_translator/data/fill.jinja +143 -38
  3. epub_translator/epub/__init__.py +1 -1
  4. epub_translator/epub/metadata.py +122 -0
  5. epub_translator/epub/spines.py +3 -2
  6. epub_translator/epub/zip.py +11 -9
  7. epub_translator/epub_transcode.py +108 -0
  8. epub_translator/llm/__init__.py +1 -0
  9. epub_translator/llm/context.py +109 -0
  10. epub_translator/llm/core.py +32 -113
  11. epub_translator/llm/executor.py +25 -31
  12. epub_translator/llm/increasable.py +1 -1
  13. epub_translator/llm/types.py +0 -3
  14. epub_translator/punctuation.py +34 -0
  15. epub_translator/segment/__init__.py +26 -0
  16. epub_translator/segment/block_segment.py +124 -0
  17. epub_translator/segment/common.py +29 -0
  18. epub_translator/segment/inline_segment.py +356 -0
  19. epub_translator/{xml_translator → segment}/text_segment.py +7 -72
  20. epub_translator/segment/utils.py +43 -0
  21. epub_translator/translator.py +152 -184
  22. epub_translator/utils.py +33 -0
  23. epub_translator/xml/__init__.py +3 -0
  24. epub_translator/xml/const.py +1 -0
  25. epub_translator/xml/deduplication.py +3 -3
  26. epub_translator/xml/inline.py +67 -0
  27. epub_translator/xml/self_closing.py +182 -0
  28. epub_translator/xml/utils.py +42 -0
  29. epub_translator/xml/xml.py +7 -0
  30. epub_translator/xml/xml_like.py +8 -33
  31. epub_translator/xml_interrupter.py +165 -0
  32. epub_translator/xml_translator/__init__.py +3 -3
  33. epub_translator/xml_translator/callbacks.py +34 -0
  34. epub_translator/xml_translator/{const.py → common.py} +0 -1
  35. epub_translator/xml_translator/hill_climbing.py +104 -0
  36. epub_translator/xml_translator/stream_mapper.py +253 -0
  37. epub_translator/xml_translator/submitter.py +352 -91
  38. epub_translator/xml_translator/translator.py +182 -114
  39. epub_translator/xml_translator/validation.py +458 -0
  40. {epub_translator-0.1.1.dist-info → epub_translator-0.1.4.dist-info}/METADATA +134 -21
  41. epub_translator-0.1.4.dist-info/RECORD +68 -0
  42. epub_translator/epub/placeholder.py +0 -53
  43. epub_translator/iter_sync.py +0 -24
  44. epub_translator/xml_translator/fill.py +0 -128
  45. epub_translator/xml_translator/format.py +0 -282
  46. epub_translator/xml_translator/fragmented.py +0 -125
  47. epub_translator/xml_translator/group.py +0 -183
  48. epub_translator/xml_translator/progressive_locking.py +0 -256
  49. epub_translator/xml_translator/utils.py +0 -29
  50. epub_translator-0.1.1.dist-info/RECORD +0 -58
  51. {epub_translator-0.1.1.dist-info → epub_translator-0.1.4.dist-info}/LICENSE +0 -0
  52. {epub_translator-0.1.1.dist-info → epub_translator-0.1.4.dist-info}/WHEEL +0 -0
@@ -1,82 +1,144 @@
1
- from collections.abc import Generator, Iterable
2
- from typing import TypeVar
1
+ from collections.abc import Callable, Generator, Iterable
2
+ from dataclasses import dataclass
3
+ from typing import Generic, TypeVar
3
4
  from xml.etree.ElementTree import Element
4
5
 
5
- from ..iter_sync import IterSync
6
6
  from ..llm import LLM, Message, MessageRole
7
- from ..xml import encode_friendly
8
- from .fill import XMLFill
9
- from .format import ValidationError, _extract_xml_element
10
- from .group import XMLGroupContext
11
- from .progressive_locking import ProgressiveLockingValidator
12
- from .text_segment import TextSegment
7
+ from ..segment import BlockSegment, InlineSegment, TextSegment
8
+ from ..xml import decode_friendly, encode_friendly
9
+ from .callbacks import Callbacks, FillFailedEvent, warp_callbacks
10
+ from .hill_climbing import HillClimbing
11
+ from .stream_mapper import InlineSegmentMapping, XMLStreamMapper
12
+ from .submitter import SubmitKind, submit
13
13
 
14
14
  T = TypeVar("T")
15
15
 
16
16
 
17
+ @dataclass
18
+ class TranslationTask(Generic[T]):
19
+ element: Element
20
+ action: SubmitKind
21
+ payload: T
22
+
23
+
17
24
  class XMLTranslator:
18
25
  def __init__(
19
26
  self,
20
- llm: LLM,
21
- group_context: XMLGroupContext,
27
+ translation_llm: LLM,
28
+ fill_llm: LLM,
22
29
  target_language: str,
23
30
  user_prompt: str | None,
24
31
  ignore_translated_error: bool,
25
32
  max_retries: int,
26
33
  max_fill_displaying_errors: int,
34
+ max_group_tokens: int,
35
+ cache_seed_content: str | None = None,
27
36
  ) -> None:
28
- self._llm: LLM = llm
29
- self._group_context: XMLGroupContext = group_context
37
+ self._translation_llm: LLM = translation_llm
38
+ self._fill_llm: LLM = fill_llm
30
39
  self._target_language: str = target_language
31
40
  self._user_prompt: str | None = user_prompt
32
41
  self._ignore_translated_error: bool = ignore_translated_error
33
42
  self._max_retries: int = max_retries
34
43
  self._max_fill_displaying_errors: int = max_fill_displaying_errors
44
+ self._cache_seed_content: str | None = cache_seed_content
45
+ self._stream_mapper: XMLStreamMapper = XMLStreamMapper(
46
+ encoding=translation_llm.encoding,
47
+ max_group_tokens=max_group_tokens,
48
+ )
35
49
 
36
- def translate_to_element(self, element: Element) -> Element:
37
- for translated, _, _ in self.translate_to_text_segments(((element, None),)):
50
+ def translate_element(
51
+ self,
52
+ task: TranslationTask[T],
53
+ interrupt_source_text_segments: Callable[[Iterable[TextSegment]], Iterable[TextSegment]] | None = None,
54
+ interrupt_translated_text_segments: Callable[[Iterable[TextSegment]], Iterable[TextSegment]] | None = None,
55
+ interrupt_block_element: Callable[[Element], Element] | None = None,
56
+ on_fill_failed: Callable[[FillFailedEvent], None] | None = None,
57
+ ) -> tuple[Element, T]:
58
+ for translated in self.translate_elements(
59
+ tasks=((task),),
60
+ interrupt_source_text_segments=interrupt_source_text_segments,
61
+ interrupt_translated_text_segments=interrupt_translated_text_segments,
62
+ interrupt_block_element=interrupt_block_element,
63
+ on_fill_failed=on_fill_failed,
64
+ ):
38
65
  return translated
66
+
39
67
  raise RuntimeError("Translation failed unexpectedly")
40
68
 
41
- def translate_to_text_segments(
42
- self, items: Iterable[tuple[Element, T]]
43
- ) -> Generator[tuple[Element, list[TextSegment], T], None, None]:
44
- sync: IterSync[tuple[Element, T]] = IterSync()
45
- text_segments: list[TextSegment] = []
69
+ def translate_elements(
70
+ self,
71
+ tasks: Iterable[TranslationTask[T]],
72
+ interrupt_source_text_segments: Callable[[Iterable[TextSegment]], Iterable[TextSegment]] | None = None,
73
+ interrupt_translated_text_segments: Callable[[Iterable[TextSegment]], Iterable[TextSegment]] | None = None,
74
+ interrupt_block_element: Callable[[Element], Element] | None = None,
75
+ on_fill_failed: Callable[[FillFailedEvent], None] | None = None,
76
+ ) -> Generator[tuple[Element, T], None, None]:
77
+ element2task: dict[int, TranslationTask[T]] = {}
78
+ callbacks = warp_callbacks(
79
+ interrupt_source_text_segments=interrupt_source_text_segments,
80
+ interrupt_translated_text_segments=interrupt_translated_text_segments,
81
+ interrupt_block_element=interrupt_block_element,
82
+ on_fill_failed=on_fill_failed,
83
+ )
46
84
 
47
- for text_segment in self._translate_text_segments(
48
- elements=(e for e, _ in sync.iter(items)),
85
+ def generate_elements():
86
+ for task in tasks:
87
+ element2task[id(task.element)] = task
88
+ yield task.element
89
+
90
+ for element, mappings in self._stream_mapper.map_stream(
91
+ elements=generate_elements(),
92
+ callbacks=callbacks,
93
+ map=lambda inline_segments: self._translate_inline_segments(
94
+ inline_segments=inline_segments,
95
+ callbacks=callbacks,
96
+ ),
49
97
  ):
50
- while True:
51
- if sync.tail is None:
52
- break
53
- tail_element, _ = sync.tail
54
- if id(tail_element) == id(text_segment.root):
55
- break
56
- tail_element, payload = sync.take()
57
- yield tail_element, text_segments, payload
58
- text_segments = []
59
- text_segments.append(text_segment)
60
-
61
- while sync.tail is not None:
62
- tail_element, payload = sync.take()
63
- yield tail_element, text_segments, payload
64
- text_segments = []
65
-
66
- def _translate_text_segments(self, elements: Iterable[Element]):
67
- for group in self._group_context.split_groups(elements):
68
- text_segments = list(group)
69
- fill = XMLFill(text_segments)
70
- source_text = "".join(self._render_text_segments(text_segments))
71
- translated_text = self._translate_text(source_text)
72
- self._fill_into_xml(
73
- fill=fill,
74
- source_text=source_text,
75
- translated_text=translated_text,
76
- )
77
- yield from group.body
98
+ task = element2task.get(id(element), None)
99
+ if task:
100
+ translated_element = submit(
101
+ element=element,
102
+ action=task.action,
103
+ mappings=mappings,
104
+ )
105
+ yield translated_element, task.payload
106
+
107
+ def _translate_inline_segments(
108
+ self,
109
+ inline_segments: list[InlineSegment],
110
+ callbacks: Callbacks,
111
+ ) -> list[InlineSegmentMapping | None]:
112
+ hill_climbing = HillClimbing(
113
+ encoding=self._fill_llm.encoding,
114
+ max_fill_displaying_errors=self._max_fill_displaying_errors,
115
+ block_segment=BlockSegment(
116
+ root_tag="xml",
117
+ inline_segments=inline_segments,
118
+ ),
119
+ )
120
+ text_segments = (text for inline in inline_segments for text in inline)
121
+ source_text = "".join(self._render_text_segments(text_segments))
122
+ translated_text = self._translate_text(source_text)
123
+
124
+ self._request_and_submit(
125
+ hill_climbing=hill_climbing,
126
+ source_text=source_text,
127
+ translated_text=translated_text,
128
+ callbacks=callbacks,
129
+ )
130
+ mappings: list[InlineSegmentMapping | None] = []
131
+ for mapping in hill_climbing.gen_mappings():
132
+ if mapping:
133
+ _, text_segments = mapping
134
+ if not text_segments:
135
+ mapping = None
136
+ mappings.append(mapping)
137
+
138
+ return mappings
78
139
 
79
140
  def _render_text_segments(self, segments: Iterable[TextSegment]):
141
+ # TODO: 没必要,直接按照新的 inline segment 组织就行了
80
142
  iterator = iter(segments)
81
143
  segment = next(iterator, None)
82
144
  if segment is None:
@@ -92,88 +154,94 @@ class XMLTranslator:
92
154
  yield segment.text
93
155
 
94
156
  def _translate_text(self, text: str) -> str:
95
- return self._llm.request(
96
- input=[
97
- Message(
98
- role=MessageRole.SYSTEM,
99
- message=self._llm.template("translate").render(
100
- target_language=self._target_language,
101
- user_prompt=self._user_prompt,
157
+ with self._translation_llm.context(cache_seed_content=self._cache_seed_content) as ctx:
158
+ return ctx.request(
159
+ input=[
160
+ Message(
161
+ role=MessageRole.SYSTEM,
162
+ message=self._translation_llm.template("translate").render(
163
+ target_language=self._target_language,
164
+ user_prompt=self._user_prompt,
165
+ ),
102
166
  ),
103
- ),
104
- Message(role=MessageRole.USER, message=text),
105
- ]
106
- )
167
+ Message(role=MessageRole.USER, message=text),
168
+ ]
169
+ )
107
170
 
108
- def _fill_into_xml(self, fill: XMLFill, source_text: str, translated_text: str) -> Element:
171
+ def _request_and_submit(
172
+ self,
173
+ hill_climbing: HillClimbing,
174
+ source_text: str,
175
+ translated_text: str,
176
+ callbacks: Callbacks,
177
+ ) -> None:
109
178
  user_message = (
110
179
  f"Source text:\n{source_text}\n\n"
111
- f"XML template:\n```XML\n{encode_friendly(fill.request_element)}\n```\n\n"
180
+ f"XML template:\n```XML\n{encode_friendly(hill_climbing.request_element())}\n```\n\n"
112
181
  f"Translated text:\n{translated_text}"
113
182
  )
114
183
  fixed_messages: list[Message] = [
115
184
  Message(
116
185
  role=MessageRole.SYSTEM,
117
- message=self._llm.template("fill").render(),
186
+ message=self._fill_llm.template("fill").render(),
118
187
  ),
119
188
  Message(
120
189
  role=MessageRole.USER,
121
190
  message=user_message,
122
191
  ),
123
192
  ]
124
-
125
- validator = ProgressiveLockingValidator()
126
193
  conversation_history: list[Message] = []
127
- latest_error: ValidationError | None = None
128
194
 
129
- with self._llm.context() as llm_context:
130
- for _ in range(self._max_retries):
131
- # Request LLM response
132
- response = llm_context.request(
133
- input=fixed_messages + conversation_history,
134
- )
195
+ with self._fill_llm.context(cache_seed_content=self._cache_seed_content) as llm_context:
196
+ error_message: str | None = None
135
197
 
136
- try:
137
- # Extract XML from response
138
- validated_element = _extract_xml_element(response)
198
+ for retry_count in range(self._max_retries):
199
+ response = llm_context.request(fixed_messages + conversation_history)
200
+ validated_element = self._extract_xml_element(response)
201
+ error_message = None
202
+ if isinstance(validated_element, str):
203
+ error_message = validated_element
204
+ elif isinstance(validated_element, Element):
205
+ error_message = hill_climbing.submit(validated_element)
139
206
 
140
- # Validate with progressive locking
141
- is_complete, error_message, newly_locked = validator.validate_with_locking(
142
- template_ele=fill.request_element,
143
- validated_ele=validated_element,
144
- errors_limit=self._max_fill_displaying_errors,
207
+ if error_message is None:
208
+ break
209
+
210
+ callbacks.on_fill_failed(
211
+ FillFailedEvent(
212
+ error_message=error_message,
213
+ retried_count=retry_count + 1,
214
+ over_maximum_retries=False,
215
+ )
216
+ )
217
+ conversation_history = [
218
+ Message(role=MessageRole.ASSISTANT, message=response),
219
+ Message(role=MessageRole.USER, message=error_message),
220
+ ]
221
+ if error_message is not None:
222
+ callbacks.on_fill_failed(
223
+ FillFailedEvent(
224
+ error_message=error_message,
225
+ retried_count=self._max_retries,
226
+ over_maximum_retries=True,
145
227
  )
228
+ )
146
229
 
147
- if is_complete:
148
- # All nodes locked, fill successful
149
- fill._fill_submitted_texts( # pylint: disable=protected-access
150
- generated_ids_stack=[],
151
- element=validated_element,
152
- )
153
- return validated_element
154
-
155
- # Not complete yet, construct error message with progress info
156
- progress_msg = f"Progress: {len(validator.locked_ids)} nodes locked"
157
- if newly_locked:
158
- progress_msg += f", {len(newly_locked)} newly locked this round"
159
-
160
- full_error_message = f"{progress_msg}\n\n{error_message}"
161
-
162
- conversation_history = [
163
- Message(role=MessageRole.ASSISTANT, message=response),
164
- Message(role=MessageRole.USER, message=full_error_message),
165
- ]
166
-
167
- except ValidationError as error:
168
- # XML extraction or basic validation failed
169
- latest_error = error
170
- conversation_history = [
171
- Message(role=MessageRole.ASSISTANT, message=response),
172
- Message(role=MessageRole.USER, message=str(error)),
173
- ]
174
-
175
- message = f"Failed to get valid XML structure after {self._max_retries} attempts"
176
- if latest_error is None:
177
- raise ValueError(message)
178
- else:
179
- raise ValueError(message) from latest_error
230
+ def _extract_xml_element(self, text: str) -> Element | str:
231
+ first_xml_element: Element | None = None
232
+ all_xml_elements: int = 0
233
+
234
+ for xml_element in decode_friendly(text, tags="xml"):
235
+ if first_xml_element is None:
236
+ first_xml_element = xml_element
237
+ all_xml_elements += 1
238
+
239
+ if first_xml_element is None:
240
+ return "No complete <xml>...</xml> block found. Please ensure you have properly closed the XML with </xml> tag." # noqa: E501
241
+
242
+ if all_xml_elements > 1:
243
+ return (
244
+ f"Found {all_xml_elements} <xml>...</xml> blocks. "
245
+ "Please return only one XML block without any examples or explanations."
246
+ )
247
+ return first_xml_element