epub-translator 0.1.0__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. epub_translator/__init__.py +2 -2
  2. epub_translator/data/fill.jinja +143 -38
  3. epub_translator/epub/__init__.py +1 -1
  4. epub_translator/epub/metadata.py +122 -0
  5. epub_translator/epub/spines.py +3 -2
  6. epub_translator/epub/zip.py +11 -9
  7. epub_translator/epub_transcode.py +108 -0
  8. epub_translator/llm/__init__.py +1 -0
  9. epub_translator/llm/context.py +109 -0
  10. epub_translator/llm/core.py +39 -62
  11. epub_translator/llm/executor.py +25 -31
  12. epub_translator/llm/increasable.py +1 -1
  13. epub_translator/llm/types.py +0 -3
  14. epub_translator/segment/__init__.py +26 -0
  15. epub_translator/segment/block_segment.py +124 -0
  16. epub_translator/segment/common.py +29 -0
  17. epub_translator/segment/inline_segment.py +356 -0
  18. epub_translator/{xml_translator → segment}/text_segment.py +8 -8
  19. epub_translator/segment/utils.py +43 -0
  20. epub_translator/translator.py +150 -183
  21. epub_translator/utils.py +33 -0
  22. epub_translator/xml/__init__.py +2 -0
  23. epub_translator/xml/const.py +1 -0
  24. epub_translator/xml/deduplication.py +3 -3
  25. epub_translator/xml/self_closing.py +182 -0
  26. epub_translator/xml/utils.py +42 -0
  27. epub_translator/xml/xml.py +7 -0
  28. epub_translator/xml/xml_like.py +145 -115
  29. epub_translator/xml_interrupter.py +165 -0
  30. epub_translator/xml_translator/__init__.py +1 -2
  31. epub_translator/xml_translator/callbacks.py +34 -0
  32. epub_translator/xml_translator/{const.py → common.py} +0 -1
  33. epub_translator/xml_translator/hill_climbing.py +104 -0
  34. epub_translator/xml_translator/stream_mapper.py +253 -0
  35. epub_translator/xml_translator/submitter.py +26 -72
  36. epub_translator/xml_translator/translator.py +157 -107
  37. epub_translator/xml_translator/validation.py +458 -0
  38. {epub_translator-0.1.0.dist-info → epub_translator-0.1.3.dist-info}/METADATA +72 -9
  39. epub_translator-0.1.3.dist-info/RECORD +66 -0
  40. epub_translator/epub/placeholder.py +0 -53
  41. epub_translator/iter_sync.py +0 -24
  42. epub_translator/xml_translator/fill.py +0 -128
  43. epub_translator/xml_translator/format.py +0 -282
  44. epub_translator/xml_translator/fragmented.py +0 -125
  45. epub_translator/xml_translator/group.py +0 -183
  46. epub_translator/xml_translator/progressive_locking.py +0 -256
  47. epub_translator/xml_translator/utils.py +0 -29
  48. epub_translator-0.1.0.dist-info/RECORD +0 -58
  49. {epub_translator-0.1.0.dist-info → epub_translator-0.1.3.dist-info}/LICENSE +0 -0
  50. {epub_translator-0.1.0.dist-info → epub_translator-0.1.3.dist-info}/WHEEL +0 -0
@@ -1,15 +1,14 @@
1
- from collections.abc import Generator, Iterable
1
+ from collections.abc import Callable, Generator, Iterable
2
2
  from typing import TypeVar
3
3
  from xml.etree.ElementTree import Element
4
4
 
5
- from ..iter_sync import IterSync
6
5
  from ..llm import LLM, Message, MessageRole
7
- from ..xml import encode_friendly
8
- from .fill import XMLFill
9
- from .format import ValidationError, _extract_xml_element
10
- from .group import XMLGroupContext
11
- from .progressive_locking import ProgressiveLockingValidator
12
- from .text_segment import TextSegment
6
+ from ..segment import BlockSegment, InlineSegment, TextSegment
7
+ from ..xml import decode_friendly, encode_friendly
8
+ from .callbacks import Callbacks, FillFailedEvent, warp_callbacks
9
+ from .hill_climbing import HillClimbing
10
+ from .stream_mapper import InlineSegmentMapping, XMLStreamMapper
11
+ from .submitter import submit_text_segments
13
12
 
14
13
  T = TypeVar("T")
15
14
 
@@ -17,66 +16,110 @@ T = TypeVar("T")
17
16
  class XMLTranslator:
18
17
  def __init__(
19
18
  self,
20
- llm: LLM,
21
- group_context: XMLGroupContext,
19
+ translation_llm: LLM,
20
+ fill_llm: LLM,
22
21
  target_language: str,
23
22
  user_prompt: str | None,
24
23
  ignore_translated_error: bool,
25
24
  max_retries: int,
26
25
  max_fill_displaying_errors: int,
26
+ max_group_tokens: int,
27
+ cache_seed_content: str | None = None,
27
28
  ) -> None:
28
- self._llm: LLM = llm
29
- self._group_context: XMLGroupContext = group_context
29
+ self._translation_llm: LLM = translation_llm
30
+ self._fill_llm: LLM = fill_llm
30
31
  self._target_language: str = target_language
31
32
  self._user_prompt: str | None = user_prompt
32
33
  self._ignore_translated_error: bool = ignore_translated_error
33
34
  self._max_retries: int = max_retries
34
35
  self._max_fill_displaying_errors: int = max_fill_displaying_errors
36
+ self._cache_seed_content: str | None = cache_seed_content
37
+ self._stream_mapper: XMLStreamMapper = XMLStreamMapper(
38
+ encoding=translation_llm.encoding,
39
+ max_group_tokens=max_group_tokens,
40
+ )
35
41
 
36
- def translate_to_element(self, element: Element) -> Element:
37
- for translated, _, _ in self.translate_to_text_segments(((element, None),)):
42
+ def translate_element(
43
+ self,
44
+ element: Element,
45
+ interrupt_source_text_segments: Callable[[Iterable[TextSegment]], Iterable[TextSegment]] | None = None,
46
+ interrupt_translated_text_segments: Callable[[Iterable[TextSegment]], Iterable[TextSegment]] | None = None,
47
+ interrupt_block_element: Callable[[Element], Element] | None = None,
48
+ on_fill_failed: Callable[[FillFailedEvent], None] | None = None,
49
+ ) -> Element:
50
+ for translated in self.translate_elements(
51
+ elements=((element),),
52
+ interrupt_source_text_segments=interrupt_source_text_segments,
53
+ interrupt_translated_text_segments=interrupt_translated_text_segments,
54
+ interrupt_block_element=interrupt_block_element,
55
+ on_fill_failed=on_fill_failed,
56
+ ):
38
57
  return translated
39
- raise RuntimeError("Translation failed unexpectedly")
40
58
 
41
- def translate_to_text_segments(
42
- self, items: Iterable[tuple[Element, T]]
43
- ) -> Generator[tuple[Element, list[TextSegment], T], None, None]:
44
- sync: IterSync[tuple[Element, T]] = IterSync()
45
- text_segments: list[TextSegment] = []
59
+ raise RuntimeError("Translation failed unexpectedly")
46
60
 
47
- for text_segment in self._translate_text_segments(
48
- elements=(e for e, _ in sync.iter(items)),
61
+ def translate_elements(
62
+ self,
63
+ elements: Iterable[Element],
64
+ interrupt_source_text_segments: Callable[[Iterable[TextSegment]], Iterable[TextSegment]] | None = None,
65
+ interrupt_translated_text_segments: Callable[[Iterable[TextSegment]], Iterable[TextSegment]] | None = None,
66
+ interrupt_block_element: Callable[[Element], Element] | None = None,
67
+ on_fill_failed: Callable[[FillFailedEvent], None] | None = None,
68
+ ) -> Generator[Element, None, None]:
69
+ callbacks = warp_callbacks(
70
+ interrupt_source_text_segments=interrupt_source_text_segments,
71
+ interrupt_translated_text_segments=interrupt_translated_text_segments,
72
+ interrupt_block_element=interrupt_block_element,
73
+ on_fill_failed=on_fill_failed,
74
+ )
75
+ for element, mappings in self._stream_mapper.map_stream(
76
+ elements=iter(elements),
77
+ callbacks=callbacks,
78
+ map=lambda inline_segments: self._translate_inline_segments(
79
+ inline_segments=inline_segments,
80
+ callbacks=callbacks,
81
+ ),
49
82
  ):
50
- while True:
51
- if sync.tail is None:
52
- break
53
- tail_element, _ = sync.tail
54
- if id(tail_element) == id(text_segment.root):
55
- break
56
- tail_element, payload = sync.take()
57
- yield tail_element, text_segments, payload
58
- text_segments = []
59
- text_segments.append(text_segment)
60
-
61
- while sync.tail is not None:
62
- tail_element, payload = sync.take()
63
- yield tail_element, text_segments, payload
64
- text_segments = []
65
-
66
- def _translate_text_segments(self, elements: Iterable[Element]):
67
- for group in self._group_context.split_groups(elements):
68
- text_segments = list(group)
69
- fill = XMLFill(text_segments)
70
- source_text = "".join(self._render_text_segments(text_segments))
71
- translated_text = self._translate_text(source_text)
72
- self._fill_into_xml(
73
- fill=fill,
74
- source_text=source_text,
75
- translated_text=translated_text,
83
+ yield submit_text_segments(
84
+ element=element,
85
+ mappings=mappings,
76
86
  )
77
- yield from group.body
87
+
88
+ def _translate_inline_segments(
89
+ self,
90
+ inline_segments: list[InlineSegment],
91
+ callbacks: Callbacks,
92
+ ) -> list[InlineSegmentMapping | None]:
93
+ hill_climbing = HillClimbing(
94
+ encoding=self._fill_llm.encoding,
95
+ max_fill_displaying_errors=self._max_fill_displaying_errors,
96
+ block_segment=BlockSegment(
97
+ root_tag="xml",
98
+ inline_segments=inline_segments,
99
+ ),
100
+ )
101
+ text_segments = (text for inline in inline_segments for text in inline)
102
+ source_text = "".join(self._render_text_segments(text_segments))
103
+ translated_text = self._translate_text(source_text)
104
+
105
+ self._request_and_submit(
106
+ hill_climbing=hill_climbing,
107
+ source_text=source_text,
108
+ translated_text=translated_text,
109
+ callbacks=callbacks,
110
+ )
111
+ mappings: list[InlineSegmentMapping | None] = []
112
+ for mapping in hill_climbing.gen_mappings():
113
+ if mapping:
114
+ _, text_segments = mapping
115
+ if not text_segments:
116
+ mapping = None
117
+ mappings.append(mapping)
118
+
119
+ return mappings
78
120
 
79
121
  def _render_text_segments(self, segments: Iterable[TextSegment]):
122
+ # TODO: 没必要,直接按照新的 inline segment 组织就行了
80
123
  iterator = iter(segments)
81
124
  segment = next(iterator, None)
82
125
  if segment is None:
@@ -92,87 +135,94 @@ class XMLTranslator:
92
135
  yield segment.text
93
136
 
94
137
  def _translate_text(self, text: str) -> str:
95
- return self._llm.request(
96
- input=[
97
- Message(
98
- role=MessageRole.SYSTEM,
99
- message=self._llm.template("translate").render(
100
- target_language=self._target_language,
101
- user_prompt=self._user_prompt,
138
+ with self._translation_llm.context(cache_seed_content=self._cache_seed_content) as ctx:
139
+ return ctx.request(
140
+ input=[
141
+ Message(
142
+ role=MessageRole.SYSTEM,
143
+ message=self._translation_llm.template("translate").render(
144
+ target_language=self._target_language,
145
+ user_prompt=self._user_prompt,
146
+ ),
102
147
  ),
103
- ),
104
- Message(role=MessageRole.USER, message=text),
105
- ]
106
- )
148
+ Message(role=MessageRole.USER, message=text),
149
+ ]
150
+ )
107
151
 
108
- def _fill_into_xml(self, fill: XMLFill, source_text: str, translated_text: str) -> Element:
152
+ def _request_and_submit(
153
+ self,
154
+ hill_climbing: HillClimbing,
155
+ source_text: str,
156
+ translated_text: str,
157
+ callbacks: Callbacks,
158
+ ) -> None:
109
159
  user_message = (
110
160
  f"Source text:\n{source_text}\n\n"
111
- f"XML template:\n```XML\n{encode_friendly(fill.request_element)}\n```\n\n"
161
+ f"XML template:\n```XML\n{encode_friendly(hill_climbing.request_element())}\n```\n\n"
112
162
  f"Translated text:\n{translated_text}"
113
163
  )
114
164
  fixed_messages: list[Message] = [
115
165
  Message(
116
166
  role=MessageRole.SYSTEM,
117
- message=self._llm.template("fill").render(),
167
+ message=self._fill_llm.template("fill").render(),
118
168
  ),
119
169
  Message(
120
170
  role=MessageRole.USER,
121
171
  message=user_message,
122
172
  ),
123
173
  ]
124
-
125
- validator = ProgressiveLockingValidator()
126
174
  conversation_history: list[Message] = []
127
- latest_error: ValidationError | None = None
128
175
 
129
- for _ in range(self._max_retries):
130
- # Request LLM response
131
- response = self._llm.request(
132
- input=fixed_messages + conversation_history,
133
- )
176
+ with self._fill_llm.context(cache_seed_content=self._cache_seed_content) as llm_context:
177
+ error_message: str | None = None
134
178
 
135
- try:
136
- # Extract XML from response
137
- validated_element = _extract_xml_element(response)
179
+ for retry_count in range(self._max_retries):
180
+ response = llm_context.request(fixed_messages + conversation_history)
181
+ validated_element = self._extract_xml_element(response)
182
+ error_message = None
183
+ if isinstance(validated_element, str):
184
+ error_message = validated_element
185
+ elif isinstance(validated_element, Element):
186
+ error_message = hill_climbing.submit(validated_element)
138
187
 
139
- # Validate with progressive locking
140
- is_complete, error_message, newly_locked = validator.validate_with_locking(
141
- template_ele=fill.request_element,
142
- validated_ele=validated_element,
143
- errors_limit=self._max_fill_displaying_errors,
144
- )
188
+ if error_message is None:
189
+ break
145
190
 
146
- if is_complete:
147
- # All nodes locked, fill successful
148
- fill._fill_submitted_texts( # pylint: disable=protected-access
149
- generated_ids_stack=[],
150
- element=validated_element,
191
+ callbacks.on_fill_failed(
192
+ FillFailedEvent(
193
+ error_message=error_message,
194
+ retried_count=retry_count + 1,
195
+ over_maximum_retries=False,
151
196
  )
152
- return validated_element
153
-
154
- # Not complete yet, construct error message with progress info
155
- progress_msg = f"Progress: {len(validator.locked_ids)} nodes locked"
156
- if newly_locked:
157
- progress_msg += f", {len(newly_locked)} newly locked this round"
158
-
159
- full_error_message = f"{progress_msg}\n\n{error_message}"
160
-
197
+ )
161
198
  conversation_history = [
162
199
  Message(role=MessageRole.ASSISTANT, message=response),
163
- Message(role=MessageRole.USER, message=full_error_message),
200
+ Message(role=MessageRole.USER, message=error_message),
164
201
  ]
202
+ if error_message is not None:
203
+ callbacks.on_fill_failed(
204
+ FillFailedEvent(
205
+ error_message=error_message,
206
+ retried_count=self._max_retries,
207
+ over_maximum_retries=True,
208
+ )
209
+ )
165
210
 
166
- except ValidationError as error:
167
- # XML extraction or basic validation failed
168
- latest_error = error
169
- conversation_history = [
170
- Message(role=MessageRole.ASSISTANT, message=response),
171
- Message(role=MessageRole.USER, message=str(error)),
172
- ]
211
+ def _extract_xml_element(self, text: str) -> Element | str:
212
+ first_xml_element: Element | None = None
213
+ all_xml_elements: int = 0
214
+
215
+ for xml_element in decode_friendly(text, tags="xml"):
216
+ if first_xml_element is None:
217
+ first_xml_element = xml_element
218
+ all_xml_elements += 1
173
219
 
174
- message = f"Failed to get valid XML structure after {self._max_retries} attempts"
175
- if latest_error is None:
176
- raise ValueError(message)
177
- else:
178
- raise ValueError(message) from latest_error
220
+ if first_xml_element is None:
221
+ return "No complete <xml>...</xml> block found. Please ensure you have properly closed the XML with </xml> tag." # noqa: E501
222
+
223
+ if all_xml_elements > 1:
224
+ return (
225
+ f"Found {all_xml_elements} <xml>...</xml> blocks. "
226
+ "Please return only one XML block without any examples or explanations."
227
+ )
228
+ return first_xml_element