epub-translator 0.0.6__py3-none-any.whl → 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. epub_translator/__init__.py +3 -1
  2. epub_translator/data/fill.jinja +66 -0
  3. epub_translator/data/mmltex/README.md +67 -0
  4. epub_translator/data/mmltex/cmarkup.xsl +1106 -0
  5. epub_translator/data/mmltex/entities.xsl +459 -0
  6. epub_translator/data/mmltex/glayout.xsl +222 -0
  7. epub_translator/data/mmltex/mmltex.xsl +36 -0
  8. epub_translator/data/mmltex/scripts.xsl +375 -0
  9. epub_translator/data/mmltex/tables.xsl +130 -0
  10. epub_translator/data/mmltex/tokens.xsl +328 -0
  11. epub_translator/data/translate.jinja +15 -12
  12. epub_translator/epub/__init__.py +4 -2
  13. epub_translator/epub/common.py +43 -0
  14. epub_translator/epub/math.py +193 -0
  15. epub_translator/epub/placeholder.py +53 -0
  16. epub_translator/epub/spines.py +42 -0
  17. epub_translator/epub/toc.py +505 -0
  18. epub_translator/epub/zip.py +67 -0
  19. epub_translator/iter_sync.py +24 -0
  20. epub_translator/language.py +23 -0
  21. epub_translator/llm/__init__.py +2 -1
  22. epub_translator/llm/core.py +175 -0
  23. epub_translator/llm/error.py +38 -35
  24. epub_translator/llm/executor.py +159 -136
  25. epub_translator/llm/increasable.py +28 -28
  26. epub_translator/llm/types.py +17 -0
  27. epub_translator/serial/__init__.py +2 -0
  28. epub_translator/serial/chunk.py +52 -0
  29. epub_translator/serial/segment.py +17 -0
  30. epub_translator/serial/splitter.py +50 -0
  31. epub_translator/template.py +35 -33
  32. epub_translator/translator.py +205 -168
  33. epub_translator/utils.py +7 -0
  34. epub_translator/xml/__init__.py +4 -3
  35. epub_translator/xml/deduplication.py +38 -0
  36. epub_translator/xml/firendly/__init__.py +2 -0
  37. epub_translator/xml/firendly/decoder.py +75 -0
  38. epub_translator/xml/firendly/encoder.py +84 -0
  39. epub_translator/xml/firendly/parser.py +177 -0
  40. epub_translator/xml/firendly/tag.py +118 -0
  41. epub_translator/xml/firendly/transform.py +36 -0
  42. epub_translator/xml/xml.py +52 -0
  43. epub_translator/xml/xml_like.py +176 -0
  44. epub_translator/xml_translator/__init__.py +3 -0
  45. epub_translator/xml_translator/const.py +2 -0
  46. epub_translator/xml_translator/fill.py +128 -0
  47. epub_translator/xml_translator/format.py +282 -0
  48. epub_translator/xml_translator/fragmented.py +125 -0
  49. epub_translator/xml_translator/group.py +183 -0
  50. epub_translator/xml_translator/progressive_locking.py +256 -0
  51. epub_translator/xml_translator/submitter.py +102 -0
  52. epub_translator/xml_translator/text_segment.py +263 -0
  53. epub_translator/xml_translator/translator.py +178 -0
  54. epub_translator/xml_translator/utils.py +29 -0
  55. epub_translator-0.1.0.dist-info/METADATA +283 -0
  56. epub_translator-0.1.0.dist-info/RECORD +58 -0
  57. epub_translator/data/format.jinja +0 -33
  58. epub_translator/epub/content_parser.py +0 -162
  59. epub_translator/epub/html/__init__.py +0 -1
  60. epub_translator/epub/html/dom_operator.py +0 -62
  61. epub_translator/epub/html/empty_tags.py +0 -23
  62. epub_translator/epub/html/file.py +0 -80
  63. epub_translator/epub/html/texts_searcher.py +0 -46
  64. epub_translator/llm/node.py +0 -201
  65. epub_translator/translation/__init__.py +0 -2
  66. epub_translator/translation/chunk.py +0 -118
  67. epub_translator/translation/splitter.py +0 -78
  68. epub_translator/translation/store.py +0 -36
  69. epub_translator/translation/translation.py +0 -231
  70. epub_translator/translation/types.py +0 -45
  71. epub_translator/translation/utils.py +0 -11
  72. epub_translator/xml/decoder.py +0 -71
  73. epub_translator/xml/encoder.py +0 -95
  74. epub_translator/xml/parser.py +0 -172
  75. epub_translator/xml/tag.py +0 -93
  76. epub_translator/xml/transform.py +0 -34
  77. epub_translator/xml/utils.py +0 -12
  78. epub_translator/zip_context.py +0 -74
  79. epub_translator-0.0.6.dist-info/METADATA +0 -170
  80. epub_translator-0.0.6.dist-info/RECORD +0 -36
  81. {epub_translator-0.0.6.dist-info → epub_translator-0.1.0.dist-info}/LICENSE +0 -0
  82. {epub_translator-0.0.6.dist-info → epub_translator-0.1.0.dist-info}/WHEEL +0 -0
@@ -0,0 +1,263 @@
1
+ from collections.abc import Generator, Iterable
2
+ from dataclasses import dataclass
3
+ from enum import Enum, auto
4
+ from typing import Self
5
+ from xml.etree.ElementTree import Element
6
+
7
+ from .utils import expand_left_element_texts, expand_right_element_texts, normalize_text_in_element
8
+
9
+ # HTML inline-level elements
10
+ # Reference: https://developer.mozilla.org/en-US/docs/Web/HTML/Inline_elements
11
+ # Reference: https://developer.mozilla.org/en-US/docs/Glossary/Inline-level_content
12
+ _HTML_INLINE_TAGS = frozenset(
13
+ [
14
+ # Inline text semantics
15
+ "a",
16
+ "abbr",
17
+ "b",
18
+ "bdi",
19
+ "bdo",
20
+ "br",
21
+ "cite",
22
+ "code",
23
+ "data",
24
+ "dfn",
25
+ "em",
26
+ "i",
27
+ "kbd",
28
+ "mark",
29
+ "q",
30
+ "rp",
31
+ "rt",
32
+ "ruby",
33
+ "s",
34
+ "samp",
35
+ "small",
36
+ "span",
37
+ "strong",
38
+ "sub",
39
+ "sup",
40
+ "time",
41
+ "u",
42
+ "var",
43
+ "wbr",
44
+ # Image and multimedia
45
+ "img",
46
+ "svg",
47
+ "canvas",
48
+ "audio",
49
+ "video",
50
+ "map",
51
+ "area",
52
+ # Form elements
53
+ "input",
54
+ "button",
55
+ "select",
56
+ "textarea",
57
+ "label",
58
+ "output",
59
+ "progress",
60
+ "meter",
61
+ # Embedded content
62
+ "iframe",
63
+ "embed",
64
+ "object",
65
+ # Other inline elements
66
+ "script",
67
+ "del",
68
+ "ins",
69
+ "slot",
70
+ ]
71
+ )
72
+
73
+
74
+ class TextPosition(Enum):
75
+ TEXT = auto()
76
+ TAIL = auto()
77
+
78
+
79
+ @dataclass
80
+ class TextSegment:
81
+ text: str
82
+ index: int # *.text is 0, the first *.tail is 1, and so on
83
+ parent_stack: list[Element]
84
+ left_common_depth: int
85
+ right_common_depth: int
86
+ block_depth: int
87
+ position: TextPosition
88
+
89
+ @property
90
+ def root(self) -> Element:
91
+ return self.parent_stack[0]
92
+
93
+ @property
94
+ def block_parent(self) -> Element:
95
+ return self.parent_stack[self.block_depth - 1]
96
+
97
+ @property
98
+ def xml_text(self) -> str:
99
+ return "".join(_expand_xml_texts(self))
100
+
101
+ def strip_block_parents(self) -> Self:
102
+ self.parent_stack = self.parent_stack[self.block_depth - 1 :]
103
+ self.block_depth = 1
104
+ return self
105
+
106
+ def clone(self) -> "TextSegment":
107
+ return TextSegment(
108
+ text=self.text,
109
+ index=self.index,
110
+ parent_stack=list(self.parent_stack),
111
+ left_common_depth=self.left_common_depth,
112
+ right_common_depth=self.right_common_depth,
113
+ block_depth=self.block_depth,
114
+ position=self.position,
115
+ )
116
+
117
+
118
+ def _expand_xml_texts(segment: TextSegment):
119
+ for i in range(segment.left_common_depth, len(segment.parent_stack)):
120
+ yield from expand_left_element_texts(segment.parent_stack[i])
121
+ yield segment.text
122
+ for i in range(len(segment.parent_stack) - 1, segment.right_common_depth - 1, -1):
123
+ yield from expand_right_element_texts(segment.parent_stack[i])
124
+
125
+
126
+ def incision_between(segment1: TextSegment, segment2: TextSegment) -> tuple[int, int]:
127
+ return (
128
+ _incision_of(segment1, segment1.right_common_depth),
129
+ _incision_of(segment2, segment2.left_common_depth),
130
+ )
131
+
132
+
133
+ def _incision_of(segment: TextSegment, common_depth: int) -> int:
134
+ block_diff: int = 0
135
+ inline_diff: int = 0
136
+ if common_depth >= segment.block_depth:
137
+ inline_diff = len(segment.parent_stack) - common_depth
138
+ else:
139
+ block_diff = segment.block_depth - common_depth
140
+ inline_diff = len(segment.parent_stack) - segment.block_depth
141
+ return block_diff * 3 + inline_diff # 数字越大越容易被拆分
142
+
143
+
144
+ def search_text_segments(root: Element) -> Generator[TextSegment, None, None]:
145
+ generator = _search_text_segments([], root)
146
+ text_segment = next(generator, None)
147
+ if text_segment is None:
148
+ return
149
+
150
+ while True:
151
+ next_text_segment = next(generator, None)
152
+ if next_text_segment is None:
153
+ break
154
+ common_depth = _common_depth(
155
+ stack1=text_segment.parent_stack,
156
+ stack2=next_text_segment.parent_stack,
157
+ )
158
+ text_segment.right_common_depth = common_depth
159
+ yield text_segment
160
+ text_segment = next_text_segment
161
+ text_segment.left_common_depth = common_depth
162
+
163
+ yield text_segment
164
+
165
+
166
+ def _search_text_segments(stack: list[Element], element: Element) -> Generator[TextSegment, None, None]:
167
+ text = normalize_text_in_element(element.text)
168
+ next_stack = stack + [element]
169
+ next_block_depth = _find_block_depth(next_stack)
170
+
171
+ if text is not None:
172
+ yield TextSegment(
173
+ text=text,
174
+ index=0,
175
+ parent_stack=next_stack,
176
+ left_common_depth=0,
177
+ right_common_depth=0,
178
+ block_depth=next_block_depth,
179
+ position=TextPosition.TEXT,
180
+ )
181
+ for i, child_element in enumerate(element):
182
+ yield from _search_text_segments(next_stack, child_element)
183
+ child_tail = normalize_text_in_element(child_element.tail)
184
+ if child_tail is not None:
185
+ yield TextSegment(
186
+ text=child_tail,
187
+ index=i + 1,
188
+ parent_stack=next_stack,
189
+ left_common_depth=0,
190
+ right_common_depth=0,
191
+ block_depth=next_block_depth,
192
+ position=TextPosition.TAIL,
193
+ )
194
+
195
+
196
+ def _find_block_depth(parent_stack: list[Element]) -> int:
197
+ index: int = 0
198
+ for i in range(len(parent_stack) - 1, -1, -1):
199
+ checked_tag = parent_stack[i].tag.lower()
200
+ if checked_tag not in _HTML_INLINE_TAGS:
201
+ index = i
202
+ break
203
+ return index + 1 # depth is a count not index
204
+
205
+
206
+ def combine_text_segments(segments: Iterable[TextSegment]) -> Generator[tuple[Element, dict[int, Element]], None, None]:
207
+ stack: list[tuple[Element, Element]] = [] # (raw, generated)
208
+ raw2generated: dict[int, Element] = {}
209
+ last_popped: Element | None = None
210
+
211
+ for segment in segments:
212
+ common_depth = _common_depth(
213
+ stack1=(raw for raw, _ in stack),
214
+ stack2=segment.parent_stack,
215
+ )
216
+ if stack and common_depth == 0:
217
+ yield stack[0][1], raw2generated
218
+ stack = []
219
+ raw2generated = {}
220
+ last_popped = None
221
+
222
+ while len(stack) > common_depth:
223
+ last_popped = stack.pop()[1]
224
+ while len(stack) < len(segment.parent_stack):
225
+ last_popped = None
226
+ index = len(stack)
227
+ raw = segment.parent_stack[index]
228
+ generated = Element(raw.tag, raw.attrib)
229
+ if stack:
230
+ _, generated_parent = stack[-1]
231
+ generated_parent.append(generated)
232
+ stack.append((raw, generated))
233
+ raw2generated[id(raw)] = generated
234
+
235
+ if last_popped is None:
236
+ if stack:
237
+ stack[-1][1].text = _append_element_text(
238
+ text=stack[-1][1].text,
239
+ appended=segment.text,
240
+ )
241
+ else:
242
+ last_popped.tail = _append_element_text(
243
+ text=last_popped.tail,
244
+ appended=segment.text,
245
+ )
246
+ if stack:
247
+ yield stack[0][1], raw2generated
248
+
249
+
250
+ def _common_depth(stack1: Iterable[Element], stack2: Iterable[Element]) -> int:
251
+ common_depth: int = 0
252
+ for parent1, parent2 in zip(stack1, stack2):
253
+ if id(parent1) != id(parent2):
254
+ break
255
+ common_depth += 1
256
+ return common_depth
257
+
258
+
259
+ def _append_element_text(text: str | None, appended: str) -> str:
260
+ if text is None:
261
+ return appended
262
+ else:
263
+ return text + appended
@@ -0,0 +1,178 @@
1
+ from collections.abc import Generator, Iterable
2
+ from typing import TypeVar
3
+ from xml.etree.ElementTree import Element
4
+
5
+ from ..iter_sync import IterSync
6
+ from ..llm import LLM, Message, MessageRole
7
+ from ..xml import encode_friendly
8
+ from .fill import XMLFill
9
+ from .format import ValidationError, _extract_xml_element
10
+ from .group import XMLGroupContext
11
+ from .progressive_locking import ProgressiveLockingValidator
12
+ from .text_segment import TextSegment
13
+
14
+ T = TypeVar("T")
15
+
16
+
17
+ class XMLTranslator:
18
+ def __init__(
19
+ self,
20
+ llm: LLM,
21
+ group_context: XMLGroupContext,
22
+ target_language: str,
23
+ user_prompt: str | None,
24
+ ignore_translated_error: bool,
25
+ max_retries: int,
26
+ max_fill_displaying_errors: int,
27
+ ) -> None:
28
+ self._llm: LLM = llm
29
+ self._group_context: XMLGroupContext = group_context
30
+ self._target_language: str = target_language
31
+ self._user_prompt: str | None = user_prompt
32
+ self._ignore_translated_error: bool = ignore_translated_error
33
+ self._max_retries: int = max_retries
34
+ self._max_fill_displaying_errors: int = max_fill_displaying_errors
35
+
36
+ def translate_to_element(self, element: Element) -> Element:
37
+ for translated, _, _ in self.translate_to_text_segments(((element, None),)):
38
+ return translated
39
+ raise RuntimeError("Translation failed unexpectedly")
40
+
41
+ def translate_to_text_segments(
42
+ self, items: Iterable[tuple[Element, T]]
43
+ ) -> Generator[tuple[Element, list[TextSegment], T], None, None]:
44
+ sync: IterSync[tuple[Element, T]] = IterSync()
45
+ text_segments: list[TextSegment] = []
46
+
47
+ for text_segment in self._translate_text_segments(
48
+ elements=(e for e, _ in sync.iter(items)),
49
+ ):
50
+ while True:
51
+ if sync.tail is None:
52
+ break
53
+ tail_element, _ = sync.tail
54
+ if id(tail_element) == id(text_segment.root):
55
+ break
56
+ tail_element, payload = sync.take()
57
+ yield tail_element, text_segments, payload
58
+ text_segments = []
59
+ text_segments.append(text_segment)
60
+
61
+ while sync.tail is not None:
62
+ tail_element, payload = sync.take()
63
+ yield tail_element, text_segments, payload
64
+ text_segments = []
65
+
66
+ def _translate_text_segments(self, elements: Iterable[Element]):
67
+ for group in self._group_context.split_groups(elements):
68
+ text_segments = list(group)
69
+ fill = XMLFill(text_segments)
70
+ source_text = "".join(self._render_text_segments(text_segments))
71
+ translated_text = self._translate_text(source_text)
72
+ self._fill_into_xml(
73
+ fill=fill,
74
+ source_text=source_text,
75
+ translated_text=translated_text,
76
+ )
77
+ yield from group.body
78
+
79
+ def _render_text_segments(self, segments: Iterable[TextSegment]):
80
+ iterator = iter(segments)
81
+ segment = next(iterator, None)
82
+ if segment is None:
83
+ return
84
+ while True:
85
+ next_segment = next(iterator, None)
86
+ if next_segment is None:
87
+ break
88
+ yield segment.text
89
+ if id(segment.block_parent) != id(next_segment.block_parent):
90
+ yield "\n\n"
91
+ segment = next_segment
92
+ yield segment.text
93
+
94
+ def _translate_text(self, text: str) -> str:
95
+ return self._llm.request(
96
+ input=[
97
+ Message(
98
+ role=MessageRole.SYSTEM,
99
+ message=self._llm.template("translate").render(
100
+ target_language=self._target_language,
101
+ user_prompt=self._user_prompt,
102
+ ),
103
+ ),
104
+ Message(role=MessageRole.USER, message=text),
105
+ ]
106
+ )
107
+
108
+ def _fill_into_xml(self, fill: XMLFill, source_text: str, translated_text: str) -> Element:
109
+ user_message = (
110
+ f"Source text:\n{source_text}\n\n"
111
+ f"XML template:\n```XML\n{encode_friendly(fill.request_element)}\n```\n\n"
112
+ f"Translated text:\n{translated_text}"
113
+ )
114
+ fixed_messages: list[Message] = [
115
+ Message(
116
+ role=MessageRole.SYSTEM,
117
+ message=self._llm.template("fill").render(),
118
+ ),
119
+ Message(
120
+ role=MessageRole.USER,
121
+ message=user_message,
122
+ ),
123
+ ]
124
+
125
+ validator = ProgressiveLockingValidator()
126
+ conversation_history: list[Message] = []
127
+ latest_error: ValidationError | None = None
128
+
129
+ for _ in range(self._max_retries):
130
+ # Request LLM response
131
+ response = self._llm.request(
132
+ input=fixed_messages + conversation_history,
133
+ )
134
+
135
+ try:
136
+ # Extract XML from response
137
+ validated_element = _extract_xml_element(response)
138
+
139
+ # Validate with progressive locking
140
+ is_complete, error_message, newly_locked = validator.validate_with_locking(
141
+ template_ele=fill.request_element,
142
+ validated_ele=validated_element,
143
+ errors_limit=self._max_fill_displaying_errors,
144
+ )
145
+
146
+ if is_complete:
147
+ # All nodes locked, fill successful
148
+ fill._fill_submitted_texts( # pylint: disable=protected-access
149
+ generated_ids_stack=[],
150
+ element=validated_element,
151
+ )
152
+ return validated_element
153
+
154
+ # Not complete yet, construct error message with progress info
155
+ progress_msg = f"Progress: {len(validator.locked_ids)} nodes locked"
156
+ if newly_locked:
157
+ progress_msg += f", {len(newly_locked)} newly locked this round"
158
+
159
+ full_error_message = f"{progress_msg}\n\n{error_message}"
160
+
161
+ conversation_history = [
162
+ Message(role=MessageRole.ASSISTANT, message=response),
163
+ Message(role=MessageRole.USER, message=full_error_message),
164
+ ]
165
+
166
+ except ValidationError as error:
167
+ # XML extraction or basic validation failed
168
+ latest_error = error
169
+ conversation_history = [
170
+ Message(role=MessageRole.ASSISTANT, message=response),
171
+ Message(role=MessageRole.USER, message=str(error)),
172
+ ]
173
+
174
+ message = f"Failed to get valid XML structure after {self._max_retries} attempts"
175
+ if latest_error is None:
176
+ raise ValueError(message)
177
+ else:
178
+ raise ValueError(message) from latest_error
@@ -0,0 +1,29 @@
1
+ from xml.etree.ElementTree import Element
2
+
3
+ from ..utils import normalize_whitespace
4
+ from .const import DATA_ORIGIN_LEN_KEY, ID_KEY
5
+
6
+
7
+ def normalize_text_in_element(text: str | None) -> str | None:
8
+ if text is None:
9
+ return None
10
+ text = normalize_whitespace(text)
11
+ if not text.strip():
12
+ return None
13
+ return text
14
+
15
+
16
+ def expand_left_element_texts(element: Element):
17
+ yield "<"
18
+ yield element.tag
19
+ yield " "
20
+ yield ID_KEY
21
+ yield '="99" '
22
+ yield DATA_ORIGIN_LEN_KEY
23
+ yield '="999">'
24
+
25
+
26
+ def expand_right_element_texts(element: Element):
27
+ yield "</"
28
+ yield element.tag
29
+ yield ">"