epub-translator 0.1.1__py3-none-any.whl → 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. epub_translator/__init__.py +9 -2
  2. epub_translator/data/fill.jinja +143 -38
  3. epub_translator/epub/__init__.py +1 -1
  4. epub_translator/epub/metadata.py +122 -0
  5. epub_translator/epub/spines.py +3 -2
  6. epub_translator/epub/zip.py +11 -9
  7. epub_translator/epub_transcode.py +108 -0
  8. epub_translator/llm/__init__.py +1 -0
  9. epub_translator/llm/context.py +109 -0
  10. epub_translator/llm/core.py +32 -113
  11. epub_translator/llm/executor.py +25 -31
  12. epub_translator/llm/increasable.py +1 -1
  13. epub_translator/llm/types.py +0 -3
  14. epub_translator/punctuation.py +34 -0
  15. epub_translator/segment/__init__.py +26 -0
  16. epub_translator/segment/block_segment.py +124 -0
  17. epub_translator/segment/common.py +29 -0
  18. epub_translator/segment/inline_segment.py +356 -0
  19. epub_translator/{xml_translator → segment}/text_segment.py +7 -72
  20. epub_translator/segment/utils.py +43 -0
  21. epub_translator/translator.py +152 -184
  22. epub_translator/utils.py +33 -0
  23. epub_translator/xml/__init__.py +3 -0
  24. epub_translator/xml/const.py +1 -0
  25. epub_translator/xml/deduplication.py +3 -3
  26. epub_translator/xml/inline.py +67 -0
  27. epub_translator/xml/self_closing.py +182 -0
  28. epub_translator/xml/utils.py +42 -0
  29. epub_translator/xml/xml.py +7 -0
  30. epub_translator/xml/xml_like.py +8 -33
  31. epub_translator/xml_interrupter.py +165 -0
  32. epub_translator/xml_translator/__init__.py +3 -3
  33. epub_translator/xml_translator/callbacks.py +34 -0
  34. epub_translator/xml_translator/{const.py → common.py} +0 -1
  35. epub_translator/xml_translator/hill_climbing.py +104 -0
  36. epub_translator/xml_translator/stream_mapper.py +253 -0
  37. epub_translator/xml_translator/submitter.py +352 -91
  38. epub_translator/xml_translator/translator.py +182 -114
  39. epub_translator/xml_translator/validation.py +458 -0
  40. {epub_translator-0.1.1.dist-info → epub_translator-0.1.4.dist-info}/METADATA +134 -21
  41. epub_translator-0.1.4.dist-info/RECORD +68 -0
  42. epub_translator/epub/placeholder.py +0 -53
  43. epub_translator/iter_sync.py +0 -24
  44. epub_translator/xml_translator/fill.py +0 -128
  45. epub_translator/xml_translator/format.py +0 -282
  46. epub_translator/xml_translator/fragmented.py +0 -125
  47. epub_translator/xml_translator/group.py +0 -183
  48. epub_translator/xml_translator/progressive_locking.py +0 -256
  49. epub_translator/xml_translator/utils.py +0 -29
  50. epub_translator-0.1.1.dist-info/RECORD +0 -58
  51. {epub_translator-0.1.1.dist-info → epub_translator-0.1.4.dist-info}/LICENSE +0 -0
  52. {epub_translator-0.1.1.dist-info → epub_translator-0.1.4.dist-info}/WHEEL +0 -0
@@ -1,102 +1,363 @@
1
- from collections.abc import Iterable
1
+ from collections.abc import Generator
2
+ from dataclasses import dataclass
3
+ from enum import Enum, auto
2
4
  from xml.etree.ElementTree import Element
3
5
 
4
- from ..xml import iter_with_stack
5
- from .text_segment import TextPosition, TextSegment, combine_text_segments
6
-
7
-
8
- def submit_text_segments(element: Element, text_segments: Iterable[TextSegment]):
9
- grouped_map = _group_text_segments(text_segments)
10
- flatten_text_segments = dict(_extract_flatten_text_segments(element, grouped_map))
11
- _append_text_segments(element, grouped_map)
12
- _replace_text_segments(element, flatten_text_segments)
13
-
14
-
15
- def _group_text_segments(text_segments: Iterable[TextSegment]):
16
- grouped_map: dict[int, list[TextSegment]] = {}
17
- for text_segment in text_segments:
18
- parent_id = id(text_segment.block_parent)
19
- grouped = grouped_map.get(parent_id, None)
20
- if grouped is None:
21
- grouped_map[parent_id] = grouped = []
22
- grouped_map[parent_id].append(text_segment)
23
- return grouped_map
24
-
25
-
26
- # 被覆盖的 block 表示一种偶然现象,由于它的子元素会触发 append 操作,若对它也进行 append 操作阅读顺序会混乱
27
- # 此时只能在它的所有文本后立即接上翻译后的文本
28
- def _extract_flatten_text_segments(element: Element, grouped_map: dict[int, list[TextSegment]]):
29
- override_parent_ids: set[int] = set()
30
- for parents, child_element in iter_with_stack(element):
31
- if id(child_element) not in grouped_map:
32
- continue
33
- for parent in parents[:-1]:
34
- parent_id = id(parent)
35
- if parent_id in grouped_map:
36
- override_parent_ids.add(parent_id)
37
-
38
- if id(element) in grouped_map:
39
- override_parent_ids.add(id(element)) # root 不会出现在 parents 中需单独添加
40
-
41
- for parent_id in override_parent_ids:
42
- yield parent_id, grouped_map.pop(parent_id)
43
-
44
-
45
- def _replace_text_segments(element: Element, text_segments: dict[int, list[TextSegment]]):
46
- for _, child_element in iter_with_stack(element):
47
- tail_text_segments: list[TextSegment] = []
48
- for text_segment in text_segments.get(id(child_element), ()):
49
- if text_segment.position == TextPosition.TEXT:
50
- child_element.text = _append_text(
51
- origin_text=child_element.text,
52
- append_text=text_segment.text,
53
- )
54
- elif text_segment.position == TextPosition.TAIL:
55
- tail_text_segments.append(text_segment)
6
+ from ..segment import TextSegment, combine_text_segments
7
+ from ..xml import index_of_parent, is_inline_tag, iter_with_stack
8
+ from .stream_mapper import InlineSegmentMapping
56
9
 
57
- tail_text_segments.sort(key=lambda t: t.index)
58
- tail_text_segments.reverse()
59
- for cc_element in child_element:
60
- if not tail_text_segments:
61
- break
62
- if cc_element.tail is not None:
63
- cc_element.tail = _append_text(
64
- origin_text=cc_element.tail,
65
- append_text=tail_text_segments.pop().text,
10
+
11
+ class SubmitKind(Enum):
12
+ REPLACE = auto()
13
+ APPEND_TEXT = auto()
14
+ APPEND_BLOCK = auto()
15
+
16
+
17
+ def submit(element: Element, action: SubmitKind, mappings: list[InlineSegmentMapping]) -> Element:
18
+ submitter = _Submitter(
19
+ element=element,
20
+ action=action,
21
+ mappings=mappings,
22
+ )
23
+ replaced_root = submitter.do()
24
+ if replaced_root is not None:
25
+ return replaced_root
26
+
27
+ return element
28
+
29
+
30
+ @dataclass
31
+ class _Node:
32
+ raw_element: Element
33
+ items: list[tuple[list[TextSegment], "_Node"]] # empty for peak, non-empty for platform
34
+ tail_text_segments: list[TextSegment]
35
+
36
+
37
+ class _Submitter:
38
+ def __init__(
39
+ self,
40
+ element: Element,
41
+ action: SubmitKind,
42
+ mappings: list[InlineSegmentMapping],
43
+ ) -> None:
44
+ self._action: SubmitKind = action
45
+ self._nodes: list[_Node] = list(_nest_nodes(mappings))
46
+ self._parents: dict[int, Element] = self._collect_parents(element, mappings)
47
+
48
+ def _collect_parents(self, element: Element, mappings: list[InlineSegmentMapping]):
49
+ ids: set[int] = set(id(e) for e, _ in mappings)
50
+ parents_dict: dict[int, Element] = {}
51
+ for parents, child in iter_with_stack(element):
52
+ if parents and id(child) in ids:
53
+ parents_dict[id(child)] = parents[-1]
54
+ return parents_dict
55
+
56
+ def do(self):
57
+ replaced_root: Element | None = None
58
+
59
+ for node in self._nodes:
60
+ submitted = self._submit_node(node)
61
+ if replaced_root is None:
62
+ replaced_root = submitted
63
+
64
+ return replaced_root
65
+
66
+ # @return replaced root element, or None if appended to parent
67
+ def _submit_node(self, node: _Node) -> Element | None:
68
+ if node.items or self._action == SubmitKind.APPEND_TEXT:
69
+ return self._submit_by_text(node)
70
+ else:
71
+ return self._submit_by_block(node)
72
+
73
+ def _submit_by_block(self, node: _Node) -> Element | None:
74
+ parent = self._parents.get(id(node.raw_element), None)
75
+ if parent is None:
76
+ return node.raw_element
77
+
78
+ preserved_elements: list[Element] = []
79
+ if self._action == SubmitKind.REPLACE:
80
+ for child in list(node.raw_element):
81
+ if not is_inline_tag(child.tag):
82
+ child.tail = None
83
+ preserved_elements.append(child)
84
+
85
+ index = index_of_parent(parent, node.raw_element)
86
+ combined = self._combine_text_segments(node.tail_text_segments)
87
+
88
+ if combined is not None:
89
+ # 在 APPEND_BLOCK 模式下,如果是 inline tag,则在文本前面加空格
90
+ if self._action == SubmitKind.APPEND_BLOCK and is_inline_tag(combined.tag) and combined.text:
91
+ combined.text = " " + combined.text
92
+ parent.insert(index + 1, combined)
93
+ index += 1
94
+
95
+ for elem in preserved_elements:
96
+ parent.insert(index + 1, elem)
97
+ index += 1
98
+
99
+ if combined is not None or preserved_elements:
100
+ if preserved_elements:
101
+ preserved_elements[-1].tail = node.raw_element.tail
102
+ elif combined is not None:
103
+ combined.tail = node.raw_element.tail
104
+ node.raw_element.tail = None
105
+
106
+ if self._action == SubmitKind.REPLACE:
107
+ parent.remove(node.raw_element)
108
+
109
+ return None
110
+
111
+ def _submit_by_text(self, node: _Node) -> Element | None:
112
+ replaced_root: Element | None = None
113
+ child_nodes = dict((id(node), node) for _, node in node.items)
114
+ last_tail_element: Element | None = None
115
+ tail_elements: dict[int, Element] = {}
116
+
117
+ for child_element in node.raw_element:
118
+ child_node = child_nodes.get(id(child_element), None)
119
+ if child_node is not None:
120
+ if last_tail_element is not None:
121
+ tail_elements[id(child_element)] = last_tail_element
122
+ last_tail_element = child_element
123
+
124
+ for text_segments, child_node in node.items:
125
+ tail_element = tail_elements.get(id(child_node.raw_element), None)
126
+ items_preserved_elements: list[Element] = []
127
+
128
+ if self._action == SubmitKind.REPLACE:
129
+ end_index = index_of_parent(node.raw_element, child_node.raw_element)
130
+ items_preserved_elements = self._remove_elements_after_tail(
131
+ node_element=node.raw_element,
132
+ tail_element=tail_element,
133
+ end_index=end_index,
66
134
  )
67
135
 
136
+ self._append_combined_after_tail(
137
+ node_element=node.raw_element,
138
+ text_segments=text_segments,
139
+ tail_element=tail_element,
140
+ append_to_end=False,
141
+ ref_element=child_node.raw_element,
142
+ )
143
+ if items_preserved_elements:
144
+ insert_position = index_of_parent(node.raw_element, child_node.raw_element)
145
+ for i, elem in enumerate(items_preserved_elements):
146
+ node.raw_element.insert(insert_position + i, elem)
147
+
148
+ for _, child_node in node.items:
149
+ submitted = self._submit_node(child_node)
150
+ if replaced_root is None:
151
+ replaced_root = submitted
68
152
 
69
- def _append_text_segments(element: Element, grouped_map: dict[int, list[TextSegment]]):
70
- for parents, child_element in iter_with_stack(element):
71
- if not parents:
72
- continue
73
- grouped = grouped_map.get(id(child_element))
74
- if not grouped:
75
- continue
76
- parent = parents[-1]
77
- index = _index_of_parent(parents[-1], child_element)
78
- combined = next(
79
- combine_text_segments(
80
- segments=(t.strip_block_parents() for t in grouped),
81
- ),
82
- None,
153
+ if node.raw_element:
154
+ last_tail_element = node.raw_element[-1]
155
+ else:
156
+ last_tail_element = None
157
+
158
+ tail_preserved_elements: list[Element] = []
159
+ if self._action == SubmitKind.REPLACE:
160
+ tail_preserved_elements = self._remove_elements_after_tail(
161
+ node_element=node.raw_element,
162
+ tail_element=last_tail_element,
163
+ end_index=None, # None 表示删除到末尾
164
+ )
165
+ self._append_combined_after_tail(
166
+ node_element=node.raw_element,
167
+ text_segments=node.tail_text_segments,
168
+ tail_element=last_tail_element,
169
+ ref_element=None,
170
+ append_to_end=True,
83
171
  )
84
- if combined is not None:
85
- combined_element, _ = combined
86
- parent.insert(index + 1, combined_element)
87
- combined_element.tail = child_element.tail
88
- child_element.tail = None
172
+ if tail_preserved_elements:
173
+ for elem in tail_preserved_elements:
174
+ node.raw_element.append(elem)
175
+
176
+ return replaced_root
177
+
178
+ def _remove_elements_after_tail(
179
+ self,
180
+ node_element: Element,
181
+ tail_element: Element | None,
182
+ end_index: int | None = None,
183
+ ) -> list[Element]:
184
+ if tail_element is None:
185
+ start_index = 0
186
+ node_element.text = None
187
+ else:
188
+ start_index = index_of_parent(node_element, tail_element) + 1
189
+ tail_element.tail = None
190
+
191
+ if end_index is None:
192
+ end_index = len(node_element)
193
+
194
+ preserved_elements: list[Element] = []
195
+ for i in range(start_index, end_index):
196
+ elem = node_element[i]
197
+ if not is_inline_tag(elem.tag):
198
+ elem.tail = None
199
+ preserved_elements.append(elem)
200
+
201
+ for i in range(end_index - 1, start_index - 1, -1):
202
+ node_element.remove(node_element[i])
203
+
204
+ return preserved_elements
205
+
206
+ def _append_combined_after_tail(
207
+ self,
208
+ node_element: Element,
209
+ text_segments: list[TextSegment],
210
+ tail_element: Element | None,
211
+ ref_element: Element | None,
212
+ append_to_end: bool,
213
+ ) -> None:
214
+ combined = self._combine_text_segments(text_segments)
215
+ if combined is None:
216
+ return
217
+
218
+ if combined.text:
219
+ will_inject_space = self._action == SubmitKind.APPEND_TEXT or (
220
+ is_inline_tag(combined.tag) and self._action == SubmitKind.APPEND_BLOCK
221
+ )
222
+ if tail_element is not None:
223
+ tail_element.tail = self._append_text_in_element(
224
+ origin_text=tail_element.tail,
225
+ append_text=combined.text,
226
+ will_inject_space=will_inject_space,
227
+ )
228
+ elif ref_element is None:
229
+ node_element.text = self._append_text_in_element(
230
+ origin_text=node_element.text,
231
+ append_text=combined.text,
232
+ will_inject_space=will_inject_space,
233
+ )
234
+ else:
235
+ ref_index = index_of_parent(node_element, ref_element)
236
+ if ref_index > 0:
237
+ # 添加到前一个元素的 tail
238
+ prev_element = node_element[ref_index - 1]
239
+ prev_element.tail = self._append_text_in_element(
240
+ origin_text=prev_element.tail,
241
+ append_text=combined.text,
242
+ will_inject_space=will_inject_space,
243
+ )
244
+ else:
245
+ # ref_element 是第一个元素,添加到 node_element.text
246
+ node_element.text = self._append_text_in_element(
247
+ origin_text=node_element.text,
248
+ append_text=combined.text,
249
+ will_inject_space=will_inject_space,
250
+ )
251
+
252
+ if tail_element is not None:
253
+ insert_position = index_of_parent(node_element, tail_element) + 1
254
+ elif append_to_end:
255
+ insert_position = len(node_element)
256
+ elif ref_element is not None:
257
+ # 使用 ref_element 来定位插入位置
258
+ # 如果文本被添加到前一个元素的 tail,则在前一个元素之后插入
259
+ ref_index = index_of_parent(node_element, ref_element)
260
+ if ref_index > 0:
261
+ # 在前一个元素之后插入
262
+ insert_position = ref_index
263
+ else:
264
+ # ref_element 是第一个元素,插入到开头
265
+ insert_position = 0
266
+ else:
267
+ insert_position = 0
268
+
269
+ for i, child in enumerate(combined):
270
+ node_element.insert(insert_position + i, child)
271
+
272
+ def _combine_text_segments(self, text_segments: list[TextSegment]) -> Element | None:
273
+ segments = (t.strip_block_parents() for t in text_segments)
274
+ combined = next(combine_text_segments(segments), None)
275
+ if combined is None:
276
+ return None
277
+ else:
278
+ return combined[0]
279
+
280
+ def _append_text_in_element(
281
+ self,
282
+ origin_text: str | None,
283
+ append_text: str,
284
+ will_inject_space: bool,
285
+ ) -> str:
286
+ if origin_text is None:
287
+ return append_text
288
+ elif will_inject_space:
289
+ return origin_text.rstrip() + " " + append_text.lstrip()
290
+ else:
291
+ return origin_text + append_text
292
+
293
+
294
+ def _nest_nodes(mappings: list[InlineSegmentMapping]) -> Generator[_Node, None, None]:
295
+ # 需要翻译的文字会被嵌套到两种不同的结构中。
296
+ # 最常见的的是 peak 结构,例如如下结构,没有任何子结构(inline 标签不是视为子结构)。
297
+ # 可直接文本替换或追加。
298
+ # <div>Some text <b>bold text</b> more text.</div>
299
+ #
300
+ # 但是还有一种少见的 platform 结构,它内部被其他 peak/platform 切割。
301
+ # <div>
302
+ # Some text before.
303
+ # <!-- 如下 peak 将它的阅读流切段 -->
304
+ # <div>Paragraph 1.</div>
305
+ # Some text in between.
306
+ # </div>
307
+ # 如果直接对它进行替换或追加,读者阅读流会被破坏,从而读起来怪异。
308
+ # 正是因为这种结构的存在,必须还原成树型结构,然后用特殊的方式来处理 platform 结构。
309
+ #
310
+ # 总之,我们假设 95% 的阅读体验由 peak 提供,但为兼顾剩下的 platform 结构,故加此步骤。
311
+ stack: list[_Node] = []
312
+
313
+ for block_element, text_segments in mappings:
314
+ keep_depth: int = 0
315
+ upwards: bool = False
316
+ for i in range(len(stack) - 1, -1, -1):
317
+ if stack[i].raw_element is block_element:
318
+ keep_depth = i + 1
319
+ upwards = True
320
+ break
321
+
322
+ if not upwards:
323
+ for i in range(len(stack) - 1, -1, -1):
324
+ if _check_includes(stack[i].raw_element, block_element):
325
+ keep_depth = i + 1
326
+ break
327
+
328
+ while len(stack) > keep_depth:
329
+ child_node = _fold_top_of_stack(stack)
330
+ if not upwards and child_node is not None:
331
+ yield child_node
332
+
333
+ if upwards:
334
+ stack[keep_depth - 1].tail_text_segments.extend(text_segments)
335
+ else:
336
+ stack.append(
337
+ _Node(
338
+ raw_element=block_element,
339
+ items=[],
340
+ tail_text_segments=list(text_segments),
341
+ )
342
+ )
343
+ while stack:
344
+ child_node = _fold_top_of_stack(stack)
345
+ if child_node is not None:
346
+ yield child_node
89
347
 
90
348
 
91
- def _index_of_parent(parent: Element, checked_element: Element) -> int:
92
- for i, child in enumerate(parent):
93
- if child == checked_element:
94
- return i
95
- raise ValueError("Element not found in parent.")
349
+ def _fold_top_of_stack(stack: list[_Node]):
350
+ child_node = stack.pop()
351
+ if not stack:
352
+ return child_node
353
+ parent_node = stack[-1]
354
+ parent_node.items.append((parent_node.tail_text_segments, child_node))
355
+ parent_node.tail_text_segments = []
356
+ return None
96
357
 
97
358
 
98
- def _append_text(origin_text: str | None, append_text: str) -> str:
99
- if origin_text is None:
100
- return append_text
101
- else:
102
- return origin_text + append_text
359
+ def _check_includes(parent: Element, child: Element) -> bool:
360
+ for _, checked in iter_with_stack(parent):
361
+ if child is checked:
362
+ return True
363
+ return False