epub-translator 0.1.0__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. epub_translator/__init__.py +2 -2
  2. epub_translator/data/fill.jinja +143 -38
  3. epub_translator/epub/__init__.py +1 -1
  4. epub_translator/epub/metadata.py +122 -0
  5. epub_translator/epub/spines.py +3 -2
  6. epub_translator/epub/zip.py +11 -9
  7. epub_translator/epub_transcode.py +108 -0
  8. epub_translator/llm/__init__.py +1 -0
  9. epub_translator/llm/context.py +109 -0
  10. epub_translator/llm/core.py +39 -62
  11. epub_translator/llm/executor.py +25 -31
  12. epub_translator/llm/increasable.py +1 -1
  13. epub_translator/llm/types.py +0 -3
  14. epub_translator/segment/__init__.py +26 -0
  15. epub_translator/segment/block_segment.py +124 -0
  16. epub_translator/segment/common.py +29 -0
  17. epub_translator/segment/inline_segment.py +356 -0
  18. epub_translator/{xml_translator → segment}/text_segment.py +8 -8
  19. epub_translator/segment/utils.py +43 -0
  20. epub_translator/translator.py +150 -183
  21. epub_translator/utils.py +33 -0
  22. epub_translator/xml/__init__.py +2 -0
  23. epub_translator/xml/const.py +1 -0
  24. epub_translator/xml/deduplication.py +3 -3
  25. epub_translator/xml/self_closing.py +182 -0
  26. epub_translator/xml/utils.py +42 -0
  27. epub_translator/xml/xml.py +7 -0
  28. epub_translator/xml/xml_like.py +145 -115
  29. epub_translator/xml_interrupter.py +165 -0
  30. epub_translator/xml_translator/__init__.py +1 -2
  31. epub_translator/xml_translator/callbacks.py +34 -0
  32. epub_translator/xml_translator/{const.py → common.py} +0 -1
  33. epub_translator/xml_translator/hill_climbing.py +104 -0
  34. epub_translator/xml_translator/stream_mapper.py +253 -0
  35. epub_translator/xml_translator/submitter.py +26 -72
  36. epub_translator/xml_translator/translator.py +157 -107
  37. epub_translator/xml_translator/validation.py +458 -0
  38. {epub_translator-0.1.0.dist-info → epub_translator-0.1.3.dist-info}/METADATA +72 -9
  39. epub_translator-0.1.3.dist-info/RECORD +66 -0
  40. epub_translator/epub/placeholder.py +0 -53
  41. epub_translator/iter_sync.py +0 -24
  42. epub_translator/xml_translator/fill.py +0 -128
  43. epub_translator/xml_translator/format.py +0 -282
  44. epub_translator/xml_translator/fragmented.py +0 -125
  45. epub_translator/xml_translator/group.py +0 -183
  46. epub_translator/xml_translator/progressive_locking.py +0 -256
  47. epub_translator/xml_translator/utils.py +0 -29
  48. epub_translator-0.1.0.dist-info/RECORD +0 -58
  49. {epub_translator-0.1.0.dist-info → epub_translator-0.1.3.dist-info}/LICENSE +0 -0
  50. {epub_translator-0.1.0.dist-info → epub_translator-0.1.3.dist-info}/WHEEL +0 -0
@@ -0,0 +1,253 @@
1
+ from collections.abc import Callable, Generator, Iterable, Iterator
2
+ from xml.etree.ElementTree import Element
3
+
4
+ from resource_segmentation import Group, Resource, Segment, split
5
+ from tiktoken import Encoding
6
+
7
+ from ..segment import InlineSegment, TextSegment, search_inline_segments, search_text_segments
8
+ from .callbacks import Callbacks
9
+
10
+ _PAGE_INCISION = 0
11
+ _BLOCK_INCISION = 1
12
+
13
+ _ELLIPSIS = "..."
14
+
15
+
16
+ InlineSegmentMapping = tuple[Element, list[TextSegment]]
17
+ InlineSegmentGroupMap = Callable[[list[InlineSegment]], list[InlineSegmentMapping | None]]
18
+
19
+
20
+ class XMLStreamMapper:
21
+ def __init__(self, encoding: Encoding, max_group_tokens: int) -> None:
22
+ self._encoding: Encoding = encoding
23
+ self._max_group_tokens: int = max_group_tokens
24
+
25
+ def map_stream(
26
+ self,
27
+ elements: Iterator[Element],
28
+ callbacks: Callbacks,
29
+ map: InlineSegmentGroupMap,
30
+ ) -> Generator[tuple[Element, list[InlineSegmentMapping]], None, None]:
31
+ current_element: Element | None = None
32
+ mapping_buffer: list[InlineSegmentMapping] = []
33
+
34
+ for group in self._split_into_serial_groups(elements, callbacks):
35
+ head, body, tail = self._truncate_and_transform_group(group)
36
+ target_body = map(head + body + tail)[len(head) : len(head) + len(body)]
37
+ for origin, target in zip(body, target_body, strict=False):
38
+ origin_element = origin.head.root
39
+ if current_element is None:
40
+ current_element = origin_element
41
+
42
+ if id(current_element) != id(origin_element):
43
+ yield current_element, mapping_buffer
44
+ current_element = origin_element
45
+ mapping_buffer = []
46
+
47
+ if target:
48
+ block_element, text_segments = target
49
+ block_element = callbacks.interrupt_block_element(block_element)
50
+ text_segments = list(callbacks.interrupt_translated_text_segments(text_segments))
51
+ if text_segments:
52
+ mapping_buffer.append((block_element, text_segments))
53
+
54
+ if current_element is not None:
55
+ yield current_element, mapping_buffer
56
+
57
+ def _split_into_serial_groups(self, elements: Iterable[Element], callbacks: Callbacks):
58
+ def generate():
59
+ for element in elements:
60
+ yield from split(
61
+ max_segment_count=self._max_group_tokens,
62
+ border_incision=_PAGE_INCISION,
63
+ resources=self._expand_to_resources(element, callbacks),
64
+ )
65
+
66
+ generator = generate()
67
+ group = next(generator, None)
68
+ if group is None:
69
+ return
70
+
71
+ # head + body * N (without tail)
72
+ sum_count = group.head_remain_count + sum(x.count for x in self._expand_resource_segments(group.body))
73
+
74
+ while True:
75
+ next_group = next(generator, None)
76
+ if next_group is None:
77
+ break
78
+
79
+ next_sum_body_count = sum(x.count for x in self._expand_resource_segments(next_group.body))
80
+ next_sum_count = sum_count + next_sum_body_count
81
+
82
+ if next_sum_count + next_group.tail_remain_count > self._max_group_tokens:
83
+ yield group
84
+ group = next_group
85
+ sum_count = group.head_remain_count + next_sum_body_count
86
+ else:
87
+ group.body.extend(next_group.body)
88
+ group.tail = next_group.tail
89
+ group.tail_remain_count = next_group.tail_remain_count
90
+ sum_count = next_sum_count
91
+
92
+ yield group
93
+
94
+ def _truncate_and_transform_group(self, group: Group[InlineSegment]):
95
+ head = list(
96
+ self._truncate_inline_segments(
97
+ inline_segments=self._expand_inline_segments(group.head),
98
+ remain_head=False,
99
+ remain_count=group.head_remain_count,
100
+ )
101
+ )
102
+ body = list(self._expand_inline_segments(group.body))
103
+ tail = list(
104
+ self._truncate_inline_segments(
105
+ inline_segments=self._expand_inline_segments(group.tail),
106
+ remain_head=True,
107
+ remain_count=group.tail_remain_count,
108
+ )
109
+ )
110
+ return head, body, tail
111
+
112
+ def _expand_to_resources(self, element: Element, callbacks: Callbacks):
113
+ def expand(element: Element):
114
+ text_segments = search_text_segments(element)
115
+ text_segments = callbacks.interrupt_source_text_segments(text_segments)
116
+ yield from search_inline_segments(text_segments)
117
+
118
+ inline_segment_generator = expand(element)
119
+ start_incision = _PAGE_INCISION
120
+ inline_segment = next(inline_segment_generator, None)
121
+ if inline_segment is None:
122
+ return
123
+
124
+ while True:
125
+ next_inline_segment = next(inline_segment_generator, None)
126
+ if next_inline_segment is None:
127
+ break
128
+
129
+ if next_inline_segment.head.root is inline_segment.tail.root:
130
+ end_incision = _BLOCK_INCISION
131
+ else:
132
+ end_incision = _PAGE_INCISION
133
+
134
+ yield Resource(
135
+ count=sum(len(self._encoding.encode(t.xml_text)) for t in inline_segment),
136
+ start_incision=start_incision,
137
+ end_incision=end_incision,
138
+ payload=inline_segment,
139
+ )
140
+ inline_segment = next_inline_segment
141
+ start_incision = end_incision
142
+
143
+ yield Resource(
144
+ count=sum(len(self._encoding.encode(t.xml_text)) for t in inline_segment),
145
+ start_incision=start_incision,
146
+ end_incision=_PAGE_INCISION,
147
+ payload=inline_segment,
148
+ )
149
+
150
+ def _truncate_inline_segments(self, inline_segments: Iterable[InlineSegment], remain_head: bool, remain_count: int):
151
+ def clone_and_expand(segments: Iterable[InlineSegment]):
152
+ for segment in segments:
153
+ for child_segment in segment:
154
+ yield child_segment.clone() # 切割对应的 head 和 tail 会与其他 group 重叠,复制避免互相影响
155
+
156
+ truncated_text_segments = self._truncate_text_segments(
157
+ text_segments=clone_and_expand(inline_segments),
158
+ remain_head=remain_head,
159
+ remain_count=remain_count,
160
+ )
161
+ yield from search_inline_segments(truncated_text_segments)
162
+
163
+ def _expand_inline_segments(self, items: list[Resource[InlineSegment] | Segment[InlineSegment]]):
164
+ for resource in self._expand_resource_segments(items):
165
+ yield resource.payload
166
+
167
+ def _expand_resource_segments(self, items: list[Resource[InlineSegment] | Segment[InlineSegment]]):
168
+ for item in items:
169
+ if isinstance(item, Resource):
170
+ yield item
171
+ elif isinstance(item, Segment):
172
+ yield from item.resources
173
+
174
+ def _truncate_text_segments(self, text_segments: Iterable[TextSegment], remain_head: bool, remain_count: int):
175
+ if remain_head:
176
+ yield from self._filter_and_remain_segments(
177
+ segments=text_segments,
178
+ remain_head=remain_head,
179
+ remain_count=remain_count,
180
+ )
181
+ else:
182
+ yield from reversed(
183
+ list(
184
+ self._filter_and_remain_segments(
185
+ segments=reversed(list(text_segments)),
186
+ remain_head=remain_head,
187
+ remain_count=remain_count,
188
+ )
189
+ )
190
+ )
191
+
192
+ def _filter_and_remain_segments(self, segments: Iterable[TextSegment], remain_head: bool, remain_count: int):
193
+ for segment in segments:
194
+ if remain_count <= 0:
195
+ break
196
+ raw_xml_text = segment.xml_text
197
+ tokens = self._encoding.encode(raw_xml_text)
198
+ tokens_count = len(tokens)
199
+
200
+ if tokens_count > remain_count:
201
+ truncated_segment = self._truncate_text_segment(
202
+ segment=segment,
203
+ tokens=tokens,
204
+ raw_xml_text=raw_xml_text,
205
+ remain_head=remain_head,
206
+ remain_count=remain_count,
207
+ )
208
+ if truncated_segment is not None:
209
+ yield truncated_segment
210
+ break
211
+
212
+ yield segment
213
+ remain_count -= tokens_count
214
+
215
+ def _truncate_text_segment(
216
+ self,
217
+ segment: TextSegment,
218
+ tokens: list[int],
219
+ raw_xml_text: str,
220
+ remain_head: bool,
221
+ remain_count: int,
222
+ ) -> TextSegment | None:
223
+ # 典型的 xml_text: <tag id="99" data-origin-len="999">Some text</tag>
224
+ # 如果切割点在前缀 XML 区,则整体舍弃
225
+ # 如果切割点在后缀 XML 区,则整体保留
226
+ # 只有刚好切割在正文区,才执行文本截断操作
227
+ remain_text: str
228
+ xml_text_head_length = raw_xml_text.find(segment.text)
229
+
230
+ if remain_head:
231
+ remain_xml_text = self._encoding.decode(tokens[:remain_count]) # remain_count cannot be 0 here
232
+ if len(remain_xml_text) <= xml_text_head_length:
233
+ return None
234
+ if len(remain_xml_text) >= xml_text_head_length + len(segment.text):
235
+ return segment
236
+ remain_text = remain_xml_text[xml_text_head_length:]
237
+ else:
238
+ xml_text_tail_length = len(raw_xml_text) - (xml_text_head_length + len(segment.text))
239
+ remain_xml_text = self._encoding.decode(tokens[-remain_count:])
240
+ if len(remain_xml_text) <= xml_text_tail_length:
241
+ return None
242
+ if len(remain_xml_text) >= xml_text_tail_length + len(segment.text):
243
+ return segment
244
+ remain_text = remain_xml_text[: len(remain_xml_text) - xml_text_tail_length]
245
+
246
+ if not remain_text.strip():
247
+ return None
248
+
249
+ if remain_head:
250
+ segment.text = f"{remain_text} {_ELLIPSIS}"
251
+ else:
252
+ segment.text = f"{_ELLIPSIS} {remain_text}"
253
+ return segment
@@ -1,69 +1,37 @@
1
- from collections.abc import Iterable
2
1
  from xml.etree.ElementTree import Element
3
2
 
4
- from ..xml import iter_with_stack
5
- from .text_segment import TextPosition, TextSegment, combine_text_segments
3
+ from ..segment import TextSegment, combine_text_segments
4
+ from ..xml import index_of_parent, iter_with_stack
5
+ from .stream_mapper import InlineSegmentMapping
6
6
 
7
7
 
8
- def submit_text_segments(element: Element, text_segments: Iterable[TextSegment]):
9
- grouped_map = _group_text_segments(text_segments)
10
- flatten_text_segments = dict(_extract_flatten_text_segments(element, grouped_map))
8
+ def submit_text_segments(element: Element, mappings: list[InlineSegmentMapping]) -> Element:
9
+ grouped_map = _group_text_segments(mappings)
11
10
  _append_text_segments(element, grouped_map)
12
- _replace_text_segments(element, flatten_text_segments)
11
+ return element
13
12
 
14
13
 
15
- def _group_text_segments(text_segments: Iterable[TextSegment]):
14
+ def _group_text_segments(mappings: list[InlineSegmentMapping]):
16
15
  grouped_map: dict[int, list[TextSegment]] = {}
17
- for text_segment in text_segments:
18
- parent_id = id(text_segment.block_parent)
19
- grouped = grouped_map.get(parent_id, None)
20
- if grouped is None:
21
- grouped_map[parent_id] = grouped = []
22
- grouped_map[parent_id].append(text_segment)
23
- return grouped_map
24
-
25
-
26
- # 被覆盖的 block 表示一种偶然现象,由于它的子元素会触发 append 操作,若对它也进行 append 操作阅读顺序会混乱
27
- # 此时只能在它的所有文本后立即接上翻译后的文本
28
- def _extract_flatten_text_segments(element: Element, grouped_map: dict[int, list[TextSegment]]):
29
- override_parent_ids: set[int] = set()
30
- for parents, child_element in iter_with_stack(element):
31
- if id(child_element) not in grouped_map:
32
- continue
33
- for parent in parents[:-1]:
34
- parent_id = id(parent)
35
- if parent_id in grouped_map:
36
- override_parent_ids.add(parent_id)
37
-
38
- if id(element) in grouped_map:
39
- override_parent_ids.add(id(element)) # root 不会出现在 parents 中需单独添加
40
-
41
- for parent_id in override_parent_ids:
42
- yield parent_id, grouped_map.pop(parent_id)
16
+ for block_element, text_segments in mappings:
17
+ parent_id = id(block_element)
18
+ grouped_map[parent_id] = text_segments
19
+
20
+ # TODO: 如下是为了清除嵌入文字的 Block,当前版本忽略了嵌入文字的 Block 概念。
21
+ # 这是书籍中可能出现的一种情况,虽然不多见。
22
+ # 例如,作为非叶子的块元素,它的子块元素之间会夹杂文本,当前 collect_next_inline_segment 会忽略这些文字:
23
+ # <div>
24
+ # Some text before.
25
+ # <!-- 只有下一行作为叶子节点的块元素内的文字会被处理 -->
26
+ # <div>Paragraph 1.</div>
27
+ # Some text in between.
28
+ # </div>
29
+ for _, text_segments in mappings:
30
+ for text_segment in text_segments:
31
+ for parent_block in text_segment.parent_stack[: text_segment.block_depth - 1]:
32
+ grouped_map.pop(id(parent_block), None)
43
33
 
44
-
45
- def _replace_text_segments(element: Element, text_segments: dict[int, list[TextSegment]]):
46
- for _, child_element in iter_with_stack(element):
47
- tail_text_segments: list[TextSegment] = []
48
- for text_segment in text_segments.get(id(child_element), ()):
49
- if text_segment.position == TextPosition.TEXT:
50
- child_element.text = _append_text(
51
- origin_text=child_element.text,
52
- append_text=text_segment.text,
53
- )
54
- elif text_segment.position == TextPosition.TAIL:
55
- tail_text_segments.append(text_segment)
56
-
57
- tail_text_segments.sort(key=lambda t: t.index)
58
- tail_text_segments.reverse()
59
- for cc_element in child_element:
60
- if not tail_text_segments:
61
- break
62
- if cc_element.tail is not None:
63
- cc_element.tail = _append_text(
64
- origin_text=cc_element.tail,
65
- append_text=tail_text_segments.pop().text,
66
- )
34
+ return grouped_map
67
35
 
68
36
 
69
37
  def _append_text_segments(element: Element, grouped_map: dict[int, list[TextSegment]]):
@@ -74,7 +42,7 @@ def _append_text_segments(element: Element, grouped_map: dict[int, list[TextSegm
74
42
  if not grouped:
75
43
  continue
76
44
  parent = parents[-1]
77
- index = _index_of_parent(parents[-1], child_element)
45
+ index = index_of_parent(parents[-1], child_element)
78
46
  combined = next(
79
47
  combine_text_segments(
80
48
  segments=(t.strip_block_parents() for t in grouped),
@@ -86,17 +54,3 @@ def _append_text_segments(element: Element, grouped_map: dict[int, list[TextSegm
86
54
  parent.insert(index + 1, combined_element)
87
55
  combined_element.tail = child_element.tail
88
56
  child_element.tail = None
89
-
90
-
91
- def _index_of_parent(parent: Element, checked_element: Element) -> int:
92
- for i, child in enumerate(parent):
93
- if child == checked_element:
94
- return i
95
- raise ValueError("Element not found in parent.")
96
-
97
-
98
- def _append_text(origin_text: str | None, append_text: str) -> str:
99
- if origin_text is None:
100
- return append_text
101
- else:
102
- return origin_text + append_text