epub-translator 0.1.1__py3-none-any.whl → 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. epub_translator/__init__.py +9 -2
  2. epub_translator/data/fill.jinja +143 -38
  3. epub_translator/epub/__init__.py +1 -1
  4. epub_translator/epub/metadata.py +122 -0
  5. epub_translator/epub/spines.py +3 -2
  6. epub_translator/epub/zip.py +11 -9
  7. epub_translator/epub_transcode.py +108 -0
  8. epub_translator/llm/__init__.py +1 -0
  9. epub_translator/llm/context.py +109 -0
  10. epub_translator/llm/core.py +32 -113
  11. epub_translator/llm/executor.py +25 -31
  12. epub_translator/llm/increasable.py +1 -1
  13. epub_translator/llm/types.py +0 -3
  14. epub_translator/punctuation.py +34 -0
  15. epub_translator/segment/__init__.py +26 -0
  16. epub_translator/segment/block_segment.py +124 -0
  17. epub_translator/segment/common.py +29 -0
  18. epub_translator/segment/inline_segment.py +356 -0
  19. epub_translator/{xml_translator → segment}/text_segment.py +7 -72
  20. epub_translator/segment/utils.py +43 -0
  21. epub_translator/translator.py +152 -184
  22. epub_translator/utils.py +33 -0
  23. epub_translator/xml/__init__.py +3 -0
  24. epub_translator/xml/const.py +1 -0
  25. epub_translator/xml/deduplication.py +3 -3
  26. epub_translator/xml/inline.py +67 -0
  27. epub_translator/xml/self_closing.py +182 -0
  28. epub_translator/xml/utils.py +42 -0
  29. epub_translator/xml/xml.py +7 -0
  30. epub_translator/xml/xml_like.py +8 -33
  31. epub_translator/xml_interrupter.py +165 -0
  32. epub_translator/xml_translator/__init__.py +3 -3
  33. epub_translator/xml_translator/callbacks.py +34 -0
  34. epub_translator/xml_translator/{const.py → common.py} +0 -1
  35. epub_translator/xml_translator/hill_climbing.py +104 -0
  36. epub_translator/xml_translator/stream_mapper.py +253 -0
  37. epub_translator/xml_translator/submitter.py +352 -91
  38. epub_translator/xml_translator/translator.py +182 -114
  39. epub_translator/xml_translator/validation.py +458 -0
  40. {epub_translator-0.1.1.dist-info → epub_translator-0.1.4.dist-info}/METADATA +134 -21
  41. epub_translator-0.1.4.dist-info/RECORD +68 -0
  42. epub_translator/epub/placeholder.py +0 -53
  43. epub_translator/iter_sync.py +0 -24
  44. epub_translator/xml_translator/fill.py +0 -128
  45. epub_translator/xml_translator/format.py +0 -282
  46. epub_translator/xml_translator/fragmented.py +0 -125
  47. epub_translator/xml_translator/group.py +0 -183
  48. epub_translator/xml_translator/progressive_locking.py +0 -256
  49. epub_translator/xml_translator/utils.py +0 -29
  50. epub_translator-0.1.1.dist-info/RECORD +0 -58
  51. {epub_translator-0.1.1.dist-info → epub_translator-0.1.4.dist-info}/LICENSE +0 -0
  52. {epub_translator-0.1.1.dist-info → epub_translator-0.1.4.dist-info}/WHEEL +0 -0
@@ -0,0 +1,253 @@
1
+ from collections.abc import Callable, Generator, Iterable, Iterator
2
+ from xml.etree.ElementTree import Element
3
+
4
+ from resource_segmentation import Group, Resource, Segment, split
5
+ from tiktoken import Encoding
6
+
7
+ from ..segment import InlineSegment, TextSegment, search_inline_segments, search_text_segments
8
+ from .callbacks import Callbacks
9
+
10
+ _PAGE_INCISION = 0
11
+ _BLOCK_INCISION = 1
12
+
13
+ _ELLIPSIS = "..."
14
+
15
+
16
+ InlineSegmentMapping = tuple[Element, list[TextSegment]]
17
+ InlineSegmentGroupMap = Callable[[list[InlineSegment]], list[InlineSegmentMapping | None]]
18
+
19
+
20
+ class XMLStreamMapper:
21
+ def __init__(self, encoding: Encoding, max_group_tokens: int) -> None:
22
+ self._encoding: Encoding = encoding
23
+ self._max_group_tokens: int = max_group_tokens
24
+
25
+ def map_stream(
26
+ self,
27
+ elements: Iterator[Element],
28
+ callbacks: Callbacks,
29
+ map: InlineSegmentGroupMap,
30
+ ) -> Generator[tuple[Element, list[InlineSegmentMapping]], None, None]:
31
+ current_element: Element | None = None
32
+ mapping_buffer: list[InlineSegmentMapping] = []
33
+
34
+ for group in self._split_into_serial_groups(elements, callbacks):
35
+ head, body, tail = self._truncate_and_transform_group(group)
36
+ target_body = map(head + body + tail)[len(head) : len(head) + len(body)]
37
+ for origin, target in zip(body, target_body, strict=False):
38
+ origin_element = origin.head.root
39
+ if current_element is None:
40
+ current_element = origin_element
41
+
42
+ if id(current_element) != id(origin_element):
43
+ yield current_element, mapping_buffer
44
+ current_element = origin_element
45
+ mapping_buffer = []
46
+
47
+ if target:
48
+ block_element, text_segments = target
49
+ block_element = callbacks.interrupt_block_element(block_element)
50
+ text_segments = list(callbacks.interrupt_translated_text_segments(text_segments))
51
+ if text_segments:
52
+ mapping_buffer.append((block_element, text_segments))
53
+
54
+ if current_element is not None:
55
+ yield current_element, mapping_buffer
56
+
57
+ def _split_into_serial_groups(self, elements: Iterable[Element], callbacks: Callbacks):
58
+ def generate():
59
+ for element in elements:
60
+ yield from split(
61
+ max_segment_count=self._max_group_tokens,
62
+ border_incision=_PAGE_INCISION,
63
+ resources=self._expand_to_resources(element, callbacks),
64
+ )
65
+
66
+ generator = generate()
67
+ group = next(generator, None)
68
+ if group is None:
69
+ return
70
+
71
+ # head + body * N (without tail)
72
+ sum_count = group.head_remain_count + sum(x.count for x in self._expand_resource_segments(group.body))
73
+
74
+ while True:
75
+ next_group = next(generator, None)
76
+ if next_group is None:
77
+ break
78
+
79
+ next_sum_body_count = sum(x.count for x in self._expand_resource_segments(next_group.body))
80
+ next_sum_count = sum_count + next_sum_body_count
81
+
82
+ if next_sum_count + next_group.tail_remain_count > self._max_group_tokens:
83
+ yield group
84
+ group = next_group
85
+ sum_count = group.head_remain_count + next_sum_body_count
86
+ else:
87
+ group.body.extend(next_group.body)
88
+ group.tail = next_group.tail
89
+ group.tail_remain_count = next_group.tail_remain_count
90
+ sum_count = next_sum_count
91
+
92
+ yield group
93
+
94
+ def _truncate_and_transform_group(self, group: Group[InlineSegment]):
95
+ head = list(
96
+ self._truncate_inline_segments(
97
+ inline_segments=self._expand_inline_segments(group.head),
98
+ remain_head=False,
99
+ remain_count=group.head_remain_count,
100
+ )
101
+ )
102
+ body = list(self._expand_inline_segments(group.body))
103
+ tail = list(
104
+ self._truncate_inline_segments(
105
+ inline_segments=self._expand_inline_segments(group.tail),
106
+ remain_head=True,
107
+ remain_count=group.tail_remain_count,
108
+ )
109
+ )
110
+ return head, body, tail
111
+
112
+ def _expand_to_resources(self, element: Element, callbacks: Callbacks):
113
+ def expand(element: Element):
114
+ text_segments = search_text_segments(element)
115
+ text_segments = callbacks.interrupt_source_text_segments(text_segments)
116
+ yield from search_inline_segments(text_segments)
117
+
118
+ inline_segment_generator = expand(element)
119
+ start_incision = _PAGE_INCISION
120
+ inline_segment = next(inline_segment_generator, None)
121
+ if inline_segment is None:
122
+ return
123
+
124
+ while True:
125
+ next_inline_segment = next(inline_segment_generator, None)
126
+ if next_inline_segment is None:
127
+ break
128
+
129
+ if next_inline_segment.head.root is inline_segment.tail.root:
130
+ end_incision = _BLOCK_INCISION
131
+ else:
132
+ end_incision = _PAGE_INCISION
133
+
134
+ yield Resource(
135
+ count=sum(len(self._encoding.encode(t.xml_text)) for t in inline_segment),
136
+ start_incision=start_incision,
137
+ end_incision=end_incision,
138
+ payload=inline_segment,
139
+ )
140
+ inline_segment = next_inline_segment
141
+ start_incision = end_incision
142
+
143
+ yield Resource(
144
+ count=sum(len(self._encoding.encode(t.xml_text)) for t in inline_segment),
145
+ start_incision=start_incision,
146
+ end_incision=_PAGE_INCISION,
147
+ payload=inline_segment,
148
+ )
149
+
150
+ def _truncate_inline_segments(self, inline_segments: Iterable[InlineSegment], remain_head: bool, remain_count: int):
151
+ def clone_and_expand(segments: Iterable[InlineSegment]):
152
+ for segment in segments:
153
+ for child_segment in segment:
154
+ yield child_segment.clone() # 切割对应的 head 和 tail 会与其他 group 重叠,复制避免互相影响
155
+
156
+ truncated_text_segments = self._truncate_text_segments(
157
+ text_segments=clone_and_expand(inline_segments),
158
+ remain_head=remain_head,
159
+ remain_count=remain_count,
160
+ )
161
+ yield from search_inline_segments(truncated_text_segments)
162
+
163
+ def _expand_inline_segments(self, items: list[Resource[InlineSegment] | Segment[InlineSegment]]):
164
+ for resource in self._expand_resource_segments(items):
165
+ yield resource.payload
166
+
167
+ def _expand_resource_segments(self, items: list[Resource[InlineSegment] | Segment[InlineSegment]]):
168
+ for item in items:
169
+ if isinstance(item, Resource):
170
+ yield item
171
+ elif isinstance(item, Segment):
172
+ yield from item.resources
173
+
174
+ def _truncate_text_segments(self, text_segments: Iterable[TextSegment], remain_head: bool, remain_count: int):
175
+ if remain_head:
176
+ yield from self._filter_and_remain_segments(
177
+ segments=text_segments,
178
+ remain_head=remain_head,
179
+ remain_count=remain_count,
180
+ )
181
+ else:
182
+ yield from reversed(
183
+ list(
184
+ self._filter_and_remain_segments(
185
+ segments=reversed(list(text_segments)),
186
+ remain_head=remain_head,
187
+ remain_count=remain_count,
188
+ )
189
+ )
190
+ )
191
+
192
+ def _filter_and_remain_segments(self, segments: Iterable[TextSegment], remain_head: bool, remain_count: int):
193
+ for segment in segments:
194
+ if remain_count <= 0:
195
+ break
196
+ raw_xml_text = segment.xml_text
197
+ tokens = self._encoding.encode(raw_xml_text)
198
+ tokens_count = len(tokens)
199
+
200
+ if tokens_count > remain_count:
201
+ truncated_segment = self._truncate_text_segment(
202
+ segment=segment,
203
+ tokens=tokens,
204
+ raw_xml_text=raw_xml_text,
205
+ remain_head=remain_head,
206
+ remain_count=remain_count,
207
+ )
208
+ if truncated_segment is not None:
209
+ yield truncated_segment
210
+ break
211
+
212
+ yield segment
213
+ remain_count -= tokens_count
214
+
215
+ def _truncate_text_segment(
216
+ self,
217
+ segment: TextSegment,
218
+ tokens: list[int],
219
+ raw_xml_text: str,
220
+ remain_head: bool,
221
+ remain_count: int,
222
+ ) -> TextSegment | None:
223
+ # 典型的 xml_text: <tag id="99" data-origin-len="999">Some text</tag>
224
+ # 如果切割点在前缀 XML 区,则整体舍弃
225
+ # 如果切割点在后缀 XML 区,则整体保留
226
+ # 只有刚好切割在正文区,才执行文本截断操作
227
+ remain_text: str
228
+ xml_text_head_length = raw_xml_text.find(segment.text)
229
+
230
+ if remain_head:
231
+ remain_xml_text = self._encoding.decode(tokens[:remain_count]) # remain_count cannot be 0 here
232
+ if len(remain_xml_text) <= xml_text_head_length:
233
+ return None
234
+ if len(remain_xml_text) >= xml_text_head_length + len(segment.text):
235
+ return segment
236
+ remain_text = remain_xml_text[xml_text_head_length:]
237
+ else:
238
+ xml_text_tail_length = len(raw_xml_text) - (xml_text_head_length + len(segment.text))
239
+ remain_xml_text = self._encoding.decode(tokens[-remain_count:])
240
+ if len(remain_xml_text) <= xml_text_tail_length:
241
+ return None
242
+ if len(remain_xml_text) >= xml_text_tail_length + len(segment.text):
243
+ return segment
244
+ remain_text = remain_xml_text[: len(remain_xml_text) - xml_text_tail_length]
245
+
246
+ if not remain_text.strip():
247
+ return None
248
+
249
+ if remain_head:
250
+ segment.text = f"{remain_text} {_ELLIPSIS}"
251
+ else:
252
+ segment.text = f"{_ELLIPSIS} {remain_text}"
253
+ return segment