epub-translator 0.1.0__py3-none-any.whl → 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epub_translator/__init__.py +2 -2
- epub_translator/data/fill.jinja +143 -38
- epub_translator/epub/__init__.py +1 -1
- epub_translator/epub/metadata.py +122 -0
- epub_translator/epub/spines.py +3 -2
- epub_translator/epub/zip.py +11 -9
- epub_translator/epub_transcode.py +108 -0
- epub_translator/llm/__init__.py +1 -0
- epub_translator/llm/context.py +109 -0
- epub_translator/llm/core.py +39 -62
- epub_translator/llm/executor.py +25 -31
- epub_translator/llm/increasable.py +1 -1
- epub_translator/llm/types.py +0 -3
- epub_translator/segment/__init__.py +26 -0
- epub_translator/segment/block_segment.py +124 -0
- epub_translator/segment/common.py +29 -0
- epub_translator/segment/inline_segment.py +356 -0
- epub_translator/{xml_translator → segment}/text_segment.py +8 -8
- epub_translator/segment/utils.py +43 -0
- epub_translator/translator.py +150 -183
- epub_translator/utils.py +33 -0
- epub_translator/xml/__init__.py +2 -0
- epub_translator/xml/const.py +1 -0
- epub_translator/xml/deduplication.py +3 -3
- epub_translator/xml/self_closing.py +182 -0
- epub_translator/xml/utils.py +42 -0
- epub_translator/xml/xml.py +7 -0
- epub_translator/xml/xml_like.py +145 -115
- epub_translator/xml_interrupter.py +165 -0
- epub_translator/xml_translator/__init__.py +1 -2
- epub_translator/xml_translator/callbacks.py +34 -0
- epub_translator/xml_translator/{const.py → common.py} +0 -1
- epub_translator/xml_translator/hill_climbing.py +104 -0
- epub_translator/xml_translator/stream_mapper.py +253 -0
- epub_translator/xml_translator/submitter.py +26 -72
- epub_translator/xml_translator/translator.py +157 -107
- epub_translator/xml_translator/validation.py +458 -0
- {epub_translator-0.1.0.dist-info → epub_translator-0.1.3.dist-info}/METADATA +72 -9
- epub_translator-0.1.3.dist-info/RECORD +66 -0
- epub_translator/epub/placeholder.py +0 -53
- epub_translator/iter_sync.py +0 -24
- epub_translator/xml_translator/fill.py +0 -128
- epub_translator/xml_translator/format.py +0 -282
- epub_translator/xml_translator/fragmented.py +0 -125
- epub_translator/xml_translator/group.py +0 -183
- epub_translator/xml_translator/progressive_locking.py +0 -256
- epub_translator/xml_translator/utils.py +0 -29
- epub_translator-0.1.0.dist-info/RECORD +0 -58
- {epub_translator-0.1.0.dist-info → epub_translator-0.1.3.dist-info}/LICENSE +0 -0
- {epub_translator-0.1.0.dist-info → epub_translator-0.1.3.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,253 @@
|
|
|
1
|
+
from collections.abc import Callable, Generator, Iterable, Iterator
|
|
2
|
+
from xml.etree.ElementTree import Element
|
|
3
|
+
|
|
4
|
+
from resource_segmentation import Group, Resource, Segment, split
|
|
5
|
+
from tiktoken import Encoding
|
|
6
|
+
|
|
7
|
+
from ..segment import InlineSegment, TextSegment, search_inline_segments, search_text_segments
|
|
8
|
+
from .callbacks import Callbacks
|
|
9
|
+
|
|
10
|
+
_PAGE_INCISION = 0
|
|
11
|
+
_BLOCK_INCISION = 1
|
|
12
|
+
|
|
13
|
+
_ELLIPSIS = "..."
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
InlineSegmentMapping = tuple[Element, list[TextSegment]]
|
|
17
|
+
InlineSegmentGroupMap = Callable[[list[InlineSegment]], list[InlineSegmentMapping | None]]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class XMLStreamMapper:
|
|
21
|
+
def __init__(self, encoding: Encoding, max_group_tokens: int) -> None:
|
|
22
|
+
self._encoding: Encoding = encoding
|
|
23
|
+
self._max_group_tokens: int = max_group_tokens
|
|
24
|
+
|
|
25
|
+
def map_stream(
|
|
26
|
+
self,
|
|
27
|
+
elements: Iterator[Element],
|
|
28
|
+
callbacks: Callbacks,
|
|
29
|
+
map: InlineSegmentGroupMap,
|
|
30
|
+
) -> Generator[tuple[Element, list[InlineSegmentMapping]], None, None]:
|
|
31
|
+
current_element: Element | None = None
|
|
32
|
+
mapping_buffer: list[InlineSegmentMapping] = []
|
|
33
|
+
|
|
34
|
+
for group in self._split_into_serial_groups(elements, callbacks):
|
|
35
|
+
head, body, tail = self._truncate_and_transform_group(group)
|
|
36
|
+
target_body = map(head + body + tail)[len(head) : len(head) + len(body)]
|
|
37
|
+
for origin, target in zip(body, target_body, strict=False):
|
|
38
|
+
origin_element = origin.head.root
|
|
39
|
+
if current_element is None:
|
|
40
|
+
current_element = origin_element
|
|
41
|
+
|
|
42
|
+
if id(current_element) != id(origin_element):
|
|
43
|
+
yield current_element, mapping_buffer
|
|
44
|
+
current_element = origin_element
|
|
45
|
+
mapping_buffer = []
|
|
46
|
+
|
|
47
|
+
if target:
|
|
48
|
+
block_element, text_segments = target
|
|
49
|
+
block_element = callbacks.interrupt_block_element(block_element)
|
|
50
|
+
text_segments = list(callbacks.interrupt_translated_text_segments(text_segments))
|
|
51
|
+
if text_segments:
|
|
52
|
+
mapping_buffer.append((block_element, text_segments))
|
|
53
|
+
|
|
54
|
+
if current_element is not None:
|
|
55
|
+
yield current_element, mapping_buffer
|
|
56
|
+
|
|
57
|
+
def _split_into_serial_groups(self, elements: Iterable[Element], callbacks: Callbacks):
|
|
58
|
+
def generate():
|
|
59
|
+
for element in elements:
|
|
60
|
+
yield from split(
|
|
61
|
+
max_segment_count=self._max_group_tokens,
|
|
62
|
+
border_incision=_PAGE_INCISION,
|
|
63
|
+
resources=self._expand_to_resources(element, callbacks),
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
generator = generate()
|
|
67
|
+
group = next(generator, None)
|
|
68
|
+
if group is None:
|
|
69
|
+
return
|
|
70
|
+
|
|
71
|
+
# head + body * N (without tail)
|
|
72
|
+
sum_count = group.head_remain_count + sum(x.count for x in self._expand_resource_segments(group.body))
|
|
73
|
+
|
|
74
|
+
while True:
|
|
75
|
+
next_group = next(generator, None)
|
|
76
|
+
if next_group is None:
|
|
77
|
+
break
|
|
78
|
+
|
|
79
|
+
next_sum_body_count = sum(x.count for x in self._expand_resource_segments(next_group.body))
|
|
80
|
+
next_sum_count = sum_count + next_sum_body_count
|
|
81
|
+
|
|
82
|
+
if next_sum_count + next_group.tail_remain_count > self._max_group_tokens:
|
|
83
|
+
yield group
|
|
84
|
+
group = next_group
|
|
85
|
+
sum_count = group.head_remain_count + next_sum_body_count
|
|
86
|
+
else:
|
|
87
|
+
group.body.extend(next_group.body)
|
|
88
|
+
group.tail = next_group.tail
|
|
89
|
+
group.tail_remain_count = next_group.tail_remain_count
|
|
90
|
+
sum_count = next_sum_count
|
|
91
|
+
|
|
92
|
+
yield group
|
|
93
|
+
|
|
94
|
+
def _truncate_and_transform_group(self, group: Group[InlineSegment]):
|
|
95
|
+
head = list(
|
|
96
|
+
self._truncate_inline_segments(
|
|
97
|
+
inline_segments=self._expand_inline_segments(group.head),
|
|
98
|
+
remain_head=False,
|
|
99
|
+
remain_count=group.head_remain_count,
|
|
100
|
+
)
|
|
101
|
+
)
|
|
102
|
+
body = list(self._expand_inline_segments(group.body))
|
|
103
|
+
tail = list(
|
|
104
|
+
self._truncate_inline_segments(
|
|
105
|
+
inline_segments=self._expand_inline_segments(group.tail),
|
|
106
|
+
remain_head=True,
|
|
107
|
+
remain_count=group.tail_remain_count,
|
|
108
|
+
)
|
|
109
|
+
)
|
|
110
|
+
return head, body, tail
|
|
111
|
+
|
|
112
|
+
def _expand_to_resources(self, element: Element, callbacks: Callbacks):
|
|
113
|
+
def expand(element: Element):
|
|
114
|
+
text_segments = search_text_segments(element)
|
|
115
|
+
text_segments = callbacks.interrupt_source_text_segments(text_segments)
|
|
116
|
+
yield from search_inline_segments(text_segments)
|
|
117
|
+
|
|
118
|
+
inline_segment_generator = expand(element)
|
|
119
|
+
start_incision = _PAGE_INCISION
|
|
120
|
+
inline_segment = next(inline_segment_generator, None)
|
|
121
|
+
if inline_segment is None:
|
|
122
|
+
return
|
|
123
|
+
|
|
124
|
+
while True:
|
|
125
|
+
next_inline_segment = next(inline_segment_generator, None)
|
|
126
|
+
if next_inline_segment is None:
|
|
127
|
+
break
|
|
128
|
+
|
|
129
|
+
if next_inline_segment.head.root is inline_segment.tail.root:
|
|
130
|
+
end_incision = _BLOCK_INCISION
|
|
131
|
+
else:
|
|
132
|
+
end_incision = _PAGE_INCISION
|
|
133
|
+
|
|
134
|
+
yield Resource(
|
|
135
|
+
count=sum(len(self._encoding.encode(t.xml_text)) for t in inline_segment),
|
|
136
|
+
start_incision=start_incision,
|
|
137
|
+
end_incision=end_incision,
|
|
138
|
+
payload=inline_segment,
|
|
139
|
+
)
|
|
140
|
+
inline_segment = next_inline_segment
|
|
141
|
+
start_incision = end_incision
|
|
142
|
+
|
|
143
|
+
yield Resource(
|
|
144
|
+
count=sum(len(self._encoding.encode(t.xml_text)) for t in inline_segment),
|
|
145
|
+
start_incision=start_incision,
|
|
146
|
+
end_incision=_PAGE_INCISION,
|
|
147
|
+
payload=inline_segment,
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
def _truncate_inline_segments(self, inline_segments: Iterable[InlineSegment], remain_head: bool, remain_count: int):
|
|
151
|
+
def clone_and_expand(segments: Iterable[InlineSegment]):
|
|
152
|
+
for segment in segments:
|
|
153
|
+
for child_segment in segment:
|
|
154
|
+
yield child_segment.clone() # 切割对应的 head 和 tail 会与其他 group 重叠,复制避免互相影响
|
|
155
|
+
|
|
156
|
+
truncated_text_segments = self._truncate_text_segments(
|
|
157
|
+
text_segments=clone_and_expand(inline_segments),
|
|
158
|
+
remain_head=remain_head,
|
|
159
|
+
remain_count=remain_count,
|
|
160
|
+
)
|
|
161
|
+
yield from search_inline_segments(truncated_text_segments)
|
|
162
|
+
|
|
163
|
+
def _expand_inline_segments(self, items: list[Resource[InlineSegment] | Segment[InlineSegment]]):
|
|
164
|
+
for resource in self._expand_resource_segments(items):
|
|
165
|
+
yield resource.payload
|
|
166
|
+
|
|
167
|
+
def _expand_resource_segments(self, items: list[Resource[InlineSegment] | Segment[InlineSegment]]):
|
|
168
|
+
for item in items:
|
|
169
|
+
if isinstance(item, Resource):
|
|
170
|
+
yield item
|
|
171
|
+
elif isinstance(item, Segment):
|
|
172
|
+
yield from item.resources
|
|
173
|
+
|
|
174
|
+
def _truncate_text_segments(self, text_segments: Iterable[TextSegment], remain_head: bool, remain_count: int):
|
|
175
|
+
if remain_head:
|
|
176
|
+
yield from self._filter_and_remain_segments(
|
|
177
|
+
segments=text_segments,
|
|
178
|
+
remain_head=remain_head,
|
|
179
|
+
remain_count=remain_count,
|
|
180
|
+
)
|
|
181
|
+
else:
|
|
182
|
+
yield from reversed(
|
|
183
|
+
list(
|
|
184
|
+
self._filter_and_remain_segments(
|
|
185
|
+
segments=reversed(list(text_segments)),
|
|
186
|
+
remain_head=remain_head,
|
|
187
|
+
remain_count=remain_count,
|
|
188
|
+
)
|
|
189
|
+
)
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
def _filter_and_remain_segments(self, segments: Iterable[TextSegment], remain_head: bool, remain_count: int):
|
|
193
|
+
for segment in segments:
|
|
194
|
+
if remain_count <= 0:
|
|
195
|
+
break
|
|
196
|
+
raw_xml_text = segment.xml_text
|
|
197
|
+
tokens = self._encoding.encode(raw_xml_text)
|
|
198
|
+
tokens_count = len(tokens)
|
|
199
|
+
|
|
200
|
+
if tokens_count > remain_count:
|
|
201
|
+
truncated_segment = self._truncate_text_segment(
|
|
202
|
+
segment=segment,
|
|
203
|
+
tokens=tokens,
|
|
204
|
+
raw_xml_text=raw_xml_text,
|
|
205
|
+
remain_head=remain_head,
|
|
206
|
+
remain_count=remain_count,
|
|
207
|
+
)
|
|
208
|
+
if truncated_segment is not None:
|
|
209
|
+
yield truncated_segment
|
|
210
|
+
break
|
|
211
|
+
|
|
212
|
+
yield segment
|
|
213
|
+
remain_count -= tokens_count
|
|
214
|
+
|
|
215
|
+
def _truncate_text_segment(
|
|
216
|
+
self,
|
|
217
|
+
segment: TextSegment,
|
|
218
|
+
tokens: list[int],
|
|
219
|
+
raw_xml_text: str,
|
|
220
|
+
remain_head: bool,
|
|
221
|
+
remain_count: int,
|
|
222
|
+
) -> TextSegment | None:
|
|
223
|
+
# 典型的 xml_text: <tag id="99" data-origin-len="999">Some text</tag>
|
|
224
|
+
# 如果切割点在前缀 XML 区,则整体舍弃
|
|
225
|
+
# 如果切割点在后缀 XML 区,则整体保留
|
|
226
|
+
# 只有刚好切割在正文区,才执行文本截断操作
|
|
227
|
+
remain_text: str
|
|
228
|
+
xml_text_head_length = raw_xml_text.find(segment.text)
|
|
229
|
+
|
|
230
|
+
if remain_head:
|
|
231
|
+
remain_xml_text = self._encoding.decode(tokens[:remain_count]) # remain_count cannot be 0 here
|
|
232
|
+
if len(remain_xml_text) <= xml_text_head_length:
|
|
233
|
+
return None
|
|
234
|
+
if len(remain_xml_text) >= xml_text_head_length + len(segment.text):
|
|
235
|
+
return segment
|
|
236
|
+
remain_text = remain_xml_text[xml_text_head_length:]
|
|
237
|
+
else:
|
|
238
|
+
xml_text_tail_length = len(raw_xml_text) - (xml_text_head_length + len(segment.text))
|
|
239
|
+
remain_xml_text = self._encoding.decode(tokens[-remain_count:])
|
|
240
|
+
if len(remain_xml_text) <= xml_text_tail_length:
|
|
241
|
+
return None
|
|
242
|
+
if len(remain_xml_text) >= xml_text_tail_length + len(segment.text):
|
|
243
|
+
return segment
|
|
244
|
+
remain_text = remain_xml_text[: len(remain_xml_text) - xml_text_tail_length]
|
|
245
|
+
|
|
246
|
+
if not remain_text.strip():
|
|
247
|
+
return None
|
|
248
|
+
|
|
249
|
+
if remain_head:
|
|
250
|
+
segment.text = f"{remain_text} {_ELLIPSIS}"
|
|
251
|
+
else:
|
|
252
|
+
segment.text = f"{_ELLIPSIS} {remain_text}"
|
|
253
|
+
return segment
|
|
@@ -1,69 +1,37 @@
|
|
|
1
|
-
from collections.abc import Iterable
|
|
2
1
|
from xml.etree.ElementTree import Element
|
|
3
2
|
|
|
4
|
-
from ..
|
|
5
|
-
from
|
|
3
|
+
from ..segment import TextSegment, combine_text_segments
|
|
4
|
+
from ..xml import index_of_parent, iter_with_stack
|
|
5
|
+
from .stream_mapper import InlineSegmentMapping
|
|
6
6
|
|
|
7
7
|
|
|
8
|
-
def submit_text_segments(element: Element,
|
|
9
|
-
grouped_map = _group_text_segments(
|
|
10
|
-
flatten_text_segments = dict(_extract_flatten_text_segments(element, grouped_map))
|
|
8
|
+
def submit_text_segments(element: Element, mappings: list[InlineSegmentMapping]) -> Element:
|
|
9
|
+
grouped_map = _group_text_segments(mappings)
|
|
11
10
|
_append_text_segments(element, grouped_map)
|
|
12
|
-
|
|
11
|
+
return element
|
|
13
12
|
|
|
14
13
|
|
|
15
|
-
def _group_text_segments(
|
|
14
|
+
def _group_text_segments(mappings: list[InlineSegmentMapping]):
|
|
16
15
|
grouped_map: dict[int, list[TextSegment]] = {}
|
|
17
|
-
for
|
|
18
|
-
parent_id = id(
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
#
|
|
27
|
-
#
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
for
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
parent_id = id(parent)
|
|
35
|
-
if parent_id in grouped_map:
|
|
36
|
-
override_parent_ids.add(parent_id)
|
|
37
|
-
|
|
38
|
-
if id(element) in grouped_map:
|
|
39
|
-
override_parent_ids.add(id(element)) # root 不会出现在 parents 中需单独添加
|
|
40
|
-
|
|
41
|
-
for parent_id in override_parent_ids:
|
|
42
|
-
yield parent_id, grouped_map.pop(parent_id)
|
|
16
|
+
for block_element, text_segments in mappings:
|
|
17
|
+
parent_id = id(block_element)
|
|
18
|
+
grouped_map[parent_id] = text_segments
|
|
19
|
+
|
|
20
|
+
# TODO: 如下是为了清除嵌入文字的 Block,当前版本忽略了嵌入文字的 Block 概念。
|
|
21
|
+
# 这是书籍中可能出现的一种情况,虽然不多见。
|
|
22
|
+
# 例如,作为非叶子的块元素,它的子块元素之间会夹杂文本,当前 collect_next_inline_segment 会忽略这些文字:
|
|
23
|
+
# <div>
|
|
24
|
+
# Some text before.
|
|
25
|
+
# <!-- 只有下一行作为叶子节点的块元素内的文字会被处理 -->
|
|
26
|
+
# <div>Paragraph 1.</div>
|
|
27
|
+
# Some text in between.
|
|
28
|
+
# </div>
|
|
29
|
+
for _, text_segments in mappings:
|
|
30
|
+
for text_segment in text_segments:
|
|
31
|
+
for parent_block in text_segment.parent_stack[: text_segment.block_depth - 1]:
|
|
32
|
+
grouped_map.pop(id(parent_block), None)
|
|
43
33
|
|
|
44
|
-
|
|
45
|
-
def _replace_text_segments(element: Element, text_segments: dict[int, list[TextSegment]]):
|
|
46
|
-
for _, child_element in iter_with_stack(element):
|
|
47
|
-
tail_text_segments: list[TextSegment] = []
|
|
48
|
-
for text_segment in text_segments.get(id(child_element), ()):
|
|
49
|
-
if text_segment.position == TextPosition.TEXT:
|
|
50
|
-
child_element.text = _append_text(
|
|
51
|
-
origin_text=child_element.text,
|
|
52
|
-
append_text=text_segment.text,
|
|
53
|
-
)
|
|
54
|
-
elif text_segment.position == TextPosition.TAIL:
|
|
55
|
-
tail_text_segments.append(text_segment)
|
|
56
|
-
|
|
57
|
-
tail_text_segments.sort(key=lambda t: t.index)
|
|
58
|
-
tail_text_segments.reverse()
|
|
59
|
-
for cc_element in child_element:
|
|
60
|
-
if not tail_text_segments:
|
|
61
|
-
break
|
|
62
|
-
if cc_element.tail is not None:
|
|
63
|
-
cc_element.tail = _append_text(
|
|
64
|
-
origin_text=cc_element.tail,
|
|
65
|
-
append_text=tail_text_segments.pop().text,
|
|
66
|
-
)
|
|
34
|
+
return grouped_map
|
|
67
35
|
|
|
68
36
|
|
|
69
37
|
def _append_text_segments(element: Element, grouped_map: dict[int, list[TextSegment]]):
|
|
@@ -74,7 +42,7 @@ def _append_text_segments(element: Element, grouped_map: dict[int, list[TextSegm
|
|
|
74
42
|
if not grouped:
|
|
75
43
|
continue
|
|
76
44
|
parent = parents[-1]
|
|
77
|
-
index =
|
|
45
|
+
index = index_of_parent(parents[-1], child_element)
|
|
78
46
|
combined = next(
|
|
79
47
|
combine_text_segments(
|
|
80
48
|
segments=(t.strip_block_parents() for t in grouped),
|
|
@@ -86,17 +54,3 @@ def _append_text_segments(element: Element, grouped_map: dict[int, list[TextSegm
|
|
|
86
54
|
parent.insert(index + 1, combined_element)
|
|
87
55
|
combined_element.tail = child_element.tail
|
|
88
56
|
child_element.tail = None
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
def _index_of_parent(parent: Element, checked_element: Element) -> int:
|
|
92
|
-
for i, child in enumerate(parent):
|
|
93
|
-
if child == checked_element:
|
|
94
|
-
return i
|
|
95
|
-
raise ValueError("Element not found in parent.")
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
def _append_text(origin_text: str | None, append_text: str) -> str:
|
|
99
|
-
if origin_text is None:
|
|
100
|
-
return append_text
|
|
101
|
-
else:
|
|
102
|
-
return origin_text + append_text
|