epub-translator 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epub_translator/__init__.py +1 -2
- epub_translator/data/translate.jinja +3 -0
- epub_translator/epub/__init__.py +1 -1
- epub_translator/llm/context.py +10 -1
- epub_translator/llm/core.py +30 -3
- epub_translator/segment/__init__.py +1 -0
- epub_translator/segment/inline_segment.py +11 -1
- epub_translator/segment/text_segment.py +5 -10
- epub_translator/segment/utils.py +0 -16
- epub_translator/translation/__init__.py +2 -0
- epub_translator/{epub_transcode.py → translation/epub_transcode.py} +2 -2
- epub_translator/{punctuation.py → translation/punctuation.py} +1 -1
- epub_translator/{translator.py → translation/translator.py} +8 -6
- epub_translator/{xml_interrupter.py → translation/xml_interrupter.py} +52 -28
- epub_translator/xml/__init__.py +1 -1
- epub_translator/xml/inline.py +48 -2
- epub_translator/xml_translator/concurrency.py +52 -0
- epub_translator/xml_translator/score.py +164 -0
- epub_translator/xml_translator/stream_mapper.py +145 -114
- epub_translator/xml_translator/submitter.py +5 -5
- epub_translator/xml_translator/translator.py +12 -18
- {epub_translator-0.1.5.dist-info → epub_translator-0.1.7.dist-info}/METADATA +37 -9
- epub_translator-0.1.7.dist-info/RECORD +63 -0
- epub_translator/data/mmltex/README.md +0 -67
- epub_translator/data/mmltex/cmarkup.xsl +0 -1106
- epub_translator/data/mmltex/entities.xsl +0 -459
- epub_translator/data/mmltex/glayout.xsl +0 -222
- epub_translator/data/mmltex/mmltex.xsl +0 -36
- epub_translator/data/mmltex/scripts.xsl +0 -375
- epub_translator/data/mmltex/tables.xsl +0 -130
- epub_translator/data/mmltex/tokens.xsl +0 -328
- epub_translator-0.1.5.dist-info/RECORD +0 -68
- /epub_translator/{language.py → translation/language.py} +0 -0
- /epub_translator/xml/{firendly → friendly}/__init__.py +0 -0
- /epub_translator/xml/{firendly → friendly}/decoder.py +0 -0
- /epub_translator/xml/{firendly → friendly}/encoder.py +0 -0
- /epub_translator/xml/{firendly → friendly}/parser.py +0 -0
- /epub_translator/xml/{firendly → friendly}/tag.py +0 -0
- /epub_translator/xml/{firendly → friendly}/transform.py +0 -0
- {epub_translator-0.1.5.dist-info → epub_translator-0.1.7.dist-info}/LICENSE +0 -0
- {epub_translator-0.1.5.dist-info → epub_translator-0.1.7.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
from collections.abc import Generator
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from enum import Enum, auto
|
|
4
|
+
|
|
5
|
+
from tiktoken import Encoding
|
|
6
|
+
|
|
7
|
+
from ..segment import InlineSegment, TextSegment
|
|
8
|
+
from .common import DATA_ORIGIN_LEN_KEY
|
|
9
|
+
|
|
10
|
+
_ID_WEIGHT = 80
|
|
11
|
+
_ELLIPSIS = "..."
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class ScoreSegment:
|
|
16
|
+
text_segment: TextSegment
|
|
17
|
+
left_parents: list[InlineSegment]
|
|
18
|
+
right_parents: list[InlineSegment]
|
|
19
|
+
text_tokens: list[int]
|
|
20
|
+
score: int
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def expand_to_score_segments(encoding: Encoding, inline_segment: InlineSegment) -> Generator[ScoreSegment, None, None]:
|
|
24
|
+
for i, score_segment in enumerate(_do_expand_inline_segment(inline_segment)):
|
|
25
|
+
xml_text = "".join(
|
|
26
|
+
_render_score_segment(
|
|
27
|
+
score_segment=score_segment,
|
|
28
|
+
is_first=(i == 0),
|
|
29
|
+
)
|
|
30
|
+
)
|
|
31
|
+
score_segment.text_tokens = encoding.encode(score_segment.text_segment.text)
|
|
32
|
+
score_segment.score = len(encoding.encode(xml_text)) + sum(
|
|
33
|
+
_ID_WEIGHT for parent in score_segment.left_parents if parent.id is not None
|
|
34
|
+
)
|
|
35
|
+
yield score_segment
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def truncate_score_segment(
|
|
39
|
+
encoding: Encoding,
|
|
40
|
+
score_segment: ScoreSegment,
|
|
41
|
+
remain_head: bool,
|
|
42
|
+
remain_score: int,
|
|
43
|
+
):
|
|
44
|
+
fixed_score = score_segment.score - len(score_segment.text_tokens)
|
|
45
|
+
if remain_score <= fixed_score:
|
|
46
|
+
# 裁剪仅能减少 text 部分的 tokens 数。
|
|
47
|
+
# 而 XML 本身头尾占用的 tokens 数,以及 ID 占用加权分属于 fixed_score 部分,无法裁剪
|
|
48
|
+
# 当发现将文字删光后才能达标时,不如直接放弃整段内容
|
|
49
|
+
return None
|
|
50
|
+
|
|
51
|
+
remain_text_tokens_count = remain_score - fixed_score
|
|
52
|
+
|
|
53
|
+
# remain_text_tokens_count cannot be 0 here
|
|
54
|
+
if remain_head:
|
|
55
|
+
remain_text = encoding.decode(score_segment.text_tokens[:remain_text_tokens_count])
|
|
56
|
+
else:
|
|
57
|
+
remain_text = encoding.decode(score_segment.text_tokens[-remain_text_tokens_count:])
|
|
58
|
+
|
|
59
|
+
if not remain_text.strip():
|
|
60
|
+
return None
|
|
61
|
+
|
|
62
|
+
if remain_head:
|
|
63
|
+
remain_text = f"{remain_text} {_ELLIPSIS}"
|
|
64
|
+
else:
|
|
65
|
+
remain_text = f"{_ELLIPSIS} {remain_text}"
|
|
66
|
+
|
|
67
|
+
text_segment = score_segment.text_segment.clone()
|
|
68
|
+
text_segment.text = remain_text
|
|
69
|
+
|
|
70
|
+
return ScoreSegment(
|
|
71
|
+
text_segment=text_segment,
|
|
72
|
+
left_parents=score_segment.left_parents,
|
|
73
|
+
right_parents=score_segment.right_parents,
|
|
74
|
+
text_tokens=encoding.encode(remain_text),
|
|
75
|
+
score=remain_text_tokens_count + fixed_score,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _render_score_segment(score_segment: ScoreSegment, is_first: bool):
|
|
80
|
+
for i, parent in enumerate(score_segment.left_parents):
|
|
81
|
+
yield "<"
|
|
82
|
+
yield parent.parent.tag
|
|
83
|
+
if parent.id is not None:
|
|
84
|
+
yield ' id="99"'
|
|
85
|
+
if is_first and i == 0:
|
|
86
|
+
yield " "
|
|
87
|
+
yield DATA_ORIGIN_LEN_KEY
|
|
88
|
+
yield '="9999"'
|
|
89
|
+
yield ">"
|
|
90
|
+
|
|
91
|
+
yield score_segment.text_segment.text
|
|
92
|
+
|
|
93
|
+
for parent in reversed(score_segment.right_parents):
|
|
94
|
+
yield "</"
|
|
95
|
+
yield parent.parent.tag
|
|
96
|
+
yield ">"
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _do_expand_inline_segment(inline_segment: InlineSegment):
|
|
100
|
+
text_segment: TextSegment | None = None
|
|
101
|
+
left_parents: list[InlineSegment] = []
|
|
102
|
+
right_parents: list[InlineSegment] = []
|
|
103
|
+
|
|
104
|
+
for item in _expand_as_wrapped(inline_segment):
|
|
105
|
+
if isinstance(item, TextSegment):
|
|
106
|
+
if text_segment is None:
|
|
107
|
+
text_segment = item
|
|
108
|
+
else:
|
|
109
|
+
yield ScoreSegment(
|
|
110
|
+
text_segment=text_segment,
|
|
111
|
+
left_parents=left_parents,
|
|
112
|
+
right_parents=right_parents,
|
|
113
|
+
text_tokens=[],
|
|
114
|
+
score=0,
|
|
115
|
+
)
|
|
116
|
+
text_segment = item
|
|
117
|
+
left_parents = []
|
|
118
|
+
right_parents = []
|
|
119
|
+
|
|
120
|
+
elif isinstance(item, tuple):
|
|
121
|
+
child_inline_segment, orientation = item
|
|
122
|
+
if orientation == _Orientation.UP:
|
|
123
|
+
if text_segment is not None:
|
|
124
|
+
yield ScoreSegment(
|
|
125
|
+
text_segment=text_segment,
|
|
126
|
+
left_parents=left_parents,
|
|
127
|
+
right_parents=right_parents,
|
|
128
|
+
text_tokens=[],
|
|
129
|
+
score=0,
|
|
130
|
+
)
|
|
131
|
+
text_segment = None
|
|
132
|
+
left_parents = []
|
|
133
|
+
right_parents = []
|
|
134
|
+
left_parents.append(child_inline_segment)
|
|
135
|
+
|
|
136
|
+
elif orientation == _Orientation.DOWN:
|
|
137
|
+
if text_segment is None:
|
|
138
|
+
left_parents.clear()
|
|
139
|
+
else:
|
|
140
|
+
right_parents.append(child_inline_segment)
|
|
141
|
+
|
|
142
|
+
if text_segment is not None:
|
|
143
|
+
yield ScoreSegment(
|
|
144
|
+
text_segment=text_segment,
|
|
145
|
+
left_parents=left_parents,
|
|
146
|
+
right_parents=right_parents,
|
|
147
|
+
text_tokens=[],
|
|
148
|
+
score=0,
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
class _Orientation(Enum):
|
|
153
|
+
DOWN = auto()
|
|
154
|
+
UP = auto()
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def _expand_as_wrapped(inline_segment: InlineSegment):
|
|
158
|
+
yield (inline_segment, _Orientation.UP)
|
|
159
|
+
for child in inline_segment.children:
|
|
160
|
+
if isinstance(child, InlineSegment):
|
|
161
|
+
yield from _expand_as_wrapped(child)
|
|
162
|
+
elif isinstance(child, TextSegment):
|
|
163
|
+
yield child
|
|
164
|
+
yield (inline_segment, _Orientation.DOWN)
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
from collections.abc import Callable, Generator, Iterable, Iterator
|
|
2
|
+
from typing import TypeVar
|
|
2
3
|
from xml.etree.ElementTree import Element
|
|
3
4
|
|
|
4
5
|
from resource_segmentation import Group, Resource, Segment, split
|
|
@@ -6,11 +7,14 @@ from tiktoken import Encoding
|
|
|
6
7
|
|
|
7
8
|
from ..segment import InlineSegment, TextSegment, search_inline_segments, search_text_segments
|
|
8
9
|
from .callbacks import Callbacks
|
|
10
|
+
from .concurrency import run_concurrency
|
|
11
|
+
from .score import ScoreSegment, expand_to_score_segments, truncate_score_segment
|
|
9
12
|
|
|
10
13
|
_PAGE_INCISION = 0
|
|
11
14
|
_BLOCK_INCISION = 1
|
|
15
|
+
_T = TypeVar("_T")
|
|
12
16
|
|
|
13
|
-
|
|
17
|
+
_ResourcePayload = tuple[InlineSegment, list[ScoreSegment]]
|
|
14
18
|
|
|
15
19
|
|
|
16
20
|
InlineSegmentMapping = tuple[Element, list[TextSegment]]
|
|
@@ -18,23 +22,33 @@ InlineSegmentGroupMap = Callable[[list[InlineSegment]], list[InlineSegmentMappin
|
|
|
18
22
|
|
|
19
23
|
|
|
20
24
|
class XMLStreamMapper:
|
|
21
|
-
def __init__(self, encoding: Encoding,
|
|
25
|
+
def __init__(self, encoding: Encoding, max_group_score: int) -> None:
|
|
22
26
|
self._encoding: Encoding = encoding
|
|
23
|
-
self.
|
|
27
|
+
self._max_group_score: int = max_group_score
|
|
24
28
|
|
|
25
29
|
def map_stream(
|
|
26
30
|
self,
|
|
27
31
|
elements: Iterator[Element],
|
|
28
32
|
callbacks: Callbacks,
|
|
29
33
|
map: InlineSegmentGroupMap,
|
|
34
|
+
concurrency: int,
|
|
30
35
|
) -> Generator[tuple[Element, list[InlineSegmentMapping]], None, None]:
|
|
31
36
|
current_element: Element | None = None
|
|
32
37
|
mapping_buffer: list[InlineSegmentMapping] = []
|
|
33
38
|
|
|
34
|
-
|
|
39
|
+
def execute(group: Group[_ResourcePayload]):
|
|
35
40
|
head, body, tail = self._truncate_and_transform_group(group)
|
|
41
|
+
head = [segment.clone() for segment in head]
|
|
42
|
+
tail = [segment.clone() for segment in tail]
|
|
36
43
|
target_body = map(head + body + tail)[len(head) : len(head) + len(body)]
|
|
37
|
-
|
|
44
|
+
return zip(body, target_body, strict=False)
|
|
45
|
+
|
|
46
|
+
for mapping_pairs in run_concurrency(
|
|
47
|
+
parameters=self._split_into_serial_groups(elements, callbacks),
|
|
48
|
+
execute=execute,
|
|
49
|
+
concurrency=concurrency,
|
|
50
|
+
):
|
|
51
|
+
for origin, target in mapping_pairs:
|
|
38
52
|
origin_element = origin.head.root
|
|
39
53
|
if current_element is None:
|
|
40
54
|
current_element = origin_element
|
|
@@ -58,7 +72,7 @@ class XMLStreamMapper:
|
|
|
58
72
|
def generate():
|
|
59
73
|
for element in elements:
|
|
60
74
|
yield from split(
|
|
61
|
-
max_segment_count=self.
|
|
75
|
+
max_segment_count=self._max_group_score,
|
|
62
76
|
border_incision=_PAGE_INCISION,
|
|
63
77
|
resources=self._expand_to_resources(element, callbacks),
|
|
64
78
|
)
|
|
@@ -79,7 +93,7 @@ class XMLStreamMapper:
|
|
|
79
93
|
next_sum_body_count = sum(x.count for x in self._expand_resource_segments(next_group.body))
|
|
80
94
|
next_sum_count = sum_count + next_sum_body_count
|
|
81
95
|
|
|
82
|
-
if next_sum_count + next_group.tail_remain_count > self.
|
|
96
|
+
if next_sum_count + next_group.tail_remain_count > self._max_group_score:
|
|
83
97
|
yield group
|
|
84
98
|
group = next_group
|
|
85
99
|
sum_count = group.head_remain_count + next_sum_body_count
|
|
@@ -91,23 +105,25 @@ class XMLStreamMapper:
|
|
|
91
105
|
|
|
92
106
|
yield group
|
|
93
107
|
|
|
94
|
-
def _truncate_and_transform_group(
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
108
|
+
def _truncate_and_transform_group(
|
|
109
|
+
self, group: Group[_ResourcePayload]
|
|
110
|
+
) -> tuple[list[InlineSegment], list[InlineSegment], list[InlineSegment]]:
|
|
111
|
+
head = self._truncate_group_gap(
|
|
112
|
+
gap=group.head,
|
|
113
|
+
remain_head=False,
|
|
114
|
+
remain_score=group.head_remain_count,
|
|
101
115
|
)
|
|
102
|
-
body =
|
|
103
|
-
tail =
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
116
|
+
body = self._expand_inline_segments(group.body)
|
|
117
|
+
tail = self._truncate_group_gap(
|
|
118
|
+
gap=group.tail,
|
|
119
|
+
remain_head=True,
|
|
120
|
+
remain_score=group.tail_remain_count,
|
|
121
|
+
)
|
|
122
|
+
return (
|
|
123
|
+
[r.payload[0] for r in head],
|
|
124
|
+
[p[0] for p in body],
|
|
125
|
+
[r.payload[0] for r in tail],
|
|
109
126
|
)
|
|
110
|
-
return head, body, tail
|
|
111
127
|
|
|
112
128
|
def _expand_to_resources(self, element: Element, callbacks: Callbacks):
|
|
113
129
|
def expand(element: Element):
|
|
@@ -131,123 +147,138 @@ class XMLStreamMapper:
|
|
|
131
147
|
else:
|
|
132
148
|
end_incision = _PAGE_INCISION
|
|
133
149
|
|
|
134
|
-
yield
|
|
135
|
-
|
|
150
|
+
yield self._transform_to_resource(
|
|
151
|
+
inline_segment=inline_segment,
|
|
136
152
|
start_incision=start_incision,
|
|
137
153
|
end_incision=end_incision,
|
|
138
|
-
payload=inline_segment,
|
|
139
154
|
)
|
|
140
155
|
inline_segment = next_inline_segment
|
|
141
156
|
start_incision = end_incision
|
|
142
157
|
|
|
143
|
-
yield
|
|
144
|
-
|
|
158
|
+
yield self._transform_to_resource(
|
|
159
|
+
inline_segment=inline_segment,
|
|
145
160
|
start_incision=start_incision,
|
|
146
161
|
end_incision=_PAGE_INCISION,
|
|
147
|
-
payload=inline_segment,
|
|
148
162
|
)
|
|
149
163
|
|
|
150
|
-
def
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
164
|
+
def _transform_to_resource(
|
|
165
|
+
self,
|
|
166
|
+
inline_segment: InlineSegment,
|
|
167
|
+
start_incision: int,
|
|
168
|
+
end_incision: int,
|
|
169
|
+
) -> Resource[_ResourcePayload]:
|
|
170
|
+
source_segments = list(
|
|
171
|
+
expand_to_score_segments(
|
|
172
|
+
encoding=self._encoding,
|
|
173
|
+
inline_segment=inline_segment,
|
|
174
|
+
)
|
|
175
|
+
)
|
|
176
|
+
return Resource(
|
|
177
|
+
count=sum(segment.score for segment in source_segments),
|
|
178
|
+
start_incision=start_incision,
|
|
179
|
+
end_incision=end_incision,
|
|
180
|
+
payload=(inline_segment, source_segments),
|
|
160
181
|
)
|
|
161
|
-
yield from search_inline_segments(truncated_text_segments)
|
|
162
182
|
|
|
163
|
-
def _expand_inline_segments(self, items: list[Resource[
|
|
183
|
+
def _expand_inline_segments(self, items: list[Resource[_ResourcePayload] | Segment[_ResourcePayload]]):
|
|
164
184
|
for resource in self._expand_resource_segments(items):
|
|
165
185
|
yield resource.payload
|
|
166
186
|
|
|
167
|
-
def _expand_resource_segments(self, items: list[Resource[
|
|
187
|
+
def _expand_resource_segments(self, items: list[Resource[_ResourcePayload] | Segment[_ResourcePayload]]):
|
|
168
188
|
for item in items:
|
|
169
189
|
if isinstance(item, Resource):
|
|
170
190
|
yield item
|
|
171
191
|
elif isinstance(item, Segment):
|
|
172
192
|
yield from item.resources
|
|
173
193
|
|
|
174
|
-
def
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
194
|
+
def _truncate_group_gap(
|
|
195
|
+
self,
|
|
196
|
+
gap: list[Resource[_ResourcePayload] | Segment[_ResourcePayload]],
|
|
197
|
+
remain_head: bool,
|
|
198
|
+
remain_score: int,
|
|
199
|
+
):
|
|
200
|
+
def expand_resource_segments(items: list[Resource[_ResourcePayload] | Segment[_ResourcePayload]]):
|
|
201
|
+
for item in items:
|
|
202
|
+
if isinstance(item, Resource):
|
|
203
|
+
yield item
|
|
204
|
+
elif isinstance(item, Segment):
|
|
205
|
+
yield from item.resources
|
|
206
|
+
|
|
207
|
+
resources, remain_score = _truncate_items(
|
|
208
|
+
items=expand_resource_segments(gap),
|
|
209
|
+
score=lambda resource: resource.count,
|
|
210
|
+
remain_head=remain_head,
|
|
211
|
+
remain_score=remain_score,
|
|
212
|
+
)
|
|
213
|
+
if remain_score > 0:
|
|
214
|
+
resource = resources.pop() if remain_head else resources.pop(0)
|
|
215
|
+
inline_segment, score_segments = resource.payload
|
|
216
|
+
score_segments, remain_score = _truncate_items(
|
|
217
|
+
items=score_segments,
|
|
218
|
+
score=lambda score_segment: score_segment.score,
|
|
178
219
|
remain_head=remain_head,
|
|
179
|
-
|
|
220
|
+
remain_score=remain_score,
|
|
180
221
|
)
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
222
|
+
if remain_score > 0:
|
|
223
|
+
score_segment = score_segments.pop() if remain_head else score_segments.pop(0)
|
|
224
|
+
score_segment = truncate_score_segment(
|
|
225
|
+
score_segment=score_segment,
|
|
226
|
+
encoding=self._encoding,
|
|
227
|
+
remain_head=remain_head,
|
|
228
|
+
remain_score=remain_score,
|
|
229
|
+
)
|
|
230
|
+
if score_segment is not None:
|
|
231
|
+
if remain_head:
|
|
232
|
+
score_segments.append(score_segment)
|
|
233
|
+
else:
|
|
234
|
+
score_segments.insert(0, score_segment)
|
|
235
|
+
|
|
236
|
+
inline_segment = next(
|
|
237
|
+
search_inline_segments(s.text_segment for s in score_segments),
|
|
238
|
+
None,
|
|
189
239
|
)
|
|
190
|
-
)
|
|
191
240
|
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
tokens_count = len(tokens)
|
|
199
|
-
|
|
200
|
-
if tokens_count > remain_count:
|
|
201
|
-
truncated_segment = self._truncate_text_segment(
|
|
202
|
-
segment=segment,
|
|
203
|
-
tokens=tokens,
|
|
204
|
-
raw_xml_text=raw_xml_text,
|
|
205
|
-
remain_head=remain_head,
|
|
206
|
-
remain_count=remain_count,
|
|
241
|
+
if inline_segment is not None:
|
|
242
|
+
resource = Resource(
|
|
243
|
+
count=sum(s.score for s in score_segments),
|
|
244
|
+
start_incision=resource.start_incision,
|
|
245
|
+
end_incision=resource.end_incision,
|
|
246
|
+
payload=(inline_segment, score_segments),
|
|
207
247
|
)
|
|
208
|
-
if
|
|
209
|
-
|
|
210
|
-
|
|
248
|
+
if remain_head:
|
|
249
|
+
resources.append(resource)
|
|
250
|
+
else:
|
|
251
|
+
resources.insert(0, resource)
|
|
211
252
|
|
|
212
|
-
|
|
213
|
-
remain_count -= tokens_count
|
|
253
|
+
return resources
|
|
214
254
|
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
if not remain_text.strip():
|
|
247
|
-
return None
|
|
248
|
-
|
|
249
|
-
if remain_head:
|
|
250
|
-
segment.text = f"{remain_text} {_ELLIPSIS}"
|
|
251
|
-
else:
|
|
252
|
-
segment.text = f"{_ELLIPSIS} {remain_text}"
|
|
253
|
-
return segment
|
|
255
|
+
|
|
256
|
+
def _truncate_items(items: Iterable[_T], score: Callable[[_T], int], remain_head: bool, remain_score: int):
|
|
257
|
+
truncated_items = list(items)
|
|
258
|
+
if not truncated_items:
|
|
259
|
+
return truncated_items, 0
|
|
260
|
+
|
|
261
|
+
if not remain_head:
|
|
262
|
+
truncated_items.reverse()
|
|
263
|
+
|
|
264
|
+
truncated_index: int | None = None
|
|
265
|
+
for i, item in enumerate(truncated_items):
|
|
266
|
+
item_score = score(item)
|
|
267
|
+
remain_score -= item_score
|
|
268
|
+
if remain_score <= 0:
|
|
269
|
+
truncated_index = i
|
|
270
|
+
break
|
|
271
|
+
|
|
272
|
+
if truncated_index is not None:
|
|
273
|
+
while len(truncated_items) > truncated_index + 1:
|
|
274
|
+
truncated_items.pop()
|
|
275
|
+
|
|
276
|
+
if truncated_items and remain_score < 0:
|
|
277
|
+
remain_score = score(truncated_items[-1]) + remain_score
|
|
278
|
+
else:
|
|
279
|
+
remain_score = 0
|
|
280
|
+
|
|
281
|
+
if not remain_head:
|
|
282
|
+
truncated_items.reverse()
|
|
283
|
+
|
|
284
|
+
return truncated_items, remain_score
|
|
@@ -4,7 +4,7 @@ from enum import Enum, auto
|
|
|
4
4
|
from xml.etree.ElementTree import Element
|
|
5
5
|
|
|
6
6
|
from ..segment import TextSegment, combine_text_segments
|
|
7
|
-
from ..xml import index_of_parent,
|
|
7
|
+
from ..xml import index_of_parent, is_inline_element, iter_with_stack
|
|
8
8
|
from .stream_mapper import InlineSegmentMapping
|
|
9
9
|
|
|
10
10
|
|
|
@@ -78,7 +78,7 @@ class _Submitter:
|
|
|
78
78
|
preserved_elements: list[Element] = []
|
|
79
79
|
if self._action == SubmitKind.REPLACE:
|
|
80
80
|
for child in list(node.raw_element):
|
|
81
|
-
if not
|
|
81
|
+
if not is_inline_element(child):
|
|
82
82
|
child.tail = None
|
|
83
83
|
preserved_elements.append(child)
|
|
84
84
|
|
|
@@ -87,7 +87,7 @@ class _Submitter:
|
|
|
87
87
|
|
|
88
88
|
if combined is not None:
|
|
89
89
|
# 在 APPEND_BLOCK 模式下,如果是 inline tag,则在文本前面加空格
|
|
90
|
-
if self._action == SubmitKind.APPEND_BLOCK and
|
|
90
|
+
if self._action == SubmitKind.APPEND_BLOCK and is_inline_element(combined) and combined.text:
|
|
91
91
|
combined.text = " " + combined.text
|
|
92
92
|
parent.insert(index + 1, combined)
|
|
93
93
|
index += 1
|
|
@@ -200,7 +200,7 @@ class _Submitter:
|
|
|
200
200
|
preserved_elements: list[Element] = []
|
|
201
201
|
for i in range(start_index, end_index):
|
|
202
202
|
elem = node_element[i]
|
|
203
|
-
if not
|
|
203
|
+
if not is_inline_element(elem):
|
|
204
204
|
elem.tail = None
|
|
205
205
|
preserved_elements.append(elem)
|
|
206
206
|
|
|
@@ -223,7 +223,7 @@ class _Submitter:
|
|
|
223
223
|
|
|
224
224
|
if combined.text:
|
|
225
225
|
will_inject_space = self._action == SubmitKind.APPEND_TEXT or (
|
|
226
|
-
|
|
226
|
+
is_inline_element(combined) and self._action == SubmitKind.APPEND_BLOCK
|
|
227
227
|
)
|
|
228
228
|
if tail_element is not None:
|
|
229
229
|
tail_element.tail = self._append_text_in_element(
|
|
@@ -31,7 +31,7 @@ class XMLTranslator:
|
|
|
31
31
|
ignore_translated_error: bool,
|
|
32
32
|
max_retries: int,
|
|
33
33
|
max_fill_displaying_errors: int,
|
|
34
|
-
|
|
34
|
+
max_group_score: int,
|
|
35
35
|
cache_seed_content: str | None = None,
|
|
36
36
|
) -> None:
|
|
37
37
|
self._translation_llm: LLM = translation_llm
|
|
@@ -44,12 +44,13 @@ class XMLTranslator:
|
|
|
44
44
|
self._cache_seed_content: str | None = cache_seed_content
|
|
45
45
|
self._stream_mapper: XMLStreamMapper = XMLStreamMapper(
|
|
46
46
|
encoding=translation_llm.encoding,
|
|
47
|
-
|
|
47
|
+
max_group_score=max_group_score,
|
|
48
48
|
)
|
|
49
49
|
|
|
50
50
|
def translate_element(
|
|
51
51
|
self,
|
|
52
52
|
task: TranslationTask[T],
|
|
53
|
+
concurrency: int = 1,
|
|
53
54
|
interrupt_source_text_segments: Callable[[Iterable[TextSegment]], Iterable[TextSegment]] | None = None,
|
|
54
55
|
interrupt_translated_text_segments: Callable[[Iterable[TextSegment]], Iterable[TextSegment]] | None = None,
|
|
55
56
|
interrupt_block_element: Callable[[Element], Element] | None = None,
|
|
@@ -57,6 +58,7 @@ class XMLTranslator:
|
|
|
57
58
|
) -> tuple[Element, T]:
|
|
58
59
|
for translated in self.translate_elements(
|
|
59
60
|
tasks=((task),),
|
|
61
|
+
concurrency=concurrency,
|
|
60
62
|
interrupt_source_text_segments=interrupt_source_text_segments,
|
|
61
63
|
interrupt_translated_text_segments=interrupt_translated_text_segments,
|
|
62
64
|
interrupt_block_element=interrupt_block_element,
|
|
@@ -69,6 +71,7 @@ class XMLTranslator:
|
|
|
69
71
|
def translate_elements(
|
|
70
72
|
self,
|
|
71
73
|
tasks: Iterable[TranslationTask[T]],
|
|
74
|
+
concurrency: int = 1,
|
|
72
75
|
interrupt_source_text_segments: Callable[[Iterable[TextSegment]], Iterable[TextSegment]] | None = None,
|
|
73
76
|
interrupt_translated_text_segments: Callable[[Iterable[TextSegment]], Iterable[TextSegment]] | None = None,
|
|
74
77
|
interrupt_block_element: Callable[[Element], Element] | None = None,
|
|
@@ -90,6 +93,7 @@ class XMLTranslator:
|
|
|
90
93
|
for element, mappings in self._stream_mapper.map_stream(
|
|
91
94
|
elements=generate_elements(),
|
|
92
95
|
callbacks=callbacks,
|
|
96
|
+
concurrency=concurrency,
|
|
93
97
|
map=lambda inline_segments: self._translate_inline_segments(
|
|
94
98
|
inline_segments=inline_segments,
|
|
95
99
|
callbacks=callbacks,
|
|
@@ -117,8 +121,7 @@ class XMLTranslator:
|
|
|
117
121
|
inline_segments=inline_segments,
|
|
118
122
|
),
|
|
119
123
|
)
|
|
120
|
-
|
|
121
|
-
source_text = "".join(self._render_text_segments(text_segments))
|
|
124
|
+
source_text = "".join(self._render_source_text_parts(inline_segments))
|
|
122
125
|
translated_text = self._translate_text(source_text)
|
|
123
126
|
|
|
124
127
|
self._request_and_submit(
|
|
@@ -137,21 +140,12 @@ class XMLTranslator:
|
|
|
137
140
|
|
|
138
141
|
return mappings
|
|
139
142
|
|
|
140
|
-
def
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
segment = next(iterator, None)
|
|
144
|
-
if segment is None:
|
|
145
|
-
return
|
|
146
|
-
while True:
|
|
147
|
-
next_segment = next(iterator, None)
|
|
148
|
-
if next_segment is None:
|
|
149
|
-
break
|
|
150
|
-
yield segment.text
|
|
151
|
-
if id(segment.block_parent) != id(next_segment.block_parent):
|
|
143
|
+
def _render_source_text_parts(self, inline_segments: list[InlineSegment]):
|
|
144
|
+
for i, inline_segment in enumerate(inline_segments):
|
|
145
|
+
if i > 0:
|
|
152
146
|
yield "\n\n"
|
|
153
|
-
|
|
154
|
-
|
|
147
|
+
for text_segment in inline_segment:
|
|
148
|
+
yield text_segment.text
|
|
155
149
|
|
|
156
150
|
def _translate_text(self, text: str) -> str:
|
|
157
151
|
with self._translation_llm.context(cache_seed_content=self._cache_seed_content) as ctx:
|