epub-translator 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epub_translator/__init__.py +1 -2
- epub_translator/epub/__init__.py +1 -1
- epub_translator/llm/context.py +10 -1
- epub_translator/llm/core.py +30 -3
- epub_translator/segment/inline_segment.py +11 -1
- epub_translator/segment/text_segment.py +0 -4
- epub_translator/translation/__init__.py +2 -0
- epub_translator/{epub_transcode.py → translation/epub_transcode.py} +2 -2
- epub_translator/{punctuation.py → translation/punctuation.py} +1 -1
- epub_translator/{translator.py → translation/translator.py} +8 -6
- epub_translator/{xml_interrupter.py → translation/xml_interrupter.py} +2 -2
- epub_translator/xml/__init__.py +1 -1
- epub_translator/xml_translator/concurrency.py +52 -0
- epub_translator/xml_translator/score.py +164 -0
- epub_translator/xml_translator/stream_mapper.py +145 -114
- epub_translator/xml_translator/submitter.py +28 -10
- epub_translator/xml_translator/translator.py +12 -18
- {epub_translator-0.1.4.dist-info → epub_translator-0.1.6.dist-info}/METADATA +58 -8
- {epub_translator-0.1.4.dist-info → epub_translator-0.1.6.dist-info}/RECORD +28 -25
- /epub_translator/{language.py → translation/language.py} +0 -0
- /epub_translator/xml/{firendly → friendly}/__init__.py +0 -0
- /epub_translator/xml/{firendly → friendly}/decoder.py +0 -0
- /epub_translator/xml/{firendly → friendly}/encoder.py +0 -0
- /epub_translator/xml/{firendly → friendly}/parser.py +0 -0
- /epub_translator/xml/{firendly → friendly}/tag.py +0 -0
- /epub_translator/xml/{firendly → friendly}/transform.py +0 -0
- {epub_translator-0.1.4.dist-info → epub_translator-0.1.6.dist-info}/LICENSE +0 -0
- {epub_translator-0.1.4.dist-info → epub_translator-0.1.6.dist-info}/WHEEL +0 -0
epub_translator/__init__.py
CHANGED
epub_translator/epub/__init__.py
CHANGED
epub_translator/llm/context.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import hashlib
|
|
2
2
|
import json
|
|
3
|
+
import threading
|
|
3
4
|
import uuid
|
|
4
5
|
from pathlib import Path
|
|
5
6
|
from typing import Self
|
|
@@ -8,6 +9,9 @@ from .executor import LLMExecutor
|
|
|
8
9
|
from .increasable import Increasable, Increaser
|
|
9
10
|
from .types import Message, MessageRole
|
|
10
11
|
|
|
12
|
+
# Global lock for cache file commit operations
|
|
13
|
+
_CACHE_COMMIT_LOCK = threading.Lock()
|
|
14
|
+
|
|
11
15
|
|
|
12
16
|
class LLMContext:
|
|
13
17
|
def __init__(
|
|
@@ -101,7 +105,12 @@ class LLMContext:
|
|
|
101
105
|
# Remove the .[context-id].txt suffix to get permanent name
|
|
102
106
|
permanent_name = temp_file.name.rsplit(".", 2)[0] + ".txt"
|
|
103
107
|
permanent_file = temp_file.parent / permanent_name
|
|
104
|
-
|
|
108
|
+
|
|
109
|
+
with _CACHE_COMMIT_LOCK: # 多线程下的线程安全
|
|
110
|
+
if permanent_file.exists():
|
|
111
|
+
temp_file.unlink()
|
|
112
|
+
else:
|
|
113
|
+
temp_file.rename(permanent_file)
|
|
105
114
|
|
|
106
115
|
def _rollback(self) -> None:
|
|
107
116
|
for temp_file in self._temp_files:
|
epub_translator/llm/core.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import datetime
|
|
2
|
+
import threading
|
|
2
3
|
from collections.abc import Generator
|
|
3
4
|
from importlib.resources import files
|
|
4
5
|
from logging import DEBUG, FileHandler, Formatter, Logger, getLogger
|
|
@@ -14,6 +15,11 @@ from .executor import LLMExecutor
|
|
|
14
15
|
from .increasable import Increasable
|
|
15
16
|
from .types import Message
|
|
16
17
|
|
|
18
|
+
# Global state for logger filename generation
|
|
19
|
+
_LOGGER_LOCK = threading.Lock()
|
|
20
|
+
_LAST_TIMESTAMP: str | None = None
|
|
21
|
+
_LOGGER_SUFFIX_ID: int = 1
|
|
22
|
+
|
|
17
23
|
|
|
18
24
|
class LLM:
|
|
19
25
|
def __init__(
|
|
@@ -95,13 +101,34 @@ class LLM:
|
|
|
95
101
|
return dir_path.resolve()
|
|
96
102
|
|
|
97
103
|
def _create_logger(self) -> Logger | None:
|
|
104
|
+
# pylint: disable=global-statement
|
|
105
|
+
global _LAST_TIMESTAMP, _LOGGER_SUFFIX_ID
|
|
106
|
+
|
|
98
107
|
if self._logger_save_path is None:
|
|
99
108
|
return None
|
|
100
109
|
|
|
101
110
|
now = datetime.datetime.now(datetime.UTC)
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
111
|
+
# Use second-level precision for collision detection
|
|
112
|
+
timestamp_key = now.strftime("%Y-%m-%d %H-%M-%S")
|
|
113
|
+
|
|
114
|
+
with _LOGGER_LOCK:
|
|
115
|
+
if _LAST_TIMESTAMP == timestamp_key:
|
|
116
|
+
_LOGGER_SUFFIX_ID += 1
|
|
117
|
+
suffix_id = _LOGGER_SUFFIX_ID
|
|
118
|
+
else:
|
|
119
|
+
_LAST_TIMESTAMP = timestamp_key
|
|
120
|
+
_LOGGER_SUFFIX_ID = 1
|
|
121
|
+
suffix_id = 1
|
|
122
|
+
|
|
123
|
+
if suffix_id == 1:
|
|
124
|
+
file_name = f"request {timestamp_key}.log"
|
|
125
|
+
logger_name = f"LLM Request {timestamp_key}"
|
|
126
|
+
else:
|
|
127
|
+
file_name = f"request {timestamp_key}_{suffix_id}.log"
|
|
128
|
+
logger_name = f"LLM Request {timestamp_key}_{suffix_id}"
|
|
129
|
+
|
|
130
|
+
file_path = self._logger_save_path / file_name
|
|
131
|
+
logger = getLogger(logger_name)
|
|
105
132
|
logger.setLevel(DEBUG)
|
|
106
133
|
handler = FileHandler(file_path, encoding="utf-8")
|
|
107
134
|
handler.setLevel(DEBUG)
|
|
@@ -47,6 +47,7 @@ def search_inline_segments(text_segments: Iterable[TextSegment]) -> Generator["I
|
|
|
47
47
|
inline_segment = _pop_stack_data(stack_data)
|
|
48
48
|
stack_data = None
|
|
49
49
|
if inline_segment:
|
|
50
|
+
inline_segment.id = 0
|
|
50
51
|
yield inline_segment
|
|
51
52
|
|
|
52
53
|
if stack_data is None:
|
|
@@ -73,6 +74,7 @@ def search_inline_segments(text_segments: Iterable[TextSegment]) -> Generator["I
|
|
|
73
74
|
if stack_data is not None:
|
|
74
75
|
inline_segment = _pop_stack_data(stack_data)
|
|
75
76
|
if inline_segment:
|
|
77
|
+
inline_segment.id = 0
|
|
76
78
|
yield inline_segment
|
|
77
79
|
|
|
78
80
|
|
|
@@ -115,7 +117,7 @@ class InlineSegment:
|
|
|
115
117
|
self._child_tag2ids: dict[str, list[int]] = {}
|
|
116
118
|
self._child_tag2count: dict[str, int] = {}
|
|
117
119
|
|
|
118
|
-
next_temp_id: int =
|
|
120
|
+
next_temp_id: int = 1
|
|
119
121
|
terms = nest((child.parent.tag, child) for child in children if isinstance(child, InlineSegment))
|
|
120
122
|
|
|
121
123
|
for tag, child_terms in terms.items():
|
|
@@ -162,6 +164,14 @@ class InlineSegment:
|
|
|
162
164
|
elif isinstance(child, InlineSegment):
|
|
163
165
|
yield from child
|
|
164
166
|
|
|
167
|
+
def clone(self) -> "InlineSegment":
|
|
168
|
+
cloned_segment = InlineSegment(
|
|
169
|
+
depth=len(self._parent_stack),
|
|
170
|
+
children=[child.clone() for child in self._children],
|
|
171
|
+
)
|
|
172
|
+
cloned_segment.id = self.id
|
|
173
|
+
return cloned_segment
|
|
174
|
+
|
|
165
175
|
def recreate_ids(self, id_generator: IDGenerator) -> None:
|
|
166
176
|
self._child_tag2count.clear()
|
|
167
177
|
self._child_tag2ids.clear()
|
|
@@ -33,10 +33,6 @@ class TextSegment:
|
|
|
33
33
|
def block_parent(self) -> Element:
|
|
34
34
|
return self.parent_stack[self.block_depth - 1]
|
|
35
35
|
|
|
36
|
-
@property
|
|
37
|
-
def xml_text(self) -> str:
|
|
38
|
-
return "".join(_expand_xml_texts(self))
|
|
39
|
-
|
|
40
36
|
def strip_block_parents(self) -> Self:
|
|
41
37
|
self.parent_stack = self.parent_stack[self.block_depth - 1 :]
|
|
42
38
|
self.block_depth = 1
|
|
@@ -5,7 +5,7 @@ from importlib.metadata import version as get_package_version
|
|
|
5
5
|
from os import PathLike
|
|
6
6
|
from pathlib import Path
|
|
7
7
|
|
|
8
|
-
from
|
|
8
|
+
from ..epub import (
|
|
9
9
|
Zip,
|
|
10
10
|
read_metadata,
|
|
11
11
|
read_toc,
|
|
@@ -13,12 +13,12 @@ from .epub import (
|
|
|
13
13
|
write_metadata,
|
|
14
14
|
write_toc,
|
|
15
15
|
)
|
|
16
|
+
from ..llm import LLM
|
|
17
|
+
from ..xml import XMLLikeNode, deduplicate_ids_in_element, find_first
|
|
18
|
+
from ..xml_translator import FillFailedEvent, SubmitKind, TranslationTask, XMLTranslator
|
|
16
19
|
from .epub_transcode import decode_metadata, decode_toc_list, encode_metadata, encode_toc_list
|
|
17
|
-
from .llm import LLM
|
|
18
20
|
from .punctuation import unwrap_french_quotes
|
|
19
|
-
from .xml import XMLLikeNode, deduplicate_ids_in_element, find_first
|
|
20
21
|
from .xml_interrupter import XMLInterrupter
|
|
21
|
-
from .xml_translator import FillFailedEvent, SubmitKind, TranslationTask, XMLTranslator
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
class _ElementType(Enum):
|
|
@@ -40,7 +40,8 @@ def translate(
|
|
|
40
40
|
submit: SubmitKind,
|
|
41
41
|
user_prompt: str | None = None,
|
|
42
42
|
max_retries: int = 5,
|
|
43
|
-
max_group_tokens: int =
|
|
43
|
+
max_group_tokens: int = 2600,
|
|
44
|
+
concurrency: int = 1,
|
|
44
45
|
llm: LLM | None = None,
|
|
45
46
|
translation_llm: LLM | None = None,
|
|
46
47
|
fill_llm: LLM | None = None,
|
|
@@ -62,7 +63,7 @@ def translate(
|
|
|
62
63
|
ignore_translated_error=False,
|
|
63
64
|
max_retries=max_retries,
|
|
64
65
|
max_fill_displaying_errors=10,
|
|
65
|
-
|
|
66
|
+
max_group_score=max_group_tokens,
|
|
66
67
|
cache_seed_content=f"{_get_version()}:{target_language}",
|
|
67
68
|
)
|
|
68
69
|
with Zip(
|
|
@@ -92,6 +93,7 @@ def translate(
|
|
|
92
93
|
current_progress = 0.0
|
|
93
94
|
|
|
94
95
|
for translated_elem, context in translator.translate_elements(
|
|
96
|
+
concurrency=concurrency,
|
|
95
97
|
interrupt_source_text_segments=interrupter.interrupt_source_text_segments,
|
|
96
98
|
interrupt_translated_text_segments=interrupter.interrupt_translated_text_segments,
|
|
97
99
|
interrupt_block_element=interrupter.interrupt_block_element,
|
|
@@ -2,8 +2,8 @@ from collections.abc import Generator, Iterable
|
|
|
2
2
|
from typing import cast
|
|
3
3
|
from xml.etree.ElementTree import Element
|
|
4
4
|
|
|
5
|
-
from
|
|
6
|
-
from
|
|
5
|
+
from ..segment import TextSegment
|
|
6
|
+
from ..utils import ensure_list, normalize_whitespace
|
|
7
7
|
|
|
8
8
|
_ID_KEY = "__XML_INTERRUPTER_ID"
|
|
9
9
|
_MATH_TAG = "math"
|
epub_translator/xml/__init__.py
CHANGED
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
from collections import deque
|
|
2
|
+
from collections.abc import Callable, Iterable
|
|
3
|
+
from concurrent.futures import Future, ThreadPoolExecutor
|
|
4
|
+
from typing import TypeVar
|
|
5
|
+
|
|
6
|
+
P = TypeVar("P")
|
|
7
|
+
R = TypeVar("R")
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def run_concurrency(
|
|
11
|
+
parameters: Iterable[P],
|
|
12
|
+
execute: Callable[[P], R],
|
|
13
|
+
concurrency: int,
|
|
14
|
+
) -> Iterable[R]:
|
|
15
|
+
assert concurrency >= 1, "the concurrency must be at least 1"
|
|
16
|
+
# Fast path: concurrency == 1, no thread overhead
|
|
17
|
+
if concurrency == 1:
|
|
18
|
+
for param in parameters:
|
|
19
|
+
yield execute(param)
|
|
20
|
+
return
|
|
21
|
+
|
|
22
|
+
executor = ThreadPoolExecutor(max_workers=concurrency)
|
|
23
|
+
did_shutdown = False
|
|
24
|
+
try:
|
|
25
|
+
futures: deque[Future[R]] = deque()
|
|
26
|
+
params_iter = iter(parameters)
|
|
27
|
+
for _ in range(concurrency):
|
|
28
|
+
try:
|
|
29
|
+
param = next(params_iter)
|
|
30
|
+
future = executor.submit(execute, param)
|
|
31
|
+
futures.append(future)
|
|
32
|
+
except StopIteration:
|
|
33
|
+
break
|
|
34
|
+
|
|
35
|
+
while futures:
|
|
36
|
+
future = futures.popleft()
|
|
37
|
+
yield future.result()
|
|
38
|
+
try:
|
|
39
|
+
param = next(params_iter)
|
|
40
|
+
new_future = executor.submit(execute, param)
|
|
41
|
+
futures.append(new_future)
|
|
42
|
+
except StopIteration:
|
|
43
|
+
pass
|
|
44
|
+
|
|
45
|
+
except KeyboardInterrupt:
|
|
46
|
+
executor.shutdown(wait=False, cancel_futures=True)
|
|
47
|
+
did_shutdown = True
|
|
48
|
+
raise
|
|
49
|
+
|
|
50
|
+
finally:
|
|
51
|
+
if not did_shutdown:
|
|
52
|
+
executor.shutdown(wait=True)
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
from collections.abc import Generator
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from enum import Enum, auto
|
|
4
|
+
|
|
5
|
+
from tiktoken import Encoding
|
|
6
|
+
|
|
7
|
+
from ..segment import InlineSegment, TextSegment
|
|
8
|
+
from .common import DATA_ORIGIN_LEN_KEY
|
|
9
|
+
|
|
10
|
+
_ID_WEIGHT = 80
|
|
11
|
+
_ELLIPSIS = "..."
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class ScoreSegment:
|
|
16
|
+
text_segment: TextSegment
|
|
17
|
+
left_parents: list[InlineSegment]
|
|
18
|
+
right_parents: list[InlineSegment]
|
|
19
|
+
text_tokens: list[int]
|
|
20
|
+
score: int
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def expand_to_score_segments(encoding: Encoding, inline_segment: InlineSegment) -> Generator[ScoreSegment, None, None]:
|
|
24
|
+
for i, score_segment in enumerate(_do_expand_inline_segment(inline_segment)):
|
|
25
|
+
xml_text = "".join(
|
|
26
|
+
_render_score_segment(
|
|
27
|
+
score_segment=score_segment,
|
|
28
|
+
is_first=(i == 0),
|
|
29
|
+
)
|
|
30
|
+
)
|
|
31
|
+
score_segment.text_tokens = encoding.encode(score_segment.text_segment.text)
|
|
32
|
+
score_segment.score = len(encoding.encode(xml_text)) + sum(
|
|
33
|
+
_ID_WEIGHT for parent in score_segment.left_parents if parent.id is not None
|
|
34
|
+
)
|
|
35
|
+
yield score_segment
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def truncate_score_segment(
|
|
39
|
+
encoding: Encoding,
|
|
40
|
+
score_segment: ScoreSegment,
|
|
41
|
+
remain_head: bool,
|
|
42
|
+
remain_score: int,
|
|
43
|
+
):
|
|
44
|
+
fixed_score = score_segment.score - len(score_segment.text_tokens)
|
|
45
|
+
if remain_score <= fixed_score:
|
|
46
|
+
# 裁剪仅能减少 text 部分的 tokens 数。
|
|
47
|
+
# 而 XML 本身头尾占用的 tokens 数,以及 ID 占用加权分属于 fixed_score 部分,无法裁剪
|
|
48
|
+
# 当发现将文字删光后才能达标时,不如直接放弃整段内容
|
|
49
|
+
return None
|
|
50
|
+
|
|
51
|
+
remain_text_tokens_count = remain_score - fixed_score
|
|
52
|
+
|
|
53
|
+
# remain_text_tokens_count cannot be 0 here
|
|
54
|
+
if remain_head:
|
|
55
|
+
remain_text = encoding.decode(score_segment.text_tokens[:remain_text_tokens_count])
|
|
56
|
+
else:
|
|
57
|
+
remain_text = encoding.decode(score_segment.text_tokens[-remain_text_tokens_count:])
|
|
58
|
+
|
|
59
|
+
if not remain_text.strip():
|
|
60
|
+
return None
|
|
61
|
+
|
|
62
|
+
if remain_head:
|
|
63
|
+
remain_text = f"{remain_text} {_ELLIPSIS}"
|
|
64
|
+
else:
|
|
65
|
+
remain_text = f"{_ELLIPSIS} {remain_text}"
|
|
66
|
+
|
|
67
|
+
text_segment = score_segment.text_segment.clone()
|
|
68
|
+
text_segment.text = remain_text
|
|
69
|
+
|
|
70
|
+
return ScoreSegment(
|
|
71
|
+
text_segment=text_segment,
|
|
72
|
+
left_parents=score_segment.left_parents,
|
|
73
|
+
right_parents=score_segment.right_parents,
|
|
74
|
+
text_tokens=encoding.encode(remain_text),
|
|
75
|
+
score=remain_text_tokens_count + fixed_score,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _render_score_segment(score_segment: ScoreSegment, is_first: bool):
|
|
80
|
+
for i, parent in enumerate(score_segment.left_parents):
|
|
81
|
+
yield "<"
|
|
82
|
+
yield parent.parent.tag
|
|
83
|
+
if parent.id is not None:
|
|
84
|
+
yield ' id="99"'
|
|
85
|
+
if is_first and i == 0:
|
|
86
|
+
yield " "
|
|
87
|
+
yield DATA_ORIGIN_LEN_KEY
|
|
88
|
+
yield '="9999"'
|
|
89
|
+
yield ">"
|
|
90
|
+
|
|
91
|
+
yield score_segment.text_segment.text
|
|
92
|
+
|
|
93
|
+
for parent in reversed(score_segment.right_parents):
|
|
94
|
+
yield "</"
|
|
95
|
+
yield parent.parent.tag
|
|
96
|
+
yield ">"
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _do_expand_inline_segment(inline_segment: InlineSegment):
|
|
100
|
+
text_segment: TextSegment | None = None
|
|
101
|
+
left_parents: list[InlineSegment] = []
|
|
102
|
+
right_parents: list[InlineSegment] = []
|
|
103
|
+
|
|
104
|
+
for item in _expand_as_wrapped(inline_segment):
|
|
105
|
+
if isinstance(item, TextSegment):
|
|
106
|
+
if text_segment is None:
|
|
107
|
+
text_segment = item
|
|
108
|
+
else:
|
|
109
|
+
yield ScoreSegment(
|
|
110
|
+
text_segment=text_segment,
|
|
111
|
+
left_parents=left_parents,
|
|
112
|
+
right_parents=right_parents,
|
|
113
|
+
text_tokens=[],
|
|
114
|
+
score=0,
|
|
115
|
+
)
|
|
116
|
+
text_segment = item
|
|
117
|
+
left_parents = []
|
|
118
|
+
right_parents = []
|
|
119
|
+
|
|
120
|
+
elif isinstance(item, tuple):
|
|
121
|
+
child_inline_segment, orientation = item
|
|
122
|
+
if orientation == _Orientation.UP:
|
|
123
|
+
if text_segment is not None:
|
|
124
|
+
yield ScoreSegment(
|
|
125
|
+
text_segment=text_segment,
|
|
126
|
+
left_parents=left_parents,
|
|
127
|
+
right_parents=right_parents,
|
|
128
|
+
text_tokens=[],
|
|
129
|
+
score=0,
|
|
130
|
+
)
|
|
131
|
+
text_segment = None
|
|
132
|
+
left_parents = []
|
|
133
|
+
right_parents = []
|
|
134
|
+
left_parents.append(child_inline_segment)
|
|
135
|
+
|
|
136
|
+
elif orientation == _Orientation.DOWN:
|
|
137
|
+
if text_segment is None:
|
|
138
|
+
left_parents.clear()
|
|
139
|
+
else:
|
|
140
|
+
right_parents.append(child_inline_segment)
|
|
141
|
+
|
|
142
|
+
if text_segment is not None:
|
|
143
|
+
yield ScoreSegment(
|
|
144
|
+
text_segment=text_segment,
|
|
145
|
+
left_parents=left_parents,
|
|
146
|
+
right_parents=right_parents,
|
|
147
|
+
text_tokens=[],
|
|
148
|
+
score=0,
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
class _Orientation(Enum):
|
|
153
|
+
DOWN = auto()
|
|
154
|
+
UP = auto()
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def _expand_as_wrapped(inline_segment: InlineSegment):
|
|
158
|
+
yield (inline_segment, _Orientation.UP)
|
|
159
|
+
for child in inline_segment.children:
|
|
160
|
+
if isinstance(child, InlineSegment):
|
|
161
|
+
yield from _expand_as_wrapped(child)
|
|
162
|
+
elif isinstance(child, TextSegment):
|
|
163
|
+
yield child
|
|
164
|
+
yield (inline_segment, _Orientation.DOWN)
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
from collections.abc import Callable, Generator, Iterable, Iterator
|
|
2
|
+
from typing import TypeVar
|
|
2
3
|
from xml.etree.ElementTree import Element
|
|
3
4
|
|
|
4
5
|
from resource_segmentation import Group, Resource, Segment, split
|
|
@@ -6,11 +7,14 @@ from tiktoken import Encoding
|
|
|
6
7
|
|
|
7
8
|
from ..segment import InlineSegment, TextSegment, search_inline_segments, search_text_segments
|
|
8
9
|
from .callbacks import Callbacks
|
|
10
|
+
from .concurrency import run_concurrency
|
|
11
|
+
from .score import ScoreSegment, expand_to_score_segments, truncate_score_segment
|
|
9
12
|
|
|
10
13
|
_PAGE_INCISION = 0
|
|
11
14
|
_BLOCK_INCISION = 1
|
|
15
|
+
_T = TypeVar("_T")
|
|
12
16
|
|
|
13
|
-
|
|
17
|
+
_ResourcePayload = tuple[InlineSegment, list[ScoreSegment]]
|
|
14
18
|
|
|
15
19
|
|
|
16
20
|
InlineSegmentMapping = tuple[Element, list[TextSegment]]
|
|
@@ -18,23 +22,33 @@ InlineSegmentGroupMap = Callable[[list[InlineSegment]], list[InlineSegmentMappin
|
|
|
18
22
|
|
|
19
23
|
|
|
20
24
|
class XMLStreamMapper:
|
|
21
|
-
def __init__(self, encoding: Encoding,
|
|
25
|
+
def __init__(self, encoding: Encoding, max_group_score: int) -> None:
|
|
22
26
|
self._encoding: Encoding = encoding
|
|
23
|
-
self.
|
|
27
|
+
self._max_group_score: int = max_group_score
|
|
24
28
|
|
|
25
29
|
def map_stream(
|
|
26
30
|
self,
|
|
27
31
|
elements: Iterator[Element],
|
|
28
32
|
callbacks: Callbacks,
|
|
29
33
|
map: InlineSegmentGroupMap,
|
|
34
|
+
concurrency: int,
|
|
30
35
|
) -> Generator[tuple[Element, list[InlineSegmentMapping]], None, None]:
|
|
31
36
|
current_element: Element | None = None
|
|
32
37
|
mapping_buffer: list[InlineSegmentMapping] = []
|
|
33
38
|
|
|
34
|
-
|
|
39
|
+
def execute(group: Group[_ResourcePayload]):
|
|
35
40
|
head, body, tail = self._truncate_and_transform_group(group)
|
|
41
|
+
head = [segment.clone() for segment in head]
|
|
42
|
+
tail = [segment.clone() for segment in tail]
|
|
36
43
|
target_body = map(head + body + tail)[len(head) : len(head) + len(body)]
|
|
37
|
-
|
|
44
|
+
return zip(body, target_body, strict=False)
|
|
45
|
+
|
|
46
|
+
for mapping_pairs in run_concurrency(
|
|
47
|
+
parameters=self._split_into_serial_groups(elements, callbacks),
|
|
48
|
+
execute=execute,
|
|
49
|
+
concurrency=concurrency,
|
|
50
|
+
):
|
|
51
|
+
for origin, target in mapping_pairs:
|
|
38
52
|
origin_element = origin.head.root
|
|
39
53
|
if current_element is None:
|
|
40
54
|
current_element = origin_element
|
|
@@ -58,7 +72,7 @@ class XMLStreamMapper:
|
|
|
58
72
|
def generate():
|
|
59
73
|
for element in elements:
|
|
60
74
|
yield from split(
|
|
61
|
-
max_segment_count=self.
|
|
75
|
+
max_segment_count=self._max_group_score,
|
|
62
76
|
border_incision=_PAGE_INCISION,
|
|
63
77
|
resources=self._expand_to_resources(element, callbacks),
|
|
64
78
|
)
|
|
@@ -79,7 +93,7 @@ class XMLStreamMapper:
|
|
|
79
93
|
next_sum_body_count = sum(x.count for x in self._expand_resource_segments(next_group.body))
|
|
80
94
|
next_sum_count = sum_count + next_sum_body_count
|
|
81
95
|
|
|
82
|
-
if next_sum_count + next_group.tail_remain_count > self.
|
|
96
|
+
if next_sum_count + next_group.tail_remain_count > self._max_group_score:
|
|
83
97
|
yield group
|
|
84
98
|
group = next_group
|
|
85
99
|
sum_count = group.head_remain_count + next_sum_body_count
|
|
@@ -91,23 +105,25 @@ class XMLStreamMapper:
|
|
|
91
105
|
|
|
92
106
|
yield group
|
|
93
107
|
|
|
94
|
-
def _truncate_and_transform_group(
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
108
|
+
def _truncate_and_transform_group(
|
|
109
|
+
self, group: Group[_ResourcePayload]
|
|
110
|
+
) -> tuple[list[InlineSegment], list[InlineSegment], list[InlineSegment]]:
|
|
111
|
+
head = self._truncate_group_gap(
|
|
112
|
+
gap=group.head,
|
|
113
|
+
remain_head=False,
|
|
114
|
+
remain_score=group.head_remain_count,
|
|
101
115
|
)
|
|
102
|
-
body =
|
|
103
|
-
tail =
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
116
|
+
body = self._expand_inline_segments(group.body)
|
|
117
|
+
tail = self._truncate_group_gap(
|
|
118
|
+
gap=group.tail,
|
|
119
|
+
remain_head=True,
|
|
120
|
+
remain_score=group.tail_remain_count,
|
|
121
|
+
)
|
|
122
|
+
return (
|
|
123
|
+
[r.payload[0] for r in head],
|
|
124
|
+
[p[0] for p in body],
|
|
125
|
+
[r.payload[0] for r in tail],
|
|
109
126
|
)
|
|
110
|
-
return head, body, tail
|
|
111
127
|
|
|
112
128
|
def _expand_to_resources(self, element: Element, callbacks: Callbacks):
|
|
113
129
|
def expand(element: Element):
|
|
@@ -131,123 +147,138 @@ class XMLStreamMapper:
|
|
|
131
147
|
else:
|
|
132
148
|
end_incision = _PAGE_INCISION
|
|
133
149
|
|
|
134
|
-
yield
|
|
135
|
-
|
|
150
|
+
yield self._transform_to_resource(
|
|
151
|
+
inline_segment=inline_segment,
|
|
136
152
|
start_incision=start_incision,
|
|
137
153
|
end_incision=end_incision,
|
|
138
|
-
payload=inline_segment,
|
|
139
154
|
)
|
|
140
155
|
inline_segment = next_inline_segment
|
|
141
156
|
start_incision = end_incision
|
|
142
157
|
|
|
143
|
-
yield
|
|
144
|
-
|
|
158
|
+
yield self._transform_to_resource(
|
|
159
|
+
inline_segment=inline_segment,
|
|
145
160
|
start_incision=start_incision,
|
|
146
161
|
end_incision=_PAGE_INCISION,
|
|
147
|
-
payload=inline_segment,
|
|
148
162
|
)
|
|
149
163
|
|
|
150
|
-
def
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
164
|
+
def _transform_to_resource(
|
|
165
|
+
self,
|
|
166
|
+
inline_segment: InlineSegment,
|
|
167
|
+
start_incision: int,
|
|
168
|
+
end_incision: int,
|
|
169
|
+
) -> Resource[_ResourcePayload]:
|
|
170
|
+
source_segments = list(
|
|
171
|
+
expand_to_score_segments(
|
|
172
|
+
encoding=self._encoding,
|
|
173
|
+
inline_segment=inline_segment,
|
|
174
|
+
)
|
|
175
|
+
)
|
|
176
|
+
return Resource(
|
|
177
|
+
count=sum(segment.score for segment in source_segments),
|
|
178
|
+
start_incision=start_incision,
|
|
179
|
+
end_incision=end_incision,
|
|
180
|
+
payload=(inline_segment, source_segments),
|
|
160
181
|
)
|
|
161
|
-
yield from search_inline_segments(truncated_text_segments)
|
|
162
182
|
|
|
163
|
-
def _expand_inline_segments(self, items: list[Resource[
|
|
183
|
+
def _expand_inline_segments(self, items: list[Resource[_ResourcePayload] | Segment[_ResourcePayload]]):
|
|
164
184
|
for resource in self._expand_resource_segments(items):
|
|
165
185
|
yield resource.payload
|
|
166
186
|
|
|
167
|
-
def _expand_resource_segments(self, items: list[Resource[
|
|
187
|
+
def _expand_resource_segments(self, items: list[Resource[_ResourcePayload] | Segment[_ResourcePayload]]):
|
|
168
188
|
for item in items:
|
|
169
189
|
if isinstance(item, Resource):
|
|
170
190
|
yield item
|
|
171
191
|
elif isinstance(item, Segment):
|
|
172
192
|
yield from item.resources
|
|
173
193
|
|
|
174
|
-
def
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
194
|
+
def _truncate_group_gap(
|
|
195
|
+
self,
|
|
196
|
+
gap: list[Resource[_ResourcePayload] | Segment[_ResourcePayload]],
|
|
197
|
+
remain_head: bool,
|
|
198
|
+
remain_score: int,
|
|
199
|
+
):
|
|
200
|
+
def expand_resource_segments(items: list[Resource[_ResourcePayload] | Segment[_ResourcePayload]]):
|
|
201
|
+
for item in items:
|
|
202
|
+
if isinstance(item, Resource):
|
|
203
|
+
yield item
|
|
204
|
+
elif isinstance(item, Segment):
|
|
205
|
+
yield from item.resources
|
|
206
|
+
|
|
207
|
+
resources, remain_score = _truncate_items(
|
|
208
|
+
items=expand_resource_segments(gap),
|
|
209
|
+
score=lambda resource: resource.count,
|
|
210
|
+
remain_head=remain_head,
|
|
211
|
+
remain_score=remain_score,
|
|
212
|
+
)
|
|
213
|
+
if remain_score > 0:
|
|
214
|
+
resource = resources.pop() if remain_head else resources.pop(0)
|
|
215
|
+
inline_segment, score_segments = resource.payload
|
|
216
|
+
score_segments, remain_score = _truncate_items(
|
|
217
|
+
items=score_segments,
|
|
218
|
+
score=lambda score_segment: score_segment.score,
|
|
178
219
|
remain_head=remain_head,
|
|
179
|
-
|
|
220
|
+
remain_score=remain_score,
|
|
180
221
|
)
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
222
|
+
if remain_score > 0:
|
|
223
|
+
score_segment = score_segments.pop() if remain_head else score_segments.pop(0)
|
|
224
|
+
score_segment = truncate_score_segment(
|
|
225
|
+
score_segment=score_segment,
|
|
226
|
+
encoding=self._encoding,
|
|
227
|
+
remain_head=remain_head,
|
|
228
|
+
remain_score=remain_score,
|
|
229
|
+
)
|
|
230
|
+
if score_segment is not None:
|
|
231
|
+
if remain_head:
|
|
232
|
+
score_segments.append(score_segment)
|
|
233
|
+
else:
|
|
234
|
+
score_segments.insert(0, score_segment)
|
|
235
|
+
|
|
236
|
+
inline_segment = next(
|
|
237
|
+
search_inline_segments(s.text_segment for s in score_segments),
|
|
238
|
+
None,
|
|
189
239
|
)
|
|
190
|
-
)
|
|
191
240
|
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
tokens_count = len(tokens)
|
|
199
|
-
|
|
200
|
-
if tokens_count > remain_count:
|
|
201
|
-
truncated_segment = self._truncate_text_segment(
|
|
202
|
-
segment=segment,
|
|
203
|
-
tokens=tokens,
|
|
204
|
-
raw_xml_text=raw_xml_text,
|
|
205
|
-
remain_head=remain_head,
|
|
206
|
-
remain_count=remain_count,
|
|
241
|
+
if inline_segment is not None:
|
|
242
|
+
resource = Resource(
|
|
243
|
+
count=sum(s.score for s in score_segments),
|
|
244
|
+
start_incision=resource.start_incision,
|
|
245
|
+
end_incision=resource.end_incision,
|
|
246
|
+
payload=(inline_segment, score_segments),
|
|
207
247
|
)
|
|
208
|
-
if
|
|
209
|
-
|
|
210
|
-
|
|
248
|
+
if remain_head:
|
|
249
|
+
resources.append(resource)
|
|
250
|
+
else:
|
|
251
|
+
resources.insert(0, resource)
|
|
211
252
|
|
|
212
|
-
|
|
213
|
-
remain_count -= tokens_count
|
|
253
|
+
return resources
|
|
214
254
|
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
if not remain_text.strip():
|
|
247
|
-
return None
|
|
248
|
-
|
|
249
|
-
if remain_head:
|
|
250
|
-
segment.text = f"{remain_text} {_ELLIPSIS}"
|
|
251
|
-
else:
|
|
252
|
-
segment.text = f"{_ELLIPSIS} {remain_text}"
|
|
253
|
-
return segment
|
|
255
|
+
|
|
256
|
+
def _truncate_items(items: Iterable[_T], score: Callable[[_T], int], remain_head: bool, remain_score: int):
|
|
257
|
+
truncated_items = list(items)
|
|
258
|
+
if not truncated_items:
|
|
259
|
+
return truncated_items, 0
|
|
260
|
+
|
|
261
|
+
if not remain_head:
|
|
262
|
+
truncated_items.reverse()
|
|
263
|
+
|
|
264
|
+
truncated_index: int | None = None
|
|
265
|
+
for i, item in enumerate(truncated_items):
|
|
266
|
+
item_score = score(item)
|
|
267
|
+
remain_score -= item_score
|
|
268
|
+
if remain_score <= 0:
|
|
269
|
+
truncated_index = i
|
|
270
|
+
break
|
|
271
|
+
|
|
272
|
+
if truncated_index is not None:
|
|
273
|
+
while len(truncated_items) > truncated_index + 1:
|
|
274
|
+
truncated_items.pop()
|
|
275
|
+
|
|
276
|
+
if truncated_items and remain_score < 0:
|
|
277
|
+
remain_score = score(truncated_items[-1]) + remain_score
|
|
278
|
+
else:
|
|
279
|
+
remain_score = 0
|
|
280
|
+
|
|
281
|
+
if not remain_head:
|
|
282
|
+
truncated_items.reverse()
|
|
283
|
+
|
|
284
|
+
return truncated_items, remain_score
|
|
@@ -122,11 +122,17 @@ class _Submitter:
|
|
|
122
122
|
last_tail_element = child_element
|
|
123
123
|
|
|
124
124
|
for text_segments, child_node in node.items:
|
|
125
|
-
|
|
125
|
+
anchor_element = _find_anchor_in_parent(node.raw_element, child_node.raw_element)
|
|
126
|
+
if anchor_element is None:
|
|
127
|
+
# 防御性编程:理论上 anchor_element 不应该为 None,
|
|
128
|
+
# 因为 _nest_nodes 已经通过 _check_includes 验证了包含关系。
|
|
129
|
+
continue
|
|
130
|
+
|
|
131
|
+
tail_element = tail_elements.get(id(anchor_element), None)
|
|
126
132
|
items_preserved_elements: list[Element] = []
|
|
127
133
|
|
|
128
134
|
if self._action == SubmitKind.REPLACE:
|
|
129
|
-
end_index = index_of_parent(node.raw_element,
|
|
135
|
+
end_index = index_of_parent(node.raw_element, anchor_element)
|
|
130
136
|
items_preserved_elements = self._remove_elements_after_tail(
|
|
131
137
|
node_element=node.raw_element,
|
|
132
138
|
tail_element=tail_element,
|
|
@@ -137,11 +143,11 @@ class _Submitter:
|
|
|
137
143
|
node_element=node.raw_element,
|
|
138
144
|
text_segments=text_segments,
|
|
139
145
|
tail_element=tail_element,
|
|
146
|
+
anchor_element=anchor_element,
|
|
140
147
|
append_to_end=False,
|
|
141
|
-
ref_element=child_node.raw_element,
|
|
142
148
|
)
|
|
143
149
|
if items_preserved_elements:
|
|
144
|
-
insert_position = index_of_parent(node.raw_element,
|
|
150
|
+
insert_position = index_of_parent(node.raw_element, anchor_element)
|
|
145
151
|
for i, elem in enumerate(items_preserved_elements):
|
|
146
152
|
node.raw_element.insert(insert_position + i, elem)
|
|
147
153
|
|
|
@@ -166,7 +172,7 @@ class _Submitter:
|
|
|
166
172
|
node_element=node.raw_element,
|
|
167
173
|
text_segments=node.tail_text_segments,
|
|
168
174
|
tail_element=last_tail_element,
|
|
169
|
-
|
|
175
|
+
anchor_element=None,
|
|
170
176
|
append_to_end=True,
|
|
171
177
|
)
|
|
172
178
|
if tail_preserved_elements:
|
|
@@ -208,7 +214,7 @@ class _Submitter:
|
|
|
208
214
|
node_element: Element,
|
|
209
215
|
text_segments: list[TextSegment],
|
|
210
216
|
tail_element: Element | None,
|
|
211
|
-
|
|
217
|
+
anchor_element: Element | None,
|
|
212
218
|
append_to_end: bool,
|
|
213
219
|
) -> None:
|
|
214
220
|
combined = self._combine_text_segments(text_segments)
|
|
@@ -225,14 +231,14 @@ class _Submitter:
|
|
|
225
231
|
append_text=combined.text,
|
|
226
232
|
will_inject_space=will_inject_space,
|
|
227
233
|
)
|
|
228
|
-
elif
|
|
234
|
+
elif anchor_element is None:
|
|
229
235
|
node_element.text = self._append_text_in_element(
|
|
230
236
|
origin_text=node_element.text,
|
|
231
237
|
append_text=combined.text,
|
|
232
238
|
will_inject_space=will_inject_space,
|
|
233
239
|
)
|
|
234
240
|
else:
|
|
235
|
-
ref_index = index_of_parent(node_element,
|
|
241
|
+
ref_index = index_of_parent(node_element, anchor_element)
|
|
236
242
|
if ref_index > 0:
|
|
237
243
|
# 添加到前一个元素的 tail
|
|
238
244
|
prev_element = node_element[ref_index - 1]
|
|
@@ -253,10 +259,10 @@ class _Submitter:
|
|
|
253
259
|
insert_position = index_of_parent(node_element, tail_element) + 1
|
|
254
260
|
elif append_to_end:
|
|
255
261
|
insert_position = len(node_element)
|
|
256
|
-
elif
|
|
262
|
+
elif anchor_element is not None:
|
|
257
263
|
# 使用 ref_element 来定位插入位置
|
|
258
264
|
# 如果文本被添加到前一个元素的 tail,则在前一个元素之后插入
|
|
259
|
-
ref_index = index_of_parent(node_element,
|
|
265
|
+
ref_index = index_of_parent(node_element, anchor_element)
|
|
260
266
|
if ref_index > 0:
|
|
261
267
|
# 在前一个元素之后插入
|
|
262
268
|
insert_position = ref_index
|
|
@@ -346,6 +352,18 @@ def _nest_nodes(mappings: list[InlineSegmentMapping]) -> Generator[_Node, None,
|
|
|
346
352
|
yield child_node
|
|
347
353
|
|
|
348
354
|
|
|
355
|
+
def _find_anchor_in_parent(parent: Element, descendant: Element) -> Element | None:
|
|
356
|
+
for child in parent:
|
|
357
|
+
if child is descendant:
|
|
358
|
+
return descendant
|
|
359
|
+
|
|
360
|
+
for child in parent:
|
|
361
|
+
if _check_includes(child, descendant):
|
|
362
|
+
return child
|
|
363
|
+
|
|
364
|
+
return None
|
|
365
|
+
|
|
366
|
+
|
|
349
367
|
def _fold_top_of_stack(stack: list[_Node]):
|
|
350
368
|
child_node = stack.pop()
|
|
351
369
|
if not stack:
|
|
@@ -31,7 +31,7 @@ class XMLTranslator:
|
|
|
31
31
|
ignore_translated_error: bool,
|
|
32
32
|
max_retries: int,
|
|
33
33
|
max_fill_displaying_errors: int,
|
|
34
|
-
|
|
34
|
+
max_group_score: int,
|
|
35
35
|
cache_seed_content: str | None = None,
|
|
36
36
|
) -> None:
|
|
37
37
|
self._translation_llm: LLM = translation_llm
|
|
@@ -44,12 +44,13 @@ class XMLTranslator:
|
|
|
44
44
|
self._cache_seed_content: str | None = cache_seed_content
|
|
45
45
|
self._stream_mapper: XMLStreamMapper = XMLStreamMapper(
|
|
46
46
|
encoding=translation_llm.encoding,
|
|
47
|
-
|
|
47
|
+
max_group_score=max_group_score,
|
|
48
48
|
)
|
|
49
49
|
|
|
50
50
|
def translate_element(
|
|
51
51
|
self,
|
|
52
52
|
task: TranslationTask[T],
|
|
53
|
+
concurrency: int = 1,
|
|
53
54
|
interrupt_source_text_segments: Callable[[Iterable[TextSegment]], Iterable[TextSegment]] | None = None,
|
|
54
55
|
interrupt_translated_text_segments: Callable[[Iterable[TextSegment]], Iterable[TextSegment]] | None = None,
|
|
55
56
|
interrupt_block_element: Callable[[Element], Element] | None = None,
|
|
@@ -57,6 +58,7 @@ class XMLTranslator:
|
|
|
57
58
|
) -> tuple[Element, T]:
|
|
58
59
|
for translated in self.translate_elements(
|
|
59
60
|
tasks=((task),),
|
|
61
|
+
concurrency=concurrency,
|
|
60
62
|
interrupt_source_text_segments=interrupt_source_text_segments,
|
|
61
63
|
interrupt_translated_text_segments=interrupt_translated_text_segments,
|
|
62
64
|
interrupt_block_element=interrupt_block_element,
|
|
@@ -69,6 +71,7 @@ class XMLTranslator:
|
|
|
69
71
|
def translate_elements(
|
|
70
72
|
self,
|
|
71
73
|
tasks: Iterable[TranslationTask[T]],
|
|
74
|
+
concurrency: int = 1,
|
|
72
75
|
interrupt_source_text_segments: Callable[[Iterable[TextSegment]], Iterable[TextSegment]] | None = None,
|
|
73
76
|
interrupt_translated_text_segments: Callable[[Iterable[TextSegment]], Iterable[TextSegment]] | None = None,
|
|
74
77
|
interrupt_block_element: Callable[[Element], Element] | None = None,
|
|
@@ -90,6 +93,7 @@ class XMLTranslator:
|
|
|
90
93
|
for element, mappings in self._stream_mapper.map_stream(
|
|
91
94
|
elements=generate_elements(),
|
|
92
95
|
callbacks=callbacks,
|
|
96
|
+
concurrency=concurrency,
|
|
93
97
|
map=lambda inline_segments: self._translate_inline_segments(
|
|
94
98
|
inline_segments=inline_segments,
|
|
95
99
|
callbacks=callbacks,
|
|
@@ -117,8 +121,7 @@ class XMLTranslator:
|
|
|
117
121
|
inline_segments=inline_segments,
|
|
118
122
|
),
|
|
119
123
|
)
|
|
120
|
-
|
|
121
|
-
source_text = "".join(self._render_text_segments(text_segments))
|
|
124
|
+
source_text = "".join(self._render_source_text_parts(inline_segments))
|
|
122
125
|
translated_text = self._translate_text(source_text)
|
|
123
126
|
|
|
124
127
|
self._request_and_submit(
|
|
@@ -137,21 +140,12 @@ class XMLTranslator:
|
|
|
137
140
|
|
|
138
141
|
return mappings
|
|
139
142
|
|
|
140
|
-
def
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
segment = next(iterator, None)
|
|
144
|
-
if segment is None:
|
|
145
|
-
return
|
|
146
|
-
while True:
|
|
147
|
-
next_segment = next(iterator, None)
|
|
148
|
-
if next_segment is None:
|
|
149
|
-
break
|
|
150
|
-
yield segment.text
|
|
151
|
-
if id(segment.block_parent) != id(next_segment.block_parent):
|
|
143
|
+
def _render_source_text_parts(self, inline_segments: list[InlineSegment]):
|
|
144
|
+
for i, inline_segment in enumerate(inline_segments):
|
|
145
|
+
if i > 0:
|
|
152
146
|
yield "\n\n"
|
|
153
|
-
|
|
154
|
-
|
|
147
|
+
for text_segment in inline_segment:
|
|
148
|
+
yield text_segment.text
|
|
155
149
|
|
|
156
150
|
def _translate_text(self, text: str) -> str:
|
|
157
151
|
with self._translation_llm.context(cache_seed_content=self._cache_seed_content) as ctx:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: epub-translator
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.6
|
|
4
4
|
Summary: Translate the epub book using LLM. The translated book will retain the original text and list the translated text side by side with the original text.
|
|
5
5
|
License: MIT
|
|
6
6
|
Keywords: epub,llm,translation,translator
|
|
@@ -156,7 +156,8 @@ translate(
|
|
|
156
156
|
submit: SubmitKind, # How to insert translations (REPLACE, APPEND_TEXT, or APPEND_BLOCK)
|
|
157
157
|
user_prompt: str | None = None, # Custom translation instructions
|
|
158
158
|
max_retries: int = 5, # Maximum retries for failed translations
|
|
159
|
-
max_group_tokens: int =
|
|
159
|
+
max_group_tokens: int = 2600, # Maximum tokens per translation group
|
|
160
|
+
concurrency: int = 1, # Number of concurrent translation tasks (default: 1)
|
|
160
161
|
llm: LLM | None = None, # Single LLM instance for both translation and filling
|
|
161
162
|
translation_llm: LLM | None = None, # LLM instance for translation (overrides llm)
|
|
162
163
|
fill_llm: LLM | None = None, # LLM instance for XML filling (overrides llm)
|
|
@@ -238,16 +239,17 @@ translate(
|
|
|
238
239
|
|
|
239
240
|
### Error Handling with `on_fill_failed`
|
|
240
241
|
|
|
241
|
-
Monitor
|
|
242
|
+
Monitor translation errors using the `on_fill_failed` callback. The system automatically retries failed translations up to `max_retries` times (default: 5). Most errors are recovered during retries and don't affect the final output.
|
|
242
243
|
|
|
243
244
|
```python
|
|
244
245
|
from epub_translator import FillFailedEvent
|
|
245
246
|
|
|
246
247
|
def handle_fill_error(event: FillFailedEvent):
|
|
247
|
-
|
|
248
|
-
print(f" {event.error_message}")
|
|
248
|
+
# Only log critical errors that will affect the final EPUB
|
|
249
249
|
if event.over_maximum_retries:
|
|
250
|
-
print("
|
|
250
|
+
print(f"Critical error after {event.retried_count} attempts:")
|
|
251
|
+
print(f" {event.error_message}")
|
|
252
|
+
print(" This error will be present in the final EPUB file!")
|
|
251
253
|
|
|
252
254
|
translate(
|
|
253
255
|
source_path="source.epub",
|
|
@@ -259,10 +261,32 @@ translate(
|
|
|
259
261
|
)
|
|
260
262
|
```
|
|
261
263
|
|
|
264
|
+
**Understanding Error Severity:**
|
|
265
|
+
|
|
262
266
|
The `FillFailedEvent` contains:
|
|
263
267
|
- `error_message: str` - Description of the error
|
|
264
|
-
- `retried_count: int` - Current retry attempt number
|
|
265
|
-
- `over_maximum_retries: bool` - Whether
|
|
268
|
+
- `retried_count: int` - Current retry attempt number (1 to max_retries)
|
|
269
|
+
- `over_maximum_retries: bool` - Whether the error is critical
|
|
270
|
+
|
|
271
|
+
**Error Categories:**
|
|
272
|
+
|
|
273
|
+
- **Recoverable errors** (`over_maximum_retries=False`): Errors during retry attempts. The system will continue retrying and may resolve these automatically. Safe to ignore in most cases.
|
|
274
|
+
|
|
275
|
+
- **Critical errors** (`over_maximum_retries=True`): Errors that persist after all retry attempts. These will appear in the final EPUB file and should be investigated.
|
|
276
|
+
|
|
277
|
+
**Advanced Usage:**
|
|
278
|
+
|
|
279
|
+
For verbose logging during translation debugging:
|
|
280
|
+
|
|
281
|
+
```python
|
|
282
|
+
def handle_fill_error(event: FillFailedEvent):
|
|
283
|
+
if event.over_maximum_retries:
|
|
284
|
+
# Critical: affects final output
|
|
285
|
+
print(f"❌ CRITICAL: {event.error_message}")
|
|
286
|
+
else:
|
|
287
|
+
# Informational: system is retrying
|
|
288
|
+
print(f"⚠️ Retry {event.retried_count}: {event.error_message}")
|
|
289
|
+
```
|
|
266
290
|
|
|
267
291
|
### Dual-LLM Architecture
|
|
268
292
|
|
|
@@ -371,6 +395,32 @@ llm = LLM(
|
|
|
371
395
|
)
|
|
372
396
|
```
|
|
373
397
|
|
|
398
|
+
### Concurrent Translation
|
|
399
|
+
|
|
400
|
+
Speed up translation by processing multiple text segments concurrently. Use the `concurrency` parameter to control how many translation tasks run in parallel:
|
|
401
|
+
|
|
402
|
+
```python
|
|
403
|
+
translate(
|
|
404
|
+
source_path="source.epub",
|
|
405
|
+
target_path="translated.epub",
|
|
406
|
+
target_language="English",
|
|
407
|
+
submit=SubmitKind.APPEND_BLOCK,
|
|
408
|
+
llm=llm,
|
|
409
|
+
concurrency=4, # Process 4 segments concurrently
|
|
410
|
+
)
|
|
411
|
+
```
|
|
412
|
+
|
|
413
|
+
**Performance Tips:**
|
|
414
|
+
|
|
415
|
+
- Start with `concurrency=4` and adjust based on your API rate limits and system resources
|
|
416
|
+
- Higher concurrency values can significantly reduce translation time for large books
|
|
417
|
+
- The translation order is preserved regardless of concurrency settings
|
|
418
|
+
- Monitor your API provider's rate limits to avoid throttling
|
|
419
|
+
|
|
420
|
+
**Thread Safety:**
|
|
421
|
+
|
|
422
|
+
When using `concurrency > 1`, ensure that any custom callback functions (`on_progress`, `on_fill_failed`) are thread-safe. Built-in callbacks are thread-safe by default.
|
|
423
|
+
|
|
374
424
|
## Related Projects
|
|
375
425
|
|
|
376
426
|
### PDF Craft
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
epub_translator/__init__.py,sha256=
|
|
1
|
+
epub_translator/__init__.py,sha256=JsiOUPpk5k7q8mXIgnRQWdVVnkJww_KDTg7jXsP7_C4,222
|
|
2
2
|
epub_translator/data/fill.jinja,sha256=zSytA8Vhp2i6YBZ09F1z9iPJq1-jUaiphoXqTNZwnvo,6964
|
|
3
3
|
epub_translator/data/mmltex/README.md,sha256=wwhe5yW1U_7_YZIFKnQVnCOmUl7Mu3gsr3lNnDSJ5Qs,2953
|
|
4
4
|
epub_translator/data/mmltex/cmarkup.xsl,sha256=DkhimAATM0XSCfVOfY41-qTPoddqzOHjZ00Pynr4zQE,37707
|
|
@@ -9,60 +9,63 @@ epub_translator/data/mmltex/scripts.xsl,sha256=f4ei0cDCW3cV-Ra7rC3kC5tRcKdjJxbSp
|
|
|
9
9
|
epub_translator/data/mmltex/tables.xsl,sha256=RxtNo8qDtVAg8_6BuYsafraB_0z7YDAB9D__fT9gmWs,4327
|
|
10
10
|
epub_translator/data/mmltex/tokens.xsl,sha256=j3JZRcBhAiiY8o5K3640phfLwxO8JVspCFlSttwBzJk,12373
|
|
11
11
|
epub_translator/data/translate.jinja,sha256=93d8kschm5HV-EfXd1kFSIVMObDqTMdoUrwDfce2bhU,820
|
|
12
|
-
epub_translator/epub/__init__.py,sha256=
|
|
12
|
+
epub_translator/epub/__init__.py,sha256=aZawPakdkEquL4kRRpyCTdoSQ82l7FGqY4Uw6-ndoGA,154
|
|
13
13
|
epub_translator/epub/common.py,sha256=4-SpTe8iot9hMfyXILmlUFvYVNYqPAHL5hn1fr2wgis,1180
|
|
14
14
|
epub_translator/epub/math.py,sha256=-Q2LJQxxjgQZQUe_WlJA9tjzLqgqtw2ZmbGbHsPRp2U,5422
|
|
15
15
|
epub_translator/epub/metadata.py,sha256=DXSimY2iZNBA2juIaKtB-4CHHSYJiDK7PPhfenV4dto,3511
|
|
16
16
|
epub_translator/epub/spines.py,sha256=bP2IsobZm7zs4z10iXGc9SmgAFSIq9pJc8HE-V0aW9Y,1331
|
|
17
17
|
epub_translator/epub/toc.py,sha256=TKJfyDT4svFkXd6JCNZk2ZEYc9q-5DXnV3zY2UKo8nE,14891
|
|
18
18
|
epub_translator/epub/zip.py,sha256=-3LI8f-ksgU8xCy28NjBOKyQPE8PhPEUPqIKZE1p8dw,2364
|
|
19
|
-
epub_translator/epub_transcode.py,sha256=NzuvXXEZfAhIoMOSrgQRF0DPtaSpz4OY-NMSdC0Y2RM,2749
|
|
20
|
-
epub_translator/language.py,sha256=88osG0JNYxOkxBjg5Pm-P0Mhiyxf6GqdxoPW12HW0PE,493
|
|
21
19
|
epub_translator/llm/__init__.py,sha256=YcFYYnXmXyX0RUyC-PDbj5k7Woygp_XOpTI3vDiNSPM,75
|
|
22
|
-
epub_translator/llm/context.py,sha256=
|
|
23
|
-
epub_translator/llm/core.py,sha256=
|
|
20
|
+
epub_translator/llm/context.py,sha256=8-0UnrZIaNshR_imy_ed_UpOK7H1a6dOsG-boaYOX8k,4186
|
|
21
|
+
epub_translator/llm/core.py,sha256=wQwt6oG68ZN_iQOaytBiPXOC7sI62XII_A4dOHdAt_s,5979
|
|
24
22
|
epub_translator/llm/error.py,sha256=4efAIQL14DFSvAnSTUfgdAbZRqaWBqOfUGsSfvxa5zM,1503
|
|
25
23
|
epub_translator/llm/executor.py,sha256=A0IjQ-s9wBJuhAZAAydneb9zBXWnu2J9inR2Q8F-GDE,5533
|
|
26
24
|
epub_translator/llm/increasable.py,sha256=8XkKeI1hiHlpMHj8dQ4fW0BkViSx4hH8QfbQsy-5SDw,1297
|
|
27
25
|
epub_translator/llm/types.py,sha256=c-dMAIvlG4R3la3mUTWEw5xei-sIYKmQeBja7mirxcI,219
|
|
28
|
-
epub_translator/punctuation.py,sha256=Yrf3b_Pl36FPBaK96LR-EBjnztlcZZTWLSNaYoWIUSc,812
|
|
29
26
|
epub_translator/segment/__init__.py,sha256=UYTv_IKQbEB0DzhFeiuqCvjoJLvB-7XRwlaFS90KmIw,573
|
|
30
27
|
epub_translator/segment/block_segment.py,sha256=psNKA_HMIcwZtoug8AtnAcV9_mQ2WXLnXqFsekHzt2g,4570
|
|
31
28
|
epub_translator/segment/common.py,sha256=gGWYQaJ0tGnWCuF1me9TOo-Q_DrZVakCu2patyFIOs0,714
|
|
32
|
-
epub_translator/segment/inline_segment.py,sha256=
|
|
33
|
-
epub_translator/segment/text_segment.py,sha256=
|
|
29
|
+
epub_translator/segment/inline_segment.py,sha256=nrRKoJ-vblsNITJeixrCgIOkVQyUXrchMg0XYU_8pLo,14563
|
|
30
|
+
epub_translator/segment/text_segment.py,sha256=LhGlugp6MeAB3tk2jxd1kBb2EA8G2ruN49mP_IZehA0,6295
|
|
34
31
|
epub_translator/segment/utils.py,sha256=qMqUt33pDRN5Tnuydkodzu2gaQrwTzAnQmXpDuHen1o,1036
|
|
35
32
|
epub_translator/serial/__init__.py,sha256=b3IMVmWcUwEqHKcGmey88b057pyz5ct946CaUZi4LB4,67
|
|
36
33
|
epub_translator/serial/chunk.py,sha256=FrTaHikVOd6bLYumnEriTaAQ_DIDLjHm16gh-wBVR9k,1495
|
|
37
34
|
epub_translator/serial/segment.py,sha256=uEz-ke1KcYrON-68FaUEzMG2CzHlMjvbC11F3ZT4yH0,446
|
|
38
35
|
epub_translator/serial/splitter.py,sha256=Nq0sxPXos8ez7QBG01sOKjnYKbeBWUBHflZGtqenVm8,1726
|
|
39
36
|
epub_translator/template.py,sha256=0CqRmj3nTtPshw0NmTr2ECqelops2MMyX94fMrE-HKs,1587
|
|
40
|
-
epub_translator/
|
|
37
|
+
epub_translator/translation/__init__.py,sha256=R0c0ZngocOC-Qczs0a8JYAdAcCu2gv3FLcSrUyhwDMo,74
|
|
38
|
+
epub_translator/translation/epub_transcode.py,sha256=_pRzmQgDrlfsibalkUogVi0F0Qy_uuYfKhZk3nP5pkA,2747
|
|
39
|
+
epub_translator/translation/language.py,sha256=88osG0JNYxOkxBjg5Pm-P0Mhiyxf6GqdxoPW12HW0PE,493
|
|
40
|
+
epub_translator/translation/punctuation.py,sha256=TPCGjEmlAyN3G11VuXdHn-pvUkuWDwWqbTNzw-ij60E,813
|
|
41
|
+
epub_translator/translation/translator.py,sha256=WC4Yqx-ffhxBhqzMAujE_NQG7BsDwgn95UMNG7OkUSo,6487
|
|
42
|
+
epub_translator/translation/xml_interrupter.py,sha256=QxrNpBoR4ZIAvWsa20jz1z_bE_5-G5-nBGjE6IKCTjw,7405
|
|
41
43
|
epub_translator/utils.py,sha256=BfZWrYjzDNQ4cFrgvRNzd4i1CKLtPxS8Z4LBHhqEV78,914
|
|
42
|
-
epub_translator/xml/__init__.py,sha256=
|
|
44
|
+
epub_translator/xml/__init__.py,sha256=qluFTfZYlPmOie8nR2C5O0tZ3UbCQEoEoR-Fq-__79c,160
|
|
43
45
|
epub_translator/xml/const.py,sha256=Re2TYmpwG7-jVVgSq3R_K-uYhvAYzcXcRmLFkwCPD9Y,19
|
|
44
46
|
epub_translator/xml/deduplication.py,sha256=TaMbzeA70VvUQV0X1wcQFVbuMEPJUtj9Hq6iWlUmtAQ,1152
|
|
45
|
-
epub_translator/xml/
|
|
46
|
-
epub_translator/xml/
|
|
47
|
-
epub_translator/xml/
|
|
48
|
-
epub_translator/xml/
|
|
49
|
-
epub_translator/xml/
|
|
50
|
-
epub_translator/xml/
|
|
47
|
+
epub_translator/xml/friendly/__init__.py,sha256=I5jhnhFWoHvojLsYXH4jfR4Gi8lKFZ3yQ56ze5hEe1M,74
|
|
48
|
+
epub_translator/xml/friendly/decoder.py,sha256=xRQ5LnSunmYbba_0oT39oUr86-sLYAHYMUGmlseIu2U,2467
|
|
49
|
+
epub_translator/xml/friendly/encoder.py,sha256=evjvw6oE-oCud44IsJ-YZVHn6dtUzjNYX25ljaZP6vY,2417
|
|
50
|
+
epub_translator/xml/friendly/parser.py,sha256=QlMHA0nfPJbNyx6IwRFrYVw7okuvzDB42NXCauIFV-o,6560
|
|
51
|
+
epub_translator/xml/friendly/tag.py,sha256=ahaGoYttuAlnFxLFFgTV51KUZSpUiHho-COZX14nxN8,3308
|
|
52
|
+
epub_translator/xml/friendly/transform.py,sha256=5tG1MJmzrXIR_Z5gmRxwcoKvXBzJBVH0ELeaRsG-8w0,1201
|
|
51
53
|
epub_translator/xml/inline.py,sha256=mwFho6wq2gYWmWcg5Cw6OQeteV-a-i6X9OE63fzblpE,1274
|
|
52
54
|
epub_translator/xml/self_closing.py,sha256=41ofGUdss9yU51IVwI4It6hKfzh8YcxIR_j-ohD19LE,5240
|
|
53
55
|
epub_translator/xml/utils.py,sha256=7tQ6L5P0_JXhxONeG64hEeeL5mKjA6NKS1H1Q9B1Cac,1062
|
|
54
56
|
epub_translator/xml/xml.py,sha256=qQ5Wk1-KVVHE4TX25zGOR7fINsGkXnoq-qyKKNl5no4,1675
|
|
55
57
|
epub_translator/xml/xml_like.py,sha256=jBK4UUgXXWRYnfYlCH1MUAjGHWBQAbUj8HsYqvTTWvA,8890
|
|
56
|
-
epub_translator/xml_interrupter.py,sha256=IGLATr7zTIdhE54Gnroab4Xu_vLJ7kzPiQgk7WMXKTc,7403
|
|
57
58
|
epub_translator/xml_translator/__init__.py,sha256=lqts1mJL_WfojDnMAQ5OM7TbT6u9X3H-X4C_avHzvXM,128
|
|
58
59
|
epub_translator/xml_translator/callbacks.py,sha256=IoZrsaivd2W76cHFupwv6auVxgEWHcBN2MHQJYcWoJ8,1324
|
|
59
60
|
epub_translator/xml_translator/common.py,sha256=hSPptgPp7j6dm47imELB5DgmEbzTEyJD6WEeELOOc50,38
|
|
61
|
+
epub_translator/xml_translator/concurrency.py,sha256=ACwoDHNX3xChL0On5yvUSFT8By7aoHoKor94k6A8nuY,1502
|
|
60
62
|
epub_translator/xml_translator/hill_climbing.py,sha256=1jvilOkTLzwljJA4Nrel8yU2XGvOXpueUJTK7RAp-XY,4272
|
|
61
|
-
epub_translator/xml_translator/
|
|
62
|
-
epub_translator/xml_translator/
|
|
63
|
-
epub_translator/xml_translator/
|
|
63
|
+
epub_translator/xml_translator/score.py,sha256=TkXDmr-29p8SzuAp68u_vFDE69y1TyId9S20HT1T_xs,5311
|
|
64
|
+
epub_translator/xml_translator/stream_mapper.py,sha256=nk8iRUHAUQA2B35_y-JOCo6il8MSxXikWvyl-WA8WAA,10662
|
|
65
|
+
epub_translator/xml_translator/submitter.py,sha256=6PGQTnEcOgL3zseDpSzDmU5d9Eg3eO5OfPIGmQp2DVY,14155
|
|
66
|
+
epub_translator/xml_translator/translator.py,sha256=7Ja1jFbmjIgHcmI9V6gg_K0t7qb6in9mhRn54a7qhZ8,9497
|
|
64
67
|
epub_translator/xml_translator/validation.py,sha256=-OKlSZuD__sjAiEpGAO93YQme4ZDSPmoPjRsAMOCEjc,16668
|
|
65
|
-
epub_translator-0.1.
|
|
66
|
-
epub_translator-0.1.
|
|
67
|
-
epub_translator-0.1.
|
|
68
|
-
epub_translator-0.1.
|
|
68
|
+
epub_translator-0.1.6.dist-info/LICENSE,sha256=5RF32sL3LtMOJIErdDKp1ZEYPGXS8WPpsiSz_jMBnGI,1066
|
|
69
|
+
epub_translator-0.1.6.dist-info/METADATA,sha256=AcjUb1wmz6cN8PnbwgWJeGlOO9sH445B-qPugLW705M,15638
|
|
70
|
+
epub_translator-0.1.6.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
|
71
|
+
epub_translator-0.1.6.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|