epub-translator 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epub_translator/__init__.py +8 -1
- epub_translator/punctuation.py +34 -0
- epub_translator/segment/text_segment.py +2 -67
- epub_translator/translator.py +33 -29
- epub_translator/xml/__init__.py +1 -0
- epub_translator/xml/inline.py +67 -0
- epub_translator/xml_translator/__init__.py +2 -1
- epub_translator/xml_translator/submitter.py +371 -46
- epub_translator/xml_translator/translator.py +31 -12
- {epub_translator-0.1.3.dist-info → epub_translator-0.1.5.dist-info}/METADATA +96 -23
- {epub_translator-0.1.3.dist-info → epub_translator-0.1.5.dist-info}/RECORD +13 -11
- {epub_translator-0.1.3.dist-info → epub_translator-0.1.5.dist-info}/LICENSE +0 -0
- {epub_translator-0.1.3.dist-info → epub_translator-0.1.5.dist-info}/WHEEL +0 -0
epub_translator/__init__.py
CHANGED
|
@@ -1,5 +1,12 @@
|
|
|
1
1
|
from . import language
|
|
2
2
|
from .llm import LLM
|
|
3
3
|
from .translator import FillFailedEvent, translate
|
|
4
|
+
from .xml_translator import SubmitKind
|
|
4
5
|
|
|
5
|
-
__all__ = [
|
|
6
|
+
__all__ = [
|
|
7
|
+
"LLM",
|
|
8
|
+
"translate",
|
|
9
|
+
"language",
|
|
10
|
+
"FillFailedEvent",
|
|
11
|
+
"SubmitKind",
|
|
12
|
+
]
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from xml.etree.ElementTree import Element
|
|
2
|
+
|
|
3
|
+
from .xml import iter_with_stack
|
|
4
|
+
|
|
5
|
+
_QUOTE_MAPPING = {
|
|
6
|
+
# 法语引号
|
|
7
|
+
"«": "",
|
|
8
|
+
"»": "",
|
|
9
|
+
"‹": "«",
|
|
10
|
+
"›": "»",
|
|
11
|
+
# 中文书书名号
|
|
12
|
+
"《": "",
|
|
13
|
+
"》": "",
|
|
14
|
+
"〈": "《",
|
|
15
|
+
"〉": "》",
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _strip_quotes(text: str):
|
|
20
|
+
for char in text:
|
|
21
|
+
mapped = _QUOTE_MAPPING.get(char, None)
|
|
22
|
+
if mapped is None:
|
|
23
|
+
yield char
|
|
24
|
+
elif mapped:
|
|
25
|
+
yield mapped
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def unwrap_french_quotes(element: Element) -> Element:
|
|
29
|
+
for _, child_element in iter_with_stack(element):
|
|
30
|
+
if child_element.text:
|
|
31
|
+
child_element.text = "".join(_strip_quotes(child_element.text))
|
|
32
|
+
if child_element.tail:
|
|
33
|
+
child_element.tail = "".join(_strip_quotes(child_element.tail))
|
|
34
|
+
return element
|
|
@@ -4,71 +4,7 @@ from enum import Enum, auto
|
|
|
4
4
|
from typing import Self
|
|
5
5
|
from xml.etree.ElementTree import Element
|
|
6
6
|
|
|
7
|
-
from ..xml import expand_left_element_texts, expand_right_element_texts, normalize_text_in_element
|
|
8
|
-
|
|
9
|
-
# HTML inline-level elements
|
|
10
|
-
# Reference: https://developer.mozilla.org/en-US/docs/Web/HTML/Inline_elements
|
|
11
|
-
# Reference: https://developer.mozilla.org/en-US/docs/Glossary/Inline-level_content
|
|
12
|
-
_HTML_INLINE_TAGS = frozenset(
|
|
13
|
-
(
|
|
14
|
-
# Inline text semantics
|
|
15
|
-
"a",
|
|
16
|
-
"abbr",
|
|
17
|
-
"b",
|
|
18
|
-
"bdi",
|
|
19
|
-
"bdo",
|
|
20
|
-
"br",
|
|
21
|
-
"cite",
|
|
22
|
-
"code",
|
|
23
|
-
"data",
|
|
24
|
-
"dfn",
|
|
25
|
-
"em",
|
|
26
|
-
"i",
|
|
27
|
-
"kbd",
|
|
28
|
-
"mark",
|
|
29
|
-
"q",
|
|
30
|
-
"rp",
|
|
31
|
-
"rt",
|
|
32
|
-
"ruby",
|
|
33
|
-
"s",
|
|
34
|
-
"samp",
|
|
35
|
-
"small",
|
|
36
|
-
"span",
|
|
37
|
-
"strong",
|
|
38
|
-
"sub",
|
|
39
|
-
"sup",
|
|
40
|
-
"time",
|
|
41
|
-
"u",
|
|
42
|
-
"var",
|
|
43
|
-
"wbr",
|
|
44
|
-
# Image and multimedia
|
|
45
|
-
"img",
|
|
46
|
-
"svg",
|
|
47
|
-
"canvas",
|
|
48
|
-
"audio",
|
|
49
|
-
"video",
|
|
50
|
-
"map",
|
|
51
|
-
"area",
|
|
52
|
-
# Form elements
|
|
53
|
-
"input",
|
|
54
|
-
"button",
|
|
55
|
-
"select",
|
|
56
|
-
"textarea",
|
|
57
|
-
"label",
|
|
58
|
-
"output",
|
|
59
|
-
"progress",
|
|
60
|
-
"meter",
|
|
61
|
-
# Embedded content
|
|
62
|
-
"iframe",
|
|
63
|
-
"embed",
|
|
64
|
-
"object",
|
|
65
|
-
# Other inline elements
|
|
66
|
-
"script",
|
|
67
|
-
"del",
|
|
68
|
-
"ins",
|
|
69
|
-
"slot",
|
|
70
|
-
)
|
|
71
|
-
)
|
|
7
|
+
from ..xml import expand_left_element_texts, expand_right_element_texts, is_inline_tag, normalize_text_in_element
|
|
72
8
|
|
|
73
9
|
|
|
74
10
|
class TextPosition(Enum):
|
|
@@ -196,8 +132,7 @@ def _search_text_segments(stack: list[Element], element: Element) -> Generator[T
|
|
|
196
132
|
def _find_block_depth(parent_stack: list[Element]) -> int:
|
|
197
133
|
index: int = 0
|
|
198
134
|
for i in range(len(parent_stack) - 1, -1, -1):
|
|
199
|
-
|
|
200
|
-
if checked_tag not in _HTML_INLINE_TAGS:
|
|
135
|
+
if not is_inline_tag(parent_stack[i].tag):
|
|
201
136
|
index = i
|
|
202
137
|
break
|
|
203
138
|
return index + 1 # depth is a count not index
|
epub_translator/translator.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from collections.abc import Callable
|
|
1
|
+
from collections.abc import Callable, Generator
|
|
2
2
|
from dataclasses import dataclass
|
|
3
3
|
from enum import Enum, auto
|
|
4
4
|
from importlib.metadata import version as get_package_version
|
|
@@ -15,9 +15,10 @@ from .epub import (
|
|
|
15
15
|
)
|
|
16
16
|
from .epub_transcode import decode_metadata, decode_toc_list, encode_metadata, encode_toc_list
|
|
17
17
|
from .llm import LLM
|
|
18
|
+
from .punctuation import unwrap_french_quotes
|
|
18
19
|
from .xml import XMLLikeNode, deduplicate_ids_in_element, find_first
|
|
19
20
|
from .xml_interrupter import XMLInterrupter
|
|
20
|
-
from .xml_translator import FillFailedEvent, XMLTranslator
|
|
21
|
+
from .xml_translator import FillFailedEvent, SubmitKind, TranslationTask, XMLTranslator
|
|
21
22
|
|
|
22
23
|
|
|
23
24
|
class _ElementType(Enum):
|
|
@@ -36,6 +37,7 @@ def translate(
|
|
|
36
37
|
source_path: PathLike | str,
|
|
37
38
|
target_path: PathLike | str,
|
|
38
39
|
target_language: str,
|
|
40
|
+
submit: SubmitKind,
|
|
39
41
|
user_prompt: str | None = None,
|
|
40
42
|
max_retries: int = 5,
|
|
41
43
|
max_group_tokens: int = 1200,
|
|
@@ -83,33 +85,26 @@ def translate(
|
|
|
83
85
|
return
|
|
84
86
|
|
|
85
87
|
interrupter = XMLInterrupter()
|
|
86
|
-
element_contexts: dict[int, _ElementContext] = {}
|
|
87
|
-
|
|
88
88
|
toc_weight = 0.05 if toc_has_items else 0
|
|
89
89
|
metadata_weight = 0.05 if metadata_has_items else 0
|
|
90
90
|
chapters_weight = 1.0 - toc_weight - metadata_weight
|
|
91
91
|
progress_per_chapter = chapters_weight / total_chapters if total_chapters > 0 else 0
|
|
92
92
|
current_progress = 0.0
|
|
93
93
|
|
|
94
|
-
for translated_elem in translator.translate_elements(
|
|
94
|
+
for translated_elem, context in translator.translate_elements(
|
|
95
95
|
interrupt_source_text_segments=interrupter.interrupt_source_text_segments,
|
|
96
96
|
interrupt_translated_text_segments=interrupter.interrupt_translated_text_segments,
|
|
97
97
|
interrupt_block_element=interrupter.interrupt_block_element,
|
|
98
98
|
on_fill_failed=on_fill_failed,
|
|
99
|
-
|
|
99
|
+
tasks=_generate_tasks_from_book(
|
|
100
100
|
zip=zip,
|
|
101
101
|
toc_list=toc_list,
|
|
102
102
|
metadata_fields=metadata_fields,
|
|
103
|
-
|
|
103
|
+
submit=submit,
|
|
104
104
|
),
|
|
105
105
|
):
|
|
106
|
-
elem_id = id(translated_elem)
|
|
107
|
-
context = element_contexts.pop(elem_id, None)
|
|
108
|
-
|
|
109
|
-
if context is None:
|
|
110
|
-
continue
|
|
111
|
-
|
|
112
106
|
if context.element_type == _ElementType.TOC:
|
|
107
|
+
translated_elem = unwrap_french_quotes(translated_elem)
|
|
113
108
|
decoded_toc = decode_toc_list(translated_elem)
|
|
114
109
|
write_toc(zip, decoded_toc)
|
|
115
110
|
|
|
@@ -118,6 +113,7 @@ def translate(
|
|
|
118
113
|
on_progress(current_progress)
|
|
119
114
|
|
|
120
115
|
elif context.element_type == _ElementType.METADATA:
|
|
116
|
+
translated_elem = unwrap_french_quotes(translated_elem)
|
|
121
117
|
decoded_metadata = decode_metadata(translated_elem)
|
|
122
118
|
write_metadata(zip, decoded_metadata)
|
|
123
119
|
|
|
@@ -137,23 +133,29 @@ def translate(
|
|
|
137
133
|
on_progress(current_progress)
|
|
138
134
|
|
|
139
135
|
|
|
140
|
-
def
|
|
136
|
+
def _generate_tasks_from_book(
|
|
141
137
|
zip: Zip,
|
|
142
138
|
toc_list: list,
|
|
143
139
|
metadata_fields: list,
|
|
144
|
-
|
|
145
|
-
):
|
|
140
|
+
submit: SubmitKind,
|
|
141
|
+
) -> Generator[TranslationTask[_ElementContext], None, None]:
|
|
142
|
+
head_submit = submit
|
|
143
|
+
if head_submit == SubmitKind.APPEND_BLOCK:
|
|
144
|
+
head_submit = SubmitKind.APPEND_TEXT
|
|
145
|
+
|
|
146
146
|
if toc_list:
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
147
|
+
yield TranslationTask(
|
|
148
|
+
element=encode_toc_list(toc_list),
|
|
149
|
+
action=head_submit,
|
|
150
|
+
payload=_ElementContext(element_type=_ElementType.TOC),
|
|
151
|
+
)
|
|
151
152
|
|
|
152
153
|
if metadata_fields:
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
154
|
+
yield TranslationTask(
|
|
155
|
+
element=encode_metadata(metadata_fields),
|
|
156
|
+
action=head_submit,
|
|
157
|
+
payload=_ElementContext(element_type=_ElementType.METADATA),
|
|
158
|
+
)
|
|
157
159
|
|
|
158
160
|
for chapter_path, media_type in search_spine_paths(zip):
|
|
159
161
|
with zip.read(chapter_path) as chapter_file:
|
|
@@ -163,12 +165,14 @@ def _generate_elements_from_book(
|
|
|
163
165
|
)
|
|
164
166
|
body_element = find_first(xml.element, "body")
|
|
165
167
|
if body_element is not None:
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
168
|
+
yield TranslationTask(
|
|
169
|
+
element=body_element,
|
|
170
|
+
action=submit,
|
|
171
|
+
payload=_ElementContext(
|
|
172
|
+
element_type=_ElementType.CHAPTER,
|
|
173
|
+
chapter_data=(chapter_path, xml),
|
|
174
|
+
),
|
|
170
175
|
)
|
|
171
|
-
yield body_element
|
|
172
176
|
|
|
173
177
|
|
|
174
178
|
def _get_version() -> str:
|
epub_translator/xml/__init__.py
CHANGED
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
# HTML inline-level elements
|
|
2
|
+
# Reference: https://developer.mozilla.org/en-US/docs/Web/HTML/Inline_elements
|
|
3
|
+
# Reference: https://developer.mozilla.org/en-US/docs/Glossary/Inline-level_content
|
|
4
|
+
_HTML_INLINE_TAGS = frozenset(
|
|
5
|
+
(
|
|
6
|
+
# Inline text semantics
|
|
7
|
+
"a",
|
|
8
|
+
"abbr",
|
|
9
|
+
"b",
|
|
10
|
+
"bdi",
|
|
11
|
+
"bdo",
|
|
12
|
+
"br",
|
|
13
|
+
"cite",
|
|
14
|
+
"code",
|
|
15
|
+
"data",
|
|
16
|
+
"dfn",
|
|
17
|
+
"em",
|
|
18
|
+
"i",
|
|
19
|
+
"kbd",
|
|
20
|
+
"mark",
|
|
21
|
+
"q",
|
|
22
|
+
"rp",
|
|
23
|
+
"rt",
|
|
24
|
+
"ruby",
|
|
25
|
+
"s",
|
|
26
|
+
"samp",
|
|
27
|
+
"small",
|
|
28
|
+
"span",
|
|
29
|
+
"strong",
|
|
30
|
+
"sub",
|
|
31
|
+
"sup",
|
|
32
|
+
"time",
|
|
33
|
+
"u",
|
|
34
|
+
"var",
|
|
35
|
+
"wbr",
|
|
36
|
+
# Image and multimedia
|
|
37
|
+
"img",
|
|
38
|
+
"svg",
|
|
39
|
+
"canvas",
|
|
40
|
+
"audio",
|
|
41
|
+
"video",
|
|
42
|
+
"map",
|
|
43
|
+
"area",
|
|
44
|
+
# Form elements
|
|
45
|
+
"input",
|
|
46
|
+
"button",
|
|
47
|
+
"select",
|
|
48
|
+
"textarea",
|
|
49
|
+
"label",
|
|
50
|
+
"output",
|
|
51
|
+
"progress",
|
|
52
|
+
"meter",
|
|
53
|
+
# Embedded content
|
|
54
|
+
"iframe",
|
|
55
|
+
"embed",
|
|
56
|
+
"object",
|
|
57
|
+
# Other inline elements
|
|
58
|
+
"script",
|
|
59
|
+
"del",
|
|
60
|
+
"ins",
|
|
61
|
+
"slot",
|
|
62
|
+
)
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def is_inline_tag(tag: str) -> bool:
|
|
67
|
+
return tag.lower() in _HTML_INLINE_TAGS
|
|
@@ -1,56 +1,381 @@
|
|
|
1
|
+
from collections.abc import Generator
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from enum import Enum, auto
|
|
1
4
|
from xml.etree.ElementTree import Element
|
|
2
5
|
|
|
3
6
|
from ..segment import TextSegment, combine_text_segments
|
|
4
|
-
from ..xml import index_of_parent, iter_with_stack
|
|
7
|
+
from ..xml import index_of_parent, is_inline_tag, iter_with_stack
|
|
5
8
|
from .stream_mapper import InlineSegmentMapping
|
|
6
9
|
|
|
7
10
|
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
+
class SubmitKind(Enum):
|
|
12
|
+
REPLACE = auto()
|
|
13
|
+
APPEND_TEXT = auto()
|
|
14
|
+
APPEND_BLOCK = auto()
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def submit(element: Element, action: SubmitKind, mappings: list[InlineSegmentMapping]) -> Element:
|
|
18
|
+
submitter = _Submitter(
|
|
19
|
+
element=element,
|
|
20
|
+
action=action,
|
|
21
|
+
mappings=mappings,
|
|
22
|
+
)
|
|
23
|
+
replaced_root = submitter.do()
|
|
24
|
+
if replaced_root is not None:
|
|
25
|
+
return replaced_root
|
|
26
|
+
|
|
11
27
|
return element
|
|
12
28
|
|
|
13
29
|
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
30
|
+
@dataclass
|
|
31
|
+
class _Node:
|
|
32
|
+
raw_element: Element
|
|
33
|
+
items: list[tuple[list[TextSegment], "_Node"]] # empty for peak, non-empty for platform
|
|
34
|
+
tail_text_segments: list[TextSegment]
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class _Submitter:
|
|
38
|
+
def __init__(
|
|
39
|
+
self,
|
|
40
|
+
element: Element,
|
|
41
|
+
action: SubmitKind,
|
|
42
|
+
mappings: list[InlineSegmentMapping],
|
|
43
|
+
) -> None:
|
|
44
|
+
self._action: SubmitKind = action
|
|
45
|
+
self._nodes: list[_Node] = list(_nest_nodes(mappings))
|
|
46
|
+
self._parents: dict[int, Element] = self._collect_parents(element, mappings)
|
|
47
|
+
|
|
48
|
+
def _collect_parents(self, element: Element, mappings: list[InlineSegmentMapping]):
|
|
49
|
+
ids: set[int] = set(id(e) for e, _ in mappings)
|
|
50
|
+
parents_dict: dict[int, Element] = {}
|
|
51
|
+
for parents, child in iter_with_stack(element):
|
|
52
|
+
if parents and id(child) in ids:
|
|
53
|
+
parents_dict[id(child)] = parents[-1]
|
|
54
|
+
return parents_dict
|
|
55
|
+
|
|
56
|
+
def do(self):
|
|
57
|
+
replaced_root: Element | None = None
|
|
58
|
+
|
|
59
|
+
for node in self._nodes:
|
|
60
|
+
submitted = self._submit_node(node)
|
|
61
|
+
if replaced_root is None:
|
|
62
|
+
replaced_root = submitted
|
|
63
|
+
|
|
64
|
+
return replaced_root
|
|
65
|
+
|
|
66
|
+
# @return replaced root element, or None if appended to parent
|
|
67
|
+
def _submit_node(self, node: _Node) -> Element | None:
|
|
68
|
+
if node.items or self._action == SubmitKind.APPEND_TEXT:
|
|
69
|
+
return self._submit_by_text(node)
|
|
70
|
+
else:
|
|
71
|
+
return self._submit_by_block(node)
|
|
72
|
+
|
|
73
|
+
def _submit_by_block(self, node: _Node) -> Element | None:
|
|
74
|
+
parent = self._parents.get(id(node.raw_element), None)
|
|
75
|
+
if parent is None:
|
|
76
|
+
return node.raw_element
|
|
77
|
+
|
|
78
|
+
preserved_elements: list[Element] = []
|
|
79
|
+
if self._action == SubmitKind.REPLACE:
|
|
80
|
+
for child in list(node.raw_element):
|
|
81
|
+
if not is_inline_tag(child.tag):
|
|
82
|
+
child.tail = None
|
|
83
|
+
preserved_elements.append(child)
|
|
84
|
+
|
|
85
|
+
index = index_of_parent(parent, node.raw_element)
|
|
86
|
+
combined = self._combine_text_segments(node.tail_text_segments)
|
|
87
|
+
|
|
52
88
|
if combined is not None:
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
89
|
+
# 在 APPEND_BLOCK 模式下,如果是 inline tag,则在文本前面加空格
|
|
90
|
+
if self._action == SubmitKind.APPEND_BLOCK and is_inline_tag(combined.tag) and combined.text:
|
|
91
|
+
combined.text = " " + combined.text
|
|
92
|
+
parent.insert(index + 1, combined)
|
|
93
|
+
index += 1
|
|
94
|
+
|
|
95
|
+
for elem in preserved_elements:
|
|
96
|
+
parent.insert(index + 1, elem)
|
|
97
|
+
index += 1
|
|
98
|
+
|
|
99
|
+
if combined is not None or preserved_elements:
|
|
100
|
+
if preserved_elements:
|
|
101
|
+
preserved_elements[-1].tail = node.raw_element.tail
|
|
102
|
+
elif combined is not None:
|
|
103
|
+
combined.tail = node.raw_element.tail
|
|
104
|
+
node.raw_element.tail = None
|
|
105
|
+
|
|
106
|
+
if self._action == SubmitKind.REPLACE:
|
|
107
|
+
parent.remove(node.raw_element)
|
|
108
|
+
|
|
109
|
+
return None
|
|
110
|
+
|
|
111
|
+
def _submit_by_text(self, node: _Node) -> Element | None:
|
|
112
|
+
replaced_root: Element | None = None
|
|
113
|
+
child_nodes = dict((id(node), node) for _, node in node.items)
|
|
114
|
+
last_tail_element: Element | None = None
|
|
115
|
+
tail_elements: dict[int, Element] = {}
|
|
116
|
+
|
|
117
|
+
for child_element in node.raw_element:
|
|
118
|
+
child_node = child_nodes.get(id(child_element), None)
|
|
119
|
+
if child_node is not None:
|
|
120
|
+
if last_tail_element is not None:
|
|
121
|
+
tail_elements[id(child_element)] = last_tail_element
|
|
122
|
+
last_tail_element = child_element
|
|
123
|
+
|
|
124
|
+
for text_segments, child_node in node.items:
|
|
125
|
+
anchor_element = _find_anchor_in_parent(node.raw_element, child_node.raw_element)
|
|
126
|
+
if anchor_element is None:
|
|
127
|
+
# 防御性编程:理论上 anchor_element 不应该为 None,
|
|
128
|
+
# 因为 _nest_nodes 已经通过 _check_includes 验证了包含关系。
|
|
129
|
+
continue
|
|
130
|
+
|
|
131
|
+
tail_element = tail_elements.get(id(anchor_element), None)
|
|
132
|
+
items_preserved_elements: list[Element] = []
|
|
133
|
+
|
|
134
|
+
if self._action == SubmitKind.REPLACE:
|
|
135
|
+
end_index = index_of_parent(node.raw_element, anchor_element)
|
|
136
|
+
items_preserved_elements = self._remove_elements_after_tail(
|
|
137
|
+
node_element=node.raw_element,
|
|
138
|
+
tail_element=tail_element,
|
|
139
|
+
end_index=end_index,
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
self._append_combined_after_tail(
|
|
143
|
+
node_element=node.raw_element,
|
|
144
|
+
text_segments=text_segments,
|
|
145
|
+
tail_element=tail_element,
|
|
146
|
+
anchor_element=anchor_element,
|
|
147
|
+
append_to_end=False,
|
|
148
|
+
)
|
|
149
|
+
if items_preserved_elements:
|
|
150
|
+
insert_position = index_of_parent(node.raw_element, anchor_element)
|
|
151
|
+
for i, elem in enumerate(items_preserved_elements):
|
|
152
|
+
node.raw_element.insert(insert_position + i, elem)
|
|
153
|
+
|
|
154
|
+
for _, child_node in node.items:
|
|
155
|
+
submitted = self._submit_node(child_node)
|
|
156
|
+
if replaced_root is None:
|
|
157
|
+
replaced_root = submitted
|
|
158
|
+
|
|
159
|
+
if node.raw_element:
|
|
160
|
+
last_tail_element = node.raw_element[-1]
|
|
161
|
+
else:
|
|
162
|
+
last_tail_element = None
|
|
163
|
+
|
|
164
|
+
tail_preserved_elements: list[Element] = []
|
|
165
|
+
if self._action == SubmitKind.REPLACE:
|
|
166
|
+
tail_preserved_elements = self._remove_elements_after_tail(
|
|
167
|
+
node_element=node.raw_element,
|
|
168
|
+
tail_element=last_tail_element,
|
|
169
|
+
end_index=None, # None 表示删除到末尾
|
|
170
|
+
)
|
|
171
|
+
self._append_combined_after_tail(
|
|
172
|
+
node_element=node.raw_element,
|
|
173
|
+
text_segments=node.tail_text_segments,
|
|
174
|
+
tail_element=last_tail_element,
|
|
175
|
+
anchor_element=None,
|
|
176
|
+
append_to_end=True,
|
|
177
|
+
)
|
|
178
|
+
if tail_preserved_elements:
|
|
179
|
+
for elem in tail_preserved_elements:
|
|
180
|
+
node.raw_element.append(elem)
|
|
181
|
+
|
|
182
|
+
return replaced_root
|
|
183
|
+
|
|
184
|
+
def _remove_elements_after_tail(
|
|
185
|
+
self,
|
|
186
|
+
node_element: Element,
|
|
187
|
+
tail_element: Element | None,
|
|
188
|
+
end_index: int | None = None,
|
|
189
|
+
) -> list[Element]:
|
|
190
|
+
if tail_element is None:
|
|
191
|
+
start_index = 0
|
|
192
|
+
node_element.text = None
|
|
193
|
+
else:
|
|
194
|
+
start_index = index_of_parent(node_element, tail_element) + 1
|
|
195
|
+
tail_element.tail = None
|
|
196
|
+
|
|
197
|
+
if end_index is None:
|
|
198
|
+
end_index = len(node_element)
|
|
199
|
+
|
|
200
|
+
preserved_elements: list[Element] = []
|
|
201
|
+
for i in range(start_index, end_index):
|
|
202
|
+
elem = node_element[i]
|
|
203
|
+
if not is_inline_tag(elem.tag):
|
|
204
|
+
elem.tail = None
|
|
205
|
+
preserved_elements.append(elem)
|
|
206
|
+
|
|
207
|
+
for i in range(end_index - 1, start_index - 1, -1):
|
|
208
|
+
node_element.remove(node_element[i])
|
|
209
|
+
|
|
210
|
+
return preserved_elements
|
|
211
|
+
|
|
212
|
+
def _append_combined_after_tail(
|
|
213
|
+
self,
|
|
214
|
+
node_element: Element,
|
|
215
|
+
text_segments: list[TextSegment],
|
|
216
|
+
tail_element: Element | None,
|
|
217
|
+
anchor_element: Element | None,
|
|
218
|
+
append_to_end: bool,
|
|
219
|
+
) -> None:
|
|
220
|
+
combined = self._combine_text_segments(text_segments)
|
|
221
|
+
if combined is None:
|
|
222
|
+
return
|
|
223
|
+
|
|
224
|
+
if combined.text:
|
|
225
|
+
will_inject_space = self._action == SubmitKind.APPEND_TEXT or (
|
|
226
|
+
is_inline_tag(combined.tag) and self._action == SubmitKind.APPEND_BLOCK
|
|
227
|
+
)
|
|
228
|
+
if tail_element is not None:
|
|
229
|
+
tail_element.tail = self._append_text_in_element(
|
|
230
|
+
origin_text=tail_element.tail,
|
|
231
|
+
append_text=combined.text,
|
|
232
|
+
will_inject_space=will_inject_space,
|
|
233
|
+
)
|
|
234
|
+
elif anchor_element is None:
|
|
235
|
+
node_element.text = self._append_text_in_element(
|
|
236
|
+
origin_text=node_element.text,
|
|
237
|
+
append_text=combined.text,
|
|
238
|
+
will_inject_space=will_inject_space,
|
|
239
|
+
)
|
|
240
|
+
else:
|
|
241
|
+
ref_index = index_of_parent(node_element, anchor_element)
|
|
242
|
+
if ref_index > 0:
|
|
243
|
+
# 添加到前一个元素的 tail
|
|
244
|
+
prev_element = node_element[ref_index - 1]
|
|
245
|
+
prev_element.tail = self._append_text_in_element(
|
|
246
|
+
origin_text=prev_element.tail,
|
|
247
|
+
append_text=combined.text,
|
|
248
|
+
will_inject_space=will_inject_space,
|
|
249
|
+
)
|
|
250
|
+
else:
|
|
251
|
+
# ref_element 是第一个元素,添加到 node_element.text
|
|
252
|
+
node_element.text = self._append_text_in_element(
|
|
253
|
+
origin_text=node_element.text,
|
|
254
|
+
append_text=combined.text,
|
|
255
|
+
will_inject_space=will_inject_space,
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
if tail_element is not None:
|
|
259
|
+
insert_position = index_of_parent(node_element, tail_element) + 1
|
|
260
|
+
elif append_to_end:
|
|
261
|
+
insert_position = len(node_element)
|
|
262
|
+
elif anchor_element is not None:
|
|
263
|
+
# 使用 ref_element 来定位插入位置
|
|
264
|
+
# 如果文本被添加到前一个元素的 tail,则在前一个元素之后插入
|
|
265
|
+
ref_index = index_of_parent(node_element, anchor_element)
|
|
266
|
+
if ref_index > 0:
|
|
267
|
+
# 在前一个元素之后插入
|
|
268
|
+
insert_position = ref_index
|
|
269
|
+
else:
|
|
270
|
+
# ref_element 是第一个元素,插入到开头
|
|
271
|
+
insert_position = 0
|
|
272
|
+
else:
|
|
273
|
+
insert_position = 0
|
|
274
|
+
|
|
275
|
+
for i, child in enumerate(combined):
|
|
276
|
+
node_element.insert(insert_position + i, child)
|
|
277
|
+
|
|
278
|
+
def _combine_text_segments(self, text_segments: list[TextSegment]) -> Element | None:
|
|
279
|
+
segments = (t.strip_block_parents() for t in text_segments)
|
|
280
|
+
combined = next(combine_text_segments(segments), None)
|
|
281
|
+
if combined is None:
|
|
282
|
+
return None
|
|
283
|
+
else:
|
|
284
|
+
return combined[0]
|
|
285
|
+
|
|
286
|
+
def _append_text_in_element(
|
|
287
|
+
self,
|
|
288
|
+
origin_text: str | None,
|
|
289
|
+
append_text: str,
|
|
290
|
+
will_inject_space: bool,
|
|
291
|
+
) -> str:
|
|
292
|
+
if origin_text is None:
|
|
293
|
+
return append_text
|
|
294
|
+
elif will_inject_space:
|
|
295
|
+
return origin_text.rstrip() + " " + append_text.lstrip()
|
|
296
|
+
else:
|
|
297
|
+
return origin_text + append_text
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
def _nest_nodes(mappings: list[InlineSegmentMapping]) -> Generator[_Node, None, None]:
|
|
301
|
+
# 需要翻译的文字会被嵌套到两种不同的结构中。
|
|
302
|
+
# 最常见的的是 peak 结构,例如如下结构,没有任何子结构(inline 标签不是视为子结构)。
|
|
303
|
+
# 可直接文本替换或追加。
|
|
304
|
+
# <div>Some text <b>bold text</b> more text.</div>
|
|
305
|
+
#
|
|
306
|
+
# 但是还有一种少见的 platform 结构,它内部被其他 peak/platform 切割。
|
|
307
|
+
# <div>
|
|
308
|
+
# Some text before.
|
|
309
|
+
# <!-- 如下 peak 将它的阅读流切段 -->
|
|
310
|
+
# <div>Paragraph 1.</div>
|
|
311
|
+
# Some text in between.
|
|
312
|
+
# </div>
|
|
313
|
+
# 如果直接对它进行替换或追加,读者阅读流会被破坏,从而读起来怪异。
|
|
314
|
+
# 正是因为这种结构的存在,必须还原成树型结构,然后用特殊的方式来处理 platform 结构。
|
|
315
|
+
#
|
|
316
|
+
# 总之,我们假设 95% 的阅读体验由 peak 提供,但为兼顾剩下的 platform 结构,故加此步骤。
|
|
317
|
+
stack: list[_Node] = []
|
|
318
|
+
|
|
319
|
+
for block_element, text_segments in mappings:
|
|
320
|
+
keep_depth: int = 0
|
|
321
|
+
upwards: bool = False
|
|
322
|
+
for i in range(len(stack) - 1, -1, -1):
|
|
323
|
+
if stack[i].raw_element is block_element:
|
|
324
|
+
keep_depth = i + 1
|
|
325
|
+
upwards = True
|
|
326
|
+
break
|
|
327
|
+
|
|
328
|
+
if not upwards:
|
|
329
|
+
for i in range(len(stack) - 1, -1, -1):
|
|
330
|
+
if _check_includes(stack[i].raw_element, block_element):
|
|
331
|
+
keep_depth = i + 1
|
|
332
|
+
break
|
|
333
|
+
|
|
334
|
+
while len(stack) > keep_depth:
|
|
335
|
+
child_node = _fold_top_of_stack(stack)
|
|
336
|
+
if not upwards and child_node is not None:
|
|
337
|
+
yield child_node
|
|
338
|
+
|
|
339
|
+
if upwards:
|
|
340
|
+
stack[keep_depth - 1].tail_text_segments.extend(text_segments)
|
|
341
|
+
else:
|
|
342
|
+
stack.append(
|
|
343
|
+
_Node(
|
|
344
|
+
raw_element=block_element,
|
|
345
|
+
items=[],
|
|
346
|
+
tail_text_segments=list(text_segments),
|
|
347
|
+
)
|
|
348
|
+
)
|
|
349
|
+
while stack:
|
|
350
|
+
child_node = _fold_top_of_stack(stack)
|
|
351
|
+
if child_node is not None:
|
|
352
|
+
yield child_node
|
|
353
|
+
|
|
354
|
+
|
|
355
|
+
def _find_anchor_in_parent(parent: Element, descendant: Element) -> Element | None:
|
|
356
|
+
for child in parent:
|
|
357
|
+
if child is descendant:
|
|
358
|
+
return descendant
|
|
359
|
+
|
|
360
|
+
for child in parent:
|
|
361
|
+
if _check_includes(child, descendant):
|
|
362
|
+
return child
|
|
363
|
+
|
|
364
|
+
return None
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
def _fold_top_of_stack(stack: list[_Node]):
|
|
368
|
+
child_node = stack.pop()
|
|
369
|
+
if not stack:
|
|
370
|
+
return child_node
|
|
371
|
+
parent_node = stack[-1]
|
|
372
|
+
parent_node.items.append((parent_node.tail_text_segments, child_node))
|
|
373
|
+
parent_node.tail_text_segments = []
|
|
374
|
+
return None
|
|
375
|
+
|
|
376
|
+
|
|
377
|
+
def _check_includes(parent: Element, child: Element) -> bool:
|
|
378
|
+
for _, checked in iter_with_stack(parent):
|
|
379
|
+
if child is checked:
|
|
380
|
+
return True
|
|
381
|
+
return False
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from collections.abc import Callable, Generator, Iterable
|
|
2
|
-
from
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import Generic, TypeVar
|
|
3
4
|
from xml.etree.ElementTree import Element
|
|
4
5
|
|
|
5
6
|
from ..llm import LLM, Message, MessageRole
|
|
@@ -8,11 +9,18 @@ from ..xml import decode_friendly, encode_friendly
|
|
|
8
9
|
from .callbacks import Callbacks, FillFailedEvent, warp_callbacks
|
|
9
10
|
from .hill_climbing import HillClimbing
|
|
10
11
|
from .stream_mapper import InlineSegmentMapping, XMLStreamMapper
|
|
11
|
-
from .submitter import
|
|
12
|
+
from .submitter import SubmitKind, submit
|
|
12
13
|
|
|
13
14
|
T = TypeVar("T")
|
|
14
15
|
|
|
15
16
|
|
|
17
|
+
@dataclass
|
|
18
|
+
class TranslationTask(Generic[T]):
|
|
19
|
+
element: Element
|
|
20
|
+
action: SubmitKind
|
|
21
|
+
payload: T
|
|
22
|
+
|
|
23
|
+
|
|
16
24
|
class XMLTranslator:
|
|
17
25
|
def __init__(
|
|
18
26
|
self,
|
|
@@ -41,14 +49,14 @@ class XMLTranslator:
|
|
|
41
49
|
|
|
42
50
|
def translate_element(
|
|
43
51
|
self,
|
|
44
|
-
|
|
52
|
+
task: TranslationTask[T],
|
|
45
53
|
interrupt_source_text_segments: Callable[[Iterable[TextSegment]], Iterable[TextSegment]] | None = None,
|
|
46
54
|
interrupt_translated_text_segments: Callable[[Iterable[TextSegment]], Iterable[TextSegment]] | None = None,
|
|
47
55
|
interrupt_block_element: Callable[[Element], Element] | None = None,
|
|
48
56
|
on_fill_failed: Callable[[FillFailedEvent], None] | None = None,
|
|
49
|
-
) -> Element:
|
|
57
|
+
) -> tuple[Element, T]:
|
|
50
58
|
for translated in self.translate_elements(
|
|
51
|
-
|
|
59
|
+
tasks=((task),),
|
|
52
60
|
interrupt_source_text_segments=interrupt_source_text_segments,
|
|
53
61
|
interrupt_translated_text_segments=interrupt_translated_text_segments,
|
|
54
62
|
interrupt_block_element=interrupt_block_element,
|
|
@@ -60,30 +68,41 @@ class XMLTranslator:
|
|
|
60
68
|
|
|
61
69
|
def translate_elements(
|
|
62
70
|
self,
|
|
63
|
-
|
|
71
|
+
tasks: Iterable[TranslationTask[T]],
|
|
64
72
|
interrupt_source_text_segments: Callable[[Iterable[TextSegment]], Iterable[TextSegment]] | None = None,
|
|
65
73
|
interrupt_translated_text_segments: Callable[[Iterable[TextSegment]], Iterable[TextSegment]] | None = None,
|
|
66
74
|
interrupt_block_element: Callable[[Element], Element] | None = None,
|
|
67
75
|
on_fill_failed: Callable[[FillFailedEvent], None] | None = None,
|
|
68
|
-
) -> Generator[Element, None, None]:
|
|
76
|
+
) -> Generator[tuple[Element, T], None, None]:
|
|
77
|
+
element2task: dict[int, TranslationTask[T]] = {}
|
|
69
78
|
callbacks = warp_callbacks(
|
|
70
79
|
interrupt_source_text_segments=interrupt_source_text_segments,
|
|
71
80
|
interrupt_translated_text_segments=interrupt_translated_text_segments,
|
|
72
81
|
interrupt_block_element=interrupt_block_element,
|
|
73
82
|
on_fill_failed=on_fill_failed,
|
|
74
83
|
)
|
|
84
|
+
|
|
85
|
+
def generate_elements():
|
|
86
|
+
for task in tasks:
|
|
87
|
+
element2task[id(task.element)] = task
|
|
88
|
+
yield task.element
|
|
89
|
+
|
|
75
90
|
for element, mappings in self._stream_mapper.map_stream(
|
|
76
|
-
elements=
|
|
91
|
+
elements=generate_elements(),
|
|
77
92
|
callbacks=callbacks,
|
|
78
93
|
map=lambda inline_segments: self._translate_inline_segments(
|
|
79
94
|
inline_segments=inline_segments,
|
|
80
95
|
callbacks=callbacks,
|
|
81
96
|
),
|
|
82
97
|
):
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
98
|
+
task = element2task.get(id(element), None)
|
|
99
|
+
if task:
|
|
100
|
+
translated_element = submit(
|
|
101
|
+
element=element,
|
|
102
|
+
action=task.action,
|
|
103
|
+
mappings=mappings,
|
|
104
|
+
)
|
|
105
|
+
yield translated_element, task.payload
|
|
87
106
|
|
|
88
107
|
def _translate_inline_segments(
|
|
89
108
|
self,
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: epub-translator
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.5
|
|
4
4
|
Summary: Translate the epub book using LLM. The translated book will retain the original text and list the translated text side by side with the original text.
|
|
5
5
|
License: MIT
|
|
6
6
|
Keywords: epub,llm,translation,translator
|
|
@@ -78,8 +78,7 @@ The easiest way to use EPUB Translator is through OOMOL Studio with a visual int
|
|
|
78
78
|
### Using Python API
|
|
79
79
|
|
|
80
80
|
```python
|
|
81
|
-
from
|
|
82
|
-
from epub_translator import LLM, translate, language
|
|
81
|
+
from epub_translator import LLM, translate, language, SubmitKind
|
|
83
82
|
|
|
84
83
|
# Initialize LLM with your API credentials
|
|
85
84
|
llm = LLM(
|
|
@@ -91,9 +90,10 @@ llm = LLM(
|
|
|
91
90
|
|
|
92
91
|
# Translate EPUB file using language constants
|
|
93
92
|
translate(
|
|
94
|
-
source_path=
|
|
95
|
-
target_path=
|
|
93
|
+
source_path="source.epub",
|
|
94
|
+
target_path="translated.epub",
|
|
96
95
|
target_language=language.ENGLISH,
|
|
96
|
+
submit=SubmitKind.APPEND_BLOCK,
|
|
97
97
|
llm=llm,
|
|
98
98
|
)
|
|
99
99
|
```
|
|
@@ -113,9 +113,10 @@ with tqdm(total=100, desc="Translating", unit="%") as pbar:
|
|
|
113
113
|
last_progress = progress
|
|
114
114
|
|
|
115
115
|
translate(
|
|
116
|
-
source_path=
|
|
117
|
-
target_path=
|
|
116
|
+
source_path="source.epub",
|
|
117
|
+
target_path="translated.epub",
|
|
118
118
|
target_language="English",
|
|
119
|
+
submit=SubmitKind.APPEND_BLOCK,
|
|
119
120
|
llm=llm,
|
|
120
121
|
on_progress=on_progress,
|
|
121
122
|
)
|
|
@@ -152,6 +153,7 @@ translate(
|
|
|
152
153
|
source_path: PathLike | str, # Source EPUB file path
|
|
153
154
|
target_path: PathLike | str, # Output EPUB file path
|
|
154
155
|
target_language: str, # Target language (e.g., "English", "Chinese")
|
|
156
|
+
submit: SubmitKind, # How to insert translations (REPLACE, APPEND_TEXT, or APPEND_BLOCK)
|
|
155
157
|
user_prompt: str | None = None, # Custom translation instructions
|
|
156
158
|
max_retries: int = 5, # Maximum retries for failed translations
|
|
157
159
|
max_group_tokens: int = 1200, # Maximum tokens per translation group
|
|
@@ -165,6 +167,49 @@ translate(
|
|
|
165
167
|
|
|
166
168
|
**Note**: Either `llm` or both `translation_llm` and `fill_llm` must be provided. Using separate LLMs allows for task-specific optimization.
|
|
167
169
|
|
|
170
|
+
#### Submit Modes
|
|
171
|
+
|
|
172
|
+
The `submit` parameter controls how translated content is inserted into the document. Use `SubmitKind` enum to specify the insertion mode:
|
|
173
|
+
|
|
174
|
+
```python
|
|
175
|
+
from epub_translator import SubmitKind
|
|
176
|
+
|
|
177
|
+
# Three available modes:
|
|
178
|
+
# - SubmitKind.REPLACE: Replace original content with translation (single-language output)
|
|
179
|
+
# - SubmitKind.APPEND_TEXT: Append translation as inline text (bilingual output)
|
|
180
|
+
# - SubmitKind.APPEND_BLOCK: Append translation as block elements (bilingual output, recommended)
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
**Mode Comparison:**
|
|
184
|
+
|
|
185
|
+
- **`SubmitKind.REPLACE`**: Creates a single-language translation by replacing original text with translated content. Useful for creating books in the target language only.
|
|
186
|
+
|
|
187
|
+
- **`SubmitKind.APPEND_TEXT`**: Appends translations as inline text immediately after the original content. Both languages appear in the same paragraph, creating a continuous reading flow.
|
|
188
|
+
|
|
189
|
+
- **`SubmitKind.APPEND_BLOCK`** (Recommended): Appends translations as separate block elements (paragraphs) after the original. This creates clear visual separation between languages, making it ideal for side-by-side bilingual reading.
|
|
190
|
+
|
|
191
|
+
**Example:**
|
|
192
|
+
|
|
193
|
+
```python
|
|
194
|
+
# For bilingual books (recommended)
|
|
195
|
+
translate(
|
|
196
|
+
source_path="source.epub",
|
|
197
|
+
target_path="translated.epub",
|
|
198
|
+
target_language=language.ENGLISH,
|
|
199
|
+
submit=SubmitKind.APPEND_BLOCK,
|
|
200
|
+
llm=llm,
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
# For single-language translation
|
|
204
|
+
translate(
|
|
205
|
+
source_path="source.epub",
|
|
206
|
+
target_path="translated.epub",
|
|
207
|
+
target_language=language.ENGLISH,
|
|
208
|
+
submit=SubmitKind.REPLACE,
|
|
209
|
+
llm=llm,
|
|
210
|
+
)
|
|
211
|
+
```
|
|
212
|
+
|
|
168
213
|
#### Language Constants
|
|
169
214
|
|
|
170
215
|
EPUB Translator provides predefined language constants for convenience. You can use these constants instead of writing language names as strings:
|
|
@@ -174,47 +219,73 @@ from epub_translator import language
|
|
|
174
219
|
|
|
175
220
|
# Usage example:
|
|
176
221
|
translate(
|
|
177
|
-
source_path=
|
|
178
|
-
target_path=
|
|
222
|
+
source_path="source.epub",
|
|
223
|
+
target_path="translated.epub",
|
|
179
224
|
target_language=language.ENGLISH,
|
|
225
|
+
submit=SubmitKind.APPEND_BLOCK,
|
|
180
226
|
llm=llm,
|
|
181
227
|
)
|
|
182
228
|
|
|
183
229
|
# You can also use custom language strings:
|
|
184
230
|
translate(
|
|
185
|
-
source_path=
|
|
186
|
-
target_path=
|
|
231
|
+
source_path="source.epub",
|
|
232
|
+
target_path="translated.epub",
|
|
187
233
|
target_language="Icelandic", # For languages not in the constants
|
|
234
|
+
submit=SubmitKind.APPEND_BLOCK,
|
|
188
235
|
llm=llm,
|
|
189
236
|
)
|
|
190
237
|
```
|
|
191
238
|
|
|
192
239
|
### Error Handling with `on_fill_failed`
|
|
193
240
|
|
|
194
|
-
Monitor
|
|
241
|
+
Monitor translation errors using the `on_fill_failed` callback. The system automatically retries failed translations up to `max_retries` times (default: 5). Most errors are recovered during retries and don't affect the final output.
|
|
195
242
|
|
|
196
243
|
```python
|
|
197
244
|
from epub_translator import FillFailedEvent
|
|
198
245
|
|
|
199
246
|
def handle_fill_error(event: FillFailedEvent):
|
|
200
|
-
|
|
201
|
-
print(f" {event.error_message}")
|
|
247
|
+
# Only log critical errors that will affect the final EPUB
|
|
202
248
|
if event.over_maximum_retries:
|
|
203
|
-
print("
|
|
249
|
+
print(f"Critical error after {event.retried_count} attempts:")
|
|
250
|
+
print(f" {event.error_message}")
|
|
251
|
+
print(" This error will be present in the final EPUB file!")
|
|
204
252
|
|
|
205
253
|
translate(
|
|
206
|
-
source_path=
|
|
207
|
-
target_path=
|
|
254
|
+
source_path="source.epub",
|
|
255
|
+
target_path="translated.epub",
|
|
208
256
|
target_language=language.ENGLISH,
|
|
257
|
+
submit=SubmitKind.APPEND_BLOCK,
|
|
209
258
|
llm=llm,
|
|
210
259
|
on_fill_failed=handle_fill_error,
|
|
211
260
|
)
|
|
212
261
|
```
|
|
213
262
|
|
|
263
|
+
**Understanding Error Severity:**
|
|
264
|
+
|
|
214
265
|
The `FillFailedEvent` contains:
|
|
215
266
|
- `error_message: str` - Description of the error
|
|
216
|
-
- `retried_count: int` - Current retry attempt number
|
|
217
|
-
- `over_maximum_retries: bool` - Whether
|
|
267
|
+
- `retried_count: int` - Current retry attempt number (1 to max_retries)
|
|
268
|
+
- `over_maximum_retries: bool` - Whether the error is critical
|
|
269
|
+
|
|
270
|
+
**Error Categories:**
|
|
271
|
+
|
|
272
|
+
- **Recoverable errors** (`over_maximum_retries=False`): Errors during retry attempts. The system will continue retrying and may resolve these automatically. Safe to ignore in most cases.
|
|
273
|
+
|
|
274
|
+
- **Critical errors** (`over_maximum_retries=True`): Errors that persist after all retry attempts. These will appear in the final EPUB file and should be investigated.
|
|
275
|
+
|
|
276
|
+
**Advanced Usage:**
|
|
277
|
+
|
|
278
|
+
For verbose logging during translation debugging:
|
|
279
|
+
|
|
280
|
+
```python
|
|
281
|
+
def handle_fill_error(event: FillFailedEvent):
|
|
282
|
+
if event.over_maximum_retries:
|
|
283
|
+
# Critical: affects final output
|
|
284
|
+
print(f"❌ CRITICAL: {event.error_message}")
|
|
285
|
+
else:
|
|
286
|
+
# Informational: system is retrying
|
|
287
|
+
print(f"⚠️ Retry {event.retried_count}: {event.error_message}")
|
|
288
|
+
```
|
|
218
289
|
|
|
219
290
|
### Dual-LLM Architecture
|
|
220
291
|
|
|
@@ -239,9 +310,10 @@ fill_llm = LLM(
|
|
|
239
310
|
)
|
|
240
311
|
|
|
241
312
|
translate(
|
|
242
|
-
source_path=
|
|
243
|
-
target_path=
|
|
313
|
+
source_path="source.epub",
|
|
314
|
+
target_path="translated.epub",
|
|
244
315
|
target_language=language.ENGLISH,
|
|
316
|
+
submit=SubmitKind.APPEND_BLOCK,
|
|
245
317
|
translation_llm=translation_llm,
|
|
246
318
|
fill_llm=fill_llm,
|
|
247
319
|
)
|
|
@@ -299,9 +371,10 @@ Provide specific translation instructions:
|
|
|
299
371
|
|
|
300
372
|
```python
|
|
301
373
|
translate(
|
|
302
|
-
source_path=
|
|
303
|
-
target_path=
|
|
374
|
+
source_path="source.epub",
|
|
375
|
+
target_path="translated.epub",
|
|
304
376
|
target_language="English",
|
|
377
|
+
submit=SubmitKind.APPEND_BLOCK,
|
|
305
378
|
llm=llm,
|
|
306
379
|
user_prompt="Use formal language and preserve technical terminology",
|
|
307
380
|
)
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
epub_translator/__init__.py,sha256=
|
|
1
|
+
epub_translator/__init__.py,sha256=m2uTGNmBmZhRWQjjYQ1TVrjOuFXJhzQnuuTOq5-t29U,234
|
|
2
2
|
epub_translator/data/fill.jinja,sha256=zSytA8Vhp2i6YBZ09F1z9iPJq1-jUaiphoXqTNZwnvo,6964
|
|
3
3
|
epub_translator/data/mmltex/README.md,sha256=wwhe5yW1U_7_YZIFKnQVnCOmUl7Mu3gsr3lNnDSJ5Qs,2953
|
|
4
4
|
epub_translator/data/mmltex/cmarkup.xsl,sha256=DkhimAATM0XSCfVOfY41-qTPoddqzOHjZ00Pynr4zQE,37707
|
|
@@ -25,20 +25,21 @@ epub_translator/llm/error.py,sha256=4efAIQL14DFSvAnSTUfgdAbZRqaWBqOfUGsSfvxa5zM,
|
|
|
25
25
|
epub_translator/llm/executor.py,sha256=A0IjQ-s9wBJuhAZAAydneb9zBXWnu2J9inR2Q8F-GDE,5533
|
|
26
26
|
epub_translator/llm/increasable.py,sha256=8XkKeI1hiHlpMHj8dQ4fW0BkViSx4hH8QfbQsy-5SDw,1297
|
|
27
27
|
epub_translator/llm/types.py,sha256=c-dMAIvlG4R3la3mUTWEw5xei-sIYKmQeBja7mirxcI,219
|
|
28
|
+
epub_translator/punctuation.py,sha256=Yrf3b_Pl36FPBaK96LR-EBjnztlcZZTWLSNaYoWIUSc,812
|
|
28
29
|
epub_translator/segment/__init__.py,sha256=UYTv_IKQbEB0DzhFeiuqCvjoJLvB-7XRwlaFS90KmIw,573
|
|
29
30
|
epub_translator/segment/block_segment.py,sha256=psNKA_HMIcwZtoug8AtnAcV9_mQ2WXLnXqFsekHzt2g,4570
|
|
30
31
|
epub_translator/segment/common.py,sha256=gGWYQaJ0tGnWCuF1me9TOo-Q_DrZVakCu2patyFIOs0,714
|
|
31
32
|
epub_translator/segment/inline_segment.py,sha256=_ZgSlZmGxzIvaPs01hreoUfnaXz8Yq7naksT34dGfds,14221
|
|
32
|
-
epub_translator/segment/text_segment.py,sha256=
|
|
33
|
+
epub_translator/segment/text_segment.py,sha256=Fos3tTuTcpnm-NmqPftzqov1_Rwr57PBv8AIgjKNYcg,6389
|
|
33
34
|
epub_translator/segment/utils.py,sha256=qMqUt33pDRN5Tnuydkodzu2gaQrwTzAnQmXpDuHen1o,1036
|
|
34
35
|
epub_translator/serial/__init__.py,sha256=b3IMVmWcUwEqHKcGmey88b057pyz5ct946CaUZi4LB4,67
|
|
35
36
|
epub_translator/serial/chunk.py,sha256=FrTaHikVOd6bLYumnEriTaAQ_DIDLjHm16gh-wBVR9k,1495
|
|
36
37
|
epub_translator/serial/segment.py,sha256=uEz-ke1KcYrON-68FaUEzMG2CzHlMjvbC11F3ZT4yH0,446
|
|
37
38
|
epub_translator/serial/splitter.py,sha256=Nq0sxPXos8ez7QBG01sOKjnYKbeBWUBHflZGtqenVm8,1726
|
|
38
39
|
epub_translator/template.py,sha256=0CqRmj3nTtPshw0NmTr2ECqelops2MMyX94fMrE-HKs,1587
|
|
39
|
-
epub_translator/translator.py,sha256=
|
|
40
|
+
epub_translator/translator.py,sha256=SL0Qh49QaZD3bKKkf5xM0hF2MkPqzxKO8uyo8rn9wTQ,6421
|
|
40
41
|
epub_translator/utils.py,sha256=BfZWrYjzDNQ4cFrgvRNzd4i1CKLtPxS8Z4LBHhqEV78,914
|
|
41
|
-
epub_translator/xml/__init__.py,sha256=
|
|
42
|
+
epub_translator/xml/__init__.py,sha256=1sBLICHtNNw0UNMOXCZzrZ7uGfOwnPf_m4MmmMNzakY,160
|
|
42
43
|
epub_translator/xml/const.py,sha256=Re2TYmpwG7-jVVgSq3R_K-uYhvAYzcXcRmLFkwCPD9Y,19
|
|
43
44
|
epub_translator/xml/deduplication.py,sha256=TaMbzeA70VvUQV0X1wcQFVbuMEPJUtj9Hq6iWlUmtAQ,1152
|
|
44
45
|
epub_translator/xml/firendly/__init__.py,sha256=I5jhnhFWoHvojLsYXH4jfR4Gi8lKFZ3yQ56ze5hEe1M,74
|
|
@@ -47,20 +48,21 @@ epub_translator/xml/firendly/encoder.py,sha256=evjvw6oE-oCud44IsJ-YZVHn6dtUzjNYX
|
|
|
47
48
|
epub_translator/xml/firendly/parser.py,sha256=QlMHA0nfPJbNyx6IwRFrYVw7okuvzDB42NXCauIFV-o,6560
|
|
48
49
|
epub_translator/xml/firendly/tag.py,sha256=ahaGoYttuAlnFxLFFgTV51KUZSpUiHho-COZX14nxN8,3308
|
|
49
50
|
epub_translator/xml/firendly/transform.py,sha256=5tG1MJmzrXIR_Z5gmRxwcoKvXBzJBVH0ELeaRsG-8w0,1201
|
|
51
|
+
epub_translator/xml/inline.py,sha256=mwFho6wq2gYWmWcg5Cw6OQeteV-a-i6X9OE63fzblpE,1274
|
|
50
52
|
epub_translator/xml/self_closing.py,sha256=41ofGUdss9yU51IVwI4It6hKfzh8YcxIR_j-ohD19LE,5240
|
|
51
53
|
epub_translator/xml/utils.py,sha256=7tQ6L5P0_JXhxONeG64hEeeL5mKjA6NKS1H1Q9B1Cac,1062
|
|
52
54
|
epub_translator/xml/xml.py,sha256=qQ5Wk1-KVVHE4TX25zGOR7fINsGkXnoq-qyKKNl5no4,1675
|
|
53
55
|
epub_translator/xml/xml_like.py,sha256=jBK4UUgXXWRYnfYlCH1MUAjGHWBQAbUj8HsYqvTTWvA,8890
|
|
54
56
|
epub_translator/xml_interrupter.py,sha256=IGLATr7zTIdhE54Gnroab4Xu_vLJ7kzPiQgk7WMXKTc,7403
|
|
55
|
-
epub_translator/xml_translator/__init__.py,sha256=
|
|
57
|
+
epub_translator/xml_translator/__init__.py,sha256=lqts1mJL_WfojDnMAQ5OM7TbT6u9X3H-X4C_avHzvXM,128
|
|
56
58
|
epub_translator/xml_translator/callbacks.py,sha256=IoZrsaivd2W76cHFupwv6auVxgEWHcBN2MHQJYcWoJ8,1324
|
|
57
59
|
epub_translator/xml_translator/common.py,sha256=hSPptgPp7j6dm47imELB5DgmEbzTEyJD6WEeELOOc50,38
|
|
58
60
|
epub_translator/xml_translator/hill_climbing.py,sha256=1jvilOkTLzwljJA4Nrel8yU2XGvOXpueUJTK7RAp-XY,4272
|
|
59
61
|
epub_translator/xml_translator/stream_mapper.py,sha256=tbMc2vyPUn9zEkJZ7-OVYuKaYyn2pPPwjcAdQ8HLzNs,10179
|
|
60
|
-
epub_translator/xml_translator/submitter.py,sha256=
|
|
61
|
-
epub_translator/xml_translator/translator.py,sha256=
|
|
62
|
+
epub_translator/xml_translator/submitter.py,sha256=6PGQTnEcOgL3zseDpSzDmU5d9Eg3eO5OfPIGmQp2DVY,14155
|
|
63
|
+
epub_translator/xml_translator/translator.py,sha256=eIvniqKtNoqFFvfvxK4oA-W02y5ZTpmPQ8wFAJlvOUU,9752
|
|
62
64
|
epub_translator/xml_translator/validation.py,sha256=-OKlSZuD__sjAiEpGAO93YQme4ZDSPmoPjRsAMOCEjc,16668
|
|
63
|
-
epub_translator-0.1.
|
|
64
|
-
epub_translator-0.1.
|
|
65
|
-
epub_translator-0.1.
|
|
66
|
-
epub_translator-0.1.
|
|
65
|
+
epub_translator-0.1.5.dist-info/LICENSE,sha256=5RF32sL3LtMOJIErdDKp1ZEYPGXS8WPpsiSz_jMBnGI,1066
|
|
66
|
+
epub_translator-0.1.5.dist-info/METADATA,sha256=IT5MBdl68pICDYmk5tn3CwvdnZ5QxlVoaSzw-VhKf3c,14603
|
|
67
|
+
epub_translator-0.1.5.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
|
68
|
+
epub_translator-0.1.5.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|