epub-translator 0.1.1__py3-none-any.whl → 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epub_translator/__init__.py +9 -2
- epub_translator/data/fill.jinja +143 -38
- epub_translator/epub/__init__.py +1 -1
- epub_translator/epub/metadata.py +122 -0
- epub_translator/epub/spines.py +3 -2
- epub_translator/epub/zip.py +11 -9
- epub_translator/epub_transcode.py +108 -0
- epub_translator/llm/__init__.py +1 -0
- epub_translator/llm/context.py +109 -0
- epub_translator/llm/core.py +32 -113
- epub_translator/llm/executor.py +25 -31
- epub_translator/llm/increasable.py +1 -1
- epub_translator/llm/types.py +0 -3
- epub_translator/punctuation.py +34 -0
- epub_translator/segment/__init__.py +26 -0
- epub_translator/segment/block_segment.py +124 -0
- epub_translator/segment/common.py +29 -0
- epub_translator/segment/inline_segment.py +356 -0
- epub_translator/{xml_translator → segment}/text_segment.py +7 -72
- epub_translator/segment/utils.py +43 -0
- epub_translator/translator.py +152 -184
- epub_translator/utils.py +33 -0
- epub_translator/xml/__init__.py +3 -0
- epub_translator/xml/const.py +1 -0
- epub_translator/xml/deduplication.py +3 -3
- epub_translator/xml/inline.py +67 -0
- epub_translator/xml/self_closing.py +182 -0
- epub_translator/xml/utils.py +42 -0
- epub_translator/xml/xml.py +7 -0
- epub_translator/xml/xml_like.py +8 -33
- epub_translator/xml_interrupter.py +165 -0
- epub_translator/xml_translator/__init__.py +3 -3
- epub_translator/xml_translator/callbacks.py +34 -0
- epub_translator/xml_translator/{const.py → common.py} +0 -1
- epub_translator/xml_translator/hill_climbing.py +104 -0
- epub_translator/xml_translator/stream_mapper.py +253 -0
- epub_translator/xml_translator/submitter.py +352 -91
- epub_translator/xml_translator/translator.py +182 -114
- epub_translator/xml_translator/validation.py +458 -0
- {epub_translator-0.1.1.dist-info → epub_translator-0.1.4.dist-info}/METADATA +134 -21
- epub_translator-0.1.4.dist-info/RECORD +68 -0
- epub_translator/epub/placeholder.py +0 -53
- epub_translator/iter_sync.py +0 -24
- epub_translator/xml_translator/fill.py +0 -128
- epub_translator/xml_translator/format.py +0 -282
- epub_translator/xml_translator/fragmented.py +0 -125
- epub_translator/xml_translator/group.py +0 -183
- epub_translator/xml_translator/progressive_locking.py +0 -256
- epub_translator/xml_translator/utils.py +0 -29
- epub_translator-0.1.1.dist-info/RECORD +0 -58
- {epub_translator-0.1.1.dist-info → epub_translator-0.1.4.dist-info}/LICENSE +0 -0
- {epub_translator-0.1.1.dist-info → epub_translator-0.1.4.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,458 @@
|
|
|
1
|
+
from collections.abc import Generator, Iterable
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import Generic, TypeVar, cast
|
|
4
|
+
from xml.etree.ElementTree import Element
|
|
5
|
+
|
|
6
|
+
from tiktoken import Encoding
|
|
7
|
+
|
|
8
|
+
from ..segment import (
|
|
9
|
+
BlockContentError,
|
|
10
|
+
BlockError,
|
|
11
|
+
BlockExpectedIDsError,
|
|
12
|
+
BlockUnexpectedIDError,
|
|
13
|
+
BlockWrongTagError,
|
|
14
|
+
FoundInvalidIDError,
|
|
15
|
+
InlineError,
|
|
16
|
+
InlineExpectedIDsError,
|
|
17
|
+
InlineLostIDError,
|
|
18
|
+
InlineUnexpectedIDError,
|
|
19
|
+
InlineWrongTagCountError,
|
|
20
|
+
)
|
|
21
|
+
from ..utils import ensure_list
|
|
22
|
+
from ..xml import plain_text
|
|
23
|
+
|
|
24
|
+
_LEVEL_WEIGHT = 3
|
|
25
|
+
_MAX_TEXT_HINT_TOKENS_COUNT = 6
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
_BLOCK_EXPECTED_IDS_LEVEL = 6
|
|
29
|
+
_BLOCK_WRONG_TAG_LEVEL = 5
|
|
30
|
+
_BLOCK_FOUND_INVALID_ID_LEVEL = 4
|
|
31
|
+
_BLOCK_UNEXPECTED_ID_LEVEL = 3
|
|
32
|
+
|
|
33
|
+
_INLINE_EXPECTED_IDS_LEVEL = 3
|
|
34
|
+
_INLINE_LOST_ID_LEVEL = 2
|
|
35
|
+
_INLINE_FOUND_INVALID_ID_LEVEL = 1
|
|
36
|
+
_INLINE_WRONG_TAG_COUNT_LEVEL = 0
|
|
37
|
+
_INLINE_UNEXPECTED_ID_LEVEL = 0
|
|
38
|
+
|
|
39
|
+
ERROR = TypeVar("ERROR")
|
|
40
|
+
LEVEL_DEPTH = 7
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class ErrorItem(Generic[ERROR]):
|
|
45
|
+
error: ERROR
|
|
46
|
+
index1: int
|
|
47
|
+
index2: int
|
|
48
|
+
level: int
|
|
49
|
+
weight: int
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@dataclass
|
|
53
|
+
class BlockErrorsGroup:
|
|
54
|
+
block_id: int
|
|
55
|
+
block_element: Element
|
|
56
|
+
errors: list[ErrorItem[BlockError | FoundInvalidIDError] | ErrorItem[InlineError | FoundInvalidIDError]]
|
|
57
|
+
|
|
58
|
+
@property
|
|
59
|
+
def weight(self) -> int:
|
|
60
|
+
return sum(e.weight for e in self.errors)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@dataclass
|
|
64
|
+
class ErrorsGroup:
|
|
65
|
+
upper_errors: list[ErrorItem[BlockError | FoundInvalidIDError]]
|
|
66
|
+
block_groups: list[BlockErrorsGroup]
|
|
67
|
+
|
|
68
|
+
@property
|
|
69
|
+
def errors_count(self) -> int:
|
|
70
|
+
count = len(self.upper_errors)
|
|
71
|
+
for block_group in self.block_groups:
|
|
72
|
+
count += len(block_group.errors)
|
|
73
|
+
return count
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def nest_as_errors_group(errors: Iterable[BlockError | FoundInvalidIDError]) -> ErrorsGroup | None:
|
|
77
|
+
return _create_errors_group(
|
|
78
|
+
error_items=_transform_errors_to_items(errors),
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def truncate_errors_group(errors_group: ErrorsGroup, max_errors: int) -> ErrorsGroup | None:
|
|
83
|
+
errors_items = list(_flatten_errors_group(errors_group))
|
|
84
|
+
if len(errors_items) <= max_errors:
|
|
85
|
+
return errors_group
|
|
86
|
+
|
|
87
|
+
errors_items.sort(key=lambda item: (-item[1].level, item[1].index1, item[1].index2))
|
|
88
|
+
errors_items = errors_items[:max_errors]
|
|
89
|
+
|
|
90
|
+
return _create_errors_group(
|
|
91
|
+
error_items=errors_items,
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def generate_error_message(encoding: Encoding, errors_group: ErrorsGroup, omitted_count: int = 0) -> None | str:
|
|
96
|
+
message_lines: list[str] = []
|
|
97
|
+
for upper_error in errors_group.upper_errors:
|
|
98
|
+
message_lines.append(_format_block_error(upper_error.error))
|
|
99
|
+
if message_lines:
|
|
100
|
+
message_lines.append("")
|
|
101
|
+
|
|
102
|
+
for i, block_group in enumerate(errors_group.block_groups):
|
|
103
|
+
if i == 0:
|
|
104
|
+
message_lines.append("")
|
|
105
|
+
|
|
106
|
+
block_tag = block_group.block_element.tag
|
|
107
|
+
error_count = len(block_group.errors)
|
|
108
|
+
count_suffix = f" ({error_count} error{'s' if error_count != 1 else ''})"
|
|
109
|
+
message_lines.append(f"In {block_tag}#{block_group.block_id}:{count_suffix}")
|
|
110
|
+
|
|
111
|
+
for block_error in block_group.errors:
|
|
112
|
+
message: str
|
|
113
|
+
if isinstance(block_error.error, BlockError):
|
|
114
|
+
message = _format_block_error(block_error.error)
|
|
115
|
+
elif isinstance(block_error.error, InlineError):
|
|
116
|
+
message = _format_inline_error(encoding, block_error.error, block_group.block_id)
|
|
117
|
+
else:
|
|
118
|
+
raise RuntimeError()
|
|
119
|
+
message_lines.append(f" - {message}")
|
|
120
|
+
|
|
121
|
+
if not message_lines:
|
|
122
|
+
return None
|
|
123
|
+
|
|
124
|
+
header = (
|
|
125
|
+
f"Found {errors_group.errors_count} error(s). Fix them and return "
|
|
126
|
+
"the COMPLETE corrected XML (not just the changed parts):"
|
|
127
|
+
)
|
|
128
|
+
message_lines.insert(0, "")
|
|
129
|
+
message_lines.insert(0, header)
|
|
130
|
+
|
|
131
|
+
if omitted_count > 0:
|
|
132
|
+
message_lines.append("")
|
|
133
|
+
message_lines.append(
|
|
134
|
+
f"... and {omitted_count} more error(s) omitted. "
|
|
135
|
+
f"Fix the above errors first, then resubmit for remaining issues."
|
|
136
|
+
)
|
|
137
|
+
message_lines.append("")
|
|
138
|
+
message_lines.append("Remember: Return the entire <xml>...</xml> block with all corrections applied.")
|
|
139
|
+
else:
|
|
140
|
+
message_lines.append("")
|
|
141
|
+
message_lines.append("Return the entire <xml>...</xml> block with corrections.")
|
|
142
|
+
|
|
143
|
+
return "\n".join(message_lines)
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
@dataclass
|
|
147
|
+
class _Block:
|
|
148
|
+
id: int
|
|
149
|
+
element: Element
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def _transform_errors_to_items(errors: Iterable[BlockError | FoundInvalidIDError]):
|
|
153
|
+
for i, block_error in enumerate(errors):
|
|
154
|
+
if isinstance(block_error, BlockContentError):
|
|
155
|
+
block = _Block(
|
|
156
|
+
id=block_error.id,
|
|
157
|
+
element=block_error.element,
|
|
158
|
+
)
|
|
159
|
+
for j, inline_error in enumerate(block_error.errors):
|
|
160
|
+
level = _get_inline_error_level(inline_error)
|
|
161
|
+
weight = _calculate_error_weight(inline_error, level)
|
|
162
|
+
yield (
|
|
163
|
+
block,
|
|
164
|
+
ErrorItem(
|
|
165
|
+
error=inline_error,
|
|
166
|
+
index1=i,
|
|
167
|
+
index2=j,
|
|
168
|
+
level=level,
|
|
169
|
+
weight=weight,
|
|
170
|
+
),
|
|
171
|
+
)
|
|
172
|
+
else:
|
|
173
|
+
level = _get_block_error_level(block_error)
|
|
174
|
+
weight = _calculate_error_weight(block_error, level)
|
|
175
|
+
error_item: ErrorItem[BlockError | FoundInvalidIDError] = ErrorItem(
|
|
176
|
+
error=block_error,
|
|
177
|
+
index1=i,
|
|
178
|
+
index2=0,
|
|
179
|
+
level=level,
|
|
180
|
+
weight=weight,
|
|
181
|
+
)
|
|
182
|
+
block: _Block | None = None
|
|
183
|
+
if isinstance(block_error, BlockWrongTagError) and block_error.block is not None:
|
|
184
|
+
block = _Block(
|
|
185
|
+
id=block_error.block[0],
|
|
186
|
+
element=block_error.block[1],
|
|
187
|
+
)
|
|
188
|
+
yield block, error_item
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def _flatten_errors_group(
|
|
192
|
+
errors_group: ErrorsGroup,
|
|
193
|
+
) -> Generator[
|
|
194
|
+
tuple[
|
|
195
|
+
_Block | None,
|
|
196
|
+
ErrorItem[BlockError | FoundInvalidIDError] | ErrorItem[InlineError | FoundInvalidIDError],
|
|
197
|
+
],
|
|
198
|
+
None,
|
|
199
|
+
None,
|
|
200
|
+
]:
|
|
201
|
+
for error in errors_group.upper_errors:
|
|
202
|
+
yield None, error
|
|
203
|
+
|
|
204
|
+
for block_group in errors_group.block_groups:
|
|
205
|
+
block = _Block(
|
|
206
|
+
id=block_group.block_id,
|
|
207
|
+
element=block_group.block_element,
|
|
208
|
+
)
|
|
209
|
+
for error in block_group.errors:
|
|
210
|
+
yield block, error
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def _create_errors_group(
|
|
214
|
+
error_items: Iterable[
|
|
215
|
+
tuple[
|
|
216
|
+
_Block | None,
|
|
217
|
+
ErrorItem[BlockError | FoundInvalidIDError] | ErrorItem[InlineError | FoundInvalidIDError],
|
|
218
|
+
]
|
|
219
|
+
],
|
|
220
|
+
) -> ErrorsGroup | None:
|
|
221
|
+
upper_errors: list[ErrorItem[BlockError | FoundInvalidIDError]] = []
|
|
222
|
+
block_elements: dict[int, Element] = {}
|
|
223
|
+
block_errors_dict: dict[
|
|
224
|
+
int, list[ErrorItem[BlockError | FoundInvalidIDError] | ErrorItem[InlineError | FoundInvalidIDError]]
|
|
225
|
+
] = {}
|
|
226
|
+
|
|
227
|
+
for block, error in error_items:
|
|
228
|
+
if block is None:
|
|
229
|
+
upper_errors.append(cast(ErrorItem[BlockError | FoundInvalidIDError], error))
|
|
230
|
+
else:
|
|
231
|
+
block_errors = ensure_list(block_errors_dict, block.id)
|
|
232
|
+
block_errors.append(error)
|
|
233
|
+
block_elements[block.id] = block.element
|
|
234
|
+
|
|
235
|
+
if not upper_errors and not block_errors_dict:
|
|
236
|
+
return None
|
|
237
|
+
|
|
238
|
+
block_errors_groups: list[BlockErrorsGroup] = []
|
|
239
|
+
for block_id, block_errors in block_errors_dict.items():
|
|
240
|
+
block_element = block_elements.get(block_id)
|
|
241
|
+
if block_element is None:
|
|
242
|
+
continue
|
|
243
|
+
|
|
244
|
+
block_error_group = BlockErrorsGroup(
|
|
245
|
+
block_id=block_id,
|
|
246
|
+
block_element=block_element,
|
|
247
|
+
errors=sorted(block_errors, key=lambda e: (-e.weight, e.index1, e.index2)),
|
|
248
|
+
)
|
|
249
|
+
block_errors_groups.append(block_error_group)
|
|
250
|
+
|
|
251
|
+
upper_errors.sort(key=lambda e: (-e.level, e.index1, e.index2))
|
|
252
|
+
block_errors_groups.sort(key=lambda g: -g.weight)
|
|
253
|
+
|
|
254
|
+
return ErrorsGroup(
|
|
255
|
+
upper_errors=upper_errors,
|
|
256
|
+
block_groups=block_errors_groups,
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def _calculate_error_weight(error: BlockError | InlineError | FoundInvalidIDError, level: int) -> int:
|
|
261
|
+
# BlockExpectedIDsError 和 InlineExpectedIDsError 的权重乘以 id2element 数量
|
|
262
|
+
if isinstance(error, (BlockExpectedIDsError, InlineExpectedIDsError)):
|
|
263
|
+
return (_LEVEL_WEIGHT**level) * len(error.id2element)
|
|
264
|
+
else:
|
|
265
|
+
return _LEVEL_WEIGHT**level
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def _get_block_error_level(error: BlockError | FoundInvalidIDError) -> int:
|
|
269
|
+
if isinstance(error, BlockWrongTagError):
|
|
270
|
+
return _BLOCK_WRONG_TAG_LEVEL
|
|
271
|
+
elif isinstance(error, BlockExpectedIDsError):
|
|
272
|
+
return _BLOCK_EXPECTED_IDS_LEVEL
|
|
273
|
+
elif isinstance(error, BlockUnexpectedIDError):
|
|
274
|
+
return _BLOCK_UNEXPECTED_ID_LEVEL
|
|
275
|
+
elif isinstance(error, FoundInvalidIDError):
|
|
276
|
+
return _BLOCK_FOUND_INVALID_ID_LEVEL
|
|
277
|
+
else:
|
|
278
|
+
return 0
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
def _get_inline_error_level(error: InlineError | FoundInvalidIDError) -> int:
|
|
282
|
+
if isinstance(error, InlineLostIDError):
|
|
283
|
+
return _INLINE_LOST_ID_LEVEL
|
|
284
|
+
elif isinstance(error, InlineExpectedIDsError):
|
|
285
|
+
return _INLINE_EXPECTED_IDS_LEVEL
|
|
286
|
+
elif isinstance(error, InlineUnexpectedIDError):
|
|
287
|
+
return _INLINE_UNEXPECTED_ID_LEVEL
|
|
288
|
+
elif isinstance(error, InlineWrongTagCountError):
|
|
289
|
+
return _INLINE_WRONG_TAG_COUNT_LEVEL
|
|
290
|
+
elif isinstance(error, FoundInvalidIDError):
|
|
291
|
+
return _INLINE_FOUND_INVALID_ID_LEVEL
|
|
292
|
+
else:
|
|
293
|
+
return 0
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
def _format_block_error(error: BlockError | FoundInvalidIDError) -> str:
|
|
297
|
+
if isinstance(error, BlockWrongTagError):
|
|
298
|
+
if error.block is None:
|
|
299
|
+
return (
|
|
300
|
+
f"Root tag mismatch: expected `<{error.expected_tag}>`, but found `<{error.instead_tag}>`. "
|
|
301
|
+
f"Fix: Change the root tag to `<{error.expected_tag}>`."
|
|
302
|
+
)
|
|
303
|
+
else:
|
|
304
|
+
return (
|
|
305
|
+
f"Wrong tag for block at `{error.instead_tag}#{error.block[0]}`: "
|
|
306
|
+
f'expected `<{error.expected_tag} id="{error.block[0]}">`, '
|
|
307
|
+
f'but found `<{error.instead_tag} id="{error.block[0]}">`. '
|
|
308
|
+
f"Fix: Change the tag to `<{error.expected_tag}>`."
|
|
309
|
+
)
|
|
310
|
+
elif isinstance(error, BlockExpectedIDsError):
|
|
311
|
+
# Add context hints with original text content
|
|
312
|
+
context_hints: list[str] = []
|
|
313
|
+
for id, elem in sorted(error.id2element.items()):
|
|
314
|
+
original_text = plain_text(elem).strip()
|
|
315
|
+
if original_text:
|
|
316
|
+
# Truncate to first 30 chars for block-level hints
|
|
317
|
+
text_preview = original_text[:30] + "..." if len(original_text) > 30 else original_text
|
|
318
|
+
context_hints.append(f' - `<{elem.tag} id="{id}">`: "{text_preview}"')
|
|
319
|
+
|
|
320
|
+
if context_hints:
|
|
321
|
+
message = "Missing block elements (find translation and wrap):\n" + "\n".join(context_hints)
|
|
322
|
+
else:
|
|
323
|
+
# Fallback if no text hints available
|
|
324
|
+
missing_elements = [f'<{elem.tag} id="{id}">' for id, elem in sorted(error.id2element.items())]
|
|
325
|
+
elements_str = ", ".join(missing_elements)
|
|
326
|
+
message = f"Missing expected blocks: {elements_str}. Fix: Add these missing blocks with the correct IDs."
|
|
327
|
+
|
|
328
|
+
return message
|
|
329
|
+
|
|
330
|
+
elif isinstance(error, BlockUnexpectedIDError):
|
|
331
|
+
selector = f"{error.element.tag}#{error.id}"
|
|
332
|
+
return f"Unexpected block found at `{selector}`. Fix: Remove this unexpected block."
|
|
333
|
+
|
|
334
|
+
elif isinstance(error, FoundInvalidIDError):
|
|
335
|
+
if error.invalid_id is None:
|
|
336
|
+
example = f"<{error.element.tag}>"
|
|
337
|
+
else:
|
|
338
|
+
example = f'<{error.element.tag} id="{error.invalid_id}">'
|
|
339
|
+
return f"Invalid or missing ID attribute: {example}. Fix: Ensure all blocks have valid numeric IDs."
|
|
340
|
+
else:
|
|
341
|
+
return "Unknown block error. Fix: Review the block structure."
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
def _format_inline_error(encoding: Encoding, error: InlineError | FoundInvalidIDError, block_id: int) -> str:
|
|
345
|
+
if isinstance(error, InlineLostIDError):
|
|
346
|
+
selector = _build_inline_selector(encoding, error.stack, block_id, element=error.element)
|
|
347
|
+
return f"Element at `{selector}` is missing an ID attribute. Fix: Add the required ID attribute."
|
|
348
|
+
|
|
349
|
+
elif isinstance(error, InlineExpectedIDsError):
|
|
350
|
+
# Add context hints with original text content
|
|
351
|
+
context_hints: list[str] = []
|
|
352
|
+
for id, elem in sorted(error.id2element.items()):
|
|
353
|
+
original_text = plain_text(elem).strip()
|
|
354
|
+
if original_text:
|
|
355
|
+
text_hint = _extract_text_hint(encoding, elem)
|
|
356
|
+
context_hints.append(f' - `<{elem.tag} id="{id}">`: "{text_hint}"')
|
|
357
|
+
|
|
358
|
+
if context_hints:
|
|
359
|
+
message = "Missing inline elements (find translation and wrap):\n" + "\n".join(context_hints)
|
|
360
|
+
else:
|
|
361
|
+
# Fallback if no text hints available
|
|
362
|
+
missing_elements = [f'<{elem.tag} id="{id}">' for id, elem in sorted(error.id2element.items())]
|
|
363
|
+
elements_str = ", ".join(missing_elements)
|
|
364
|
+
message = f"Missing expected inline elements: {elements_str}. Fix: Add these missing inline elements."
|
|
365
|
+
|
|
366
|
+
return message
|
|
367
|
+
|
|
368
|
+
elif isinstance(error, InlineUnexpectedIDError):
|
|
369
|
+
selector = f"{error.element.tag}#{error.id}"
|
|
370
|
+
return f"Unexpected inline element at `{selector}`. Fix: Remove this unexpected element."
|
|
371
|
+
|
|
372
|
+
elif isinstance(error, InlineWrongTagCountError):
|
|
373
|
+
tag = error.found_elements[0].tag if error.found_elements else "unknown"
|
|
374
|
+
selector = _build_inline_selector(encoding, error.stack, block_id, tag=tag)
|
|
375
|
+
expected = error.expected_count
|
|
376
|
+
found = len(error.found_elements)
|
|
377
|
+
|
|
378
|
+
if expected == 0 and found > 0:
|
|
379
|
+
# 情况1: 不应该有,但发现了
|
|
380
|
+
return (
|
|
381
|
+
f"Found unexpected `<{tag}>` elements at `{selector}`. "
|
|
382
|
+
f"There should be none, but {found} were found. "
|
|
383
|
+
f"Fix: Remove all `<{tag}>` elements from this location."
|
|
384
|
+
)
|
|
385
|
+
elif expected > 0 and found == 0:
|
|
386
|
+
# 情况2: 应该有,但没找到
|
|
387
|
+
return (
|
|
388
|
+
f"Missing `<{tag}>` elements at `{selector}`. "
|
|
389
|
+
f"Expected {expected}, but none were found. "
|
|
390
|
+
f"Fix: Add {expected} `<{tag}>` element(s) to this location."
|
|
391
|
+
)
|
|
392
|
+
elif found > expected:
|
|
393
|
+
# 情况3: 数量过多
|
|
394
|
+
extra = found - expected
|
|
395
|
+
return (
|
|
396
|
+
f"Too many `<{tag}>` elements at `{selector}`. "
|
|
397
|
+
f"Expected {expected}, but found {found} ({extra} extra). "
|
|
398
|
+
f"Fix: Remove {extra} `<{tag}>` element(s)."
|
|
399
|
+
)
|
|
400
|
+
else:
|
|
401
|
+
# 情况4: 数量过少
|
|
402
|
+
missing = expected - found
|
|
403
|
+
return (
|
|
404
|
+
f"Too few `<{tag}>` elements at `{selector}`. "
|
|
405
|
+
f"Expected {expected}, but only found {found} ({missing} missing). "
|
|
406
|
+
f"Fix: Add {missing} more `<{tag}>` element(s)."
|
|
407
|
+
)
|
|
408
|
+
elif isinstance(error, FoundInvalidIDError):
|
|
409
|
+
if error.invalid_id is None:
|
|
410
|
+
example = f"<{error.element.tag}>"
|
|
411
|
+
else:
|
|
412
|
+
example = f'<{error.element.tag} id="{error.invalid_id}">'
|
|
413
|
+
return f"Invalid inline ID: {example}. Fix: Ensure inline elements have valid numeric IDs."
|
|
414
|
+
else:
|
|
415
|
+
return "Unknown inline error. Fix: Review the inline structure."
|
|
416
|
+
|
|
417
|
+
|
|
418
|
+
def _build_inline_selector(
|
|
419
|
+
encoding: Encoding,
|
|
420
|
+
stack: list[Element],
|
|
421
|
+
block_id: int,
|
|
422
|
+
element: Element | None = None,
|
|
423
|
+
tag: str | None = None,
|
|
424
|
+
) -> str:
|
|
425
|
+
if element is not None:
|
|
426
|
+
element_id = element.get("id")
|
|
427
|
+
if element_id is not None:
|
|
428
|
+
# 能用 ID 直接定位,就不必用路径定位
|
|
429
|
+
return f"{element.tag}#{element_id}"
|
|
430
|
+
tag = element.tag
|
|
431
|
+
|
|
432
|
+
# 路径:block#id > parent > ... > tag
|
|
433
|
+
block_tag = stack[0].tag if stack else "unknown"
|
|
434
|
+
path_parts = [f"{block_tag}#{block_id}"]
|
|
435
|
+
|
|
436
|
+
for parent in stack[1:]:
|
|
437
|
+
path_parts.append(parent.tag)
|
|
438
|
+
|
|
439
|
+
if tag:
|
|
440
|
+
path_parts.append(tag)
|
|
441
|
+
|
|
442
|
+
selector = " > ".join(path_parts)
|
|
443
|
+
|
|
444
|
+
if element is not None:
|
|
445
|
+
text_hint = _extract_text_hint(encoding, element)
|
|
446
|
+
if text_hint:
|
|
447
|
+
selector += f' (contains text: "{text_hint}")'
|
|
448
|
+
return selector
|
|
449
|
+
|
|
450
|
+
|
|
451
|
+
def _extract_text_hint(encoding: Encoding, element: Element) -> str:
|
|
452
|
+
text = plain_text(element).strip()
|
|
453
|
+
if text:
|
|
454
|
+
tokens = encoding.encode(text)
|
|
455
|
+
if len(tokens) > _MAX_TEXT_HINT_TOKENS_COUNT:
|
|
456
|
+
tokens = tokens[:_MAX_TEXT_HINT_TOKENS_COUNT]
|
|
457
|
+
text = encoding.decode(tokens).strip() + " ..."
|
|
458
|
+
return text
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: epub-translator
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.4
|
|
4
4
|
Summary: Translate the epub book using LLM. The translated book will retain the original text and list the translated text side by side with the original text.
|
|
5
5
|
License: MIT
|
|
6
6
|
Keywords: epub,llm,translation,translator
|
|
@@ -78,8 +78,7 @@ The easiest way to use EPUB Translator is through OOMOL Studio with a visual int
|
|
|
78
78
|
### Using Python API
|
|
79
79
|
|
|
80
80
|
```python
|
|
81
|
-
from
|
|
82
|
-
from epub_translator import LLM, translate, language
|
|
81
|
+
from epub_translator import LLM, translate, language, SubmitKind
|
|
83
82
|
|
|
84
83
|
# Initialize LLM with your API credentials
|
|
85
84
|
llm = LLM(
|
|
@@ -91,10 +90,11 @@ llm = LLM(
|
|
|
91
90
|
|
|
92
91
|
# Translate EPUB file using language constants
|
|
93
92
|
translate(
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
target_path=Path("translated.epub"),
|
|
93
|
+
source_path="source.epub",
|
|
94
|
+
target_path="translated.epub",
|
|
97
95
|
target_language=language.ENGLISH,
|
|
96
|
+
submit=SubmitKind.APPEND_BLOCK,
|
|
97
|
+
llm=llm,
|
|
98
98
|
)
|
|
99
99
|
```
|
|
100
100
|
|
|
@@ -113,10 +113,11 @@ with tqdm(total=100, desc="Translating", unit="%") as pbar:
|
|
|
113
113
|
last_progress = progress
|
|
114
114
|
|
|
115
115
|
translate(
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
target_path=Path("translated.epub"),
|
|
116
|
+
source_path="source.epub",
|
|
117
|
+
target_path="translated.epub",
|
|
119
118
|
target_language="English",
|
|
119
|
+
submit=SubmitKind.APPEND_BLOCK,
|
|
120
|
+
llm=llm,
|
|
120
121
|
on_progress=on_progress,
|
|
121
122
|
)
|
|
122
123
|
```
|
|
@@ -149,14 +150,63 @@ Translate an EPUB file:
|
|
|
149
150
|
|
|
150
151
|
```python
|
|
151
152
|
translate(
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
target_path: Path, # Output EPUB file path
|
|
153
|
+
source_path: PathLike | str, # Source EPUB file path
|
|
154
|
+
target_path: PathLike | str, # Output EPUB file path
|
|
155
155
|
target_language: str, # Target language (e.g., "English", "Chinese")
|
|
156
|
+
submit: SubmitKind, # How to insert translations (REPLACE, APPEND_TEXT, or APPEND_BLOCK)
|
|
156
157
|
user_prompt: str | None = None, # Custom translation instructions
|
|
157
158
|
max_retries: int = 5, # Maximum retries for failed translations
|
|
158
159
|
max_group_tokens: int = 1200, # Maximum tokens per translation group
|
|
160
|
+
llm: LLM | None = None, # Single LLM instance for both translation and filling
|
|
161
|
+
translation_llm: LLM | None = None, # LLM instance for translation (overrides llm)
|
|
162
|
+
fill_llm: LLM | None = None, # LLM instance for XML filling (overrides llm)
|
|
159
163
|
on_progress: Callable[[float], None] | None = None, # Progress callback (0.0-1.0)
|
|
164
|
+
on_fill_failed: Callable[[FillFailedEvent], None] | None = None, # Error callback
|
|
165
|
+
)
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
**Note**: Either `llm` or both `translation_llm` and `fill_llm` must be provided. Using separate LLMs allows for task-specific optimization.
|
|
169
|
+
|
|
170
|
+
#### Submit Modes
|
|
171
|
+
|
|
172
|
+
The `submit` parameter controls how translated content is inserted into the document. Use `SubmitKind` enum to specify the insertion mode:
|
|
173
|
+
|
|
174
|
+
```python
|
|
175
|
+
from epub_translator import SubmitKind
|
|
176
|
+
|
|
177
|
+
# Three available modes:
|
|
178
|
+
# - SubmitKind.REPLACE: Replace original content with translation (single-language output)
|
|
179
|
+
# - SubmitKind.APPEND_TEXT: Append translation as inline text (bilingual output)
|
|
180
|
+
# - SubmitKind.APPEND_BLOCK: Append translation as block elements (bilingual output, recommended)
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
**Mode Comparison:**
|
|
184
|
+
|
|
185
|
+
- **`SubmitKind.REPLACE`**: Creates a single-language translation by replacing original text with translated content. Useful for creating books in the target language only.
|
|
186
|
+
|
|
187
|
+
- **`SubmitKind.APPEND_TEXT`**: Appends translations as inline text immediately after the original content. Both languages appear in the same paragraph, creating a continuous reading flow.
|
|
188
|
+
|
|
189
|
+
- **`SubmitKind.APPEND_BLOCK`** (Recommended): Appends translations as separate block elements (paragraphs) after the original. This creates clear visual separation between languages, making it ideal for side-by-side bilingual reading.
|
|
190
|
+
|
|
191
|
+
**Example:**
|
|
192
|
+
|
|
193
|
+
```python
|
|
194
|
+
# For bilingual books (recommended)
|
|
195
|
+
translate(
|
|
196
|
+
source_path="source.epub",
|
|
197
|
+
target_path="translated.epub",
|
|
198
|
+
target_language=language.ENGLISH,
|
|
199
|
+
submit=SubmitKind.APPEND_BLOCK,
|
|
200
|
+
llm=llm,
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
# For single-language translation
|
|
204
|
+
translate(
|
|
205
|
+
source_path="source.epub",
|
|
206
|
+
target_path="translated.epub",
|
|
207
|
+
target_language=language.ENGLISH,
|
|
208
|
+
submit=SubmitKind.REPLACE,
|
|
209
|
+
llm=llm,
|
|
160
210
|
)
|
|
161
211
|
```
|
|
162
212
|
|
|
@@ -169,18 +219,80 @@ from epub_translator import language
|
|
|
169
219
|
|
|
170
220
|
# Usage example:
|
|
171
221
|
translate(
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
target_path=Path("translated.epub"),
|
|
222
|
+
source_path="source.epub",
|
|
223
|
+
target_path="translated.epub",
|
|
175
224
|
target_language=language.ENGLISH,
|
|
225
|
+
submit=SubmitKind.APPEND_BLOCK,
|
|
226
|
+
llm=llm,
|
|
176
227
|
)
|
|
177
228
|
|
|
178
229
|
# You can also use custom language strings:
|
|
179
230
|
translate(
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
target_path=Path("translated.epub"),
|
|
231
|
+
source_path="source.epub",
|
|
232
|
+
target_path="translated.epub",
|
|
183
233
|
target_language="Icelandic", # For languages not in the constants
|
|
234
|
+
submit=SubmitKind.APPEND_BLOCK,
|
|
235
|
+
llm=llm,
|
|
236
|
+
)
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
### Error Handling with `on_fill_failed`
|
|
240
|
+
|
|
241
|
+
Monitor and handle translation errors using the `on_fill_failed` callback:
|
|
242
|
+
|
|
243
|
+
```python
|
|
244
|
+
from epub_translator import FillFailedEvent
|
|
245
|
+
|
|
246
|
+
def handle_fill_error(event: FillFailedEvent):
|
|
247
|
+
print(f"Translation error (attempt {event.retried_count}):")
|
|
248
|
+
print(f" {event.error_message}")
|
|
249
|
+
if event.over_maximum_retries:
|
|
250
|
+
print(" Maximum retries exceeded!")
|
|
251
|
+
|
|
252
|
+
translate(
|
|
253
|
+
source_path="source.epub",
|
|
254
|
+
target_path="translated.epub",
|
|
255
|
+
target_language=language.ENGLISH,
|
|
256
|
+
submit=SubmitKind.APPEND_BLOCK,
|
|
257
|
+
llm=llm,
|
|
258
|
+
on_fill_failed=handle_fill_error,
|
|
259
|
+
)
|
|
260
|
+
```
|
|
261
|
+
|
|
262
|
+
The `FillFailedEvent` contains:
|
|
263
|
+
- `error_message: str` - Description of the error
|
|
264
|
+
- `retried_count: int` - Current retry attempt number
|
|
265
|
+
- `over_maximum_retries: bool` - Whether max retries has been exceeded
|
|
266
|
+
|
|
267
|
+
### Dual-LLM Architecture
|
|
268
|
+
|
|
269
|
+
Use separate LLM instances for translation and XML structure filling with different optimization parameters:
|
|
270
|
+
|
|
271
|
+
```python
|
|
272
|
+
# Create two LLM instances with different temperatures
|
|
273
|
+
translation_llm = LLM(
|
|
274
|
+
key="your-api-key",
|
|
275
|
+
url="https://api.openai.com/v1",
|
|
276
|
+
model="gpt-4",
|
|
277
|
+
token_encoding="o200k_base",
|
|
278
|
+
temperature=0.8, # Higher temperature for creative translation
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
fill_llm = LLM(
|
|
282
|
+
key="your-api-key",
|
|
283
|
+
url="https://api.openai.com/v1",
|
|
284
|
+
model="gpt-4",
|
|
285
|
+
token_encoding="o200k_base",
|
|
286
|
+
temperature=0.3, # Lower temperature for structure preservation
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
translate(
|
|
290
|
+
source_path="source.epub",
|
|
291
|
+
target_path="translated.epub",
|
|
292
|
+
target_language=language.ENGLISH,
|
|
293
|
+
submit=SubmitKind.APPEND_BLOCK,
|
|
294
|
+
translation_llm=translation_llm,
|
|
295
|
+
fill_llm=fill_llm,
|
|
184
296
|
)
|
|
185
297
|
```
|
|
186
298
|
|
|
@@ -236,10 +348,11 @@ Provide specific translation instructions:
|
|
|
236
348
|
|
|
237
349
|
```python
|
|
238
350
|
translate(
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
target_path=Path("translated.epub"),
|
|
351
|
+
source_path="source.epub",
|
|
352
|
+
target_path="translated.epub",
|
|
242
353
|
target_language="English",
|
|
354
|
+
submit=SubmitKind.APPEND_BLOCK,
|
|
355
|
+
llm=llm,
|
|
243
356
|
user_prompt="Use formal language and preserve technical terminology",
|
|
244
357
|
)
|
|
245
358
|
```
|