epub-translator 0.1.0__py3-none-any.whl → 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epub_translator/__init__.py +2 -2
- epub_translator/data/fill.jinja +143 -38
- epub_translator/epub/__init__.py +1 -1
- epub_translator/epub/metadata.py +122 -0
- epub_translator/epub/spines.py +3 -2
- epub_translator/epub/zip.py +11 -9
- epub_translator/epub_transcode.py +108 -0
- epub_translator/llm/__init__.py +1 -0
- epub_translator/llm/context.py +109 -0
- epub_translator/llm/core.py +39 -62
- epub_translator/llm/executor.py +25 -31
- epub_translator/llm/increasable.py +1 -1
- epub_translator/llm/types.py +0 -3
- epub_translator/segment/__init__.py +26 -0
- epub_translator/segment/block_segment.py +124 -0
- epub_translator/segment/common.py +29 -0
- epub_translator/segment/inline_segment.py +356 -0
- epub_translator/{xml_translator → segment}/text_segment.py +8 -8
- epub_translator/segment/utils.py +43 -0
- epub_translator/translator.py +150 -183
- epub_translator/utils.py +33 -0
- epub_translator/xml/__init__.py +2 -0
- epub_translator/xml/const.py +1 -0
- epub_translator/xml/deduplication.py +3 -3
- epub_translator/xml/self_closing.py +182 -0
- epub_translator/xml/utils.py +42 -0
- epub_translator/xml/xml.py +7 -0
- epub_translator/xml/xml_like.py +145 -115
- epub_translator/xml_interrupter.py +165 -0
- epub_translator/xml_translator/__init__.py +1 -2
- epub_translator/xml_translator/callbacks.py +34 -0
- epub_translator/xml_translator/{const.py → common.py} +0 -1
- epub_translator/xml_translator/hill_climbing.py +104 -0
- epub_translator/xml_translator/stream_mapper.py +253 -0
- epub_translator/xml_translator/submitter.py +26 -72
- epub_translator/xml_translator/translator.py +157 -107
- epub_translator/xml_translator/validation.py +458 -0
- {epub_translator-0.1.0.dist-info → epub_translator-0.1.3.dist-info}/METADATA +72 -9
- epub_translator-0.1.3.dist-info/RECORD +66 -0
- epub_translator/epub/placeholder.py +0 -53
- epub_translator/iter_sync.py +0 -24
- epub_translator/xml_translator/fill.py +0 -128
- epub_translator/xml_translator/format.py +0 -282
- epub_translator/xml_translator/fragmented.py +0 -125
- epub_translator/xml_translator/group.py +0 -183
- epub_translator/xml_translator/progressive_locking.py +0 -256
- epub_translator/xml_translator/utils.py +0 -29
- epub_translator-0.1.0.dist-info/RECORD +0 -58
- {epub_translator-0.1.0.dist-info → epub_translator-0.1.3.dist-info}/LICENSE +0 -0
- {epub_translator-0.1.0.dist-info → epub_translator-0.1.3.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,458 @@
|
|
|
1
|
+
from collections.abc import Generator, Iterable
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import Generic, TypeVar, cast
|
|
4
|
+
from xml.etree.ElementTree import Element
|
|
5
|
+
|
|
6
|
+
from tiktoken import Encoding
|
|
7
|
+
|
|
8
|
+
from ..segment import (
|
|
9
|
+
BlockContentError,
|
|
10
|
+
BlockError,
|
|
11
|
+
BlockExpectedIDsError,
|
|
12
|
+
BlockUnexpectedIDError,
|
|
13
|
+
BlockWrongTagError,
|
|
14
|
+
FoundInvalidIDError,
|
|
15
|
+
InlineError,
|
|
16
|
+
InlineExpectedIDsError,
|
|
17
|
+
InlineLostIDError,
|
|
18
|
+
InlineUnexpectedIDError,
|
|
19
|
+
InlineWrongTagCountError,
|
|
20
|
+
)
|
|
21
|
+
from ..utils import ensure_list
|
|
22
|
+
from ..xml import plain_text
|
|
23
|
+
|
|
24
|
+
_LEVEL_WEIGHT = 3
|
|
25
|
+
_MAX_TEXT_HINT_TOKENS_COUNT = 6
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
_BLOCK_EXPECTED_IDS_LEVEL = 6
|
|
29
|
+
_BLOCK_WRONG_TAG_LEVEL = 5
|
|
30
|
+
_BLOCK_FOUND_INVALID_ID_LEVEL = 4
|
|
31
|
+
_BLOCK_UNEXPECTED_ID_LEVEL = 3
|
|
32
|
+
|
|
33
|
+
_INLINE_EXPECTED_IDS_LEVEL = 3
|
|
34
|
+
_INLINE_LOST_ID_LEVEL = 2
|
|
35
|
+
_INLINE_FOUND_INVALID_ID_LEVEL = 1
|
|
36
|
+
_INLINE_WRONG_TAG_COUNT_LEVEL = 0
|
|
37
|
+
_INLINE_UNEXPECTED_ID_LEVEL = 0
|
|
38
|
+
|
|
39
|
+
ERROR = TypeVar("ERROR")
|
|
40
|
+
LEVEL_DEPTH = 7
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class ErrorItem(Generic[ERROR]):
|
|
45
|
+
error: ERROR
|
|
46
|
+
index1: int
|
|
47
|
+
index2: int
|
|
48
|
+
level: int
|
|
49
|
+
weight: int
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@dataclass
|
|
53
|
+
class BlockErrorsGroup:
|
|
54
|
+
block_id: int
|
|
55
|
+
block_element: Element
|
|
56
|
+
errors: list[ErrorItem[BlockError | FoundInvalidIDError] | ErrorItem[InlineError | FoundInvalidIDError]]
|
|
57
|
+
|
|
58
|
+
@property
|
|
59
|
+
def weight(self) -> int:
|
|
60
|
+
return sum(e.weight for e in self.errors)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@dataclass
|
|
64
|
+
class ErrorsGroup:
|
|
65
|
+
upper_errors: list[ErrorItem[BlockError | FoundInvalidIDError]]
|
|
66
|
+
block_groups: list[BlockErrorsGroup]
|
|
67
|
+
|
|
68
|
+
@property
|
|
69
|
+
def errors_count(self) -> int:
|
|
70
|
+
count = len(self.upper_errors)
|
|
71
|
+
for block_group in self.block_groups:
|
|
72
|
+
count += len(block_group.errors)
|
|
73
|
+
return count
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def nest_as_errors_group(errors: Iterable[BlockError | FoundInvalidIDError]) -> ErrorsGroup | None:
|
|
77
|
+
return _create_errors_group(
|
|
78
|
+
error_items=_transform_errors_to_items(errors),
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def truncate_errors_group(errors_group: ErrorsGroup, max_errors: int) -> ErrorsGroup | None:
|
|
83
|
+
errors_items = list(_flatten_errors_group(errors_group))
|
|
84
|
+
if len(errors_items) <= max_errors:
|
|
85
|
+
return errors_group
|
|
86
|
+
|
|
87
|
+
errors_items.sort(key=lambda item: (-item[1].level, item[1].index1, item[1].index2))
|
|
88
|
+
errors_items = errors_items[:max_errors]
|
|
89
|
+
|
|
90
|
+
return _create_errors_group(
|
|
91
|
+
error_items=errors_items,
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def generate_error_message(encoding: Encoding, errors_group: ErrorsGroup, omitted_count: int = 0) -> None | str:
|
|
96
|
+
message_lines: list[str] = []
|
|
97
|
+
for upper_error in errors_group.upper_errors:
|
|
98
|
+
message_lines.append(_format_block_error(upper_error.error))
|
|
99
|
+
if message_lines:
|
|
100
|
+
message_lines.append("")
|
|
101
|
+
|
|
102
|
+
for i, block_group in enumerate(errors_group.block_groups):
|
|
103
|
+
if i == 0:
|
|
104
|
+
message_lines.append("")
|
|
105
|
+
|
|
106
|
+
block_tag = block_group.block_element.tag
|
|
107
|
+
error_count = len(block_group.errors)
|
|
108
|
+
count_suffix = f" ({error_count} error{'s' if error_count != 1 else ''})"
|
|
109
|
+
message_lines.append(f"In {block_tag}#{block_group.block_id}:{count_suffix}")
|
|
110
|
+
|
|
111
|
+
for block_error in block_group.errors:
|
|
112
|
+
message: str
|
|
113
|
+
if isinstance(block_error.error, BlockError):
|
|
114
|
+
message = _format_block_error(block_error.error)
|
|
115
|
+
elif isinstance(block_error.error, InlineError):
|
|
116
|
+
message = _format_inline_error(encoding, block_error.error, block_group.block_id)
|
|
117
|
+
else:
|
|
118
|
+
raise RuntimeError()
|
|
119
|
+
message_lines.append(f" - {message}")
|
|
120
|
+
|
|
121
|
+
if not message_lines:
|
|
122
|
+
return None
|
|
123
|
+
|
|
124
|
+
header = (
|
|
125
|
+
f"Found {errors_group.errors_count} error(s). Fix them and return "
|
|
126
|
+
"the COMPLETE corrected XML (not just the changed parts):"
|
|
127
|
+
)
|
|
128
|
+
message_lines.insert(0, "")
|
|
129
|
+
message_lines.insert(0, header)
|
|
130
|
+
|
|
131
|
+
if omitted_count > 0:
|
|
132
|
+
message_lines.append("")
|
|
133
|
+
message_lines.append(
|
|
134
|
+
f"... and {omitted_count} more error(s) omitted. "
|
|
135
|
+
f"Fix the above errors first, then resubmit for remaining issues."
|
|
136
|
+
)
|
|
137
|
+
message_lines.append("")
|
|
138
|
+
message_lines.append("Remember: Return the entire <xml>...</xml> block with all corrections applied.")
|
|
139
|
+
else:
|
|
140
|
+
message_lines.append("")
|
|
141
|
+
message_lines.append("Return the entire <xml>...</xml> block with corrections.")
|
|
142
|
+
|
|
143
|
+
return "\n".join(message_lines)
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
@dataclass
|
|
147
|
+
class _Block:
|
|
148
|
+
id: int
|
|
149
|
+
element: Element
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def _transform_errors_to_items(errors: Iterable[BlockError | FoundInvalidIDError]):
|
|
153
|
+
for i, block_error in enumerate(errors):
|
|
154
|
+
if isinstance(block_error, BlockContentError):
|
|
155
|
+
block = _Block(
|
|
156
|
+
id=block_error.id,
|
|
157
|
+
element=block_error.element,
|
|
158
|
+
)
|
|
159
|
+
for j, inline_error in enumerate(block_error.errors):
|
|
160
|
+
level = _get_inline_error_level(inline_error)
|
|
161
|
+
weight = _calculate_error_weight(inline_error, level)
|
|
162
|
+
yield (
|
|
163
|
+
block,
|
|
164
|
+
ErrorItem(
|
|
165
|
+
error=inline_error,
|
|
166
|
+
index1=i,
|
|
167
|
+
index2=j,
|
|
168
|
+
level=level,
|
|
169
|
+
weight=weight,
|
|
170
|
+
),
|
|
171
|
+
)
|
|
172
|
+
else:
|
|
173
|
+
level = _get_block_error_level(block_error)
|
|
174
|
+
weight = _calculate_error_weight(block_error, level)
|
|
175
|
+
error_item: ErrorItem[BlockError | FoundInvalidIDError] = ErrorItem(
|
|
176
|
+
error=block_error,
|
|
177
|
+
index1=i,
|
|
178
|
+
index2=0,
|
|
179
|
+
level=level,
|
|
180
|
+
weight=weight,
|
|
181
|
+
)
|
|
182
|
+
block: _Block | None = None
|
|
183
|
+
if isinstance(block_error, BlockWrongTagError) and block_error.block is not None:
|
|
184
|
+
block = _Block(
|
|
185
|
+
id=block_error.block[0],
|
|
186
|
+
element=block_error.block[1],
|
|
187
|
+
)
|
|
188
|
+
yield block, error_item
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def _flatten_errors_group(
|
|
192
|
+
errors_group: ErrorsGroup,
|
|
193
|
+
) -> Generator[
|
|
194
|
+
tuple[
|
|
195
|
+
_Block | None,
|
|
196
|
+
ErrorItem[BlockError | FoundInvalidIDError] | ErrorItem[InlineError | FoundInvalidIDError],
|
|
197
|
+
],
|
|
198
|
+
None,
|
|
199
|
+
None,
|
|
200
|
+
]:
|
|
201
|
+
for error in errors_group.upper_errors:
|
|
202
|
+
yield None, error
|
|
203
|
+
|
|
204
|
+
for block_group in errors_group.block_groups:
|
|
205
|
+
block = _Block(
|
|
206
|
+
id=block_group.block_id,
|
|
207
|
+
element=block_group.block_element,
|
|
208
|
+
)
|
|
209
|
+
for error in block_group.errors:
|
|
210
|
+
yield block, error
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def _create_errors_group(
|
|
214
|
+
error_items: Iterable[
|
|
215
|
+
tuple[
|
|
216
|
+
_Block | None,
|
|
217
|
+
ErrorItem[BlockError | FoundInvalidIDError] | ErrorItem[InlineError | FoundInvalidIDError],
|
|
218
|
+
]
|
|
219
|
+
],
|
|
220
|
+
) -> ErrorsGroup | None:
|
|
221
|
+
upper_errors: list[ErrorItem[BlockError | FoundInvalidIDError]] = []
|
|
222
|
+
block_elements: dict[int, Element] = {}
|
|
223
|
+
block_errors_dict: dict[
|
|
224
|
+
int, list[ErrorItem[BlockError | FoundInvalidIDError] | ErrorItem[InlineError | FoundInvalidIDError]]
|
|
225
|
+
] = {}
|
|
226
|
+
|
|
227
|
+
for block, error in error_items:
|
|
228
|
+
if block is None:
|
|
229
|
+
upper_errors.append(cast(ErrorItem[BlockError | FoundInvalidIDError], error))
|
|
230
|
+
else:
|
|
231
|
+
block_errors = ensure_list(block_errors_dict, block.id)
|
|
232
|
+
block_errors.append(error)
|
|
233
|
+
block_elements[block.id] = block.element
|
|
234
|
+
|
|
235
|
+
if not upper_errors and not block_errors_dict:
|
|
236
|
+
return None
|
|
237
|
+
|
|
238
|
+
block_errors_groups: list[BlockErrorsGroup] = []
|
|
239
|
+
for block_id, block_errors in block_errors_dict.items():
|
|
240
|
+
block_element = block_elements.get(block_id)
|
|
241
|
+
if block_element is None:
|
|
242
|
+
continue
|
|
243
|
+
|
|
244
|
+
block_error_group = BlockErrorsGroup(
|
|
245
|
+
block_id=block_id,
|
|
246
|
+
block_element=block_element,
|
|
247
|
+
errors=sorted(block_errors, key=lambda e: (-e.weight, e.index1, e.index2)),
|
|
248
|
+
)
|
|
249
|
+
block_errors_groups.append(block_error_group)
|
|
250
|
+
|
|
251
|
+
upper_errors.sort(key=lambda e: (-e.level, e.index1, e.index2))
|
|
252
|
+
block_errors_groups.sort(key=lambda g: -g.weight)
|
|
253
|
+
|
|
254
|
+
return ErrorsGroup(
|
|
255
|
+
upper_errors=upper_errors,
|
|
256
|
+
block_groups=block_errors_groups,
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def _calculate_error_weight(error: BlockError | InlineError | FoundInvalidIDError, level: int) -> int:
|
|
261
|
+
# BlockExpectedIDsError 和 InlineExpectedIDsError 的权重乘以 id2element 数量
|
|
262
|
+
if isinstance(error, (BlockExpectedIDsError, InlineExpectedIDsError)):
|
|
263
|
+
return (_LEVEL_WEIGHT**level) * len(error.id2element)
|
|
264
|
+
else:
|
|
265
|
+
return _LEVEL_WEIGHT**level
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def _get_block_error_level(error: BlockError | FoundInvalidIDError) -> int:
|
|
269
|
+
if isinstance(error, BlockWrongTagError):
|
|
270
|
+
return _BLOCK_WRONG_TAG_LEVEL
|
|
271
|
+
elif isinstance(error, BlockExpectedIDsError):
|
|
272
|
+
return _BLOCK_EXPECTED_IDS_LEVEL
|
|
273
|
+
elif isinstance(error, BlockUnexpectedIDError):
|
|
274
|
+
return _BLOCK_UNEXPECTED_ID_LEVEL
|
|
275
|
+
elif isinstance(error, FoundInvalidIDError):
|
|
276
|
+
return _BLOCK_FOUND_INVALID_ID_LEVEL
|
|
277
|
+
else:
|
|
278
|
+
return 0
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
def _get_inline_error_level(error: InlineError | FoundInvalidIDError) -> int:
|
|
282
|
+
if isinstance(error, InlineLostIDError):
|
|
283
|
+
return _INLINE_LOST_ID_LEVEL
|
|
284
|
+
elif isinstance(error, InlineExpectedIDsError):
|
|
285
|
+
return _INLINE_EXPECTED_IDS_LEVEL
|
|
286
|
+
elif isinstance(error, InlineUnexpectedIDError):
|
|
287
|
+
return _INLINE_UNEXPECTED_ID_LEVEL
|
|
288
|
+
elif isinstance(error, InlineWrongTagCountError):
|
|
289
|
+
return _INLINE_WRONG_TAG_COUNT_LEVEL
|
|
290
|
+
elif isinstance(error, FoundInvalidIDError):
|
|
291
|
+
return _INLINE_FOUND_INVALID_ID_LEVEL
|
|
292
|
+
else:
|
|
293
|
+
return 0
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
def _format_block_error(error: BlockError | FoundInvalidIDError) -> str:
|
|
297
|
+
if isinstance(error, BlockWrongTagError):
|
|
298
|
+
if error.block is None:
|
|
299
|
+
return (
|
|
300
|
+
f"Root tag mismatch: expected `<{error.expected_tag}>`, but found `<{error.instead_tag}>`. "
|
|
301
|
+
f"Fix: Change the root tag to `<{error.expected_tag}>`."
|
|
302
|
+
)
|
|
303
|
+
else:
|
|
304
|
+
return (
|
|
305
|
+
f"Wrong tag for block at `{error.instead_tag}#{error.block[0]}`: "
|
|
306
|
+
f'expected `<{error.expected_tag} id="{error.block[0]}">`, '
|
|
307
|
+
f'but found `<{error.instead_tag} id="{error.block[0]}">`. '
|
|
308
|
+
f"Fix: Change the tag to `<{error.expected_tag}>`."
|
|
309
|
+
)
|
|
310
|
+
elif isinstance(error, BlockExpectedIDsError):
|
|
311
|
+
# Add context hints with original text content
|
|
312
|
+
context_hints: list[str] = []
|
|
313
|
+
for id, elem in sorted(error.id2element.items()):
|
|
314
|
+
original_text = plain_text(elem).strip()
|
|
315
|
+
if original_text:
|
|
316
|
+
# Truncate to first 30 chars for block-level hints
|
|
317
|
+
text_preview = original_text[:30] + "..." if len(original_text) > 30 else original_text
|
|
318
|
+
context_hints.append(f' - `<{elem.tag} id="{id}">`: "{text_preview}"')
|
|
319
|
+
|
|
320
|
+
if context_hints:
|
|
321
|
+
message = "Missing block elements (find translation and wrap):\n" + "\n".join(context_hints)
|
|
322
|
+
else:
|
|
323
|
+
# Fallback if no text hints available
|
|
324
|
+
missing_elements = [f'<{elem.tag} id="{id}">' for id, elem in sorted(error.id2element.items())]
|
|
325
|
+
elements_str = ", ".join(missing_elements)
|
|
326
|
+
message = f"Missing expected blocks: {elements_str}. Fix: Add these missing blocks with the correct IDs."
|
|
327
|
+
|
|
328
|
+
return message
|
|
329
|
+
|
|
330
|
+
elif isinstance(error, BlockUnexpectedIDError):
|
|
331
|
+
selector = f"{error.element.tag}#{error.id}"
|
|
332
|
+
return f"Unexpected block found at `{selector}`. Fix: Remove this unexpected block."
|
|
333
|
+
|
|
334
|
+
elif isinstance(error, FoundInvalidIDError):
|
|
335
|
+
if error.invalid_id is None:
|
|
336
|
+
example = f"<{error.element.tag}>"
|
|
337
|
+
else:
|
|
338
|
+
example = f'<{error.element.tag} id="{error.invalid_id}">'
|
|
339
|
+
return f"Invalid or missing ID attribute: {example}. Fix: Ensure all blocks have valid numeric IDs."
|
|
340
|
+
else:
|
|
341
|
+
return "Unknown block error. Fix: Review the block structure."
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
def _format_inline_error(encoding: Encoding, error: InlineError | FoundInvalidIDError, block_id: int) -> str:
|
|
345
|
+
if isinstance(error, InlineLostIDError):
|
|
346
|
+
selector = _build_inline_selector(encoding, error.stack, block_id, element=error.element)
|
|
347
|
+
return f"Element at `{selector}` is missing an ID attribute. Fix: Add the required ID attribute."
|
|
348
|
+
|
|
349
|
+
elif isinstance(error, InlineExpectedIDsError):
|
|
350
|
+
# Add context hints with original text content
|
|
351
|
+
context_hints: list[str] = []
|
|
352
|
+
for id, elem in sorted(error.id2element.items()):
|
|
353
|
+
original_text = plain_text(elem).strip()
|
|
354
|
+
if original_text:
|
|
355
|
+
text_hint = _extract_text_hint(encoding, elem)
|
|
356
|
+
context_hints.append(f' - `<{elem.tag} id="{id}">`: "{text_hint}"')
|
|
357
|
+
|
|
358
|
+
if context_hints:
|
|
359
|
+
message = "Missing inline elements (find translation and wrap):\n" + "\n".join(context_hints)
|
|
360
|
+
else:
|
|
361
|
+
# Fallback if no text hints available
|
|
362
|
+
missing_elements = [f'<{elem.tag} id="{id}">' for id, elem in sorted(error.id2element.items())]
|
|
363
|
+
elements_str = ", ".join(missing_elements)
|
|
364
|
+
message = f"Missing expected inline elements: {elements_str}. Fix: Add these missing inline elements."
|
|
365
|
+
|
|
366
|
+
return message
|
|
367
|
+
|
|
368
|
+
elif isinstance(error, InlineUnexpectedIDError):
|
|
369
|
+
selector = f"{error.element.tag}#{error.id}"
|
|
370
|
+
return f"Unexpected inline element at `{selector}`. Fix: Remove this unexpected element."
|
|
371
|
+
|
|
372
|
+
elif isinstance(error, InlineWrongTagCountError):
|
|
373
|
+
tag = error.found_elements[0].tag if error.found_elements else "unknown"
|
|
374
|
+
selector = _build_inline_selector(encoding, error.stack, block_id, tag=tag)
|
|
375
|
+
expected = error.expected_count
|
|
376
|
+
found = len(error.found_elements)
|
|
377
|
+
|
|
378
|
+
if expected == 0 and found > 0:
|
|
379
|
+
# 情况1: 不应该有,但发现了
|
|
380
|
+
return (
|
|
381
|
+
f"Found unexpected `<{tag}>` elements at `{selector}`. "
|
|
382
|
+
f"There should be none, but {found} were found. "
|
|
383
|
+
f"Fix: Remove all `<{tag}>` elements from this location."
|
|
384
|
+
)
|
|
385
|
+
elif expected > 0 and found == 0:
|
|
386
|
+
# 情况2: 应该有,但没找到
|
|
387
|
+
return (
|
|
388
|
+
f"Missing `<{tag}>` elements at `{selector}`. "
|
|
389
|
+
f"Expected {expected}, but none were found. "
|
|
390
|
+
f"Fix: Add {expected} `<{tag}>` element(s) to this location."
|
|
391
|
+
)
|
|
392
|
+
elif found > expected:
|
|
393
|
+
# 情况3: 数量过多
|
|
394
|
+
extra = found - expected
|
|
395
|
+
return (
|
|
396
|
+
f"Too many `<{tag}>` elements at `{selector}`. "
|
|
397
|
+
f"Expected {expected}, but found {found} ({extra} extra). "
|
|
398
|
+
f"Fix: Remove {extra} `<{tag}>` element(s)."
|
|
399
|
+
)
|
|
400
|
+
else:
|
|
401
|
+
# 情况4: 数量过少
|
|
402
|
+
missing = expected - found
|
|
403
|
+
return (
|
|
404
|
+
f"Too few `<{tag}>` elements at `{selector}`. "
|
|
405
|
+
f"Expected {expected}, but only found {found} ({missing} missing). "
|
|
406
|
+
f"Fix: Add {missing} more `<{tag}>` element(s)."
|
|
407
|
+
)
|
|
408
|
+
elif isinstance(error, FoundInvalidIDError):
|
|
409
|
+
if error.invalid_id is None:
|
|
410
|
+
example = f"<{error.element.tag}>"
|
|
411
|
+
else:
|
|
412
|
+
example = f'<{error.element.tag} id="{error.invalid_id}">'
|
|
413
|
+
return f"Invalid inline ID: {example}. Fix: Ensure inline elements have valid numeric IDs."
|
|
414
|
+
else:
|
|
415
|
+
return "Unknown inline error. Fix: Review the inline structure."
|
|
416
|
+
|
|
417
|
+
|
|
418
|
+
def _build_inline_selector(
|
|
419
|
+
encoding: Encoding,
|
|
420
|
+
stack: list[Element],
|
|
421
|
+
block_id: int,
|
|
422
|
+
element: Element | None = None,
|
|
423
|
+
tag: str | None = None,
|
|
424
|
+
) -> str:
|
|
425
|
+
if element is not None:
|
|
426
|
+
element_id = element.get("id")
|
|
427
|
+
if element_id is not None:
|
|
428
|
+
# 能用 ID 直接定位,就不必用路径定位
|
|
429
|
+
return f"{element.tag}#{element_id}"
|
|
430
|
+
tag = element.tag
|
|
431
|
+
|
|
432
|
+
# 路径:block#id > parent > ... > tag
|
|
433
|
+
block_tag = stack[0].tag if stack else "unknown"
|
|
434
|
+
path_parts = [f"{block_tag}#{block_id}"]
|
|
435
|
+
|
|
436
|
+
for parent in stack[1:]:
|
|
437
|
+
path_parts.append(parent.tag)
|
|
438
|
+
|
|
439
|
+
if tag:
|
|
440
|
+
path_parts.append(tag)
|
|
441
|
+
|
|
442
|
+
selector = " > ".join(path_parts)
|
|
443
|
+
|
|
444
|
+
if element is not None:
|
|
445
|
+
text_hint = _extract_text_hint(encoding, element)
|
|
446
|
+
if text_hint:
|
|
447
|
+
selector += f' (contains text: "{text_hint}")'
|
|
448
|
+
return selector
|
|
449
|
+
|
|
450
|
+
|
|
451
|
+
def _extract_text_hint(encoding: Encoding, element: Element) -> str:
|
|
452
|
+
text = plain_text(element).strip()
|
|
453
|
+
if text:
|
|
454
|
+
tokens = encoding.encode(text)
|
|
455
|
+
if len(tokens) > _MAX_TEXT_HINT_TOKENS_COUNT:
|
|
456
|
+
tokens = tokens[:_MAX_TEXT_HINT_TOKENS_COUNT]
|
|
457
|
+
text = encoding.decode(tokens).strip() + " ..."
|
|
458
|
+
return text
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: epub-translator
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.3
|
|
4
4
|
Summary: Translate the epub book using LLM. The translated book will retain the original text and list the translated text side by side with the original text.
|
|
5
5
|
License: MIT
|
|
6
6
|
Keywords: epub,llm,translation,translator
|
|
@@ -91,10 +91,10 @@ llm = LLM(
|
|
|
91
91
|
|
|
92
92
|
# Translate EPUB file using language constants
|
|
93
93
|
translate(
|
|
94
|
-
llm=llm,
|
|
95
94
|
source_path=Path("source.epub"),
|
|
96
95
|
target_path=Path("translated.epub"),
|
|
97
96
|
target_language=language.ENGLISH,
|
|
97
|
+
llm=llm,
|
|
98
98
|
)
|
|
99
99
|
```
|
|
100
100
|
|
|
@@ -113,10 +113,10 @@ with tqdm(total=100, desc="Translating", unit="%") as pbar:
|
|
|
113
113
|
last_progress = progress
|
|
114
114
|
|
|
115
115
|
translate(
|
|
116
|
-
llm=llm,
|
|
117
116
|
source_path=Path("source.epub"),
|
|
118
117
|
target_path=Path("translated.epub"),
|
|
119
118
|
target_language="English",
|
|
119
|
+
llm=llm,
|
|
120
120
|
on_progress=on_progress,
|
|
121
121
|
)
|
|
122
122
|
```
|
|
@@ -149,17 +149,22 @@ Translate an EPUB file:
|
|
|
149
149
|
|
|
150
150
|
```python
|
|
151
151
|
translate(
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
target_path: Path, # Output EPUB file path
|
|
152
|
+
source_path: PathLike | str, # Source EPUB file path
|
|
153
|
+
target_path: PathLike | str, # Output EPUB file path
|
|
155
154
|
target_language: str, # Target language (e.g., "English", "Chinese")
|
|
156
155
|
user_prompt: str | None = None, # Custom translation instructions
|
|
157
156
|
max_retries: int = 5, # Maximum retries for failed translations
|
|
158
157
|
max_group_tokens: int = 1200, # Maximum tokens per translation group
|
|
158
|
+
llm: LLM | None = None, # Single LLM instance for both translation and filling
|
|
159
|
+
translation_llm: LLM | None = None, # LLM instance for translation (overrides llm)
|
|
160
|
+
fill_llm: LLM | None = None, # LLM instance for XML filling (overrides llm)
|
|
159
161
|
on_progress: Callable[[float], None] | None = None, # Progress callback (0.0-1.0)
|
|
162
|
+
on_fill_failed: Callable[[FillFailedEvent], None] | None = None, # Error callback
|
|
160
163
|
)
|
|
161
164
|
```
|
|
162
165
|
|
|
166
|
+
**Note**: Either `llm` or both `translation_llm` and `fill_llm` must be provided. Using separate LLMs allows for task-specific optimization.
|
|
167
|
+
|
|
163
168
|
#### Language Constants
|
|
164
169
|
|
|
165
170
|
EPUB Translator provides predefined language constants for convenience. You can use these constants instead of writing language names as strings:
|
|
@@ -169,18 +174,76 @@ from epub_translator import language
|
|
|
169
174
|
|
|
170
175
|
# Usage example:
|
|
171
176
|
translate(
|
|
172
|
-
llm=llm,
|
|
173
177
|
source_path=Path("source.epub"),
|
|
174
178
|
target_path=Path("translated.epub"),
|
|
175
179
|
target_language=language.ENGLISH,
|
|
180
|
+
llm=llm,
|
|
176
181
|
)
|
|
177
182
|
|
|
178
183
|
# You can also use custom language strings:
|
|
179
184
|
translate(
|
|
180
|
-
llm=llm,
|
|
181
185
|
source_path=Path("source.epub"),
|
|
182
186
|
target_path=Path("translated.epub"),
|
|
183
187
|
target_language="Icelandic", # For languages not in the constants
|
|
188
|
+
llm=llm,
|
|
189
|
+
)
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
### Error Handling with `on_fill_failed`
|
|
193
|
+
|
|
194
|
+
Monitor and handle translation errors using the `on_fill_failed` callback:
|
|
195
|
+
|
|
196
|
+
```python
|
|
197
|
+
from epub_translator import FillFailedEvent
|
|
198
|
+
|
|
199
|
+
def handle_fill_error(event: FillFailedEvent):
|
|
200
|
+
print(f"Translation error (attempt {event.retried_count}):")
|
|
201
|
+
print(f" {event.error_message}")
|
|
202
|
+
if event.over_maximum_retries:
|
|
203
|
+
print(" Maximum retries exceeded!")
|
|
204
|
+
|
|
205
|
+
translate(
|
|
206
|
+
source_path=Path("source.epub"),
|
|
207
|
+
target_path=Path("translated.epub"),
|
|
208
|
+
target_language=language.ENGLISH,
|
|
209
|
+
llm=llm,
|
|
210
|
+
on_fill_failed=handle_fill_error,
|
|
211
|
+
)
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
The `FillFailedEvent` contains:
|
|
215
|
+
- `error_message: str` - Description of the error
|
|
216
|
+
- `retried_count: int` - Current retry attempt number
|
|
217
|
+
- `over_maximum_retries: bool` - Whether max retries has been exceeded
|
|
218
|
+
|
|
219
|
+
### Dual-LLM Architecture
|
|
220
|
+
|
|
221
|
+
Use separate LLM instances for translation and XML structure filling with different optimization parameters:
|
|
222
|
+
|
|
223
|
+
```python
|
|
224
|
+
# Create two LLM instances with different temperatures
|
|
225
|
+
translation_llm = LLM(
|
|
226
|
+
key="your-api-key",
|
|
227
|
+
url="https://api.openai.com/v1",
|
|
228
|
+
model="gpt-4",
|
|
229
|
+
token_encoding="o200k_base",
|
|
230
|
+
temperature=0.8, # Higher temperature for creative translation
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
fill_llm = LLM(
|
|
234
|
+
key="your-api-key",
|
|
235
|
+
url="https://api.openai.com/v1",
|
|
236
|
+
model="gpt-4",
|
|
237
|
+
token_encoding="o200k_base",
|
|
238
|
+
temperature=0.3, # Lower temperature for structure preservation
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
translate(
|
|
242
|
+
source_path=Path("source.epub"),
|
|
243
|
+
target_path=Path("translated.epub"),
|
|
244
|
+
target_language=language.ENGLISH,
|
|
245
|
+
translation_llm=translation_llm,
|
|
246
|
+
fill_llm=fill_llm,
|
|
184
247
|
)
|
|
185
248
|
```
|
|
186
249
|
|
|
@@ -236,10 +299,10 @@ Provide specific translation instructions:
|
|
|
236
299
|
|
|
237
300
|
```python
|
|
238
301
|
translate(
|
|
239
|
-
llm=llm,
|
|
240
302
|
source_path=Path("source.epub"),
|
|
241
303
|
target_path=Path("translated.epub"),
|
|
242
304
|
target_language="English",
|
|
305
|
+
llm=llm,
|
|
243
306
|
user_prompt="Use formal language and preserve technical terminology",
|
|
244
307
|
)
|
|
245
308
|
```
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
epub_translator/__init__.py,sha256=2FZPJyQdTgy_X7oOCxDpSqCZCGEcFqlulmhztLWuCIk,158
|
|
2
|
+
epub_translator/data/fill.jinja,sha256=zSytA8Vhp2i6YBZ09F1z9iPJq1-jUaiphoXqTNZwnvo,6964
|
|
3
|
+
epub_translator/data/mmltex/README.md,sha256=wwhe5yW1U_7_YZIFKnQVnCOmUl7Mu3gsr3lNnDSJ5Qs,2953
|
|
4
|
+
epub_translator/data/mmltex/cmarkup.xsl,sha256=DkhimAATM0XSCfVOfY41-qTPoddqzOHjZ00Pynr4zQE,37707
|
|
5
|
+
epub_translator/data/mmltex/entities.xsl,sha256=TYZ5iGg0u9XlDDBBGuZiHL7MsxKc-3OsTIBAVM1GDek,107742
|
|
6
|
+
epub_translator/data/mmltex/glayout.xsl,sha256=Ztc7N1wiHaYZlo9u9iuROrIl3uIIIoo1VFIuojXq7TM,6879
|
|
7
|
+
epub_translator/data/mmltex/mmltex.xsl,sha256=BVXFbApz-9W2qRKKtBTxptK5vxG2bfB8tv9W1MP5iBI,1384
|
|
8
|
+
epub_translator/data/mmltex/scripts.xsl,sha256=f4ei0cDCW3cV-Ra7rC3kC5tRcKdjJxbSpCeQLoohtgo,13697
|
|
9
|
+
epub_translator/data/mmltex/tables.xsl,sha256=RxtNo8qDtVAg8_6BuYsafraB_0z7YDAB9D__fT9gmWs,4327
|
|
10
|
+
epub_translator/data/mmltex/tokens.xsl,sha256=j3JZRcBhAiiY8o5K3640phfLwxO8JVspCFlSttwBzJk,12373
|
|
11
|
+
epub_translator/data/translate.jinja,sha256=93d8kschm5HV-EfXd1kFSIVMObDqTMdoUrwDfce2bhU,820
|
|
12
|
+
epub_translator/epub/__init__.py,sha256=ZddRHrLNVzgaSVrYflGnrq8tffmlKPhBbz9ok7sp8PY,149
|
|
13
|
+
epub_translator/epub/common.py,sha256=4-SpTe8iot9hMfyXILmlUFvYVNYqPAHL5hn1fr2wgis,1180
|
|
14
|
+
epub_translator/epub/math.py,sha256=-Q2LJQxxjgQZQUe_WlJA9tjzLqgqtw2ZmbGbHsPRp2U,5422
|
|
15
|
+
epub_translator/epub/metadata.py,sha256=DXSimY2iZNBA2juIaKtB-4CHHSYJiDK7PPhfenV4dto,3511
|
|
16
|
+
epub_translator/epub/spines.py,sha256=bP2IsobZm7zs4z10iXGc9SmgAFSIq9pJc8HE-V0aW9Y,1331
|
|
17
|
+
epub_translator/epub/toc.py,sha256=TKJfyDT4svFkXd6JCNZk2ZEYc9q-5DXnV3zY2UKo8nE,14891
|
|
18
|
+
epub_translator/epub/zip.py,sha256=-3LI8f-ksgU8xCy28NjBOKyQPE8PhPEUPqIKZE1p8dw,2364
|
|
19
|
+
epub_translator/epub_transcode.py,sha256=NzuvXXEZfAhIoMOSrgQRF0DPtaSpz4OY-NMSdC0Y2RM,2749
|
|
20
|
+
epub_translator/language.py,sha256=88osG0JNYxOkxBjg5Pm-P0Mhiyxf6GqdxoPW12HW0PE,493
|
|
21
|
+
epub_translator/llm/__init__.py,sha256=YcFYYnXmXyX0RUyC-PDbj5k7Woygp_XOpTI3vDiNSPM,75
|
|
22
|
+
epub_translator/llm/context.py,sha256=73paN3V66LQ6muKUSMCKEHEmMYBylK-dXOF8LmaQo5M,3885
|
|
23
|
+
epub_translator/llm/core.py,sha256=AorV4ss4Hr-IbAk8FmGhV2hgI2tKxQmW2Vz2WwUd0Ms,5110
|
|
24
|
+
epub_translator/llm/error.py,sha256=4efAIQL14DFSvAnSTUfgdAbZRqaWBqOfUGsSfvxa5zM,1503
|
|
25
|
+
epub_translator/llm/executor.py,sha256=A0IjQ-s9wBJuhAZAAydneb9zBXWnu2J9inR2Q8F-GDE,5533
|
|
26
|
+
epub_translator/llm/increasable.py,sha256=8XkKeI1hiHlpMHj8dQ4fW0BkViSx4hH8QfbQsy-5SDw,1297
|
|
27
|
+
epub_translator/llm/types.py,sha256=c-dMAIvlG4R3la3mUTWEw5xei-sIYKmQeBja7mirxcI,219
|
|
28
|
+
epub_translator/segment/__init__.py,sha256=UYTv_IKQbEB0DzhFeiuqCvjoJLvB-7XRwlaFS90KmIw,573
|
|
29
|
+
epub_translator/segment/block_segment.py,sha256=psNKA_HMIcwZtoug8AtnAcV9_mQ2WXLnXqFsekHzt2g,4570
|
|
30
|
+
epub_translator/segment/common.py,sha256=gGWYQaJ0tGnWCuF1me9TOo-Q_DrZVakCu2patyFIOs0,714
|
|
31
|
+
epub_translator/segment/inline_segment.py,sha256=_ZgSlZmGxzIvaPs01hreoUfnaXz8Yq7naksT34dGfds,14221
|
|
32
|
+
epub_translator/segment/text_segment.py,sha256=qKp646lAqsrI7CP7KYyXgRD2bY0dCR78i6TMBCzklrM,7614
|
|
33
|
+
epub_translator/segment/utils.py,sha256=qMqUt33pDRN5Tnuydkodzu2gaQrwTzAnQmXpDuHen1o,1036
|
|
34
|
+
epub_translator/serial/__init__.py,sha256=b3IMVmWcUwEqHKcGmey88b057pyz5ct946CaUZi4LB4,67
|
|
35
|
+
epub_translator/serial/chunk.py,sha256=FrTaHikVOd6bLYumnEriTaAQ_DIDLjHm16gh-wBVR9k,1495
|
|
36
|
+
epub_translator/serial/segment.py,sha256=uEz-ke1KcYrON-68FaUEzMG2CzHlMjvbC11F3ZT4yH0,446
|
|
37
|
+
epub_translator/serial/splitter.py,sha256=Nq0sxPXos8ez7QBG01sOKjnYKbeBWUBHflZGtqenVm8,1726
|
|
38
|
+
epub_translator/template.py,sha256=0CqRmj3nTtPshw0NmTr2ECqelops2MMyX94fMrE-HKs,1587
|
|
39
|
+
epub_translator/translator.py,sha256=Uy1dRBPA9hoNh3BE40M2XukK2VvcWRCvMvGwvhQtIaA,6212
|
|
40
|
+
epub_translator/utils.py,sha256=BfZWrYjzDNQ4cFrgvRNzd4i1CKLtPxS8Z4LBHhqEV78,914
|
|
41
|
+
epub_translator/xml/__init__.py,sha256=pxSRKPBQ7D8BCIzXceSad_1MFgN1Dou0BZz9trK47wU,138
|
|
42
|
+
epub_translator/xml/const.py,sha256=Re2TYmpwG7-jVVgSq3R_K-uYhvAYzcXcRmLFkwCPD9Y,19
|
|
43
|
+
epub_translator/xml/deduplication.py,sha256=TaMbzeA70VvUQV0X1wcQFVbuMEPJUtj9Hq6iWlUmtAQ,1152
|
|
44
|
+
epub_translator/xml/firendly/__init__.py,sha256=I5jhnhFWoHvojLsYXH4jfR4Gi8lKFZ3yQ56ze5hEe1M,74
|
|
45
|
+
epub_translator/xml/firendly/decoder.py,sha256=xRQ5LnSunmYbba_0oT39oUr86-sLYAHYMUGmlseIu2U,2467
|
|
46
|
+
epub_translator/xml/firendly/encoder.py,sha256=evjvw6oE-oCud44IsJ-YZVHn6dtUzjNYX25ljaZP6vY,2417
|
|
47
|
+
epub_translator/xml/firendly/parser.py,sha256=QlMHA0nfPJbNyx6IwRFrYVw7okuvzDB42NXCauIFV-o,6560
|
|
48
|
+
epub_translator/xml/firendly/tag.py,sha256=ahaGoYttuAlnFxLFFgTV51KUZSpUiHho-COZX14nxN8,3308
|
|
49
|
+
epub_translator/xml/firendly/transform.py,sha256=5tG1MJmzrXIR_Z5gmRxwcoKvXBzJBVH0ELeaRsG-8w0,1201
|
|
50
|
+
epub_translator/xml/self_closing.py,sha256=41ofGUdss9yU51IVwI4It6hKfzh8YcxIR_j-ohD19LE,5240
|
|
51
|
+
epub_translator/xml/utils.py,sha256=7tQ6L5P0_JXhxONeG64hEeeL5mKjA6NKS1H1Q9B1Cac,1062
|
|
52
|
+
epub_translator/xml/xml.py,sha256=qQ5Wk1-KVVHE4TX25zGOR7fINsGkXnoq-qyKKNl5no4,1675
|
|
53
|
+
epub_translator/xml/xml_like.py,sha256=jBK4UUgXXWRYnfYlCH1MUAjGHWBQAbUj8HsYqvTTWvA,8890
|
|
54
|
+
epub_translator/xml_interrupter.py,sha256=IGLATr7zTIdhE54Gnroab4Xu_vLJ7kzPiQgk7WMXKTc,7403
|
|
55
|
+
epub_translator/xml_translator/__init__.py,sha256=7aswnFGtuj97l7RQd4ka976WCKC7OPs2gFnJFdS74Ug,77
|
|
56
|
+
epub_translator/xml_translator/callbacks.py,sha256=IoZrsaivd2W76cHFupwv6auVxgEWHcBN2MHQJYcWoJ8,1324
|
|
57
|
+
epub_translator/xml_translator/common.py,sha256=hSPptgPp7j6dm47imELB5DgmEbzTEyJD6WEeELOOc50,38
|
|
58
|
+
epub_translator/xml_translator/hill_climbing.py,sha256=1jvilOkTLzwljJA4Nrel8yU2XGvOXpueUJTK7RAp-XY,4272
|
|
59
|
+
epub_translator/xml_translator/stream_mapper.py,sha256=tbMc2vyPUn9zEkJZ7-OVYuKaYyn2pPPwjcAdQ8HLzNs,10179
|
|
60
|
+
epub_translator/xml_translator/submitter.py,sha256=Ihp6DvvVMLNZLJkRccYppt_2I2CM7wvkkSAam9B2o2s,2268
|
|
61
|
+
epub_translator/xml_translator/translator.py,sha256=3Lu56vRkAbbnsWK5fOYkVoO-7b6TXCrFYSVYLOjqhw0,9169
|
|
62
|
+
epub_translator/xml_translator/validation.py,sha256=-OKlSZuD__sjAiEpGAO93YQme4ZDSPmoPjRsAMOCEjc,16668
|
|
63
|
+
epub_translator-0.1.3.dist-info/LICENSE,sha256=5RF32sL3LtMOJIErdDKp1ZEYPGXS8WPpsiSz_jMBnGI,1066
|
|
64
|
+
epub_translator-0.1.3.dist-info/METADATA,sha256=ruyJKZI669xCDIYL6YKoc8ojBsqbO_7Ebe15KkTjLS0,11699
|
|
65
|
+
epub_translator-0.1.3.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
|
66
|
+
epub_translator-0.1.3.dist-info/RECORD,,
|