epub-translator 0.1.1__tar.gz → 0.1.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {epub_translator-0.1.1 → epub_translator-0.1.4}/PKG-INFO +134 -21
- {epub_translator-0.1.1 → epub_translator-0.1.4}/README.md +133 -20
- epub_translator-0.1.4/epub_translator/__init__.py +12 -0
- epub_translator-0.1.4/epub_translator/data/fill.jinja +171 -0
- {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/epub/__init__.py +1 -1
- epub_translator-0.1.4/epub_translator/epub/metadata.py +122 -0
- {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/epub/spines.py +3 -2
- {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/epub/zip.py +11 -9
- epub_translator-0.1.4/epub_translator/epub_transcode.py +108 -0
- {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/llm/__init__.py +1 -0
- epub_translator-0.1.4/epub_translator/llm/context.py +109 -0
- epub_translator-0.1.4/epub_translator/llm/core.py +152 -0
- {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/llm/executor.py +25 -31
- {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/llm/increasable.py +1 -1
- {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/llm/types.py +0 -3
- epub_translator-0.1.4/epub_translator/punctuation.py +34 -0
- epub_translator-0.1.4/epub_translator/segment/__init__.py +26 -0
- epub_translator-0.1.4/epub_translator/segment/block_segment.py +124 -0
- epub_translator-0.1.4/epub_translator/segment/common.py +29 -0
- epub_translator-0.1.4/epub_translator/segment/inline_segment.py +356 -0
- {epub_translator-0.1.1/epub_translator/xml_translator → epub_translator-0.1.4/epub_translator/segment}/text_segment.py +7 -72
- epub_translator-0.1.4/epub_translator/segment/utils.py +43 -0
- epub_translator-0.1.4/epub_translator/translator.py +182 -0
- epub_translator-0.1.4/epub_translator/utils.py +40 -0
- {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/xml/__init__.py +3 -0
- epub_translator-0.1.4/epub_translator/xml/const.py +1 -0
- {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/xml/deduplication.py +3 -3
- epub_translator-0.1.4/epub_translator/xml/inline.py +67 -0
- epub_translator-0.1.4/epub_translator/xml/self_closing.py +182 -0
- epub_translator-0.1.4/epub_translator/xml/utils.py +42 -0
- {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/xml/xml.py +7 -0
- {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/xml/xml_like.py +8 -33
- epub_translator-0.1.4/epub_translator/xml_interrupter.py +165 -0
- epub_translator-0.1.4/epub_translator/xml_translator/__init__.py +3 -0
- epub_translator-0.1.4/epub_translator/xml_translator/callbacks.py +34 -0
- epub_translator-0.1.1/epub_translator/xml_translator/const.py → epub_translator-0.1.4/epub_translator/xml_translator/common.py +0 -1
- epub_translator-0.1.4/epub_translator/xml_translator/hill_climbing.py +104 -0
- epub_translator-0.1.4/epub_translator/xml_translator/stream_mapper.py +253 -0
- epub_translator-0.1.4/epub_translator/xml_translator/submitter.py +363 -0
- epub_translator-0.1.4/epub_translator/xml_translator/translator.py +247 -0
- epub_translator-0.1.4/epub_translator/xml_translator/validation.py +458 -0
- {epub_translator-0.1.1 → epub_translator-0.1.4}/pyproject.toml +1 -1
- epub_translator-0.1.1/epub_translator/__init__.py +0 -5
- epub_translator-0.1.1/epub_translator/data/fill.jinja +0 -66
- epub_translator-0.1.1/epub_translator/epub/placeholder.py +0 -53
- epub_translator-0.1.1/epub_translator/iter_sync.py +0 -24
- epub_translator-0.1.1/epub_translator/llm/core.py +0 -233
- epub_translator-0.1.1/epub_translator/translator.py +0 -214
- epub_translator-0.1.1/epub_translator/utils.py +0 -7
- epub_translator-0.1.1/epub_translator/xml_translator/__init__.py +0 -3
- epub_translator-0.1.1/epub_translator/xml_translator/fill.py +0 -128
- epub_translator-0.1.1/epub_translator/xml_translator/format.py +0 -282
- epub_translator-0.1.1/epub_translator/xml_translator/fragmented.py +0 -125
- epub_translator-0.1.1/epub_translator/xml_translator/group.py +0 -183
- epub_translator-0.1.1/epub_translator/xml_translator/progressive_locking.py +0 -256
- epub_translator-0.1.1/epub_translator/xml_translator/submitter.py +0 -102
- epub_translator-0.1.1/epub_translator/xml_translator/translator.py +0 -179
- epub_translator-0.1.1/epub_translator/xml_translator/utils.py +0 -29
- {epub_translator-0.1.1 → epub_translator-0.1.4}/LICENSE +0 -0
- {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/data/mmltex/README.md +0 -0
- {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/data/mmltex/cmarkup.xsl +0 -0
- {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/data/mmltex/entities.xsl +0 -0
- {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/data/mmltex/glayout.xsl +0 -0
- {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/data/mmltex/mmltex.xsl +0 -0
- {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/data/mmltex/scripts.xsl +0 -0
- {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/data/mmltex/tables.xsl +0 -0
- {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/data/mmltex/tokens.xsl +0 -0
- {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/data/translate.jinja +0 -0
- {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/epub/common.py +0 -0
- {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/epub/math.py +0 -0
- {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/epub/toc.py +0 -0
- {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/language.py +0 -0
- {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/llm/error.py +0 -0
- {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/serial/__init__.py +0 -0
- {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/serial/chunk.py +0 -0
- {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/serial/segment.py +0 -0
- {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/serial/splitter.py +0 -0
- {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/template.py +0 -0
- {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/xml/firendly/__init__.py +0 -0
- {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/xml/firendly/decoder.py +0 -0
- {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/xml/firendly/encoder.py +0 -0
- {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/xml/firendly/parser.py +0 -0
- {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/xml/firendly/tag.py +0 -0
- {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/xml/firendly/transform.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: epub-translator
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.4
|
|
4
4
|
Summary: Translate the epub book using LLM. The translated book will retain the original text and list the translated text side by side with the original text.
|
|
5
5
|
License: MIT
|
|
6
6
|
Keywords: epub,llm,translation,translator
|
|
@@ -78,8 +78,7 @@ The easiest way to use EPUB Translator is through OOMOL Studio with a visual int
|
|
|
78
78
|
### Using Python API
|
|
79
79
|
|
|
80
80
|
```python
|
|
81
|
-
from
|
|
82
|
-
from epub_translator import LLM, translate, language
|
|
81
|
+
from epub_translator import LLM, translate, language, SubmitKind
|
|
83
82
|
|
|
84
83
|
# Initialize LLM with your API credentials
|
|
85
84
|
llm = LLM(
|
|
@@ -91,10 +90,11 @@ llm = LLM(
|
|
|
91
90
|
|
|
92
91
|
# Translate EPUB file using language constants
|
|
93
92
|
translate(
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
target_path=Path("translated.epub"),
|
|
93
|
+
source_path="source.epub",
|
|
94
|
+
target_path="translated.epub",
|
|
97
95
|
target_language=language.ENGLISH,
|
|
96
|
+
submit=SubmitKind.APPEND_BLOCK,
|
|
97
|
+
llm=llm,
|
|
98
98
|
)
|
|
99
99
|
```
|
|
100
100
|
|
|
@@ -113,10 +113,11 @@ with tqdm(total=100, desc="Translating", unit="%") as pbar:
|
|
|
113
113
|
last_progress = progress
|
|
114
114
|
|
|
115
115
|
translate(
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
target_path=Path("translated.epub"),
|
|
116
|
+
source_path="source.epub",
|
|
117
|
+
target_path="translated.epub",
|
|
119
118
|
target_language="English",
|
|
119
|
+
submit=SubmitKind.APPEND_BLOCK,
|
|
120
|
+
llm=llm,
|
|
120
121
|
on_progress=on_progress,
|
|
121
122
|
)
|
|
122
123
|
```
|
|
@@ -149,14 +150,63 @@ Translate an EPUB file:
|
|
|
149
150
|
|
|
150
151
|
```python
|
|
151
152
|
translate(
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
target_path: Path, # Output EPUB file path
|
|
153
|
+
source_path: PathLike | str, # Source EPUB file path
|
|
154
|
+
target_path: PathLike | str, # Output EPUB file path
|
|
155
155
|
target_language: str, # Target language (e.g., "English", "Chinese")
|
|
156
|
+
submit: SubmitKind, # How to insert translations (REPLACE, APPEND_TEXT, or APPEND_BLOCK)
|
|
156
157
|
user_prompt: str | None = None, # Custom translation instructions
|
|
157
158
|
max_retries: int = 5, # Maximum retries for failed translations
|
|
158
159
|
max_group_tokens: int = 1200, # Maximum tokens per translation group
|
|
160
|
+
llm: LLM | None = None, # Single LLM instance for both translation and filling
|
|
161
|
+
translation_llm: LLM | None = None, # LLM instance for translation (overrides llm)
|
|
162
|
+
fill_llm: LLM | None = None, # LLM instance for XML filling (overrides llm)
|
|
159
163
|
on_progress: Callable[[float], None] | None = None, # Progress callback (0.0-1.0)
|
|
164
|
+
on_fill_failed: Callable[[FillFailedEvent], None] | None = None, # Error callback
|
|
165
|
+
)
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
**Note**: Either `llm` or both `translation_llm` and `fill_llm` must be provided. Using separate LLMs allows for task-specific optimization.
|
|
169
|
+
|
|
170
|
+
#### Submit Modes
|
|
171
|
+
|
|
172
|
+
The `submit` parameter controls how translated content is inserted into the document. Use `SubmitKind` enum to specify the insertion mode:
|
|
173
|
+
|
|
174
|
+
```python
|
|
175
|
+
from epub_translator import SubmitKind
|
|
176
|
+
|
|
177
|
+
# Three available modes:
|
|
178
|
+
# - SubmitKind.REPLACE: Replace original content with translation (single-language output)
|
|
179
|
+
# - SubmitKind.APPEND_TEXT: Append translation as inline text (bilingual output)
|
|
180
|
+
# - SubmitKind.APPEND_BLOCK: Append translation as block elements (bilingual output, recommended)
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
**Mode Comparison:**
|
|
184
|
+
|
|
185
|
+
- **`SubmitKind.REPLACE`**: Creates a single-language translation by replacing original text with translated content. Useful for creating books in the target language only.
|
|
186
|
+
|
|
187
|
+
- **`SubmitKind.APPEND_TEXT`**: Appends translations as inline text immediately after the original content. Both languages appear in the same paragraph, creating a continuous reading flow.
|
|
188
|
+
|
|
189
|
+
- **`SubmitKind.APPEND_BLOCK`** (Recommended): Appends translations as separate block elements (paragraphs) after the original. This creates clear visual separation between languages, making it ideal for side-by-side bilingual reading.
|
|
190
|
+
|
|
191
|
+
**Example:**
|
|
192
|
+
|
|
193
|
+
```python
|
|
194
|
+
# For bilingual books (recommended)
|
|
195
|
+
translate(
|
|
196
|
+
source_path="source.epub",
|
|
197
|
+
target_path="translated.epub",
|
|
198
|
+
target_language=language.ENGLISH,
|
|
199
|
+
submit=SubmitKind.APPEND_BLOCK,
|
|
200
|
+
llm=llm,
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
# For single-language translation
|
|
204
|
+
translate(
|
|
205
|
+
source_path="source.epub",
|
|
206
|
+
target_path="translated.epub",
|
|
207
|
+
target_language=language.ENGLISH,
|
|
208
|
+
submit=SubmitKind.REPLACE,
|
|
209
|
+
llm=llm,
|
|
160
210
|
)
|
|
161
211
|
```
|
|
162
212
|
|
|
@@ -169,18 +219,80 @@ from epub_translator import language
|
|
|
169
219
|
|
|
170
220
|
# Usage example:
|
|
171
221
|
translate(
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
target_path=Path("translated.epub"),
|
|
222
|
+
source_path="source.epub",
|
|
223
|
+
target_path="translated.epub",
|
|
175
224
|
target_language=language.ENGLISH,
|
|
225
|
+
submit=SubmitKind.APPEND_BLOCK,
|
|
226
|
+
llm=llm,
|
|
176
227
|
)
|
|
177
228
|
|
|
178
229
|
# You can also use custom language strings:
|
|
179
230
|
translate(
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
target_path=Path("translated.epub"),
|
|
231
|
+
source_path="source.epub",
|
|
232
|
+
target_path="translated.epub",
|
|
183
233
|
target_language="Icelandic", # For languages not in the constants
|
|
234
|
+
submit=SubmitKind.APPEND_BLOCK,
|
|
235
|
+
llm=llm,
|
|
236
|
+
)
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
### Error Handling with `on_fill_failed`
|
|
240
|
+
|
|
241
|
+
Monitor and handle translation errors using the `on_fill_failed` callback:
|
|
242
|
+
|
|
243
|
+
```python
|
|
244
|
+
from epub_translator import FillFailedEvent
|
|
245
|
+
|
|
246
|
+
def handle_fill_error(event: FillFailedEvent):
|
|
247
|
+
print(f"Translation error (attempt {event.retried_count}):")
|
|
248
|
+
print(f" {event.error_message}")
|
|
249
|
+
if event.over_maximum_retries:
|
|
250
|
+
print(" Maximum retries exceeded!")
|
|
251
|
+
|
|
252
|
+
translate(
|
|
253
|
+
source_path="source.epub",
|
|
254
|
+
target_path="translated.epub",
|
|
255
|
+
target_language=language.ENGLISH,
|
|
256
|
+
submit=SubmitKind.APPEND_BLOCK,
|
|
257
|
+
llm=llm,
|
|
258
|
+
on_fill_failed=handle_fill_error,
|
|
259
|
+
)
|
|
260
|
+
```
|
|
261
|
+
|
|
262
|
+
The `FillFailedEvent` contains:
|
|
263
|
+
- `error_message: str` - Description of the error
|
|
264
|
+
- `retried_count: int` - Current retry attempt number
|
|
265
|
+
- `over_maximum_retries: bool` - Whether max retries has been exceeded
|
|
266
|
+
|
|
267
|
+
### Dual-LLM Architecture
|
|
268
|
+
|
|
269
|
+
Use separate LLM instances for translation and XML structure filling with different optimization parameters:
|
|
270
|
+
|
|
271
|
+
```python
|
|
272
|
+
# Create two LLM instances with different temperatures
|
|
273
|
+
translation_llm = LLM(
|
|
274
|
+
key="your-api-key",
|
|
275
|
+
url="https://api.openai.com/v1",
|
|
276
|
+
model="gpt-4",
|
|
277
|
+
token_encoding="o200k_base",
|
|
278
|
+
temperature=0.8, # Higher temperature for creative translation
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
fill_llm = LLM(
|
|
282
|
+
key="your-api-key",
|
|
283
|
+
url="https://api.openai.com/v1",
|
|
284
|
+
model="gpt-4",
|
|
285
|
+
token_encoding="o200k_base",
|
|
286
|
+
temperature=0.3, # Lower temperature for structure preservation
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
translate(
|
|
290
|
+
source_path="source.epub",
|
|
291
|
+
target_path="translated.epub",
|
|
292
|
+
target_language=language.ENGLISH,
|
|
293
|
+
submit=SubmitKind.APPEND_BLOCK,
|
|
294
|
+
translation_llm=translation_llm,
|
|
295
|
+
fill_llm=fill_llm,
|
|
184
296
|
)
|
|
185
297
|
```
|
|
186
298
|
|
|
@@ -236,10 +348,11 @@ Provide specific translation instructions:
|
|
|
236
348
|
|
|
237
349
|
```python
|
|
238
350
|
translate(
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
target_path=Path("translated.epub"),
|
|
351
|
+
source_path="source.epub",
|
|
352
|
+
target_path="translated.epub",
|
|
242
353
|
target_language="English",
|
|
354
|
+
submit=SubmitKind.APPEND_BLOCK,
|
|
355
|
+
llm=llm,
|
|
243
356
|
user_prompt="Use formal language and preserve technical terminology",
|
|
244
357
|
)
|
|
245
358
|
```
|
|
@@ -45,8 +45,7 @@ The easiest way to use EPUB Translator is through OOMOL Studio with a visual int
|
|
|
45
45
|
### Using Python API
|
|
46
46
|
|
|
47
47
|
```python
|
|
48
|
-
from
|
|
49
|
-
from epub_translator import LLM, translate, language
|
|
48
|
+
from epub_translator import LLM, translate, language, SubmitKind
|
|
50
49
|
|
|
51
50
|
# Initialize LLM with your API credentials
|
|
52
51
|
llm = LLM(
|
|
@@ -58,10 +57,11 @@ llm = LLM(
|
|
|
58
57
|
|
|
59
58
|
# Translate EPUB file using language constants
|
|
60
59
|
translate(
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
target_path=Path("translated.epub"),
|
|
60
|
+
source_path="source.epub",
|
|
61
|
+
target_path="translated.epub",
|
|
64
62
|
target_language=language.ENGLISH,
|
|
63
|
+
submit=SubmitKind.APPEND_BLOCK,
|
|
64
|
+
llm=llm,
|
|
65
65
|
)
|
|
66
66
|
```
|
|
67
67
|
|
|
@@ -80,10 +80,11 @@ with tqdm(total=100, desc="Translating", unit="%") as pbar:
|
|
|
80
80
|
last_progress = progress
|
|
81
81
|
|
|
82
82
|
translate(
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
target_path=Path("translated.epub"),
|
|
83
|
+
source_path="source.epub",
|
|
84
|
+
target_path="translated.epub",
|
|
86
85
|
target_language="English",
|
|
86
|
+
submit=SubmitKind.APPEND_BLOCK,
|
|
87
|
+
llm=llm,
|
|
87
88
|
on_progress=on_progress,
|
|
88
89
|
)
|
|
89
90
|
```
|
|
@@ -116,14 +117,63 @@ Translate an EPUB file:
|
|
|
116
117
|
|
|
117
118
|
```python
|
|
118
119
|
translate(
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
target_path: Path, # Output EPUB file path
|
|
120
|
+
source_path: PathLike | str, # Source EPUB file path
|
|
121
|
+
target_path: PathLike | str, # Output EPUB file path
|
|
122
122
|
target_language: str, # Target language (e.g., "English", "Chinese")
|
|
123
|
+
submit: SubmitKind, # How to insert translations (REPLACE, APPEND_TEXT, or APPEND_BLOCK)
|
|
123
124
|
user_prompt: str | None = None, # Custom translation instructions
|
|
124
125
|
max_retries: int = 5, # Maximum retries for failed translations
|
|
125
126
|
max_group_tokens: int = 1200, # Maximum tokens per translation group
|
|
127
|
+
llm: LLM | None = None, # Single LLM instance for both translation and filling
|
|
128
|
+
translation_llm: LLM | None = None, # LLM instance for translation (overrides llm)
|
|
129
|
+
fill_llm: LLM | None = None, # LLM instance for XML filling (overrides llm)
|
|
126
130
|
on_progress: Callable[[float], None] | None = None, # Progress callback (0.0-1.0)
|
|
131
|
+
on_fill_failed: Callable[[FillFailedEvent], None] | None = None, # Error callback
|
|
132
|
+
)
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
**Note**: Either `llm` or both `translation_llm` and `fill_llm` must be provided. Using separate LLMs allows for task-specific optimization.
|
|
136
|
+
|
|
137
|
+
#### Submit Modes
|
|
138
|
+
|
|
139
|
+
The `submit` parameter controls how translated content is inserted into the document. Use `SubmitKind` enum to specify the insertion mode:
|
|
140
|
+
|
|
141
|
+
```python
|
|
142
|
+
from epub_translator import SubmitKind
|
|
143
|
+
|
|
144
|
+
# Three available modes:
|
|
145
|
+
# - SubmitKind.REPLACE: Replace original content with translation (single-language output)
|
|
146
|
+
# - SubmitKind.APPEND_TEXT: Append translation as inline text (bilingual output)
|
|
147
|
+
# - SubmitKind.APPEND_BLOCK: Append translation as block elements (bilingual output, recommended)
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
**Mode Comparison:**
|
|
151
|
+
|
|
152
|
+
- **`SubmitKind.REPLACE`**: Creates a single-language translation by replacing original text with translated content. Useful for creating books in the target language only.
|
|
153
|
+
|
|
154
|
+
- **`SubmitKind.APPEND_TEXT`**: Appends translations as inline text immediately after the original content. Both languages appear in the same paragraph, creating a continuous reading flow.
|
|
155
|
+
|
|
156
|
+
- **`SubmitKind.APPEND_BLOCK`** (Recommended): Appends translations as separate block elements (paragraphs) after the original. This creates clear visual separation between languages, making it ideal for side-by-side bilingual reading.
|
|
157
|
+
|
|
158
|
+
**Example:**
|
|
159
|
+
|
|
160
|
+
```python
|
|
161
|
+
# For bilingual books (recommended)
|
|
162
|
+
translate(
|
|
163
|
+
source_path="source.epub",
|
|
164
|
+
target_path="translated.epub",
|
|
165
|
+
target_language=language.ENGLISH,
|
|
166
|
+
submit=SubmitKind.APPEND_BLOCK,
|
|
167
|
+
llm=llm,
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
# For single-language translation
|
|
171
|
+
translate(
|
|
172
|
+
source_path="source.epub",
|
|
173
|
+
target_path="translated.epub",
|
|
174
|
+
target_language=language.ENGLISH,
|
|
175
|
+
submit=SubmitKind.REPLACE,
|
|
176
|
+
llm=llm,
|
|
127
177
|
)
|
|
128
178
|
```
|
|
129
179
|
|
|
@@ -136,18 +186,80 @@ from epub_translator import language
|
|
|
136
186
|
|
|
137
187
|
# Usage example:
|
|
138
188
|
translate(
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
target_path=Path("translated.epub"),
|
|
189
|
+
source_path="source.epub",
|
|
190
|
+
target_path="translated.epub",
|
|
142
191
|
target_language=language.ENGLISH,
|
|
192
|
+
submit=SubmitKind.APPEND_BLOCK,
|
|
193
|
+
llm=llm,
|
|
143
194
|
)
|
|
144
195
|
|
|
145
196
|
# You can also use custom language strings:
|
|
146
197
|
translate(
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
target_path=Path("translated.epub"),
|
|
198
|
+
source_path="source.epub",
|
|
199
|
+
target_path="translated.epub",
|
|
150
200
|
target_language="Icelandic", # For languages not in the constants
|
|
201
|
+
submit=SubmitKind.APPEND_BLOCK,
|
|
202
|
+
llm=llm,
|
|
203
|
+
)
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
### Error Handling with `on_fill_failed`
|
|
207
|
+
|
|
208
|
+
Monitor and handle translation errors using the `on_fill_failed` callback:
|
|
209
|
+
|
|
210
|
+
```python
|
|
211
|
+
from epub_translator import FillFailedEvent
|
|
212
|
+
|
|
213
|
+
def handle_fill_error(event: FillFailedEvent):
|
|
214
|
+
print(f"Translation error (attempt {event.retried_count}):")
|
|
215
|
+
print(f" {event.error_message}")
|
|
216
|
+
if event.over_maximum_retries:
|
|
217
|
+
print(" Maximum retries exceeded!")
|
|
218
|
+
|
|
219
|
+
translate(
|
|
220
|
+
source_path="source.epub",
|
|
221
|
+
target_path="translated.epub",
|
|
222
|
+
target_language=language.ENGLISH,
|
|
223
|
+
submit=SubmitKind.APPEND_BLOCK,
|
|
224
|
+
llm=llm,
|
|
225
|
+
on_fill_failed=handle_fill_error,
|
|
226
|
+
)
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
The `FillFailedEvent` contains:
|
|
230
|
+
- `error_message: str` - Description of the error
|
|
231
|
+
- `retried_count: int` - Current retry attempt number
|
|
232
|
+
- `over_maximum_retries: bool` - Whether max retries has been exceeded
|
|
233
|
+
|
|
234
|
+
### Dual-LLM Architecture
|
|
235
|
+
|
|
236
|
+
Use separate LLM instances for translation and XML structure filling with different optimization parameters:
|
|
237
|
+
|
|
238
|
+
```python
|
|
239
|
+
# Create two LLM instances with different temperatures
|
|
240
|
+
translation_llm = LLM(
|
|
241
|
+
key="your-api-key",
|
|
242
|
+
url="https://api.openai.com/v1",
|
|
243
|
+
model="gpt-4",
|
|
244
|
+
token_encoding="o200k_base",
|
|
245
|
+
temperature=0.8, # Higher temperature for creative translation
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
fill_llm = LLM(
|
|
249
|
+
key="your-api-key",
|
|
250
|
+
url="https://api.openai.com/v1",
|
|
251
|
+
model="gpt-4",
|
|
252
|
+
token_encoding="o200k_base",
|
|
253
|
+
temperature=0.3, # Lower temperature for structure preservation
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
translate(
|
|
257
|
+
source_path="source.epub",
|
|
258
|
+
target_path="translated.epub",
|
|
259
|
+
target_language=language.ENGLISH,
|
|
260
|
+
submit=SubmitKind.APPEND_BLOCK,
|
|
261
|
+
translation_llm=translation_llm,
|
|
262
|
+
fill_llm=fill_llm,
|
|
151
263
|
)
|
|
152
264
|
```
|
|
153
265
|
|
|
@@ -203,10 +315,11 @@ Provide specific translation instructions:
|
|
|
203
315
|
|
|
204
316
|
```python
|
|
205
317
|
translate(
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
target_path=Path("translated.epub"),
|
|
318
|
+
source_path="source.epub",
|
|
319
|
+
target_path="translated.epub",
|
|
209
320
|
target_language="English",
|
|
321
|
+
submit=SubmitKind.APPEND_BLOCK,
|
|
322
|
+
llm=llm,
|
|
210
323
|
user_prompt="Use formal language and preserve technical terminology",
|
|
211
324
|
)
|
|
212
325
|
```
|
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
You are an XML structure validator. Your ONLY task is to preserve the exact XML structure from the template while filling in translated text.
|
|
2
|
+
|
|
3
|
+
CRITICAL RULES:
|
|
4
|
+
|
|
5
|
+
1. Structure Preservation: The output XML MUST have the EXACT SAME structure as the template
|
|
6
|
+
- Same tags in the same order
|
|
7
|
+
- Same nesting hierarchy
|
|
8
|
+
- Same attributes (especially id attributes)
|
|
9
|
+
|
|
10
|
+
IMPORTANT: Translation fluency is SECONDARY to structure preservation.
|
|
11
|
+
If the translated text flows naturally but doesn't match template structure,
|
|
12
|
+
you MUST break the flow to insert required tags.
|
|
13
|
+
|
|
14
|
+
2. ID Handling:
|
|
15
|
+
- Tags WITH id="X": Disambiguation markers for structurally similar elements
|
|
16
|
+
- Tags WITHOUT id: Structurally unique, match by position and tag name
|
|
17
|
+
- NEVER add, remove, or change id attributes
|
|
18
|
+
|
|
19
|
+
3. Text Filling Strategy:
|
|
20
|
+
- Compare source text with translated text
|
|
21
|
+
- Identify how source maps to template structure
|
|
22
|
+
- Apply the same mapping to translated text
|
|
23
|
+
- Preserve paragraph breaks (elements are natural separators)
|
|
24
|
+
- IMPORTANT: Translation may change word order - use SEMANTIC matching, not position
|
|
25
|
+
|
|
26
|
+
---
|
|
27
|
+
|
|
28
|
+
COMMON ERRORS TO AVOID:
|
|
29
|
+
|
|
30
|
+
Error Type 1: Missing expected blocks
|
|
31
|
+
❌ WRONG: Omitting elements with id attributes
|
|
32
|
+
✓ CORRECT: Every <tag id="X"> in template MUST appear in output
|
|
33
|
+
|
|
34
|
+
Error Type 2: Tag count mismatch for non-id elements
|
|
35
|
+
Example template:
|
|
36
|
+
<p id="1">
|
|
37
|
+
<span>text1</span>
|
|
38
|
+
<span>text2</span>
|
|
39
|
+
</p>
|
|
40
|
+
|
|
41
|
+
❌ WRONG: <p id="1"><span>merged text</span></p> (only 1 span, expected 2)
|
|
42
|
+
✓ CORRECT: <p id="1"><span>text1</span><span>text2</span></p>
|
|
43
|
+
|
|
44
|
+
Error Type 3: Adding unexpected IDs
|
|
45
|
+
❌ WRONG: Adding id="99" to a tag that didn't have an id in template
|
|
46
|
+
✓ CORRECT: If template has <span>text</span>, output should be <span>译文</span> (no id)
|
|
47
|
+
|
|
48
|
+
Error Type 4: Wrong tag names
|
|
49
|
+
❌ WRONG: Changing <em id="5"> to <i id="5">
|
|
50
|
+
✓ CORRECT: Keep exact tag name from template
|
|
51
|
+
|
|
52
|
+
Error Type 5: Missing ID on required elements
|
|
53
|
+
❌ WRONG: <span>text</span> when template has <span id="5">text</span>
|
|
54
|
+
✓ CORRECT: <span id="5">译文</span>
|
|
55
|
+
|
|
56
|
+
Error Type 6: Wrong text mapping when word order changes
|
|
57
|
+
Example 1: Template has "reviewer of <span id="5">Book</span> in <span id="6">Journal</span>"
|
|
58
|
+
Translation: "Journal 上对 Book 的评论者"
|
|
59
|
+
|
|
60
|
+
❌ WRONG: Journal 上对 <span id="5">Book</span> 的评论者<span id="6">Journal</span>
|
|
61
|
+
(appending original text at end)
|
|
62
|
+
✓ CORRECT: <span id="6">Journal</span> 上对 <span id="5">Book</span> 的评论者
|
|
63
|
+
(wrapping semantic equivalents in translated positions)
|
|
64
|
+
|
|
65
|
+
Example 2: Breaking fluent translation to preserve structure
|
|
66
|
+
Template: "published in <span id="5">Book Title</span> in 1990"
|
|
67
|
+
Translation: "于1990年出版的《书名》" (flows naturally, but loses structure)
|
|
68
|
+
|
|
69
|
+
❌ WRONG: 于1990年出版的《书名》 (fluent but missing <span id="5">)
|
|
70
|
+
✓ CORRECT: 于1990年出版的<span id="5">《书名》</span>
|
|
71
|
+
(Break fluency to preserve structure - this is REQUIRED)
|
|
72
|
+
|
|
73
|
+
Error Type 7: Wrong semantic matching when word order changes
|
|
74
|
+
When translation changes word order, match elements by SEMANTIC TYPE, not position.
|
|
75
|
+
|
|
76
|
+
Example: Book title and year
|
|
77
|
+
Template: "<span id="3">Book Title</span> in <span id="4"><a>1990</a></span>"
|
|
78
|
+
Translation: "《书名》于1990年出版"
|
|
79
|
+
|
|
80
|
+
❌ WRONG: 《书名》于<span id="3">1990</span>年出版...
|
|
81
|
+
(Matching by position: "1990" appears after "于", so wrapping it with id="3")
|
|
82
|
+
(WRONG because you matched a YEAR to a slot expecting BOOK TITLE)
|
|
83
|
+
|
|
84
|
+
✓ CORRECT: <span id="3">《书名》</span>于<span id="4"><a>1990</a></span>年出版
|
|
85
|
+
(Matching by semantic type: book title → book title, year → year)
|
|
86
|
+
|
|
87
|
+
KEY PRINCIPLE: Semantic type matching beats position matching!
|
|
88
|
+
- Identify semantic types: book titles, journal names, years (4-digit numbers), person names, etc.
|
|
89
|
+
- Match each to its corresponding slot, regardless of position in translation
|
|
90
|
+
- data-orig-len hints at length: book/journal titles usually longer than years/numbers
|
|
91
|
+
|
|
92
|
+
---
|
|
93
|
+
|
|
94
|
+
FILLING ALGORITHM:
|
|
95
|
+
|
|
96
|
+
1. Analyze template structure: count elements at each level, note id attributes
|
|
97
|
+
|
|
98
|
+
2. Segment source text by elements (elements are natural separators)
|
|
99
|
+
|
|
100
|
+
3. Apply to translation - STRICT STRUCTURAL MATCHING:
|
|
101
|
+
|
|
102
|
+
A. For elements WITH id:
|
|
103
|
+
- Locate semantic equivalent in translation
|
|
104
|
+
- Wrap with same tag+id
|
|
105
|
+
- If translation merged multiple spans: You MUST still output all original spans separately
|
|
106
|
+
Example: Template has id="1" and id="2", translation merged both
|
|
107
|
+
→ Output BOTH spans, use source text fallback for missing one
|
|
108
|
+
|
|
109
|
+
B. For elements WITHOUT id:
|
|
110
|
+
- Match by STRUCTURAL POSITION only (template order)
|
|
111
|
+
- Count MUST be exact: 7 spans in template = 7 spans in output
|
|
112
|
+
- Even if content repeats (e.g., 3 instances of "x"), each gets its own span
|
|
113
|
+
- Process sequentially: wrap 1st occurrence with 1st span, 2nd with 2nd span, etc.
|
|
114
|
+
- DO NOT merge, skip, or add extra elements
|
|
115
|
+
|
|
116
|
+
CRITICAL for repeated content:
|
|
117
|
+
If template has: "...<span>Word</span>...more text...<span>Word</span>"
|
|
118
|
+
And translation has: "...词...更多文字...词"
|
|
119
|
+
→ Wrap 1st occurrence of "词" with 1st span, 2nd occurrence with 2nd span
|
|
120
|
+
→ Even if the words are identical, treat each span position independently
|
|
121
|
+
|
|
122
|
+
4. Verify: same element counts, all ids preserved, tag names match
|
|
123
|
+
|
|
124
|
+
CRITICAL: Template structure is LAW. Translation fluency is secondary.
|
|
125
|
+
|
|
126
|
+
---
|
|
127
|
+
|
|
128
|
+
SPECIAL CASES:
|
|
129
|
+
|
|
130
|
+
1. data-orig-len attribute: Token count hint. Longer counts usually = titles/names, shorter = numbers/symbols.
|
|
131
|
+
|
|
132
|
+
2. Name+Number as single unit (e.g., "JournalName42" with NO space):
|
|
133
|
+
- In translation, find the name's equivalent and any adjacent number
|
|
134
|
+
- Wrap them together: <span id="X">《期刊名》42</span> or <span id="X">《期刊名》第42期</span>
|
|
135
|
+
- Key: If template treats them as one span, keep them in one span in translation
|
|
136
|
+
|
|
137
|
+
3. Translation merges adjacent spans:
|
|
138
|
+
Template: "<span id="A">Word1</span> & <span id="B">Word2</span>"
|
|
139
|
+
Translation: "复合词" (one inseparable term)
|
|
140
|
+
|
|
141
|
+
Solution: You MUST output BOTH spans even if translation merged them
|
|
142
|
+
- Try to split translation if possible
|
|
143
|
+
- If truly inseparable: Keep translation for one span, use source text for the other
|
|
144
|
+
- Example: <span id="A">复合词</span>与<span id="B">Word2</span>
|
|
145
|
+
|
|
146
|
+
4. Missing semantic match:
|
|
147
|
+
- Exhaust all possibilities first (synonyms, paraphrases, context)
|
|
148
|
+
- Last resort: Use source text as fallback
|
|
149
|
+
- Mixed language is acceptable to preserve structure
|
|
150
|
+
|
|
151
|
+
WRONG fallback approaches:
|
|
152
|
+
❌ Empty: <span id="2"></span>
|
|
153
|
+
❌ Placeholder: <span id="2">内容</span>
|
|
154
|
+
❌ Duplicate: <span id="2">中文名称</span> (when id="1" has this)
|
|
155
|
+
|
|
156
|
+
---
|
|
157
|
+
|
|
158
|
+
OUTPUT FORMAT:
|
|
159
|
+
```xml
|
|
160
|
+
<xml>
|
|
161
|
+
... your filled XML here ...
|
|
162
|
+
</xml>
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
CRITICAL:
|
|
166
|
+
- Return ONLY the XML block, no explanations
|
|
167
|
+
- Do NOT include example blocks or alternatives
|
|
168
|
+
- If unsure, make best attempt based on pattern
|
|
169
|
+
- System will provide detailed error messages if corrections needed
|
|
170
|
+
|
|
171
|
+
Begin.
|