epub-translator 0.1.1__tar.gz → 0.1.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. {epub_translator-0.1.1 → epub_translator-0.1.4}/PKG-INFO +134 -21
  2. {epub_translator-0.1.1 → epub_translator-0.1.4}/README.md +133 -20
  3. epub_translator-0.1.4/epub_translator/__init__.py +12 -0
  4. epub_translator-0.1.4/epub_translator/data/fill.jinja +171 -0
  5. {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/epub/__init__.py +1 -1
  6. epub_translator-0.1.4/epub_translator/epub/metadata.py +122 -0
  7. {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/epub/spines.py +3 -2
  8. {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/epub/zip.py +11 -9
  9. epub_translator-0.1.4/epub_translator/epub_transcode.py +108 -0
  10. {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/llm/__init__.py +1 -0
  11. epub_translator-0.1.4/epub_translator/llm/context.py +109 -0
  12. epub_translator-0.1.4/epub_translator/llm/core.py +152 -0
  13. {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/llm/executor.py +25 -31
  14. {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/llm/increasable.py +1 -1
  15. {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/llm/types.py +0 -3
  16. epub_translator-0.1.4/epub_translator/punctuation.py +34 -0
  17. epub_translator-0.1.4/epub_translator/segment/__init__.py +26 -0
  18. epub_translator-0.1.4/epub_translator/segment/block_segment.py +124 -0
  19. epub_translator-0.1.4/epub_translator/segment/common.py +29 -0
  20. epub_translator-0.1.4/epub_translator/segment/inline_segment.py +356 -0
  21. {epub_translator-0.1.1/epub_translator/xml_translator → epub_translator-0.1.4/epub_translator/segment}/text_segment.py +7 -72
  22. epub_translator-0.1.4/epub_translator/segment/utils.py +43 -0
  23. epub_translator-0.1.4/epub_translator/translator.py +182 -0
  24. epub_translator-0.1.4/epub_translator/utils.py +40 -0
  25. {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/xml/__init__.py +3 -0
  26. epub_translator-0.1.4/epub_translator/xml/const.py +1 -0
  27. {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/xml/deduplication.py +3 -3
  28. epub_translator-0.1.4/epub_translator/xml/inline.py +67 -0
  29. epub_translator-0.1.4/epub_translator/xml/self_closing.py +182 -0
  30. epub_translator-0.1.4/epub_translator/xml/utils.py +42 -0
  31. {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/xml/xml.py +7 -0
  32. {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/xml/xml_like.py +8 -33
  33. epub_translator-0.1.4/epub_translator/xml_interrupter.py +165 -0
  34. epub_translator-0.1.4/epub_translator/xml_translator/__init__.py +3 -0
  35. epub_translator-0.1.4/epub_translator/xml_translator/callbacks.py +34 -0
  36. epub_translator-0.1.1/epub_translator/xml_translator/const.py → epub_translator-0.1.4/epub_translator/xml_translator/common.py +0 -1
  37. epub_translator-0.1.4/epub_translator/xml_translator/hill_climbing.py +104 -0
  38. epub_translator-0.1.4/epub_translator/xml_translator/stream_mapper.py +253 -0
  39. epub_translator-0.1.4/epub_translator/xml_translator/submitter.py +363 -0
  40. epub_translator-0.1.4/epub_translator/xml_translator/translator.py +247 -0
  41. epub_translator-0.1.4/epub_translator/xml_translator/validation.py +458 -0
  42. {epub_translator-0.1.1 → epub_translator-0.1.4}/pyproject.toml +1 -1
  43. epub_translator-0.1.1/epub_translator/__init__.py +0 -5
  44. epub_translator-0.1.1/epub_translator/data/fill.jinja +0 -66
  45. epub_translator-0.1.1/epub_translator/epub/placeholder.py +0 -53
  46. epub_translator-0.1.1/epub_translator/iter_sync.py +0 -24
  47. epub_translator-0.1.1/epub_translator/llm/core.py +0 -233
  48. epub_translator-0.1.1/epub_translator/translator.py +0 -214
  49. epub_translator-0.1.1/epub_translator/utils.py +0 -7
  50. epub_translator-0.1.1/epub_translator/xml_translator/__init__.py +0 -3
  51. epub_translator-0.1.1/epub_translator/xml_translator/fill.py +0 -128
  52. epub_translator-0.1.1/epub_translator/xml_translator/format.py +0 -282
  53. epub_translator-0.1.1/epub_translator/xml_translator/fragmented.py +0 -125
  54. epub_translator-0.1.1/epub_translator/xml_translator/group.py +0 -183
  55. epub_translator-0.1.1/epub_translator/xml_translator/progressive_locking.py +0 -256
  56. epub_translator-0.1.1/epub_translator/xml_translator/submitter.py +0 -102
  57. epub_translator-0.1.1/epub_translator/xml_translator/translator.py +0 -179
  58. epub_translator-0.1.1/epub_translator/xml_translator/utils.py +0 -29
  59. {epub_translator-0.1.1 → epub_translator-0.1.4}/LICENSE +0 -0
  60. {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/data/mmltex/README.md +0 -0
  61. {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/data/mmltex/cmarkup.xsl +0 -0
  62. {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/data/mmltex/entities.xsl +0 -0
  63. {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/data/mmltex/glayout.xsl +0 -0
  64. {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/data/mmltex/mmltex.xsl +0 -0
  65. {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/data/mmltex/scripts.xsl +0 -0
  66. {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/data/mmltex/tables.xsl +0 -0
  67. {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/data/mmltex/tokens.xsl +0 -0
  68. {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/data/translate.jinja +0 -0
  69. {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/epub/common.py +0 -0
  70. {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/epub/math.py +0 -0
  71. {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/epub/toc.py +0 -0
  72. {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/language.py +0 -0
  73. {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/llm/error.py +0 -0
  74. {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/serial/__init__.py +0 -0
  75. {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/serial/chunk.py +0 -0
  76. {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/serial/segment.py +0 -0
  77. {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/serial/splitter.py +0 -0
  78. {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/template.py +0 -0
  79. {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/xml/firendly/__init__.py +0 -0
  80. {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/xml/firendly/decoder.py +0 -0
  81. {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/xml/firendly/encoder.py +0 -0
  82. {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/xml/firendly/parser.py +0 -0
  83. {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/xml/firendly/tag.py +0 -0
  84. {epub_translator-0.1.1 → epub_translator-0.1.4}/epub_translator/xml/firendly/transform.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: epub-translator
3
- Version: 0.1.1
3
+ Version: 0.1.4
4
4
  Summary: Translate the epub book using LLM. The translated book will retain the original text and list the translated text side by side with the original text.
5
5
  License: MIT
6
6
  Keywords: epub,llm,translation,translator
@@ -78,8 +78,7 @@ The easiest way to use EPUB Translator is through OOMOL Studio with a visual int
78
78
  ### Using Python API
79
79
 
80
80
  ```python
81
- from pathlib import Path
82
- from epub_translator import LLM, translate, language
81
+ from epub_translator import LLM, translate, language, SubmitKind
83
82
 
84
83
  # Initialize LLM with your API credentials
85
84
  llm = LLM(
@@ -91,10 +90,11 @@ llm = LLM(
91
90
 
92
91
  # Translate EPUB file using language constants
93
92
  translate(
94
- llm=llm,
95
- source_path=Path("source.epub"),
96
- target_path=Path("translated.epub"),
93
+ source_path="source.epub",
94
+ target_path="translated.epub",
97
95
  target_language=language.ENGLISH,
96
+ submit=SubmitKind.APPEND_BLOCK,
97
+ llm=llm,
98
98
  )
99
99
  ```
100
100
 
@@ -113,10 +113,11 @@ with tqdm(total=100, desc="Translating", unit="%") as pbar:
113
113
  last_progress = progress
114
114
 
115
115
  translate(
116
- llm=llm,
117
- source_path=Path("source.epub"),
118
- target_path=Path("translated.epub"),
116
+ source_path="source.epub",
117
+ target_path="translated.epub",
119
118
  target_language="English",
119
+ submit=SubmitKind.APPEND_BLOCK,
120
+ llm=llm,
120
121
  on_progress=on_progress,
121
122
  )
122
123
  ```
@@ -149,14 +150,63 @@ Translate an EPUB file:
149
150
 
150
151
  ```python
151
152
  translate(
152
- llm: LLM, # LLM instance
153
- source_path: Path, # Source EPUB file path
154
- target_path: Path, # Output EPUB file path
153
+ source_path: PathLike | str, # Source EPUB file path
154
+ target_path: PathLike | str, # Output EPUB file path
155
155
  target_language: str, # Target language (e.g., "English", "Chinese")
156
+ submit: SubmitKind, # How to insert translations (REPLACE, APPEND_TEXT, or APPEND_BLOCK)
156
157
  user_prompt: str | None = None, # Custom translation instructions
157
158
  max_retries: int = 5, # Maximum retries for failed translations
158
159
  max_group_tokens: int = 1200, # Maximum tokens per translation group
160
+ llm: LLM | None = None, # Single LLM instance for both translation and filling
161
+ translation_llm: LLM | None = None, # LLM instance for translation (overrides llm)
162
+ fill_llm: LLM | None = None, # LLM instance for XML filling (overrides llm)
159
163
  on_progress: Callable[[float], None] | None = None, # Progress callback (0.0-1.0)
164
+ on_fill_failed: Callable[[FillFailedEvent], None] | None = None, # Error callback
165
+ )
166
+ ```
167
+
168
+ **Note**: Either `llm` or both `translation_llm` and `fill_llm` must be provided. Using separate LLMs allows for task-specific optimization.
169
+
170
+ #### Submit Modes
171
+
172
+ The `submit` parameter controls how translated content is inserted into the document. Use `SubmitKind` enum to specify the insertion mode:
173
+
174
+ ```python
175
+ from epub_translator import SubmitKind
176
+
177
+ # Three available modes:
178
+ # - SubmitKind.REPLACE: Replace original content with translation (single-language output)
179
+ # - SubmitKind.APPEND_TEXT: Append translation as inline text (bilingual output)
180
+ # - SubmitKind.APPEND_BLOCK: Append translation as block elements (bilingual output, recommended)
181
+ ```
182
+
183
+ **Mode Comparison:**
184
+
185
+ - **`SubmitKind.REPLACE`**: Creates a single-language translation by replacing original text with translated content. Useful for creating books in the target language only.
186
+
187
+ - **`SubmitKind.APPEND_TEXT`**: Appends translations as inline text immediately after the original content. Both languages appear in the same paragraph, creating a continuous reading flow.
188
+
189
+ - **`SubmitKind.APPEND_BLOCK`** (Recommended): Appends translations as separate block elements (paragraphs) after the original. This creates clear visual separation between languages, making it ideal for side-by-side bilingual reading.
190
+
191
+ **Example:**
192
+
193
+ ```python
194
+ # For bilingual books (recommended)
195
+ translate(
196
+ source_path="source.epub",
197
+ target_path="translated.epub",
198
+ target_language=language.ENGLISH,
199
+ submit=SubmitKind.APPEND_BLOCK,
200
+ llm=llm,
201
+ )
202
+
203
+ # For single-language translation
204
+ translate(
205
+ source_path="source.epub",
206
+ target_path="translated.epub",
207
+ target_language=language.ENGLISH,
208
+ submit=SubmitKind.REPLACE,
209
+ llm=llm,
160
210
  )
161
211
  ```
162
212
 
@@ -169,18 +219,80 @@ from epub_translator import language
169
219
 
170
220
  # Usage example:
171
221
  translate(
172
- llm=llm,
173
- source_path=Path("source.epub"),
174
- target_path=Path("translated.epub"),
222
+ source_path="source.epub",
223
+ target_path="translated.epub",
175
224
  target_language=language.ENGLISH,
225
+ submit=SubmitKind.APPEND_BLOCK,
226
+ llm=llm,
176
227
  )
177
228
 
178
229
  # You can also use custom language strings:
179
230
  translate(
180
- llm=llm,
181
- source_path=Path("source.epub"),
182
- target_path=Path("translated.epub"),
231
+ source_path="source.epub",
232
+ target_path="translated.epub",
183
233
  target_language="Icelandic", # For languages not in the constants
234
+ submit=SubmitKind.APPEND_BLOCK,
235
+ llm=llm,
236
+ )
237
+ ```
238
+
239
+ ### Error Handling with `on_fill_failed`
240
+
241
+ Monitor and handle translation errors using the `on_fill_failed` callback:
242
+
243
+ ```python
244
+ from epub_translator import FillFailedEvent
245
+
246
+ def handle_fill_error(event: FillFailedEvent):
247
+ print(f"Translation error (attempt {event.retried_count}):")
248
+ print(f" {event.error_message}")
249
+ if event.over_maximum_retries:
250
+ print(" Maximum retries exceeded!")
251
+
252
+ translate(
253
+ source_path="source.epub",
254
+ target_path="translated.epub",
255
+ target_language=language.ENGLISH,
256
+ submit=SubmitKind.APPEND_BLOCK,
257
+ llm=llm,
258
+ on_fill_failed=handle_fill_error,
259
+ )
260
+ ```
261
+
262
+ The `FillFailedEvent` contains:
263
+ - `error_message: str` - Description of the error
264
+ - `retried_count: int` - Current retry attempt number
265
+ - `over_maximum_retries: bool` - Whether max retries has been exceeded
266
+
267
+ ### Dual-LLM Architecture
268
+
269
+ Use separate LLM instances for translation and XML structure filling with different optimization parameters:
270
+
271
+ ```python
272
+ # Create two LLM instances with different temperatures
273
+ translation_llm = LLM(
274
+ key="your-api-key",
275
+ url="https://api.openai.com/v1",
276
+ model="gpt-4",
277
+ token_encoding="o200k_base",
278
+ temperature=0.8, # Higher temperature for creative translation
279
+ )
280
+
281
+ fill_llm = LLM(
282
+ key="your-api-key",
283
+ url="https://api.openai.com/v1",
284
+ model="gpt-4",
285
+ token_encoding="o200k_base",
286
+ temperature=0.3, # Lower temperature for structure preservation
287
+ )
288
+
289
+ translate(
290
+ source_path="source.epub",
291
+ target_path="translated.epub",
292
+ target_language=language.ENGLISH,
293
+ submit=SubmitKind.APPEND_BLOCK,
294
+ translation_llm=translation_llm,
295
+ fill_llm=fill_llm,
184
296
  )
185
297
  ```
186
298
 
@@ -236,10 +348,11 @@ Provide specific translation instructions:
236
348
 
237
349
  ```python
238
350
  translate(
239
- llm=llm,
240
- source_path=Path("source.epub"),
241
- target_path=Path("translated.epub"),
351
+ source_path="source.epub",
352
+ target_path="translated.epub",
242
353
  target_language="English",
354
+ submit=SubmitKind.APPEND_BLOCK,
355
+ llm=llm,
243
356
  user_prompt="Use formal language and preserve technical terminology",
244
357
  )
245
358
  ```
@@ -45,8 +45,7 @@ The easiest way to use EPUB Translator is through OOMOL Studio with a visual int
45
45
  ### Using Python API
46
46
 
47
47
  ```python
48
- from pathlib import Path
49
- from epub_translator import LLM, translate, language
48
+ from epub_translator import LLM, translate, language, SubmitKind
50
49
 
51
50
  # Initialize LLM with your API credentials
52
51
  llm = LLM(
@@ -58,10 +57,11 @@ llm = LLM(
58
57
 
59
58
  # Translate EPUB file using language constants
60
59
  translate(
61
- llm=llm,
62
- source_path=Path("source.epub"),
63
- target_path=Path("translated.epub"),
60
+ source_path="source.epub",
61
+ target_path="translated.epub",
64
62
  target_language=language.ENGLISH,
63
+ submit=SubmitKind.APPEND_BLOCK,
64
+ llm=llm,
65
65
  )
66
66
  ```
67
67
 
@@ -80,10 +80,11 @@ with tqdm(total=100, desc="Translating", unit="%") as pbar:
80
80
  last_progress = progress
81
81
 
82
82
  translate(
83
- llm=llm,
84
- source_path=Path("source.epub"),
85
- target_path=Path("translated.epub"),
83
+ source_path="source.epub",
84
+ target_path="translated.epub",
86
85
  target_language="English",
86
+ submit=SubmitKind.APPEND_BLOCK,
87
+ llm=llm,
87
88
  on_progress=on_progress,
88
89
  )
89
90
  ```
@@ -116,14 +117,63 @@ Translate an EPUB file:
116
117
 
117
118
  ```python
118
119
  translate(
119
- llm: LLM, # LLM instance
120
- source_path: Path, # Source EPUB file path
121
- target_path: Path, # Output EPUB file path
120
+ source_path: PathLike | str, # Source EPUB file path
121
+ target_path: PathLike | str, # Output EPUB file path
122
122
  target_language: str, # Target language (e.g., "English", "Chinese")
123
+ submit: SubmitKind, # How to insert translations (REPLACE, APPEND_TEXT, or APPEND_BLOCK)
123
124
  user_prompt: str | None = None, # Custom translation instructions
124
125
  max_retries: int = 5, # Maximum retries for failed translations
125
126
  max_group_tokens: int = 1200, # Maximum tokens per translation group
127
+ llm: LLM | None = None, # Single LLM instance for both translation and filling
128
+ translation_llm: LLM | None = None, # LLM instance for translation (overrides llm)
129
+ fill_llm: LLM | None = None, # LLM instance for XML filling (overrides llm)
126
130
  on_progress: Callable[[float], None] | None = None, # Progress callback (0.0-1.0)
131
+ on_fill_failed: Callable[[FillFailedEvent], None] | None = None, # Error callback
132
+ )
133
+ ```
134
+
135
+ **Note**: Either `llm` or both `translation_llm` and `fill_llm` must be provided. Using separate LLMs allows for task-specific optimization.
136
+
137
+ #### Submit Modes
138
+
139
+ The `submit` parameter controls how translated content is inserted into the document. Use `SubmitKind` enum to specify the insertion mode:
140
+
141
+ ```python
142
+ from epub_translator import SubmitKind
143
+
144
+ # Three available modes:
145
+ # - SubmitKind.REPLACE: Replace original content with translation (single-language output)
146
+ # - SubmitKind.APPEND_TEXT: Append translation as inline text (bilingual output)
147
+ # - SubmitKind.APPEND_BLOCK: Append translation as block elements (bilingual output, recommended)
148
+ ```
149
+
150
+ **Mode Comparison:**
151
+
152
+ - **`SubmitKind.REPLACE`**: Creates a single-language translation by replacing original text with translated content. Useful for creating books in the target language only.
153
+
154
+ - **`SubmitKind.APPEND_TEXT`**: Appends translations as inline text immediately after the original content. Both languages appear in the same paragraph, creating a continuous reading flow.
155
+
156
+ - **`SubmitKind.APPEND_BLOCK`** (Recommended): Appends translations as separate block elements (paragraphs) after the original. This creates clear visual separation between languages, making it ideal for side-by-side bilingual reading.
157
+
158
+ **Example:**
159
+
160
+ ```python
161
+ # For bilingual books (recommended)
162
+ translate(
163
+ source_path="source.epub",
164
+ target_path="translated.epub",
165
+ target_language=language.ENGLISH,
166
+ submit=SubmitKind.APPEND_BLOCK,
167
+ llm=llm,
168
+ )
169
+
170
+ # For single-language translation
171
+ translate(
172
+ source_path="source.epub",
173
+ target_path="translated.epub",
174
+ target_language=language.ENGLISH,
175
+ submit=SubmitKind.REPLACE,
176
+ llm=llm,
127
177
  )
128
178
  ```
129
179
 
@@ -136,18 +186,80 @@ from epub_translator import language
136
186
 
137
187
  # Usage example:
138
188
  translate(
139
- llm=llm,
140
- source_path=Path("source.epub"),
141
- target_path=Path("translated.epub"),
189
+ source_path="source.epub",
190
+ target_path="translated.epub",
142
191
  target_language=language.ENGLISH,
192
+ submit=SubmitKind.APPEND_BLOCK,
193
+ llm=llm,
143
194
  )
144
195
 
145
196
  # You can also use custom language strings:
146
197
  translate(
147
- llm=llm,
148
- source_path=Path("source.epub"),
149
- target_path=Path("translated.epub"),
198
+ source_path="source.epub",
199
+ target_path="translated.epub",
150
200
  target_language="Icelandic", # For languages not in the constants
201
+ submit=SubmitKind.APPEND_BLOCK,
202
+ llm=llm,
203
+ )
204
+ ```
205
+
206
+ ### Error Handling with `on_fill_failed`
207
+
208
+ Monitor and handle translation errors using the `on_fill_failed` callback:
209
+
210
+ ```python
211
+ from epub_translator import FillFailedEvent
212
+
213
+ def handle_fill_error(event: FillFailedEvent):
214
+ print(f"Translation error (attempt {event.retried_count}):")
215
+ print(f" {event.error_message}")
216
+ if event.over_maximum_retries:
217
+ print(" Maximum retries exceeded!")
218
+
219
+ translate(
220
+ source_path="source.epub",
221
+ target_path="translated.epub",
222
+ target_language=language.ENGLISH,
223
+ submit=SubmitKind.APPEND_BLOCK,
224
+ llm=llm,
225
+ on_fill_failed=handle_fill_error,
226
+ )
227
+ ```
228
+
229
+ The `FillFailedEvent` contains:
230
+ - `error_message: str` - Description of the error
231
+ - `retried_count: int` - Current retry attempt number
232
+ - `over_maximum_retries: bool` - Whether max retries has been exceeded
233
+
234
+ ### Dual-LLM Architecture
235
+
236
+ Use separate LLM instances for translation and XML structure filling with different optimization parameters:
237
+
238
+ ```python
239
+ # Create two LLM instances with different temperatures
240
+ translation_llm = LLM(
241
+ key="your-api-key",
242
+ url="https://api.openai.com/v1",
243
+ model="gpt-4",
244
+ token_encoding="o200k_base",
245
+ temperature=0.8, # Higher temperature for creative translation
246
+ )
247
+
248
+ fill_llm = LLM(
249
+ key="your-api-key",
250
+ url="https://api.openai.com/v1",
251
+ model="gpt-4",
252
+ token_encoding="o200k_base",
253
+ temperature=0.3, # Lower temperature for structure preservation
254
+ )
255
+
256
+ translate(
257
+ source_path="source.epub",
258
+ target_path="translated.epub",
259
+ target_language=language.ENGLISH,
260
+ submit=SubmitKind.APPEND_BLOCK,
261
+ translation_llm=translation_llm,
262
+ fill_llm=fill_llm,
151
263
  )
152
264
  ```
153
265
 
@@ -203,10 +315,11 @@ Provide specific translation instructions:
203
315
 
204
316
  ```python
205
317
  translate(
206
- llm=llm,
207
- source_path=Path("source.epub"),
208
- target_path=Path("translated.epub"),
318
+ source_path="source.epub",
319
+ target_path="translated.epub",
209
320
  target_language="English",
321
+ submit=SubmitKind.APPEND_BLOCK,
322
+ llm=llm,
210
323
  user_prompt="Use formal language and preserve technical terminology",
211
324
  )
212
325
  ```
@@ -0,0 +1,12 @@
1
+ from . import language
2
+ from .llm import LLM
3
+ from .translator import FillFailedEvent, translate
4
+ from .xml_translator import SubmitKind
5
+
6
+ __all__ = [
7
+ "LLM",
8
+ "translate",
9
+ "language",
10
+ "FillFailedEvent",
11
+ "SubmitKind",
12
+ ]
@@ -0,0 +1,171 @@
1
+ You are an XML structure validator. Your ONLY task is to preserve the exact XML structure from the template while filling in translated text.
2
+
3
+ CRITICAL RULES:
4
+
5
+ 1. Structure Preservation: The output XML MUST have the EXACT SAME structure as the template
6
+ - Same tags in the same order
7
+ - Same nesting hierarchy
8
+ - Same attributes (especially id attributes)
9
+
10
+ IMPORTANT: Translation fluency is SECONDARY to structure preservation.
11
+ If the translated text flows naturally but doesn't match template structure,
12
+ you MUST break the flow to insert required tags.
13
+
14
+ 2. ID Handling:
15
+ - Tags WITH id="X": Disambiguation markers for structurally similar elements
16
+ - Tags WITHOUT id: Structurally unique, match by position and tag name
17
+ - NEVER add, remove, or change id attributes
18
+
19
+ 3. Text Filling Strategy:
20
+ - Compare source text with translated text
21
+ - Identify how source maps to template structure
22
+ - Apply the same mapping to translated text
23
+ - Preserve paragraph breaks (elements are natural separators)
24
+ - IMPORTANT: Translation may change word order - use SEMANTIC matching, not position
25
+
26
+ ---
27
+
28
+ COMMON ERRORS TO AVOID:
29
+
30
+ Error Type 1: Missing expected blocks
31
+ ❌ WRONG: Omitting elements with id attributes
32
+ ✓ CORRECT: Every <tag id="X"> in template MUST appear in output
33
+
34
+ Error Type 2: Tag count mismatch for non-id elements
35
+ Example template:
36
+ <p id="1">
37
+ <span>text1</span>
38
+ <span>text2</span>
39
+ </p>
40
+
41
+ ❌ WRONG: <p id="1"><span>merged text</span></p> (only 1 span, expected 2)
42
+ ✓ CORRECT: <p id="1"><span>text1</span><span>text2</span></p>
43
+
44
+ Error Type 3: Adding unexpected IDs
45
+ ❌ WRONG: Adding id="99" to a tag that didn't have an id in template
46
+ ✓ CORRECT: If template has <span>text</span>, output should be <span>译文</span> (no id)
47
+
48
+ Error Type 4: Wrong tag names
49
+ ❌ WRONG: Changing <em id="5"> to <i id="5">
50
+ ✓ CORRECT: Keep exact tag name from template
51
+
52
+ Error Type 5: Missing ID on required elements
53
+ ❌ WRONG: <span>text</span> when template has <span id="5">text</span>
54
+ ✓ CORRECT: <span id="5">译文</span>
55
+
56
+ Error Type 6: Wrong text mapping when word order changes
57
+ Example 1: Template has "reviewer of <span id="5">Book</span> in <span id="6">Journal</span>"
58
+ Translation: "Journal 上对 Book 的评论者"
59
+
60
+ ❌ WRONG: Journal 上对 <span id="5">Book</span> 的评论者<span id="6">Journal</span>
61
+ (appending original text at end)
62
+ ✓ CORRECT: <span id="6">Journal</span> 上对 <span id="5">Book</span> 的评论者
63
+ (wrapping semantic equivalents in translated positions)
64
+
65
+ Example 2: Breaking fluent translation to preserve structure
66
+ Template: "published in <span id="5">Book Title</span> in 1990"
67
+ Translation: "于1990年出版的《书名》" (flows naturally, but loses structure)
68
+
69
+ ❌ WRONG: 于1990年出版的《书名》 (fluent but missing <span id="5">)
70
+ ✓ CORRECT: 于1990年出版的<span id="5">《书名》</span>
71
+ (Break fluency to preserve structure - this is REQUIRED)
72
+
73
+ Error Type 7: Wrong semantic matching when word order changes
74
+ When translation changes word order, match elements by SEMANTIC TYPE, not position.
75
+
76
+ Example: Book title and year
77
+ Template: "<span id="3">Book Title</span> in <span id="4"><a>1990</a></span>"
78
+ Translation: "《书名》于1990年出版"
79
+
80
+ ❌ WRONG: 《书名》于<span id="3">1990</span>年出版...
81
+ (Matching by position: "1990" appears after "于", so wrapping it with id="3")
82
+ (WRONG because you matched a YEAR to a slot expecting BOOK TITLE)
83
+
84
+ ✓ CORRECT: <span id="3">《书名》</span>于<span id="4"><a>1990</a></span>年出版
85
+ (Matching by semantic type: book title → book title, year → year)
86
+
87
+ KEY PRINCIPLE: Semantic type matching beats position matching!
88
+ - Identify semantic types: book titles, journal names, years (4-digit numbers), person names, etc.
89
+ - Match each to its corresponding slot, regardless of position in translation
90
+ - data-orig-len hints at length: book/journal titles usually longer than years/numbers
91
+
92
+ ---
93
+
94
+ FILLING ALGORITHM:
95
+
96
+ 1. Analyze template structure: count elements at each level, note id attributes
97
+
98
+ 2. Segment source text by elements (elements are natural separators)
99
+
100
+ 3. Apply to translation - STRICT STRUCTURAL MATCHING:
101
+
102
+ A. For elements WITH id:
103
+ - Locate semantic equivalent in translation
104
+ - Wrap with same tag+id
105
+ - If translation merged multiple spans: You MUST still output all original spans separately
106
+ Example: Template has id="1" and id="2", translation merged both
107
+ → Output BOTH spans, use source text fallback for missing one
108
+
109
+ B. For elements WITHOUT id:
110
+ - Match by STRUCTURAL POSITION only (template order)
111
+ - Count MUST be exact: 7 spans in template = 7 spans in output
112
+ - Even if content repeats (e.g., 3 instances of "x"), each gets its own span
113
+ - Process sequentially: wrap 1st occurrence with 1st span, 2nd with 2nd span, etc.
114
+ - DO NOT merge, skip, or add extra elements
115
+
116
+ CRITICAL for repeated content:
117
+ If template has: "...<span>Word</span>...more text...<span>Word</span>"
118
+ And translation has: "...词...更多文字...词"
119
+ → Wrap 1st occurrence of "词" with 1st span, 2nd occurrence with 2nd span
120
+ → Even if the words are identical, treat each span position independently
121
+
122
+ 4. Verify: same element counts, all ids preserved, tag names match
123
+
124
+ CRITICAL: Template structure is LAW. Translation fluency is secondary.
125
+
126
+ ---
127
+
128
+ SPECIAL CASES:
129
+
130
+ 1. data-orig-len attribute: Token count hint. Longer counts usually = titles/names, shorter = numbers/symbols.
131
+
132
+ 2. Name+Number as single unit (e.g., "JournalName42" with NO space):
133
+ - In translation, find the name's equivalent and any adjacent number
134
+ - Wrap them together: <span id="X">《期刊名》42</span> or <span id="X">《期刊名》第42期</span>
135
+ - Key: If template treats them as one span, keep them in one span in translation
136
+
137
+ 3. Translation merges adjacent spans:
138
+ Template: "<span id="A">Word1</span> & <span id="B">Word2</span>"
139
+ Translation: "复合词" (one inseparable term)
140
+
141
+ Solution: You MUST output BOTH spans even if translation merged them
142
+ - Try to split translation if possible
143
+ - If truly inseparable: Keep translation for one span, use source text for the other
144
+ - Example: <span id="A">复合词</span>与<span id="B">Word2</span>
145
+
146
+ 4. Missing semantic match:
147
+ - Exhaust all possibilities first (synonyms, paraphrases, context)
148
+ - Last resort: Use source text as fallback
149
+ - Mixed language is acceptable to preserve structure
150
+
151
+ WRONG fallback approaches:
152
+ ❌ Empty: <span id="2"></span>
153
+ ❌ Placeholder: <span id="2">内容</span>
154
+ ❌ Duplicate: <span id="2">中文名称</span> (when id="1" has this)
155
+
156
+ ---
157
+
158
+ OUTPUT FORMAT:
159
+ ```xml
160
+ <xml>
161
+ ... your filled XML here ...
162
+ </xml>
163
+ ```
164
+
165
+ CRITICAL:
166
+ - Return ONLY the XML block, no explanations
167
+ - Do NOT include example blocks or alternatives
168
+ - If unsure, make best attempt based on pattern
169
+ - System will provide detailed error messages if corrections needed
170
+
171
+ Begin.
@@ -1,4 +1,4 @@
1
- from .placeholder import Placeholder, is_placeholder_tag
1
+ from .metadata import read_metadata, write_metadata
2
2
  from .spines import search_spine_paths
3
3
  from .toc import read_toc, write_toc
4
4
  from .zip import Zip