epub-translator 0.1.3__tar.gz → 0.1.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {epub_translator-0.1.3 → epub_translator-0.1.4}/PKG-INFO +67 -17
- {epub_translator-0.1.3 → epub_translator-0.1.4}/README.md +66 -16
- epub_translator-0.1.4/epub_translator/__init__.py +12 -0
- epub_translator-0.1.4/epub_translator/punctuation.py +34 -0
- {epub_translator-0.1.3 → epub_translator-0.1.4}/epub_translator/segment/text_segment.py +2 -67
- {epub_translator-0.1.3 → epub_translator-0.1.4}/epub_translator/translator.py +33 -29
- {epub_translator-0.1.3 → epub_translator-0.1.4}/epub_translator/xml/__init__.py +1 -0
- epub_translator-0.1.4/epub_translator/xml/inline.py +67 -0
- epub_translator-0.1.4/epub_translator/xml_translator/__init__.py +3 -0
- epub_translator-0.1.4/epub_translator/xml_translator/submitter.py +363 -0
- {epub_translator-0.1.3 → epub_translator-0.1.4}/epub_translator/xml_translator/translator.py +31 -12
- {epub_translator-0.1.3 → epub_translator-0.1.4}/pyproject.toml +1 -1
- epub_translator-0.1.3/epub_translator/__init__.py +0 -5
- epub_translator-0.1.3/epub_translator/xml_translator/__init__.py +0 -2
- epub_translator-0.1.3/epub_translator/xml_translator/submitter.py +0 -56
- {epub_translator-0.1.3 → epub_translator-0.1.4}/LICENSE +0 -0
- {epub_translator-0.1.3 → epub_translator-0.1.4}/epub_translator/data/fill.jinja +0 -0
- {epub_translator-0.1.3 → epub_translator-0.1.4}/epub_translator/data/mmltex/README.md +0 -0
- {epub_translator-0.1.3 → epub_translator-0.1.4}/epub_translator/data/mmltex/cmarkup.xsl +0 -0
- {epub_translator-0.1.3 → epub_translator-0.1.4}/epub_translator/data/mmltex/entities.xsl +0 -0
- {epub_translator-0.1.3 → epub_translator-0.1.4}/epub_translator/data/mmltex/glayout.xsl +0 -0
- {epub_translator-0.1.3 → epub_translator-0.1.4}/epub_translator/data/mmltex/mmltex.xsl +0 -0
- {epub_translator-0.1.3 → epub_translator-0.1.4}/epub_translator/data/mmltex/scripts.xsl +0 -0
- {epub_translator-0.1.3 → epub_translator-0.1.4}/epub_translator/data/mmltex/tables.xsl +0 -0
- {epub_translator-0.1.3 → epub_translator-0.1.4}/epub_translator/data/mmltex/tokens.xsl +0 -0
- {epub_translator-0.1.3 → epub_translator-0.1.4}/epub_translator/data/translate.jinja +0 -0
- {epub_translator-0.1.3 → epub_translator-0.1.4}/epub_translator/epub/__init__.py +0 -0
- {epub_translator-0.1.3 → epub_translator-0.1.4}/epub_translator/epub/common.py +0 -0
- {epub_translator-0.1.3 → epub_translator-0.1.4}/epub_translator/epub/math.py +0 -0
- {epub_translator-0.1.3 → epub_translator-0.1.4}/epub_translator/epub/metadata.py +0 -0
- {epub_translator-0.1.3 → epub_translator-0.1.4}/epub_translator/epub/spines.py +0 -0
- {epub_translator-0.1.3 → epub_translator-0.1.4}/epub_translator/epub/toc.py +0 -0
- {epub_translator-0.1.3 → epub_translator-0.1.4}/epub_translator/epub/zip.py +0 -0
- {epub_translator-0.1.3 → epub_translator-0.1.4}/epub_translator/epub_transcode.py +0 -0
- {epub_translator-0.1.3 → epub_translator-0.1.4}/epub_translator/language.py +0 -0
- {epub_translator-0.1.3 → epub_translator-0.1.4}/epub_translator/llm/__init__.py +0 -0
- {epub_translator-0.1.3 → epub_translator-0.1.4}/epub_translator/llm/context.py +0 -0
- {epub_translator-0.1.3 → epub_translator-0.1.4}/epub_translator/llm/core.py +0 -0
- {epub_translator-0.1.3 → epub_translator-0.1.4}/epub_translator/llm/error.py +0 -0
- {epub_translator-0.1.3 → epub_translator-0.1.4}/epub_translator/llm/executor.py +0 -0
- {epub_translator-0.1.3 → epub_translator-0.1.4}/epub_translator/llm/increasable.py +0 -0
- {epub_translator-0.1.3 → epub_translator-0.1.4}/epub_translator/llm/types.py +0 -0
- {epub_translator-0.1.3 → epub_translator-0.1.4}/epub_translator/segment/__init__.py +0 -0
- {epub_translator-0.1.3 → epub_translator-0.1.4}/epub_translator/segment/block_segment.py +0 -0
- {epub_translator-0.1.3 → epub_translator-0.1.4}/epub_translator/segment/common.py +0 -0
- {epub_translator-0.1.3 → epub_translator-0.1.4}/epub_translator/segment/inline_segment.py +0 -0
- {epub_translator-0.1.3 → epub_translator-0.1.4}/epub_translator/segment/utils.py +0 -0
- {epub_translator-0.1.3 → epub_translator-0.1.4}/epub_translator/serial/__init__.py +0 -0
- {epub_translator-0.1.3 → epub_translator-0.1.4}/epub_translator/serial/chunk.py +0 -0
- {epub_translator-0.1.3 → epub_translator-0.1.4}/epub_translator/serial/segment.py +0 -0
- {epub_translator-0.1.3 → epub_translator-0.1.4}/epub_translator/serial/splitter.py +0 -0
- {epub_translator-0.1.3 → epub_translator-0.1.4}/epub_translator/template.py +0 -0
- {epub_translator-0.1.3 → epub_translator-0.1.4}/epub_translator/utils.py +0 -0
- {epub_translator-0.1.3 → epub_translator-0.1.4}/epub_translator/xml/const.py +0 -0
- {epub_translator-0.1.3 → epub_translator-0.1.4}/epub_translator/xml/deduplication.py +0 -0
- {epub_translator-0.1.3 → epub_translator-0.1.4}/epub_translator/xml/firendly/__init__.py +0 -0
- {epub_translator-0.1.3 → epub_translator-0.1.4}/epub_translator/xml/firendly/decoder.py +0 -0
- {epub_translator-0.1.3 → epub_translator-0.1.4}/epub_translator/xml/firendly/encoder.py +0 -0
- {epub_translator-0.1.3 → epub_translator-0.1.4}/epub_translator/xml/firendly/parser.py +0 -0
- {epub_translator-0.1.3 → epub_translator-0.1.4}/epub_translator/xml/firendly/tag.py +0 -0
- {epub_translator-0.1.3 → epub_translator-0.1.4}/epub_translator/xml/firendly/transform.py +0 -0
- {epub_translator-0.1.3 → epub_translator-0.1.4}/epub_translator/xml/self_closing.py +0 -0
- {epub_translator-0.1.3 → epub_translator-0.1.4}/epub_translator/xml/utils.py +0 -0
- {epub_translator-0.1.3 → epub_translator-0.1.4}/epub_translator/xml/xml.py +0 -0
- {epub_translator-0.1.3 → epub_translator-0.1.4}/epub_translator/xml/xml_like.py +0 -0
- {epub_translator-0.1.3 → epub_translator-0.1.4}/epub_translator/xml_interrupter.py +0 -0
- {epub_translator-0.1.3 → epub_translator-0.1.4}/epub_translator/xml_translator/callbacks.py +0 -0
- {epub_translator-0.1.3 → epub_translator-0.1.4}/epub_translator/xml_translator/common.py +0 -0
- {epub_translator-0.1.3 → epub_translator-0.1.4}/epub_translator/xml_translator/hill_climbing.py +0 -0
- {epub_translator-0.1.3 → epub_translator-0.1.4}/epub_translator/xml_translator/stream_mapper.py +0 -0
- {epub_translator-0.1.3 → epub_translator-0.1.4}/epub_translator/xml_translator/validation.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: epub-translator
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.4
|
|
4
4
|
Summary: Translate the epub book using LLM. The translated book will retain the original text and list the translated text side by side with the original text.
|
|
5
5
|
License: MIT
|
|
6
6
|
Keywords: epub,llm,translation,translator
|
|
@@ -78,8 +78,7 @@ The easiest way to use EPUB Translator is through OOMOL Studio with a visual int
|
|
|
78
78
|
### Using Python API
|
|
79
79
|
|
|
80
80
|
```python
|
|
81
|
-
from
|
|
82
|
-
from epub_translator import LLM, translate, language
|
|
81
|
+
from epub_translator import LLM, translate, language, SubmitKind
|
|
83
82
|
|
|
84
83
|
# Initialize LLM with your API credentials
|
|
85
84
|
llm = LLM(
|
|
@@ -91,9 +90,10 @@ llm = LLM(
|
|
|
91
90
|
|
|
92
91
|
# Translate EPUB file using language constants
|
|
93
92
|
translate(
|
|
94
|
-
source_path=
|
|
95
|
-
target_path=
|
|
93
|
+
source_path="source.epub",
|
|
94
|
+
target_path="translated.epub",
|
|
96
95
|
target_language=language.ENGLISH,
|
|
96
|
+
submit=SubmitKind.APPEND_BLOCK,
|
|
97
97
|
llm=llm,
|
|
98
98
|
)
|
|
99
99
|
```
|
|
@@ -113,9 +113,10 @@ with tqdm(total=100, desc="Translating", unit="%") as pbar:
|
|
|
113
113
|
last_progress = progress
|
|
114
114
|
|
|
115
115
|
translate(
|
|
116
|
-
source_path=
|
|
117
|
-
target_path=
|
|
116
|
+
source_path="source.epub",
|
|
117
|
+
target_path="translated.epub",
|
|
118
118
|
target_language="English",
|
|
119
|
+
submit=SubmitKind.APPEND_BLOCK,
|
|
119
120
|
llm=llm,
|
|
120
121
|
on_progress=on_progress,
|
|
121
122
|
)
|
|
@@ -152,6 +153,7 @@ translate(
|
|
|
152
153
|
source_path: PathLike | str, # Source EPUB file path
|
|
153
154
|
target_path: PathLike | str, # Output EPUB file path
|
|
154
155
|
target_language: str, # Target language (e.g., "English", "Chinese")
|
|
156
|
+
submit: SubmitKind, # How to insert translations (REPLACE, APPEND_TEXT, or APPEND_BLOCK)
|
|
155
157
|
user_prompt: str | None = None, # Custom translation instructions
|
|
156
158
|
max_retries: int = 5, # Maximum retries for failed translations
|
|
157
159
|
max_group_tokens: int = 1200, # Maximum tokens per translation group
|
|
@@ -165,6 +167,49 @@ translate(
|
|
|
165
167
|
|
|
166
168
|
**Note**: Either `llm` or both `translation_llm` and `fill_llm` must be provided. Using separate LLMs allows for task-specific optimization.
|
|
167
169
|
|
|
170
|
+
#### Submit Modes
|
|
171
|
+
|
|
172
|
+
The `submit` parameter controls how translated content is inserted into the document. Use `SubmitKind` enum to specify the insertion mode:
|
|
173
|
+
|
|
174
|
+
```python
|
|
175
|
+
from epub_translator import SubmitKind
|
|
176
|
+
|
|
177
|
+
# Three available modes:
|
|
178
|
+
# - SubmitKind.REPLACE: Replace original content with translation (single-language output)
|
|
179
|
+
# - SubmitKind.APPEND_TEXT: Append translation as inline text (bilingual output)
|
|
180
|
+
# - SubmitKind.APPEND_BLOCK: Append translation as block elements (bilingual output, recommended)
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
**Mode Comparison:**
|
|
184
|
+
|
|
185
|
+
- **`SubmitKind.REPLACE`**: Creates a single-language translation by replacing original text with translated content. Useful for creating books in the target language only.
|
|
186
|
+
|
|
187
|
+
- **`SubmitKind.APPEND_TEXT`**: Appends translations as inline text immediately after the original content. Both languages appear in the same paragraph, creating a continuous reading flow.
|
|
188
|
+
|
|
189
|
+
- **`SubmitKind.APPEND_BLOCK`** (Recommended): Appends translations as separate block elements (paragraphs) after the original. This creates clear visual separation between languages, making it ideal for side-by-side bilingual reading.
|
|
190
|
+
|
|
191
|
+
**Example:**
|
|
192
|
+
|
|
193
|
+
```python
|
|
194
|
+
# For bilingual books (recommended)
|
|
195
|
+
translate(
|
|
196
|
+
source_path="source.epub",
|
|
197
|
+
target_path="translated.epub",
|
|
198
|
+
target_language=language.ENGLISH,
|
|
199
|
+
submit=SubmitKind.APPEND_BLOCK,
|
|
200
|
+
llm=llm,
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
# For single-language translation
|
|
204
|
+
translate(
|
|
205
|
+
source_path="source.epub",
|
|
206
|
+
target_path="translated.epub",
|
|
207
|
+
target_language=language.ENGLISH,
|
|
208
|
+
submit=SubmitKind.REPLACE,
|
|
209
|
+
llm=llm,
|
|
210
|
+
)
|
|
211
|
+
```
|
|
212
|
+
|
|
168
213
|
#### Language Constants
|
|
169
214
|
|
|
170
215
|
EPUB Translator provides predefined language constants for convenience. You can use these constants instead of writing language names as strings:
|
|
@@ -174,17 +219,19 @@ from epub_translator import language
|
|
|
174
219
|
|
|
175
220
|
# Usage example:
|
|
176
221
|
translate(
|
|
177
|
-
source_path=
|
|
178
|
-
target_path=
|
|
222
|
+
source_path="source.epub",
|
|
223
|
+
target_path="translated.epub",
|
|
179
224
|
target_language=language.ENGLISH,
|
|
225
|
+
submit=SubmitKind.APPEND_BLOCK,
|
|
180
226
|
llm=llm,
|
|
181
227
|
)
|
|
182
228
|
|
|
183
229
|
# You can also use custom language strings:
|
|
184
230
|
translate(
|
|
185
|
-
source_path=
|
|
186
|
-
target_path=
|
|
231
|
+
source_path="source.epub",
|
|
232
|
+
target_path="translated.epub",
|
|
187
233
|
target_language="Icelandic", # For languages not in the constants
|
|
234
|
+
submit=SubmitKind.APPEND_BLOCK,
|
|
188
235
|
llm=llm,
|
|
189
236
|
)
|
|
190
237
|
```
|
|
@@ -203,9 +250,10 @@ def handle_fill_error(event: FillFailedEvent):
|
|
|
203
250
|
print(" Maximum retries exceeded!")
|
|
204
251
|
|
|
205
252
|
translate(
|
|
206
|
-
source_path=
|
|
207
|
-
target_path=
|
|
253
|
+
source_path="source.epub",
|
|
254
|
+
target_path="translated.epub",
|
|
208
255
|
target_language=language.ENGLISH,
|
|
256
|
+
submit=SubmitKind.APPEND_BLOCK,
|
|
209
257
|
llm=llm,
|
|
210
258
|
on_fill_failed=handle_fill_error,
|
|
211
259
|
)
|
|
@@ -239,9 +287,10 @@ fill_llm = LLM(
|
|
|
239
287
|
)
|
|
240
288
|
|
|
241
289
|
translate(
|
|
242
|
-
source_path=
|
|
243
|
-
target_path=
|
|
290
|
+
source_path="source.epub",
|
|
291
|
+
target_path="translated.epub",
|
|
244
292
|
target_language=language.ENGLISH,
|
|
293
|
+
submit=SubmitKind.APPEND_BLOCK,
|
|
245
294
|
translation_llm=translation_llm,
|
|
246
295
|
fill_llm=fill_llm,
|
|
247
296
|
)
|
|
@@ -299,9 +348,10 @@ Provide specific translation instructions:
|
|
|
299
348
|
|
|
300
349
|
```python
|
|
301
350
|
translate(
|
|
302
|
-
source_path=
|
|
303
|
-
target_path=
|
|
351
|
+
source_path="source.epub",
|
|
352
|
+
target_path="translated.epub",
|
|
304
353
|
target_language="English",
|
|
354
|
+
submit=SubmitKind.APPEND_BLOCK,
|
|
305
355
|
llm=llm,
|
|
306
356
|
user_prompt="Use formal language and preserve technical terminology",
|
|
307
357
|
)
|
|
@@ -45,8 +45,7 @@ The easiest way to use EPUB Translator is through OOMOL Studio with a visual int
|
|
|
45
45
|
### Using Python API
|
|
46
46
|
|
|
47
47
|
```python
|
|
48
|
-
from
|
|
49
|
-
from epub_translator import LLM, translate, language
|
|
48
|
+
from epub_translator import LLM, translate, language, SubmitKind
|
|
50
49
|
|
|
51
50
|
# Initialize LLM with your API credentials
|
|
52
51
|
llm = LLM(
|
|
@@ -58,9 +57,10 @@ llm = LLM(
|
|
|
58
57
|
|
|
59
58
|
# Translate EPUB file using language constants
|
|
60
59
|
translate(
|
|
61
|
-
source_path=
|
|
62
|
-
target_path=
|
|
60
|
+
source_path="source.epub",
|
|
61
|
+
target_path="translated.epub",
|
|
63
62
|
target_language=language.ENGLISH,
|
|
63
|
+
submit=SubmitKind.APPEND_BLOCK,
|
|
64
64
|
llm=llm,
|
|
65
65
|
)
|
|
66
66
|
```
|
|
@@ -80,9 +80,10 @@ with tqdm(total=100, desc="Translating", unit="%") as pbar:
|
|
|
80
80
|
last_progress = progress
|
|
81
81
|
|
|
82
82
|
translate(
|
|
83
|
-
source_path=
|
|
84
|
-
target_path=
|
|
83
|
+
source_path="source.epub",
|
|
84
|
+
target_path="translated.epub",
|
|
85
85
|
target_language="English",
|
|
86
|
+
submit=SubmitKind.APPEND_BLOCK,
|
|
86
87
|
llm=llm,
|
|
87
88
|
on_progress=on_progress,
|
|
88
89
|
)
|
|
@@ -119,6 +120,7 @@ translate(
|
|
|
119
120
|
source_path: PathLike | str, # Source EPUB file path
|
|
120
121
|
target_path: PathLike | str, # Output EPUB file path
|
|
121
122
|
target_language: str, # Target language (e.g., "English", "Chinese")
|
|
123
|
+
submit: SubmitKind, # How to insert translations (REPLACE, APPEND_TEXT, or APPEND_BLOCK)
|
|
122
124
|
user_prompt: str | None = None, # Custom translation instructions
|
|
123
125
|
max_retries: int = 5, # Maximum retries for failed translations
|
|
124
126
|
max_group_tokens: int = 1200, # Maximum tokens per translation group
|
|
@@ -132,6 +134,49 @@ translate(
|
|
|
132
134
|
|
|
133
135
|
**Note**: Either `llm` or both `translation_llm` and `fill_llm` must be provided. Using separate LLMs allows for task-specific optimization.
|
|
134
136
|
|
|
137
|
+
#### Submit Modes
|
|
138
|
+
|
|
139
|
+
The `submit` parameter controls how translated content is inserted into the document. Use `SubmitKind` enum to specify the insertion mode:
|
|
140
|
+
|
|
141
|
+
```python
|
|
142
|
+
from epub_translator import SubmitKind
|
|
143
|
+
|
|
144
|
+
# Three available modes:
|
|
145
|
+
# - SubmitKind.REPLACE: Replace original content with translation (single-language output)
|
|
146
|
+
# - SubmitKind.APPEND_TEXT: Append translation as inline text (bilingual output)
|
|
147
|
+
# - SubmitKind.APPEND_BLOCK: Append translation as block elements (bilingual output, recommended)
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
**Mode Comparison:**
|
|
151
|
+
|
|
152
|
+
- **`SubmitKind.REPLACE`**: Creates a single-language translation by replacing original text with translated content. Useful for creating books in the target language only.
|
|
153
|
+
|
|
154
|
+
- **`SubmitKind.APPEND_TEXT`**: Appends translations as inline text immediately after the original content. Both languages appear in the same paragraph, creating a continuous reading flow.
|
|
155
|
+
|
|
156
|
+
- **`SubmitKind.APPEND_BLOCK`** (Recommended): Appends translations as separate block elements (paragraphs) after the original. This creates clear visual separation between languages, making it ideal for side-by-side bilingual reading.
|
|
157
|
+
|
|
158
|
+
**Example:**
|
|
159
|
+
|
|
160
|
+
```python
|
|
161
|
+
# For bilingual books (recommended)
|
|
162
|
+
translate(
|
|
163
|
+
source_path="source.epub",
|
|
164
|
+
target_path="translated.epub",
|
|
165
|
+
target_language=language.ENGLISH,
|
|
166
|
+
submit=SubmitKind.APPEND_BLOCK,
|
|
167
|
+
llm=llm,
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
# For single-language translation
|
|
171
|
+
translate(
|
|
172
|
+
source_path="source.epub",
|
|
173
|
+
target_path="translated.epub",
|
|
174
|
+
target_language=language.ENGLISH,
|
|
175
|
+
submit=SubmitKind.REPLACE,
|
|
176
|
+
llm=llm,
|
|
177
|
+
)
|
|
178
|
+
```
|
|
179
|
+
|
|
135
180
|
#### Language Constants
|
|
136
181
|
|
|
137
182
|
EPUB Translator provides predefined language constants for convenience. You can use these constants instead of writing language names as strings:
|
|
@@ -141,17 +186,19 @@ from epub_translator import language
|
|
|
141
186
|
|
|
142
187
|
# Usage example:
|
|
143
188
|
translate(
|
|
144
|
-
source_path=
|
|
145
|
-
target_path=
|
|
189
|
+
source_path="source.epub",
|
|
190
|
+
target_path="translated.epub",
|
|
146
191
|
target_language=language.ENGLISH,
|
|
192
|
+
submit=SubmitKind.APPEND_BLOCK,
|
|
147
193
|
llm=llm,
|
|
148
194
|
)
|
|
149
195
|
|
|
150
196
|
# You can also use custom language strings:
|
|
151
197
|
translate(
|
|
152
|
-
source_path=
|
|
153
|
-
target_path=
|
|
198
|
+
source_path="source.epub",
|
|
199
|
+
target_path="translated.epub",
|
|
154
200
|
target_language="Icelandic", # For languages not in the constants
|
|
201
|
+
submit=SubmitKind.APPEND_BLOCK,
|
|
155
202
|
llm=llm,
|
|
156
203
|
)
|
|
157
204
|
```
|
|
@@ -170,9 +217,10 @@ def handle_fill_error(event: FillFailedEvent):
|
|
|
170
217
|
print(" Maximum retries exceeded!")
|
|
171
218
|
|
|
172
219
|
translate(
|
|
173
|
-
source_path=
|
|
174
|
-
target_path=
|
|
220
|
+
source_path="source.epub",
|
|
221
|
+
target_path="translated.epub",
|
|
175
222
|
target_language=language.ENGLISH,
|
|
223
|
+
submit=SubmitKind.APPEND_BLOCK,
|
|
176
224
|
llm=llm,
|
|
177
225
|
on_fill_failed=handle_fill_error,
|
|
178
226
|
)
|
|
@@ -206,9 +254,10 @@ fill_llm = LLM(
|
|
|
206
254
|
)
|
|
207
255
|
|
|
208
256
|
translate(
|
|
209
|
-
source_path=
|
|
210
|
-
target_path=
|
|
257
|
+
source_path="source.epub",
|
|
258
|
+
target_path="translated.epub",
|
|
211
259
|
target_language=language.ENGLISH,
|
|
260
|
+
submit=SubmitKind.APPEND_BLOCK,
|
|
212
261
|
translation_llm=translation_llm,
|
|
213
262
|
fill_llm=fill_llm,
|
|
214
263
|
)
|
|
@@ -266,9 +315,10 @@ Provide specific translation instructions:
|
|
|
266
315
|
|
|
267
316
|
```python
|
|
268
317
|
translate(
|
|
269
|
-
source_path=
|
|
270
|
-
target_path=
|
|
318
|
+
source_path="source.epub",
|
|
319
|
+
target_path="translated.epub",
|
|
271
320
|
target_language="English",
|
|
321
|
+
submit=SubmitKind.APPEND_BLOCK,
|
|
272
322
|
llm=llm,
|
|
273
323
|
user_prompt="Use formal language and preserve technical terminology",
|
|
274
324
|
)
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from xml.etree.ElementTree import Element
|
|
2
|
+
|
|
3
|
+
from .xml import iter_with_stack
|
|
4
|
+
|
|
5
|
+
_QUOTE_MAPPING = {
|
|
6
|
+
# 法语引号
|
|
7
|
+
"«": "",
|
|
8
|
+
"»": "",
|
|
9
|
+
"‹": "«",
|
|
10
|
+
"›": "»",
|
|
11
|
+
# 中文书书名号
|
|
12
|
+
"《": "",
|
|
13
|
+
"》": "",
|
|
14
|
+
"〈": "《",
|
|
15
|
+
"〉": "》",
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _strip_quotes(text: str):
|
|
20
|
+
for char in text:
|
|
21
|
+
mapped = _QUOTE_MAPPING.get(char, None)
|
|
22
|
+
if mapped is None:
|
|
23
|
+
yield char
|
|
24
|
+
elif mapped:
|
|
25
|
+
yield mapped
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def unwrap_french_quotes(element: Element) -> Element:
|
|
29
|
+
for _, child_element in iter_with_stack(element):
|
|
30
|
+
if child_element.text:
|
|
31
|
+
child_element.text = "".join(_strip_quotes(child_element.text))
|
|
32
|
+
if child_element.tail:
|
|
33
|
+
child_element.tail = "".join(_strip_quotes(child_element.tail))
|
|
34
|
+
return element
|
|
@@ -4,71 +4,7 @@ from enum import Enum, auto
|
|
|
4
4
|
from typing import Self
|
|
5
5
|
from xml.etree.ElementTree import Element
|
|
6
6
|
|
|
7
|
-
from ..xml import expand_left_element_texts, expand_right_element_texts, normalize_text_in_element
|
|
8
|
-
|
|
9
|
-
# HTML inline-level elements
|
|
10
|
-
# Reference: https://developer.mozilla.org/en-US/docs/Web/HTML/Inline_elements
|
|
11
|
-
# Reference: https://developer.mozilla.org/en-US/docs/Glossary/Inline-level_content
|
|
12
|
-
_HTML_INLINE_TAGS = frozenset(
|
|
13
|
-
(
|
|
14
|
-
# Inline text semantics
|
|
15
|
-
"a",
|
|
16
|
-
"abbr",
|
|
17
|
-
"b",
|
|
18
|
-
"bdi",
|
|
19
|
-
"bdo",
|
|
20
|
-
"br",
|
|
21
|
-
"cite",
|
|
22
|
-
"code",
|
|
23
|
-
"data",
|
|
24
|
-
"dfn",
|
|
25
|
-
"em",
|
|
26
|
-
"i",
|
|
27
|
-
"kbd",
|
|
28
|
-
"mark",
|
|
29
|
-
"q",
|
|
30
|
-
"rp",
|
|
31
|
-
"rt",
|
|
32
|
-
"ruby",
|
|
33
|
-
"s",
|
|
34
|
-
"samp",
|
|
35
|
-
"small",
|
|
36
|
-
"span",
|
|
37
|
-
"strong",
|
|
38
|
-
"sub",
|
|
39
|
-
"sup",
|
|
40
|
-
"time",
|
|
41
|
-
"u",
|
|
42
|
-
"var",
|
|
43
|
-
"wbr",
|
|
44
|
-
# Image and multimedia
|
|
45
|
-
"img",
|
|
46
|
-
"svg",
|
|
47
|
-
"canvas",
|
|
48
|
-
"audio",
|
|
49
|
-
"video",
|
|
50
|
-
"map",
|
|
51
|
-
"area",
|
|
52
|
-
# Form elements
|
|
53
|
-
"input",
|
|
54
|
-
"button",
|
|
55
|
-
"select",
|
|
56
|
-
"textarea",
|
|
57
|
-
"label",
|
|
58
|
-
"output",
|
|
59
|
-
"progress",
|
|
60
|
-
"meter",
|
|
61
|
-
# Embedded content
|
|
62
|
-
"iframe",
|
|
63
|
-
"embed",
|
|
64
|
-
"object",
|
|
65
|
-
# Other inline elements
|
|
66
|
-
"script",
|
|
67
|
-
"del",
|
|
68
|
-
"ins",
|
|
69
|
-
"slot",
|
|
70
|
-
)
|
|
71
|
-
)
|
|
7
|
+
from ..xml import expand_left_element_texts, expand_right_element_texts, is_inline_tag, normalize_text_in_element
|
|
72
8
|
|
|
73
9
|
|
|
74
10
|
class TextPosition(Enum):
|
|
@@ -196,8 +132,7 @@ def _search_text_segments(stack: list[Element], element: Element) -> Generator[T
|
|
|
196
132
|
def _find_block_depth(parent_stack: list[Element]) -> int:
|
|
197
133
|
index: int = 0
|
|
198
134
|
for i in range(len(parent_stack) - 1, -1, -1):
|
|
199
|
-
|
|
200
|
-
if checked_tag not in _HTML_INLINE_TAGS:
|
|
135
|
+
if not is_inline_tag(parent_stack[i].tag):
|
|
201
136
|
index = i
|
|
202
137
|
break
|
|
203
138
|
return index + 1 # depth is a count not index
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from collections.abc import Callable
|
|
1
|
+
from collections.abc import Callable, Generator
|
|
2
2
|
from dataclasses import dataclass
|
|
3
3
|
from enum import Enum, auto
|
|
4
4
|
from importlib.metadata import version as get_package_version
|
|
@@ -15,9 +15,10 @@ from .epub import (
|
|
|
15
15
|
)
|
|
16
16
|
from .epub_transcode import decode_metadata, decode_toc_list, encode_metadata, encode_toc_list
|
|
17
17
|
from .llm import LLM
|
|
18
|
+
from .punctuation import unwrap_french_quotes
|
|
18
19
|
from .xml import XMLLikeNode, deduplicate_ids_in_element, find_first
|
|
19
20
|
from .xml_interrupter import XMLInterrupter
|
|
20
|
-
from .xml_translator import FillFailedEvent, XMLTranslator
|
|
21
|
+
from .xml_translator import FillFailedEvent, SubmitKind, TranslationTask, XMLTranslator
|
|
21
22
|
|
|
22
23
|
|
|
23
24
|
class _ElementType(Enum):
|
|
@@ -36,6 +37,7 @@ def translate(
|
|
|
36
37
|
source_path: PathLike | str,
|
|
37
38
|
target_path: PathLike | str,
|
|
38
39
|
target_language: str,
|
|
40
|
+
submit: SubmitKind,
|
|
39
41
|
user_prompt: str | None = None,
|
|
40
42
|
max_retries: int = 5,
|
|
41
43
|
max_group_tokens: int = 1200,
|
|
@@ -83,33 +85,26 @@ def translate(
|
|
|
83
85
|
return
|
|
84
86
|
|
|
85
87
|
interrupter = XMLInterrupter()
|
|
86
|
-
element_contexts: dict[int, _ElementContext] = {}
|
|
87
|
-
|
|
88
88
|
toc_weight = 0.05 if toc_has_items else 0
|
|
89
89
|
metadata_weight = 0.05 if metadata_has_items else 0
|
|
90
90
|
chapters_weight = 1.0 - toc_weight - metadata_weight
|
|
91
91
|
progress_per_chapter = chapters_weight / total_chapters if total_chapters > 0 else 0
|
|
92
92
|
current_progress = 0.0
|
|
93
93
|
|
|
94
|
-
for translated_elem in translator.translate_elements(
|
|
94
|
+
for translated_elem, context in translator.translate_elements(
|
|
95
95
|
interrupt_source_text_segments=interrupter.interrupt_source_text_segments,
|
|
96
96
|
interrupt_translated_text_segments=interrupter.interrupt_translated_text_segments,
|
|
97
97
|
interrupt_block_element=interrupter.interrupt_block_element,
|
|
98
98
|
on_fill_failed=on_fill_failed,
|
|
99
|
-
|
|
99
|
+
tasks=_generate_tasks_from_book(
|
|
100
100
|
zip=zip,
|
|
101
101
|
toc_list=toc_list,
|
|
102
102
|
metadata_fields=metadata_fields,
|
|
103
|
-
|
|
103
|
+
submit=submit,
|
|
104
104
|
),
|
|
105
105
|
):
|
|
106
|
-
elem_id = id(translated_elem)
|
|
107
|
-
context = element_contexts.pop(elem_id, None)
|
|
108
|
-
|
|
109
|
-
if context is None:
|
|
110
|
-
continue
|
|
111
|
-
|
|
112
106
|
if context.element_type == _ElementType.TOC:
|
|
107
|
+
translated_elem = unwrap_french_quotes(translated_elem)
|
|
113
108
|
decoded_toc = decode_toc_list(translated_elem)
|
|
114
109
|
write_toc(zip, decoded_toc)
|
|
115
110
|
|
|
@@ -118,6 +113,7 @@ def translate(
|
|
|
118
113
|
on_progress(current_progress)
|
|
119
114
|
|
|
120
115
|
elif context.element_type == _ElementType.METADATA:
|
|
116
|
+
translated_elem = unwrap_french_quotes(translated_elem)
|
|
121
117
|
decoded_metadata = decode_metadata(translated_elem)
|
|
122
118
|
write_metadata(zip, decoded_metadata)
|
|
123
119
|
|
|
@@ -137,23 +133,29 @@ def translate(
|
|
|
137
133
|
on_progress(current_progress)
|
|
138
134
|
|
|
139
135
|
|
|
140
|
-
def
|
|
136
|
+
def _generate_tasks_from_book(
|
|
141
137
|
zip: Zip,
|
|
142
138
|
toc_list: list,
|
|
143
139
|
metadata_fields: list,
|
|
144
|
-
|
|
145
|
-
):
|
|
140
|
+
submit: SubmitKind,
|
|
141
|
+
) -> Generator[TranslationTask[_ElementContext], None, None]:
|
|
142
|
+
head_submit = submit
|
|
143
|
+
if head_submit == SubmitKind.APPEND_BLOCK:
|
|
144
|
+
head_submit = SubmitKind.APPEND_TEXT
|
|
145
|
+
|
|
146
146
|
if toc_list:
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
147
|
+
yield TranslationTask(
|
|
148
|
+
element=encode_toc_list(toc_list),
|
|
149
|
+
action=head_submit,
|
|
150
|
+
payload=_ElementContext(element_type=_ElementType.TOC),
|
|
151
|
+
)
|
|
151
152
|
|
|
152
153
|
if metadata_fields:
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
154
|
+
yield TranslationTask(
|
|
155
|
+
element=encode_metadata(metadata_fields),
|
|
156
|
+
action=head_submit,
|
|
157
|
+
payload=_ElementContext(element_type=_ElementType.METADATA),
|
|
158
|
+
)
|
|
157
159
|
|
|
158
160
|
for chapter_path, media_type in search_spine_paths(zip):
|
|
159
161
|
with zip.read(chapter_path) as chapter_file:
|
|
@@ -163,12 +165,14 @@ def _generate_elements_from_book(
|
|
|
163
165
|
)
|
|
164
166
|
body_element = find_first(xml.element, "body")
|
|
165
167
|
if body_element is not None:
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
168
|
+
yield TranslationTask(
|
|
169
|
+
element=body_element,
|
|
170
|
+
action=submit,
|
|
171
|
+
payload=_ElementContext(
|
|
172
|
+
element_type=_ElementType.CHAPTER,
|
|
173
|
+
chapter_data=(chapter_path, xml),
|
|
174
|
+
),
|
|
170
175
|
)
|
|
171
|
-
yield body_element
|
|
172
176
|
|
|
173
177
|
|
|
174
178
|
def _get_version() -> str:
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
# HTML inline-level elements
|
|
2
|
+
# Reference: https://developer.mozilla.org/en-US/docs/Web/HTML/Inline_elements
|
|
3
|
+
# Reference: https://developer.mozilla.org/en-US/docs/Glossary/Inline-level_content
|
|
4
|
+
_HTML_INLINE_TAGS = frozenset(
|
|
5
|
+
(
|
|
6
|
+
# Inline text semantics
|
|
7
|
+
"a",
|
|
8
|
+
"abbr",
|
|
9
|
+
"b",
|
|
10
|
+
"bdi",
|
|
11
|
+
"bdo",
|
|
12
|
+
"br",
|
|
13
|
+
"cite",
|
|
14
|
+
"code",
|
|
15
|
+
"data",
|
|
16
|
+
"dfn",
|
|
17
|
+
"em",
|
|
18
|
+
"i",
|
|
19
|
+
"kbd",
|
|
20
|
+
"mark",
|
|
21
|
+
"q",
|
|
22
|
+
"rp",
|
|
23
|
+
"rt",
|
|
24
|
+
"ruby",
|
|
25
|
+
"s",
|
|
26
|
+
"samp",
|
|
27
|
+
"small",
|
|
28
|
+
"span",
|
|
29
|
+
"strong",
|
|
30
|
+
"sub",
|
|
31
|
+
"sup",
|
|
32
|
+
"time",
|
|
33
|
+
"u",
|
|
34
|
+
"var",
|
|
35
|
+
"wbr",
|
|
36
|
+
# Image and multimedia
|
|
37
|
+
"img",
|
|
38
|
+
"svg",
|
|
39
|
+
"canvas",
|
|
40
|
+
"audio",
|
|
41
|
+
"video",
|
|
42
|
+
"map",
|
|
43
|
+
"area",
|
|
44
|
+
# Form elements
|
|
45
|
+
"input",
|
|
46
|
+
"button",
|
|
47
|
+
"select",
|
|
48
|
+
"textarea",
|
|
49
|
+
"label",
|
|
50
|
+
"output",
|
|
51
|
+
"progress",
|
|
52
|
+
"meter",
|
|
53
|
+
# Embedded content
|
|
54
|
+
"iframe",
|
|
55
|
+
"embed",
|
|
56
|
+
"object",
|
|
57
|
+
# Other inline elements
|
|
58
|
+
"script",
|
|
59
|
+
"del",
|
|
60
|
+
"ins",
|
|
61
|
+
"slot",
|
|
62
|
+
)
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def is_inline_tag(tag: str) -> bool:
|
|
67
|
+
return tag.lower() in _HTML_INLINE_TAGS
|