epub-translator 0.0.7__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epub_translator/__init__.py +4 -2
- epub_translator/data/fill.jinja +66 -0
- epub_translator/data/mmltex/README.md +67 -0
- epub_translator/data/mmltex/cmarkup.xsl +1106 -0
- epub_translator/data/mmltex/entities.xsl +459 -0
- epub_translator/data/mmltex/glayout.xsl +222 -0
- epub_translator/data/mmltex/mmltex.xsl +36 -0
- epub_translator/data/mmltex/scripts.xsl +375 -0
- epub_translator/data/mmltex/tables.xsl +130 -0
- epub_translator/data/mmltex/tokens.xsl +328 -0
- epub_translator/data/translate.jinja +15 -12
- epub_translator/epub/__init__.py +4 -2
- epub_translator/epub/common.py +43 -0
- epub_translator/epub/math.py +193 -0
- epub_translator/epub/placeholder.py +53 -0
- epub_translator/epub/spines.py +42 -0
- epub_translator/epub/toc.py +505 -0
- epub_translator/epub/zip.py +67 -0
- epub_translator/iter_sync.py +24 -0
- epub_translator/language.py +23 -0
- epub_translator/llm/__init__.py +2 -1
- epub_translator/llm/core.py +233 -0
- epub_translator/llm/error.py +38 -35
- epub_translator/llm/executor.py +159 -136
- epub_translator/llm/increasable.py +28 -28
- epub_translator/llm/types.py +17 -0
- epub_translator/serial/__init__.py +2 -0
- epub_translator/serial/chunk.py +52 -0
- epub_translator/serial/segment.py +17 -0
- epub_translator/serial/splitter.py +50 -0
- epub_translator/template.py +35 -33
- epub_translator/translator.py +208 -178
- epub_translator/utils.py +7 -0
- epub_translator/xml/__init__.py +4 -3
- epub_translator/xml/deduplication.py +38 -0
- epub_translator/xml/firendly/__init__.py +2 -0
- epub_translator/xml/firendly/decoder.py +75 -0
- epub_translator/xml/firendly/encoder.py +84 -0
- epub_translator/xml/firendly/parser.py +177 -0
- epub_translator/xml/firendly/tag.py +118 -0
- epub_translator/xml/firendly/transform.py +36 -0
- epub_translator/xml/xml.py +52 -0
- epub_translator/xml/xml_like.py +231 -0
- epub_translator/xml_translator/__init__.py +3 -0
- epub_translator/xml_translator/const.py +2 -0
- epub_translator/xml_translator/fill.py +128 -0
- epub_translator/xml_translator/format.py +282 -0
- epub_translator/xml_translator/fragmented.py +125 -0
- epub_translator/xml_translator/group.py +183 -0
- epub_translator/xml_translator/progressive_locking.py +256 -0
- epub_translator/xml_translator/submitter.py +102 -0
- epub_translator/xml_translator/text_segment.py +263 -0
- epub_translator/xml_translator/translator.py +179 -0
- epub_translator/xml_translator/utils.py +29 -0
- epub_translator-0.1.1.dist-info/METADATA +283 -0
- epub_translator-0.1.1.dist-info/RECORD +58 -0
- epub_translator/data/format.jinja +0 -33
- epub_translator/epub/content_parser.py +0 -162
- epub_translator/epub/html/__init__.py +0 -1
- epub_translator/epub/html/dom_operator.py +0 -68
- epub_translator/epub/html/empty_tags.py +0 -23
- epub_translator/epub/html/file.py +0 -80
- epub_translator/epub/html/texts_searcher.py +0 -46
- epub_translator/llm/node.py +0 -201
- epub_translator/translation/__init__.py +0 -2
- epub_translator/translation/chunk.py +0 -118
- epub_translator/translation/splitter.py +0 -78
- epub_translator/translation/store.py +0 -36
- epub_translator/translation/translation.py +0 -231
- epub_translator/translation/types.py +0 -45
- epub_translator/translation/utils.py +0 -11
- epub_translator/xml/decoder.py +0 -71
- epub_translator/xml/encoder.py +0 -95
- epub_translator/xml/parser.py +0 -172
- epub_translator/xml/tag.py +0 -93
- epub_translator/xml/transform.py +0 -34
- epub_translator/xml/utils.py +0 -12
- epub_translator/zip_context.py +0 -74
- epub_translator-0.0.7.dist-info/METADATA +0 -170
- epub_translator-0.0.7.dist-info/RECORD +0 -36
- {epub_translator-0.0.7.dist-info → epub_translator-0.1.1.dist-info}/LICENSE +0 -0
- {epub_translator-0.0.7.dist-info → epub_translator-0.1.1.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,283 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: epub-translator
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: Translate the epub book using LLM. The translated book will retain the original text and list the translated text side by side with the original text.
|
|
5
|
+
License: MIT
|
|
6
|
+
Keywords: epub,llm,translation,translator
|
|
7
|
+
Author: Tao Zeyu
|
|
8
|
+
Author-email: i@taozeyu.com
|
|
9
|
+
Maintainer: Tao Zeyu
|
|
10
|
+
Maintainer-email: i@taozeyu.com
|
|
11
|
+
Requires-Python: >=3.11,<3.14
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Intended Audience :: Education
|
|
15
|
+
Classifier: Intended Audience :: Science/Research
|
|
16
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
17
|
+
Classifier: Operating System :: OS Independent
|
|
18
|
+
Classifier: Programming Language :: Python :: 3
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
22
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
23
|
+
Classifier: Topic :: Software Development :: Localization
|
|
24
|
+
Classifier: Topic :: Text Processing :: Markup
|
|
25
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
26
|
+
Requires-Dist: jinja2 (>=3.1.6,<4.0.0)
|
|
27
|
+
Requires-Dist: openai (>=2.14.0,<3.0.0)
|
|
28
|
+
Requires-Dist: resource-segmentation (>=0.0.7,<0.1.0)
|
|
29
|
+
Requires-Dist: tiktoken (>=0.12.0,<1.0.0)
|
|
30
|
+
Project-URL: Homepage, https://hub.oomol.com/package/books-translator
|
|
31
|
+
Project-URL: Repository, https://github.com/oomol-lab/epub-translator
|
|
32
|
+
Description-Content-Type: text/markdown
|
|
33
|
+
|
|
34
|
+
<div align=center>
|
|
35
|
+
<h1>EPUB Translator</h1>
|
|
36
|
+
<p>
|
|
37
|
+
<a href="https://github.com/oomol-lab/epub-translator/actions/workflows/merge-build.yml" target="_blank"><img src="https://img.shields.io/github/actions/workflow/status/oomol-lab/epub-translator/merge-build.yml" alt="ci" /></a>
|
|
38
|
+
<a href="https://pypi.org/project/epub-translator/" target="_blank"><img src="https://img.shields.io/badge/pip_install-epub--translator-blue" alt="pip install epub-translator" /></a>
|
|
39
|
+
<a href="https://pypi.org/project/epub-translator/" target="_blank"><img src="https://img.shields.io/pypi/v/epub-translator.svg" alt="pypi epub-translator" /></a>
|
|
40
|
+
<a href="https://pypi.org/project/epub-translator/" target="_blank"><img src="https://img.shields.io/pypi/pyversions/epub-translator.svg" alt="python versions" /></a>
|
|
41
|
+
<a href="https://github.com/oomol-lab/epub-translator/blob/main/LICENSE" target="_blank"><img src="https://img.shields.io/github/license/oomol-lab/epub-translator" alt="license" /></a>
|
|
42
|
+
</p>
|
|
43
|
+
<p><a href="https://hub.oomol.com/package/books-translator?open=true" target="_blank"><img src="https://static.oomol.com/assets/button.svg" alt="Open in OOMOL Studio" /></a></p>
|
|
44
|
+
<p>English | <a href="./README_zh-CN.md">中文</a></p>
|
|
45
|
+
</div>
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
Translate EPUB books using Large Language Models while preserving the original text. The translated content is displayed side-by-side with the original, creating bilingual books perfect for language learning and cross-reference reading.
|
|
49
|
+
|
|
50
|
+

|
|
51
|
+
|
|
52
|
+
## Features
|
|
53
|
+
|
|
54
|
+
- **Bilingual Output**: Preserves original text alongside translations for easy comparison
|
|
55
|
+
- **LLM-Powered**: Leverages large language models for high-quality, context-aware translations
|
|
56
|
+
- **Format Preservation**: Maintains EPUB structure, styles, images, and formatting
|
|
57
|
+
- **Complete Translation**: Translates chapter content, table of contents, and metadata
|
|
58
|
+
- **Progress Tracking**: Monitor translation progress with built-in callbacks
|
|
59
|
+
- **Flexible LLM Support**: Works with any OpenAI-compatible API endpoint
|
|
60
|
+
- **Caching**: Built-in caching for progress recovery when translation fails
|
|
61
|
+
|
|
62
|
+
## Installation
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
pip install epub-translator
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
**Requirements**: Python 3.11, 3.12, or 3.13
|
|
69
|
+
|
|
70
|
+
## Quick Start
|
|
71
|
+
|
|
72
|
+
### Using OOMOL Studio (Recommended)
|
|
73
|
+
|
|
74
|
+
The easiest way to use EPUB Translator is through OOMOL Studio with a visual interface:
|
|
75
|
+
|
|
76
|
+
[](https://www.youtube.com/watch?v=QsAdiskxfXI)
|
|
77
|
+
|
|
78
|
+
### Using Python API
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
from pathlib import Path
|
|
82
|
+
from epub_translator import LLM, translate, language
|
|
83
|
+
|
|
84
|
+
# Initialize LLM with your API credentials
|
|
85
|
+
llm = LLM(
|
|
86
|
+
key="your-api-key",
|
|
87
|
+
url="https://api.openai.com/v1",
|
|
88
|
+
model="gpt-4",
|
|
89
|
+
token_encoding="o200k_base",
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
# Translate EPUB file using language constants
|
|
93
|
+
translate(
|
|
94
|
+
llm=llm,
|
|
95
|
+
source_path=Path("source.epub"),
|
|
96
|
+
target_path=Path("translated.epub"),
|
|
97
|
+
target_language=language.ENGLISH,
|
|
98
|
+
)
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
### With Progress Tracking
|
|
102
|
+
|
|
103
|
+
```python
|
|
104
|
+
from tqdm import tqdm
|
|
105
|
+
|
|
106
|
+
with tqdm(total=100, desc="Translating", unit="%") as pbar:
|
|
107
|
+
last_progress = 0.0
|
|
108
|
+
|
|
109
|
+
def on_progress(progress: float):
|
|
110
|
+
nonlocal last_progress
|
|
111
|
+
increment = (progress - last_progress) * 100
|
|
112
|
+
pbar.update(increment)
|
|
113
|
+
last_progress = progress
|
|
114
|
+
|
|
115
|
+
translate(
|
|
116
|
+
llm=llm,
|
|
117
|
+
source_path=Path("source.epub"),
|
|
118
|
+
target_path=Path("translated.epub"),
|
|
119
|
+
target_language="English",
|
|
120
|
+
on_progress=on_progress,
|
|
121
|
+
)
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
## API Reference
|
|
125
|
+
|
|
126
|
+
### `LLM` Class
|
|
127
|
+
|
|
128
|
+
Initialize the LLM client for translation:
|
|
129
|
+
|
|
130
|
+
```python
|
|
131
|
+
LLM(
|
|
132
|
+
key: str, # API key
|
|
133
|
+
url: str, # API endpoint URL
|
|
134
|
+
model: str, # Model name (e.g., "gpt-4")
|
|
135
|
+
token_encoding: str, # Token encoding (e.g., "o200k_base")
|
|
136
|
+
cache_path: PathLike | None = None, # Cache directory path
|
|
137
|
+
timeout: float | None = None, # Request timeout in seconds
|
|
138
|
+
top_p: float | tuple[float, float] | None = None,
|
|
139
|
+
temperature: float | tuple[float, float] | None = None,
|
|
140
|
+
retry_times: int = 5, # Number of retries on failure
|
|
141
|
+
retry_interval_seconds: float = 6.0, # Interval between retries
|
|
142
|
+
log_dir_path: PathLike | None = None, # Log directory path
|
|
143
|
+
)
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
### `translate` Function
|
|
147
|
+
|
|
148
|
+
Translate an EPUB file:
|
|
149
|
+
|
|
150
|
+
```python
|
|
151
|
+
translate(
|
|
152
|
+
llm: LLM, # LLM instance
|
|
153
|
+
source_path: Path, # Source EPUB file path
|
|
154
|
+
target_path: Path, # Output EPUB file path
|
|
155
|
+
target_language: str, # Target language (e.g., "English", "Chinese")
|
|
156
|
+
user_prompt: str | None = None, # Custom translation instructions
|
|
157
|
+
max_retries: int = 5, # Maximum retries for failed translations
|
|
158
|
+
max_group_tokens: int = 1200, # Maximum tokens per translation group
|
|
159
|
+
on_progress: Callable[[float], None] | None = None, # Progress callback (0.0-1.0)
|
|
160
|
+
)
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
#### Language Constants
|
|
164
|
+
|
|
165
|
+
EPUB Translator provides predefined language constants for convenience. You can use these constants instead of writing language names as strings:
|
|
166
|
+
|
|
167
|
+
```python
|
|
168
|
+
from epub_translator import language
|
|
169
|
+
|
|
170
|
+
# Usage example:
|
|
171
|
+
translate(
|
|
172
|
+
llm=llm,
|
|
173
|
+
source_path=Path("source.epub"),
|
|
174
|
+
target_path=Path("translated.epub"),
|
|
175
|
+
target_language=language.ENGLISH,
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
# You can also use custom language strings:
|
|
179
|
+
translate(
|
|
180
|
+
llm=llm,
|
|
181
|
+
source_path=Path("source.epub"),
|
|
182
|
+
target_path=Path("translated.epub"),
|
|
183
|
+
target_language="Icelandic", # For languages not in the constants
|
|
184
|
+
)
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
## Configuration Examples
|
|
188
|
+
|
|
189
|
+
### OpenAI
|
|
190
|
+
|
|
191
|
+
```python
|
|
192
|
+
llm = LLM(
|
|
193
|
+
key="sk-...",
|
|
194
|
+
url="https://api.openai.com/v1",
|
|
195
|
+
model="gpt-4",
|
|
196
|
+
token_encoding="o200k_base",
|
|
197
|
+
)
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
### Azure OpenAI
|
|
201
|
+
|
|
202
|
+
```python
|
|
203
|
+
llm = LLM(
|
|
204
|
+
key="your-azure-key",
|
|
205
|
+
url="https://your-resource.openai.azure.com/openai/deployments/your-deployment",
|
|
206
|
+
model="gpt-4",
|
|
207
|
+
token_encoding="o200k_base",
|
|
208
|
+
)
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
### Other OpenAI-Compatible Services
|
|
212
|
+
|
|
213
|
+
Any service with an OpenAI-compatible API can be used:
|
|
214
|
+
|
|
215
|
+
```python
|
|
216
|
+
llm = LLM(
|
|
217
|
+
key="your-api-key",
|
|
218
|
+
url="https://your-service.com/v1",
|
|
219
|
+
model="your-model",
|
|
220
|
+
token_encoding="o200k_base", # Match your model's encoding
|
|
221
|
+
)
|
|
222
|
+
```
|
|
223
|
+
|
|
224
|
+
## Use Cases
|
|
225
|
+
|
|
226
|
+
- **Language Learning**: Read books in their original language with side-by-side translations
|
|
227
|
+
- **Academic Research**: Access foreign literature with bilingual references
|
|
228
|
+
- **Content Localization**: Prepare books for international audiences
|
|
229
|
+
- **Cross-Cultural Reading**: Enjoy literature while understanding cultural nuances
|
|
230
|
+
|
|
231
|
+
## Advanced Features
|
|
232
|
+
|
|
233
|
+
### Custom Translation Prompts
|
|
234
|
+
|
|
235
|
+
Provide specific translation instructions:
|
|
236
|
+
|
|
237
|
+
```python
|
|
238
|
+
translate(
|
|
239
|
+
llm=llm,
|
|
240
|
+
source_path=Path("source.epub"),
|
|
241
|
+
target_path=Path("translated.epub"),
|
|
242
|
+
target_language="English",
|
|
243
|
+
user_prompt="Use formal language and preserve technical terminology",
|
|
244
|
+
)
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
### Caching for Progress Recovery
|
|
248
|
+
|
|
249
|
+
Enable caching to resume translation progress after failures:
|
|
250
|
+
|
|
251
|
+
```python
|
|
252
|
+
llm = LLM(
|
|
253
|
+
key="your-api-key",
|
|
254
|
+
url="https://api.openai.com/v1",
|
|
255
|
+
model="gpt-4",
|
|
256
|
+
token_encoding="o200k_base",
|
|
257
|
+
cache_path="./translation_cache", # Translations are cached here
|
|
258
|
+
)
|
|
259
|
+
```
|
|
260
|
+
|
|
261
|
+
## Related Projects
|
|
262
|
+
|
|
263
|
+
### PDF Craft
|
|
264
|
+
|
|
265
|
+
[PDF Craft](https://github.com/oomol-lab/pdf-craft) converts PDF files into EPUB and other formats, with a focus on scanned books. Combine PDF Craft with EPUB Translator to convert and translate scanned PDF books into bilingual EPUB format.
|
|
266
|
+
|
|
267
|
+
**Workflow**: Scanned PDF → [PDF Craft] → EPUB → [EPUB Translator] → Bilingual EPUB
|
|
268
|
+
|
|
269
|
+
For a complete tutorial, watch: [Convert scanned PDF books to EPUB format and translate them into bilingual books](https://www.bilibili.com/video/BV1tMQZY5EYY/)
|
|
270
|
+
|
|
271
|
+
## Contributing
|
|
272
|
+
|
|
273
|
+
Contributions are welcome! Please feel free to submit a Pull Request.
|
|
274
|
+
|
|
275
|
+
## License
|
|
276
|
+
|
|
277
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
278
|
+
|
|
279
|
+
## Support
|
|
280
|
+
|
|
281
|
+
- **Issues**: [GitHub Issues](https://github.com/oomol-lab/epub-translator/issues)
|
|
282
|
+
- **OOMOL Studio**: [Open in OOMOL Studio](https://hub.oomol.com/package/books-translator?open=true)
|
|
283
|
+
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
epub_translator/__init__.py,sha256=CDj5ZDWZreLKF3LdHf6QmGbUABytunhUBPJwYbpeIKc,122
|
|
2
|
+
epub_translator/data/fill.jinja,sha256=Rk8EodbDOEHS-W20CGJbhSNBHdZI1EIz414mQf_B4tY,1390
|
|
3
|
+
epub_translator/data/mmltex/README.md,sha256=wwhe5yW1U_7_YZIFKnQVnCOmUl7Mu3gsr3lNnDSJ5Qs,2953
|
|
4
|
+
epub_translator/data/mmltex/cmarkup.xsl,sha256=DkhimAATM0XSCfVOfY41-qTPoddqzOHjZ00Pynr4zQE,37707
|
|
5
|
+
epub_translator/data/mmltex/entities.xsl,sha256=TYZ5iGg0u9XlDDBBGuZiHL7MsxKc-3OsTIBAVM1GDek,107742
|
|
6
|
+
epub_translator/data/mmltex/glayout.xsl,sha256=Ztc7N1wiHaYZlo9u9iuROrIl3uIIIoo1VFIuojXq7TM,6879
|
|
7
|
+
epub_translator/data/mmltex/mmltex.xsl,sha256=BVXFbApz-9W2qRKKtBTxptK5vxG2bfB8tv9W1MP5iBI,1384
|
|
8
|
+
epub_translator/data/mmltex/scripts.xsl,sha256=f4ei0cDCW3cV-Ra7rC3kC5tRcKdjJxbSpCeQLoohtgo,13697
|
|
9
|
+
epub_translator/data/mmltex/tables.xsl,sha256=RxtNo8qDtVAg8_6BuYsafraB_0z7YDAB9D__fT9gmWs,4327
|
|
10
|
+
epub_translator/data/mmltex/tokens.xsl,sha256=j3JZRcBhAiiY8o5K3640phfLwxO8JVspCFlSttwBzJk,12373
|
|
11
|
+
epub_translator/data/translate.jinja,sha256=93d8kschm5HV-EfXd1kFSIVMObDqTMdoUrwDfce2bhU,820
|
|
12
|
+
epub_translator/epub/__init__.py,sha256=KpGWmHS4y0tBAIYp3v8G3k1u1KGaKNeloOvt3osAL5c,154
|
|
13
|
+
epub_translator/epub/common.py,sha256=4-SpTe8iot9hMfyXILmlUFvYVNYqPAHL5hn1fr2wgis,1180
|
|
14
|
+
epub_translator/epub/math.py,sha256=-Q2LJQxxjgQZQUe_WlJA9tjzLqgqtw2ZmbGbHsPRp2U,5422
|
|
15
|
+
epub_translator/epub/placeholder.py,sha256=ywBrFo4ZgwtTZpl_mNwWVhT1xyk3JZGD0qrpQi4u1DM,1613
|
|
16
|
+
epub_translator/epub/spines.py,sha256=bkLgalqJ2sId4enmrnYnGUMs74vohxRkXPbxqbOQuyw,1277
|
|
17
|
+
epub_translator/epub/toc.py,sha256=TKJfyDT4svFkXd6JCNZk2ZEYc9q-5DXnV3zY2UKo8nE,14891
|
|
18
|
+
epub_translator/epub/zip.py,sha256=CUE50LrrVNeQVecNm2ZFionJz4k_vMTXTi8an7BiQ_c,2314
|
|
19
|
+
epub_translator/iter_sync.py,sha256=56m-bRPqc731alGenqLvCIM99J8NzNuie86FDGtJj8k,588
|
|
20
|
+
epub_translator/language.py,sha256=88osG0JNYxOkxBjg5Pm-P0Mhiyxf6GqdxoPW12HW0PE,493
|
|
21
|
+
epub_translator/llm/__init__.py,sha256=QcAuTQpH0T7IMf-J3bRdtf8Tvyu6Z2CAe-wSzLJRLLw,43
|
|
22
|
+
epub_translator/llm/core.py,sha256=nRNAVDQD7kxSl2EN7m5OQ7CvlBL4ENbzQThUcJSzMsk,8123
|
|
23
|
+
epub_translator/llm/error.py,sha256=4efAIQL14DFSvAnSTUfgdAbZRqaWBqOfUGsSfvxa5zM,1503
|
|
24
|
+
epub_translator/llm/executor.py,sha256=Oax07rwivDbB0T3i_waLAvXvfQoR9dnWPTvw475C9vQ,6081
|
|
25
|
+
epub_translator/llm/increasable.py,sha256=vQka-bysKuFR-Vu-GziGZfQCasLn9q2GxGEoV2RiCec,1289
|
|
26
|
+
epub_translator/llm/types.py,sha256=-VPfl1qjTZ8s8rQfqZ44H_txZfVmx49TZdQSvp2vUU4,264
|
|
27
|
+
epub_translator/serial/__init__.py,sha256=b3IMVmWcUwEqHKcGmey88b057pyz5ct946CaUZi4LB4,67
|
|
28
|
+
epub_translator/serial/chunk.py,sha256=FrTaHikVOd6bLYumnEriTaAQ_DIDLjHm16gh-wBVR9k,1495
|
|
29
|
+
epub_translator/serial/segment.py,sha256=uEz-ke1KcYrON-68FaUEzMG2CzHlMjvbC11F3ZT4yH0,446
|
|
30
|
+
epub_translator/serial/splitter.py,sha256=Nq0sxPXos8ez7QBG01sOKjnYKbeBWUBHflZGtqenVm8,1726
|
|
31
|
+
epub_translator/template.py,sha256=0CqRmj3nTtPshw0NmTr2ECqelops2MMyX94fMrE-HKs,1587
|
|
32
|
+
epub_translator/translator.py,sha256=vEccCEFc-mArX4DzvUz09W_WFOxUv6dlQkwWDkbbVFs,6976
|
|
33
|
+
epub_translator/utils.py,sha256=7lBWHNyv4GQiutqqqUhbAxc8gqVIkhS7B4rkL1EKOFs,144
|
|
34
|
+
epub_translator/xml/__init__.py,sha256=te8vIRgG-2n1fEcTmNzCLc-WH9G0JUr_lJncJQvRbgw,96
|
|
35
|
+
epub_translator/xml/deduplication.py,sha256=Vc7BtXXnAMQHNtE--o2Qkm_sYrjnJSh33reKFh9YUjo,1143
|
|
36
|
+
epub_translator/xml/firendly/__init__.py,sha256=I5jhnhFWoHvojLsYXH4jfR4Gi8lKFZ3yQ56ze5hEe1M,74
|
|
37
|
+
epub_translator/xml/firendly/decoder.py,sha256=xRQ5LnSunmYbba_0oT39oUr86-sLYAHYMUGmlseIu2U,2467
|
|
38
|
+
epub_translator/xml/firendly/encoder.py,sha256=evjvw6oE-oCud44IsJ-YZVHn6dtUzjNYX25ljaZP6vY,2417
|
|
39
|
+
epub_translator/xml/firendly/parser.py,sha256=QlMHA0nfPJbNyx6IwRFrYVw7okuvzDB42NXCauIFV-o,6560
|
|
40
|
+
epub_translator/xml/firendly/tag.py,sha256=ahaGoYttuAlnFxLFFgTV51KUZSpUiHho-COZX14nxN8,3308
|
|
41
|
+
epub_translator/xml/firendly/transform.py,sha256=5tG1MJmzrXIR_Z5gmRxwcoKvXBzJBVH0ELeaRsG-8w0,1201
|
|
42
|
+
epub_translator/xml/xml.py,sha256=7NPinMOFGBeOHCG-hw0iQjL-p-_I4DmYL8lq0Ar8rag,1498
|
|
43
|
+
epub_translator/xml/xml_like.py,sha256=tgzqDQFfql9-QMSRbLf9SVlNsvyZXJTCEWmksxd3TuI,9489
|
|
44
|
+
epub_translator/xml_translator/__init__.py,sha256=yNgwIermFXaRfAfnqXaNFCEf5I95cBVUDxha-6xkLq0,117
|
|
45
|
+
epub_translator/xml_translator/const.py,sha256=Q9pmLplUR71TqF4MN5oLtPNl_pBRWoOJwsC5eIQOOWE,57
|
|
46
|
+
epub_translator/xml_translator/fill.py,sha256=LxkPxlfbDDB3gP1rciXEBFyi1QRj5vXWzdca5SBcd5o,4839
|
|
47
|
+
epub_translator/xml_translator/format.py,sha256=xupnymuvr0hNJnjZ3-M4x3WiHN7LdgvHTQCUZkAug5U,10524
|
|
48
|
+
epub_translator/xml_translator/fragmented.py,sha256=DMueQlGNLbW70dwoZHRNLypzVmxyXAGxkbsc6K5115M,4569
|
|
49
|
+
epub_translator/xml_translator/group.py,sha256=2GxJl3RojyHyMuTZ5cn5PITT-F2fdaBlvAhfsn_o08Q,6977
|
|
50
|
+
epub_translator/xml_translator/progressive_locking.py,sha256=2eoCzVNeV4e4TziYTk4UgKmBUGuFQFj7X24ejO75lUA,9613
|
|
51
|
+
epub_translator/xml_translator/submitter.py,sha256=bIoxhUIDMScgnxnqfCKR8d3u1DaISXqIM2WuHzrNU7M,4022
|
|
52
|
+
epub_translator/xml_translator/text_segment.py,sha256=Aue5XHKYKzTuinFExcdu0CqGY5TiuJoIIhbP9t5ubPg,7673
|
|
53
|
+
epub_translator/xml_translator/translator.py,sha256=FGSXo2UWtcoIOWGzkI4emyqp1Q2Z8EoOBCBmdtty18A,7063
|
|
54
|
+
epub_translator/xml_translator/utils.py,sha256=AIJOcB7Btad0yxxLwD3UC9NTk2gOPEM8qqx7sNO6tDc,626
|
|
55
|
+
epub_translator-0.1.1.dist-info/LICENSE,sha256=5RF32sL3LtMOJIErdDKp1ZEYPGXS8WPpsiSz_jMBnGI,1066
|
|
56
|
+
epub_translator-0.1.1.dist-info/METADATA,sha256=BJDV44wO93Nw7e1hqBV33HXK8KUa_JO2XJ1qQ22RGmc,9655
|
|
57
|
+
epub_translator-0.1.1.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
|
58
|
+
epub_translator-0.1.1.dist-info/RECORD,,
|
|
@@ -1,33 +0,0 @@
|
|
|
1
|
-
你是一个校对员,需要帮助用户校对译文与原文的对应关系。用户随后会提交一段文本,先是一段表示原文的XML格式内容,接下来是一段纯文本的{{ target_language }}译文。原文XML文本以<request>标签作为根,<fragment>标签作为子元素。原文将按顺序拆分再各个<fragment>标签中,每个<fragment>标签将包含一个id属性,以唯一标识该片段。
|
|
2
|
-
|
|
3
|
-
你要将{{ target_language }}译文正确分割成片段,并与原文一一对应。并模仿用户提交的格式,将根节点由<request>替换成<response>节点,再将<fragment>的内容由原文替换成{{ target_language }}译文,但保留id不变。最终将整个XML格式内容输出。你的输出必须满足如下规则:
|
|
4
|
-
- 分割后的片段对应标准是语义一致。即对应片段的原文与译文互相翻译后,是完全相同的内容。
|
|
5
|
-
- 替换后的译文必须严格与用户提交的译文对应,不得增加、修改、删除内容。
|
|
6
|
-
- 将你输出<fragment>中的译文单独提取出来按顺序读出来,应与用户提交的{{ target_language }}译文一字不差。
|
|
7
|
-
- 译文必须是其对应的原文的直接翻译。
|
|
8
|
-
- 绝大部分情况下,译文<fragment>的id能与原文<fragment>的id一一配对,不会出现错位、新增、遗漏的情况。但若发现无论如何都无法一一对应,应该尽可能将文字更多的片段对应对应上,跳过哪些由几个字或几个单词构成的短小、碎片化的片段。此时你输出的<fragment>的id可能不连续,也是没关系的,通过跳过短小片段来保证整体对应关系完整,是可接受的。决不可接受的是,因为遗漏短小片段,导致后面大段大段内容直接错位。
|
|
9
|
-
|
|
10
|
-
特别注意,用户提交的译文也会分自然段,这个自然段与原文的<fragment>**没有任何关系**。这个分段是翻译家仅看过原文文本,但不知其<fragment>划分的情况下自行作出了。你绝对不可参考这个译文的分段,更不要被它误导。匹配标准只有一条,就是语义一致。
|
|
11
|
-
|
|
12
|
-
这里举个例子,假设用户提交的原文是英文,译文是中文。用户提交的内容如下:
|
|
13
|
-
```XML
|
|
14
|
-
<request>
|
|
15
|
-
<fragment id="1">Although fermentation was an idea dear to the heart of many an alchemist, the particular notion of fermenting water in order to produce the specified materials of the world perceived by the senses is at heart Helmontian.</fragment>
|
|
16
|
-
<fragment id="2">In the following it will therefore be useful to give a brief overview of van Helmont’s matter-theory.</fragment>
|
|
17
|
-
<fragment id="3">Reference</fragment>
|
|
18
|
-
<fragment id="4">[1] Newman, Gehennical Fire, pp. 58–78, 171–96.</fragment>
|
|
19
|
-
</response>
|
|
20
|
-
```
|
|
21
|
-
|
|
22
|
-
尽管发酵是许多炼金术士所珍视的理念,但通过发酵水来生成感官所感知的特定物质这一特定概念,其核心却是海尔蒙特式的。因此,下文将简要概述范·海尔蒙特的物质理论引用。[1]纽曼,《地底之火》,第 58-78 页、第 171-96 页。
|
|
23
|
-
|
|
24
|
-
你应该返回如下内容。
|
|
25
|
-
```XML
|
|
26
|
-
<response>
|
|
27
|
-
<fragment id="1">尽管发酵是许多炼金术士所珍视的理念,但通过发酵水来生成感官所感知的特定物质这一特定概念,其核心却是海尔蒙特式的。</fragment>
|
|
28
|
-
<fragment id="2">因此,下文将简要概述范·海尔蒙特的物质理论引用。</fragment>
|
|
29
|
-
<fragment id="4">[1] 纽曼,《地底之火》,第 58-78 页、第 171-96 页。</fragment>
|
|
30
|
-
</response>
|
|
31
|
-
```
|
|
32
|
-
|
|
33
|
-
在该例子中,仅仅演示如何将片段对应以及输出XML的具体格式。不要参考到底从哪种语言翻译到哪种语言,也不要参考具体内容。
|
|
@@ -1,162 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import re
|
|
3
|
-
|
|
4
|
-
from lxml.etree import parse, Element, QName
|
|
5
|
-
from html import escape
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
# TODO replace with XML
|
|
9
|
-
class Spine:
|
|
10
|
-
def __init__(self, folder_path, base_path, item):
|
|
11
|
-
self._folder_path = folder_path
|
|
12
|
-
self._base_path = base_path
|
|
13
|
-
self.href = item.get("href")
|
|
14
|
-
self.media_type = item.get("media-type")
|
|
15
|
-
|
|
16
|
-
@property
|
|
17
|
-
def path(self) -> str:
|
|
18
|
-
path = os.path.join(self._base_path, self.href)
|
|
19
|
-
path = os.path.abspath(path)
|
|
20
|
-
|
|
21
|
-
if os.path.exists(path):
|
|
22
|
-
return path
|
|
23
|
-
|
|
24
|
-
path = os.path.join(self._folder_path, self.href)
|
|
25
|
-
path = os.path.abspath(path)
|
|
26
|
-
return path
|
|
27
|
-
|
|
28
|
-
class EpubContent:
|
|
29
|
-
def __init__(self, path: str):
|
|
30
|
-
self.folder_path = path
|
|
31
|
-
self._content_path = self._find_content_path(path)
|
|
32
|
-
self._tree = parse(self._content_path)
|
|
33
|
-
self._namespaces = { "ns": self._tree.getroot().nsmap.get(None) }
|
|
34
|
-
self._spine = self._tree.xpath("//ns:spine", namespaces=self._namespaces)[0]
|
|
35
|
-
self._metadata = self._tree.xpath("//ns:metadata", namespaces=self._namespaces)[0]
|
|
36
|
-
self._manifest = self._tree.xpath("//ns:manifest", namespaces=self._namespaces)[0]
|
|
37
|
-
|
|
38
|
-
def save(self):
|
|
39
|
-
self._tree.write(self._content_path, pretty_print=True)
|
|
40
|
-
|
|
41
|
-
def _find_content_path(self, path: str) -> str:
|
|
42
|
-
root = parse(os.path.join(path, "META-INF", "container.xml")).getroot()
|
|
43
|
-
rootfile = root.xpath(
|
|
44
|
-
"//ns:container/ns:rootfiles/ns:rootfile",
|
|
45
|
-
namespaces={ "ns": root.nsmap.get(None) },
|
|
46
|
-
)[0]
|
|
47
|
-
full_path = rootfile.attrib["full-path"]
|
|
48
|
-
joined_path = os.path.join(path, full_path)
|
|
49
|
-
|
|
50
|
-
return os.path.abspath(joined_path)
|
|
51
|
-
|
|
52
|
-
@property
|
|
53
|
-
def ncx_path(self):
|
|
54
|
-
ncx_dom = self._manifest.find(".//*[@id=\"ncx\"]")
|
|
55
|
-
if ncx_dom is not None:
|
|
56
|
-
href_path = ncx_dom.get("href")
|
|
57
|
-
base_path = os.path.dirname(self._content_path)
|
|
58
|
-
path = os.path.join(base_path, href_path)
|
|
59
|
-
path = os.path.abspath(path)
|
|
60
|
-
|
|
61
|
-
if os.path.exists(path):
|
|
62
|
-
return path
|
|
63
|
-
|
|
64
|
-
path = os.path.join(self.folder_path, path)
|
|
65
|
-
path = os.path.abspath(path)
|
|
66
|
-
return path
|
|
67
|
-
|
|
68
|
-
@property
|
|
69
|
-
def spines(self) -> list[Spine]:
|
|
70
|
-
idref_dict = {}
|
|
71
|
-
index = 0
|
|
72
|
-
|
|
73
|
-
for child in self._spine.iterchildren():
|
|
74
|
-
id = child.get("idref")
|
|
75
|
-
idref_dict[id] = index
|
|
76
|
-
index += 1
|
|
77
|
-
|
|
78
|
-
items = [None for _ in range(index)]
|
|
79
|
-
spines = []
|
|
80
|
-
|
|
81
|
-
for child in self._manifest.iterchildren():
|
|
82
|
-
id = child.get("id")
|
|
83
|
-
if id in idref_dict:
|
|
84
|
-
index = idref_dict[id]
|
|
85
|
-
items[index] = child
|
|
86
|
-
|
|
87
|
-
base_path = os.path.dirname(self._content_path)
|
|
88
|
-
|
|
89
|
-
for item in items:
|
|
90
|
-
if item is not None:
|
|
91
|
-
spines.append(Spine(
|
|
92
|
-
folder_path=self.folder_path,
|
|
93
|
-
base_path=base_path,
|
|
94
|
-
item=item,
|
|
95
|
-
))
|
|
96
|
-
|
|
97
|
-
return spines
|
|
98
|
-
|
|
99
|
-
@property
|
|
100
|
-
def title(self):
|
|
101
|
-
title_dom = self._get_title()
|
|
102
|
-
if title_dom is None:
|
|
103
|
-
return None
|
|
104
|
-
return title_dom.text
|
|
105
|
-
|
|
106
|
-
@title.setter
|
|
107
|
-
def title(self, title: str):
|
|
108
|
-
title_dom = self._get_title()
|
|
109
|
-
if title_dom is not None:
|
|
110
|
-
title_dom.text = _escape_ascii(title)
|
|
111
|
-
|
|
112
|
-
def _get_title(self):
|
|
113
|
-
titles = self._metadata.xpath(
|
|
114
|
-
"./dc:title",
|
|
115
|
-
namespaces={
|
|
116
|
-
"dc": self._metadata.nsmap.get("dc"),
|
|
117
|
-
},
|
|
118
|
-
)
|
|
119
|
-
if len(titles) == 0:
|
|
120
|
-
return None
|
|
121
|
-
return titles[0]
|
|
122
|
-
|
|
123
|
-
@property
|
|
124
|
-
def authors(self) -> list[str]:
|
|
125
|
-
return list(map(lambda x: x.text, self._get_creators()))
|
|
126
|
-
|
|
127
|
-
@authors.setter
|
|
128
|
-
def authors(self, authors):
|
|
129
|
-
creator_doms = self._get_creators()
|
|
130
|
-
if len(creator_doms) == 0:
|
|
131
|
-
return
|
|
132
|
-
parent_dom = creator_doms[0].getparent()
|
|
133
|
-
index_at_parent = parent_dom.index(creator_doms[0])
|
|
134
|
-
ns={
|
|
135
|
-
"dc": self._metadata.nsmap.get("dc"),
|
|
136
|
-
"opf": self._metadata.nsmap.get("opf"),
|
|
137
|
-
}
|
|
138
|
-
for author in reversed(authors):
|
|
139
|
-
creator_dom = Element(QName(ns["dc"], "creator"))
|
|
140
|
-
creator_dom.set(QName(ns["opf"], "file-as"), author)
|
|
141
|
-
creator_dom.set(QName(ns["opf"], "role"), "aut")
|
|
142
|
-
creator_dom.text = _escape_ascii(author)
|
|
143
|
-
parent_dom.insert(index_at_parent, creator_dom)
|
|
144
|
-
|
|
145
|
-
for creator_dom in creator_doms:
|
|
146
|
-
parent_dom.remove(creator_dom)
|
|
147
|
-
|
|
148
|
-
def _get_creators(self):
|
|
149
|
-
return self._metadata.xpath(
|
|
150
|
-
"./dc:creator",
|
|
151
|
-
namespaces={
|
|
152
|
-
"dc": self._metadata.nsmap.get("dc"),
|
|
153
|
-
},
|
|
154
|
-
)
|
|
155
|
-
|
|
156
|
-
def _escape_ascii(content: str) -> str:
|
|
157
|
-
content = escape(content)
|
|
158
|
-
content = re.sub(
|
|
159
|
-
r"\\u([\da-fA-F]{4})",
|
|
160
|
-
lambda x: chr(int(x.group(1), 16)), content,
|
|
161
|
-
)
|
|
162
|
-
return content
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
from .file import HTMLFile
|
|
@@ -1,68 +0,0 @@
|
|
|
1
|
-
from io import StringIO
|
|
2
|
-
from typing import cast, Generator, Iterable
|
|
3
|
-
from xml.etree.ElementTree import Element
|
|
4
|
-
from .texts_searcher import search_texts, TextPosition
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
def read_texts(root: Element) -> Generator[str, None, None]:
|
|
8
|
-
for element, position, _ in search_texts(root):
|
|
9
|
-
if position == TextPosition.WHOLE_DOM:
|
|
10
|
-
yield _plain_text(element)
|
|
11
|
-
elif position == TextPosition.TEXT:
|
|
12
|
-
yield cast(str, element.text)
|
|
13
|
-
elif position == TextPosition.TAIL:
|
|
14
|
-
yield cast(str, element.tail)
|
|
15
|
-
|
|
16
|
-
def write_texts(root: Element, texts: Iterable[str | Iterable[str] | None], append: bool):
|
|
17
|
-
zip_list = list(zip(texts, search_texts(root)))
|
|
18
|
-
for text, (element, position, parent) in reversed(zip_list):
|
|
19
|
-
if text is None:
|
|
20
|
-
continue
|
|
21
|
-
if not isinstance(text, str):
|
|
22
|
-
# TODO: implements split text
|
|
23
|
-
text = "".join(text)
|
|
24
|
-
if position == TextPosition.WHOLE_DOM:
|
|
25
|
-
if parent is not None:
|
|
26
|
-
_write_dom(parent, element, text, append)
|
|
27
|
-
elif position == TextPosition.TEXT:
|
|
28
|
-
element.text = _write_text(element.text, text, append)
|
|
29
|
-
elif position == TextPosition.TAIL:
|
|
30
|
-
element.tail = _write_text(element.tail, text, append)
|
|
31
|
-
|
|
32
|
-
def _write_dom(parent: Element, origin: Element, text: str, append: bool):
|
|
33
|
-
if append:
|
|
34
|
-
appended = Element(origin.tag, {**origin.attrib})
|
|
35
|
-
for index, child in enumerate(parent):
|
|
36
|
-
if child == origin:
|
|
37
|
-
parent.insert(index + 1, appended)
|
|
38
|
-
break
|
|
39
|
-
appended.attrib.pop("id", None)
|
|
40
|
-
appended.text = text
|
|
41
|
-
appended.tail = origin.tail
|
|
42
|
-
origin.tail = None
|
|
43
|
-
else:
|
|
44
|
-
for child in origin:
|
|
45
|
-
origin.remove(child)
|
|
46
|
-
origin.text = text
|
|
47
|
-
|
|
48
|
-
def _write_text(left: str | None, right: str, append: bool) -> str:
|
|
49
|
-
if not append:
|
|
50
|
-
return right
|
|
51
|
-
elif left is None:
|
|
52
|
-
return right
|
|
53
|
-
else:
|
|
54
|
-
return left + right
|
|
55
|
-
|
|
56
|
-
def _plain_text(target: Element):
|
|
57
|
-
buffer = StringIO()
|
|
58
|
-
for text in _iter_text(target):
|
|
59
|
-
buffer.write(text)
|
|
60
|
-
return buffer.getvalue()
|
|
61
|
-
|
|
62
|
-
def _iter_text(parent: Element):
|
|
63
|
-
if parent.text is not None:
|
|
64
|
-
yield parent.text
|
|
65
|
-
for child in parent:
|
|
66
|
-
yield from _iter_text(child)
|
|
67
|
-
if parent.tail is not None:
|
|
68
|
-
yield parent.tail
|
|
@@ -1,23 +0,0 @@
|
|
|
1
|
-
import re
|
|
2
|
-
|
|
3
|
-
# HTML 规定了一系列自闭标签,这些标签需要改成非自闭的,因为 EPub 格式不支持
|
|
4
|
-
# https://www.tutorialspoint.com/which-html-tags-are-self-closing
|
|
5
|
-
_EMPTY_TAGS = (
|
|
6
|
-
"br",
|
|
7
|
-
"hr",
|
|
8
|
-
"input",
|
|
9
|
-
"col",
|
|
10
|
-
"base",
|
|
11
|
-
"meta",
|
|
12
|
-
"area",
|
|
13
|
-
)
|
|
14
|
-
|
|
15
|
-
_EMPTY_TAG_PATTERN = re.compile(
|
|
16
|
-
r"<(" + "|".join(_EMPTY_TAGS) + r")(\s[^>]*?)\s*/?>"
|
|
17
|
-
)
|
|
18
|
-
|
|
19
|
-
def to_html(content: str) -> str:
|
|
20
|
-
return re.sub(_EMPTY_TAG_PATTERN, lambda m: f"<{m.group(1)}{m.group(2)}>", content)
|
|
21
|
-
|
|
22
|
-
def to_xml(content: str) -> str:
|
|
23
|
-
return re.sub(_EMPTY_TAG_PATTERN, lambda m: f"<{m.group(1)}{m.group(2)} />", content)
|