epub-translator 0.0.7__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. epub_translator/__init__.py +4 -2
  2. epub_translator/data/fill.jinja +66 -0
  3. epub_translator/data/mmltex/README.md +67 -0
  4. epub_translator/data/mmltex/cmarkup.xsl +1106 -0
  5. epub_translator/data/mmltex/entities.xsl +459 -0
  6. epub_translator/data/mmltex/glayout.xsl +222 -0
  7. epub_translator/data/mmltex/mmltex.xsl +36 -0
  8. epub_translator/data/mmltex/scripts.xsl +375 -0
  9. epub_translator/data/mmltex/tables.xsl +130 -0
  10. epub_translator/data/mmltex/tokens.xsl +328 -0
  11. epub_translator/data/translate.jinja +15 -12
  12. epub_translator/epub/__init__.py +4 -2
  13. epub_translator/epub/common.py +43 -0
  14. epub_translator/epub/math.py +193 -0
  15. epub_translator/epub/placeholder.py +53 -0
  16. epub_translator/epub/spines.py +42 -0
  17. epub_translator/epub/toc.py +505 -0
  18. epub_translator/epub/zip.py +67 -0
  19. epub_translator/iter_sync.py +24 -0
  20. epub_translator/language.py +23 -0
  21. epub_translator/llm/__init__.py +2 -1
  22. epub_translator/llm/core.py +233 -0
  23. epub_translator/llm/error.py +38 -35
  24. epub_translator/llm/executor.py +159 -136
  25. epub_translator/llm/increasable.py +28 -28
  26. epub_translator/llm/types.py +17 -0
  27. epub_translator/serial/__init__.py +2 -0
  28. epub_translator/serial/chunk.py +52 -0
  29. epub_translator/serial/segment.py +17 -0
  30. epub_translator/serial/splitter.py +50 -0
  31. epub_translator/template.py +35 -33
  32. epub_translator/translator.py +208 -178
  33. epub_translator/utils.py +7 -0
  34. epub_translator/xml/__init__.py +4 -3
  35. epub_translator/xml/deduplication.py +38 -0
  36. epub_translator/xml/firendly/__init__.py +2 -0
  37. epub_translator/xml/firendly/decoder.py +75 -0
  38. epub_translator/xml/firendly/encoder.py +84 -0
  39. epub_translator/xml/firendly/parser.py +177 -0
  40. epub_translator/xml/firendly/tag.py +118 -0
  41. epub_translator/xml/firendly/transform.py +36 -0
  42. epub_translator/xml/xml.py +52 -0
  43. epub_translator/xml/xml_like.py +231 -0
  44. epub_translator/xml_translator/__init__.py +3 -0
  45. epub_translator/xml_translator/const.py +2 -0
  46. epub_translator/xml_translator/fill.py +128 -0
  47. epub_translator/xml_translator/format.py +282 -0
  48. epub_translator/xml_translator/fragmented.py +125 -0
  49. epub_translator/xml_translator/group.py +183 -0
  50. epub_translator/xml_translator/progressive_locking.py +256 -0
  51. epub_translator/xml_translator/submitter.py +102 -0
  52. epub_translator/xml_translator/text_segment.py +263 -0
  53. epub_translator/xml_translator/translator.py +179 -0
  54. epub_translator/xml_translator/utils.py +29 -0
  55. epub_translator-0.1.1.dist-info/METADATA +283 -0
  56. epub_translator-0.1.1.dist-info/RECORD +58 -0
  57. epub_translator/data/format.jinja +0 -33
  58. epub_translator/epub/content_parser.py +0 -162
  59. epub_translator/epub/html/__init__.py +0 -1
  60. epub_translator/epub/html/dom_operator.py +0 -68
  61. epub_translator/epub/html/empty_tags.py +0 -23
  62. epub_translator/epub/html/file.py +0 -80
  63. epub_translator/epub/html/texts_searcher.py +0 -46
  64. epub_translator/llm/node.py +0 -201
  65. epub_translator/translation/__init__.py +0 -2
  66. epub_translator/translation/chunk.py +0 -118
  67. epub_translator/translation/splitter.py +0 -78
  68. epub_translator/translation/store.py +0 -36
  69. epub_translator/translation/translation.py +0 -231
  70. epub_translator/translation/types.py +0 -45
  71. epub_translator/translation/utils.py +0 -11
  72. epub_translator/xml/decoder.py +0 -71
  73. epub_translator/xml/encoder.py +0 -95
  74. epub_translator/xml/parser.py +0 -172
  75. epub_translator/xml/tag.py +0 -93
  76. epub_translator/xml/transform.py +0 -34
  77. epub_translator/xml/utils.py +0 -12
  78. epub_translator/zip_context.py +0 -74
  79. epub_translator-0.0.7.dist-info/METADATA +0 -170
  80. epub_translator-0.0.7.dist-info/RECORD +0 -36
  81. {epub_translator-0.0.7.dist-info → epub_translator-0.1.1.dist-info}/LICENSE +0 -0
  82. {epub_translator-0.0.7.dist-info → epub_translator-0.1.1.dist-info}/WHEEL +0 -0
@@ -0,0 +1,283 @@
1
+ Metadata-Version: 2.3
2
+ Name: epub-translator
3
+ Version: 0.1.1
4
+ Summary: Translate the epub book using LLM. The translated book will retain the original text and list the translated text side by side with the original text.
5
+ License: MIT
6
+ Keywords: epub,llm,translation,translator
7
+ Author: Tao Zeyu
8
+ Author-email: i@taozeyu.com
9
+ Maintainer: Tao Zeyu
10
+ Maintainer-email: i@taozeyu.com
11
+ Requires-Python: >=3.11,<3.14
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Intended Audience :: Education
15
+ Classifier: Intended Audience :: Science/Research
16
+ Classifier: License :: OSI Approved :: MIT License
17
+ Classifier: Operating System :: OS Independent
18
+ Classifier: Programming Language :: Python :: 3
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Programming Language :: Python :: 3.13
22
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
23
+ Classifier: Topic :: Software Development :: Localization
24
+ Classifier: Topic :: Text Processing :: Markup
25
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
26
+ Requires-Dist: jinja2 (>=3.1.6,<4.0.0)
27
+ Requires-Dist: openai (>=2.14.0,<3.0.0)
28
+ Requires-Dist: resource-segmentation (>=0.0.7,<0.1.0)
29
+ Requires-Dist: tiktoken (>=0.12.0,<1.0.0)
30
+ Project-URL: Homepage, https://hub.oomol.com/package/books-translator
31
+ Project-URL: Repository, https://github.com/oomol-lab/epub-translator
32
+ Description-Content-Type: text/markdown
33
+
34
+ <div align=center>
35
+ <h1>EPUB Translator</h1>
36
+ <p>
37
+ <a href="https://github.com/oomol-lab/epub-translator/actions/workflows/merge-build.yml" target="_blank"><img src="https://img.shields.io/github/actions/workflow/status/oomol-lab/epub-translator/merge-build.yml" alt="ci" /></a>
38
+ <a href="https://pypi.org/project/epub-translator/" target="_blank"><img src="https://img.shields.io/badge/pip_install-epub--translator-blue" alt="pip install epub-translator" /></a>
39
+ <a href="https://pypi.org/project/epub-translator/" target="_blank"><img src="https://img.shields.io/pypi/v/epub-translator.svg" alt="pypi epub-translator" /></a>
40
+ <a href="https://pypi.org/project/epub-translator/" target="_blank"><img src="https://img.shields.io/pypi/pyversions/epub-translator.svg" alt="python versions" /></a>
41
+ <a href="https://github.com/oomol-lab/epub-translator/blob/main/LICENSE" target="_blank"><img src="https://img.shields.io/github/license/oomol-lab/epub-translator" alt="license" /></a>
42
+ </p>
43
+ <p><a href="https://hub.oomol.com/package/books-translator?open=true" target="_blank"><img src="https://static.oomol.com/assets/button.svg" alt="Open in OOMOL Studio" /></a></p>
44
+ <p>English | <a href="./README_zh-CN.md">中文</a></p>
45
+ </div>
46
+
47
+
48
+ Translate EPUB books using Large Language Models while preserving the original text. The translated content is displayed side-by-side with the original, creating bilingual books perfect for language learning and cross-reference reading.
49
+
50
+ ![Translation Effect](./docs/images/translation.png)
51
+
52
+ ## Features
53
+
54
+ - **Bilingual Output**: Preserves original text alongside translations for easy comparison
55
+ - **LLM-Powered**: Leverages large language models for high-quality, context-aware translations
56
+ - **Format Preservation**: Maintains EPUB structure, styles, images, and formatting
57
+ - **Complete Translation**: Translates chapter content, table of contents, and metadata
58
+ - **Progress Tracking**: Monitor translation progress with built-in callbacks
59
+ - **Flexible LLM Support**: Works with any OpenAI-compatible API endpoint
60
+ - **Caching**: Built-in caching for progress recovery when translation fails
61
+
62
+ ## Installation
63
+
64
+ ```bash
65
+ pip install epub-translator
66
+ ```
67
+
68
+ **Requirements**: Python 3.11, 3.12, or 3.13
69
+
70
+ ## Quick Start
71
+
72
+ ### Using OOMOL Studio (Recommended)
73
+
74
+ The easiest way to use EPUB Translator is through OOMOL Studio with a visual interface:
75
+
76
+ [![Watch the Tutorial](./docs/images/link2youtube.png)](https://www.youtube.com/watch?v=QsAdiskxfXI)
77
+
78
+ ### Using Python API
79
+
80
+ ```python
81
+ from pathlib import Path
82
+ from epub_translator import LLM, translate, language
83
+
84
+ # Initialize LLM with your API credentials
85
+ llm = LLM(
86
+ key="your-api-key",
87
+ url="https://api.openai.com/v1",
88
+ model="gpt-4",
89
+ token_encoding="o200k_base",
90
+ )
91
+
92
+ # Translate EPUB file using language constants
93
+ translate(
94
+ llm=llm,
95
+ source_path=Path("source.epub"),
96
+ target_path=Path("translated.epub"),
97
+ target_language=language.ENGLISH,
98
+ )
99
+ ```
100
+
101
+ ### With Progress Tracking
102
+
103
+ ```python
104
+ from tqdm import tqdm
105
+
106
+ with tqdm(total=100, desc="Translating", unit="%") as pbar:
107
+ last_progress = 0.0
108
+
109
+ def on_progress(progress: float):
110
+ nonlocal last_progress
111
+ increment = (progress - last_progress) * 100
112
+ pbar.update(increment)
113
+ last_progress = progress
114
+
115
+ translate(
116
+ llm=llm,
117
+ source_path=Path("source.epub"),
118
+ target_path=Path("translated.epub"),
119
+ target_language="English",
120
+ on_progress=on_progress,
121
+ )
122
+ ```
123
+
124
+ ## API Reference
125
+
126
+ ### `LLM` Class
127
+
128
+ Initialize the LLM client for translation:
129
+
130
+ ```python
131
+ LLM(
132
+ key: str, # API key
133
+ url: str, # API endpoint URL
134
+ model: str, # Model name (e.g., "gpt-4")
135
+ token_encoding: str, # Token encoding (e.g., "o200k_base")
136
+ cache_path: PathLike | None = None, # Cache directory path
137
+ timeout: float | None = None, # Request timeout in seconds
138
+ top_p: float | tuple[float, float] | None = None,
139
+ temperature: float | tuple[float, float] | None = None,
140
+ retry_times: int = 5, # Number of retries on failure
141
+ retry_interval_seconds: float = 6.0, # Interval between retries
142
+ log_dir_path: PathLike | None = None, # Log directory path
143
+ )
144
+ ```
145
+
146
+ ### `translate` Function
147
+
148
+ Translate an EPUB file:
149
+
150
+ ```python
151
+ translate(
152
+ llm: LLM, # LLM instance
153
+ source_path: Path, # Source EPUB file path
154
+ target_path: Path, # Output EPUB file path
155
+ target_language: str, # Target language (e.g., "English", "Chinese")
156
+ user_prompt: str | None = None, # Custom translation instructions
157
+ max_retries: int = 5, # Maximum retries for failed translations
158
+ max_group_tokens: int = 1200, # Maximum tokens per translation group
159
+ on_progress: Callable[[float], None] | None = None, # Progress callback (0.0-1.0)
160
+ )
161
+ ```
162
+
163
+ #### Language Constants
164
+
165
+ EPUB Translator provides predefined language constants for convenience. You can use these constants instead of writing language names as strings:
166
+
167
+ ```python
168
+ from epub_translator import language
169
+
170
+ # Usage example:
171
+ translate(
172
+ llm=llm,
173
+ source_path=Path("source.epub"),
174
+ target_path=Path("translated.epub"),
175
+ target_language=language.ENGLISH,
176
+ )
177
+
178
+ # You can also use custom language strings:
179
+ translate(
180
+ llm=llm,
181
+ source_path=Path("source.epub"),
182
+ target_path=Path("translated.epub"),
183
+ target_language="Icelandic", # For languages not in the constants
184
+ )
185
+ ```
186
+
187
+ ## Configuration Examples
188
+
189
+ ### OpenAI
190
+
191
+ ```python
192
+ llm = LLM(
193
+ key="sk-...",
194
+ url="https://api.openai.com/v1",
195
+ model="gpt-4",
196
+ token_encoding="o200k_base",
197
+ )
198
+ ```
199
+
200
+ ### Azure OpenAI
201
+
202
+ ```python
203
+ llm = LLM(
204
+ key="your-azure-key",
205
+ url="https://your-resource.openai.azure.com/openai/deployments/your-deployment",
206
+ model="gpt-4",
207
+ token_encoding="o200k_base",
208
+ )
209
+ ```
210
+
211
+ ### Other OpenAI-Compatible Services
212
+
213
+ Any service with an OpenAI-compatible API can be used:
214
+
215
+ ```python
216
+ llm = LLM(
217
+ key="your-api-key",
218
+ url="https://your-service.com/v1",
219
+ model="your-model",
220
+ token_encoding="o200k_base", # Match your model's encoding
221
+ )
222
+ ```
223
+
224
+ ## Use Cases
225
+
226
+ - **Language Learning**: Read books in their original language with side-by-side translations
227
+ - **Academic Research**: Access foreign literature with bilingual references
228
+ - **Content Localization**: Prepare books for international audiences
229
+ - **Cross-Cultural Reading**: Enjoy literature while understanding cultural nuances
230
+
231
+ ## Advanced Features
232
+
233
+ ### Custom Translation Prompts
234
+
235
+ Provide specific translation instructions:
236
+
237
+ ```python
238
+ translate(
239
+ llm=llm,
240
+ source_path=Path("source.epub"),
241
+ target_path=Path("translated.epub"),
242
+ target_language="English",
243
+ user_prompt="Use formal language and preserve technical terminology",
244
+ )
245
+ ```
246
+
247
+ ### Caching for Progress Recovery
248
+
249
+ Enable caching to resume translation progress after failures:
250
+
251
+ ```python
252
+ llm = LLM(
253
+ key="your-api-key",
254
+ url="https://api.openai.com/v1",
255
+ model="gpt-4",
256
+ token_encoding="o200k_base",
257
+ cache_path="./translation_cache", # Translations are cached here
258
+ )
259
+ ```
260
+
261
+ ## Related Projects
262
+
263
+ ### PDF Craft
264
+
265
+ [PDF Craft](https://github.com/oomol-lab/pdf-craft) converts PDF files into EPUB and other formats, with a focus on scanned books. Combine PDF Craft with EPUB Translator to convert and translate scanned PDF books into bilingual EPUB format.
266
+
267
+ **Workflow**: Scanned PDF → [PDF Craft] → EPUB → [EPUB Translator] → Bilingual EPUB
268
+
269
+ For a complete tutorial, watch: [Convert scanned PDF books to EPUB format and translate them into bilingual books](https://www.bilibili.com/video/BV1tMQZY5EYY/)
270
+
271
+ ## Contributing
272
+
273
+ Contributions are welcome! Please feel free to submit a Pull Request.
274
+
275
+ ## License
276
+
277
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
278
+
279
+ ## Support
280
+
281
+ - **Issues**: [GitHub Issues](https://github.com/oomol-lab/epub-translator/issues)
282
+ - **OOMOL Studio**: [Open in OOMOL Studio](https://hub.oomol.com/package/books-translator?open=true)
283
+
@@ -0,0 +1,58 @@
1
+ epub_translator/__init__.py,sha256=CDj5ZDWZreLKF3LdHf6QmGbUABytunhUBPJwYbpeIKc,122
2
+ epub_translator/data/fill.jinja,sha256=Rk8EodbDOEHS-W20CGJbhSNBHdZI1EIz414mQf_B4tY,1390
3
+ epub_translator/data/mmltex/README.md,sha256=wwhe5yW1U_7_YZIFKnQVnCOmUl7Mu3gsr3lNnDSJ5Qs,2953
4
+ epub_translator/data/mmltex/cmarkup.xsl,sha256=DkhimAATM0XSCfVOfY41-qTPoddqzOHjZ00Pynr4zQE,37707
5
+ epub_translator/data/mmltex/entities.xsl,sha256=TYZ5iGg0u9XlDDBBGuZiHL7MsxKc-3OsTIBAVM1GDek,107742
6
+ epub_translator/data/mmltex/glayout.xsl,sha256=Ztc7N1wiHaYZlo9u9iuROrIl3uIIIoo1VFIuojXq7TM,6879
7
+ epub_translator/data/mmltex/mmltex.xsl,sha256=BVXFbApz-9W2qRKKtBTxptK5vxG2bfB8tv9W1MP5iBI,1384
8
+ epub_translator/data/mmltex/scripts.xsl,sha256=f4ei0cDCW3cV-Ra7rC3kC5tRcKdjJxbSpCeQLoohtgo,13697
9
+ epub_translator/data/mmltex/tables.xsl,sha256=RxtNo8qDtVAg8_6BuYsafraB_0z7YDAB9D__fT9gmWs,4327
10
+ epub_translator/data/mmltex/tokens.xsl,sha256=j3JZRcBhAiiY8o5K3640phfLwxO8JVspCFlSttwBzJk,12373
11
+ epub_translator/data/translate.jinja,sha256=93d8kschm5HV-EfXd1kFSIVMObDqTMdoUrwDfce2bhU,820
12
+ epub_translator/epub/__init__.py,sha256=KpGWmHS4y0tBAIYp3v8G3k1u1KGaKNeloOvt3osAL5c,154
13
+ epub_translator/epub/common.py,sha256=4-SpTe8iot9hMfyXILmlUFvYVNYqPAHL5hn1fr2wgis,1180
14
+ epub_translator/epub/math.py,sha256=-Q2LJQxxjgQZQUe_WlJA9tjzLqgqtw2ZmbGbHsPRp2U,5422
15
+ epub_translator/epub/placeholder.py,sha256=ywBrFo4ZgwtTZpl_mNwWVhT1xyk3JZGD0qrpQi4u1DM,1613
16
+ epub_translator/epub/spines.py,sha256=bkLgalqJ2sId4enmrnYnGUMs74vohxRkXPbxqbOQuyw,1277
17
+ epub_translator/epub/toc.py,sha256=TKJfyDT4svFkXd6JCNZk2ZEYc9q-5DXnV3zY2UKo8nE,14891
18
+ epub_translator/epub/zip.py,sha256=CUE50LrrVNeQVecNm2ZFionJz4k_vMTXTi8an7BiQ_c,2314
19
+ epub_translator/iter_sync.py,sha256=56m-bRPqc731alGenqLvCIM99J8NzNuie86FDGtJj8k,588
20
+ epub_translator/language.py,sha256=88osG0JNYxOkxBjg5Pm-P0Mhiyxf6GqdxoPW12HW0PE,493
21
+ epub_translator/llm/__init__.py,sha256=QcAuTQpH0T7IMf-J3bRdtf8Tvyu6Z2CAe-wSzLJRLLw,43
22
+ epub_translator/llm/core.py,sha256=nRNAVDQD7kxSl2EN7m5OQ7CvlBL4ENbzQThUcJSzMsk,8123
23
+ epub_translator/llm/error.py,sha256=4efAIQL14DFSvAnSTUfgdAbZRqaWBqOfUGsSfvxa5zM,1503
24
+ epub_translator/llm/executor.py,sha256=Oax07rwivDbB0T3i_waLAvXvfQoR9dnWPTvw475C9vQ,6081
25
+ epub_translator/llm/increasable.py,sha256=vQka-bysKuFR-Vu-GziGZfQCasLn9q2GxGEoV2RiCec,1289
26
+ epub_translator/llm/types.py,sha256=-VPfl1qjTZ8s8rQfqZ44H_txZfVmx49TZdQSvp2vUU4,264
27
+ epub_translator/serial/__init__.py,sha256=b3IMVmWcUwEqHKcGmey88b057pyz5ct946CaUZi4LB4,67
28
+ epub_translator/serial/chunk.py,sha256=FrTaHikVOd6bLYumnEriTaAQ_DIDLjHm16gh-wBVR9k,1495
29
+ epub_translator/serial/segment.py,sha256=uEz-ke1KcYrON-68FaUEzMG2CzHlMjvbC11F3ZT4yH0,446
30
+ epub_translator/serial/splitter.py,sha256=Nq0sxPXos8ez7QBG01sOKjnYKbeBWUBHflZGtqenVm8,1726
31
+ epub_translator/template.py,sha256=0CqRmj3nTtPshw0NmTr2ECqelops2MMyX94fMrE-HKs,1587
32
+ epub_translator/translator.py,sha256=vEccCEFc-mArX4DzvUz09W_WFOxUv6dlQkwWDkbbVFs,6976
33
+ epub_translator/utils.py,sha256=7lBWHNyv4GQiutqqqUhbAxc8gqVIkhS7B4rkL1EKOFs,144
34
+ epub_translator/xml/__init__.py,sha256=te8vIRgG-2n1fEcTmNzCLc-WH9G0JUr_lJncJQvRbgw,96
35
+ epub_translator/xml/deduplication.py,sha256=Vc7BtXXnAMQHNtE--o2Qkm_sYrjnJSh33reKFh9YUjo,1143
36
+ epub_translator/xml/firendly/__init__.py,sha256=I5jhnhFWoHvojLsYXH4jfR4Gi8lKFZ3yQ56ze5hEe1M,74
37
+ epub_translator/xml/firendly/decoder.py,sha256=xRQ5LnSunmYbba_0oT39oUr86-sLYAHYMUGmlseIu2U,2467
38
+ epub_translator/xml/firendly/encoder.py,sha256=evjvw6oE-oCud44IsJ-YZVHn6dtUzjNYX25ljaZP6vY,2417
39
+ epub_translator/xml/firendly/parser.py,sha256=QlMHA0nfPJbNyx6IwRFrYVw7okuvzDB42NXCauIFV-o,6560
40
+ epub_translator/xml/firendly/tag.py,sha256=ahaGoYttuAlnFxLFFgTV51KUZSpUiHho-COZX14nxN8,3308
41
+ epub_translator/xml/firendly/transform.py,sha256=5tG1MJmzrXIR_Z5gmRxwcoKvXBzJBVH0ELeaRsG-8w0,1201
42
+ epub_translator/xml/xml.py,sha256=7NPinMOFGBeOHCG-hw0iQjL-p-_I4DmYL8lq0Ar8rag,1498
43
+ epub_translator/xml/xml_like.py,sha256=tgzqDQFfql9-QMSRbLf9SVlNsvyZXJTCEWmksxd3TuI,9489
44
+ epub_translator/xml_translator/__init__.py,sha256=yNgwIermFXaRfAfnqXaNFCEf5I95cBVUDxha-6xkLq0,117
45
+ epub_translator/xml_translator/const.py,sha256=Q9pmLplUR71TqF4MN5oLtPNl_pBRWoOJwsC5eIQOOWE,57
46
+ epub_translator/xml_translator/fill.py,sha256=LxkPxlfbDDB3gP1rciXEBFyi1QRj5vXWzdca5SBcd5o,4839
47
+ epub_translator/xml_translator/format.py,sha256=xupnymuvr0hNJnjZ3-M4x3WiHN7LdgvHTQCUZkAug5U,10524
48
+ epub_translator/xml_translator/fragmented.py,sha256=DMueQlGNLbW70dwoZHRNLypzVmxyXAGxkbsc6K5115M,4569
49
+ epub_translator/xml_translator/group.py,sha256=2GxJl3RojyHyMuTZ5cn5PITT-F2fdaBlvAhfsn_o08Q,6977
50
+ epub_translator/xml_translator/progressive_locking.py,sha256=2eoCzVNeV4e4TziYTk4UgKmBUGuFQFj7X24ejO75lUA,9613
51
+ epub_translator/xml_translator/submitter.py,sha256=bIoxhUIDMScgnxnqfCKR8d3u1DaISXqIM2WuHzrNU7M,4022
52
+ epub_translator/xml_translator/text_segment.py,sha256=Aue5XHKYKzTuinFExcdu0CqGY5TiuJoIIhbP9t5ubPg,7673
53
+ epub_translator/xml_translator/translator.py,sha256=FGSXo2UWtcoIOWGzkI4emyqp1Q2Z8EoOBCBmdtty18A,7063
54
+ epub_translator/xml_translator/utils.py,sha256=AIJOcB7Btad0yxxLwD3UC9NTk2gOPEM8qqx7sNO6tDc,626
55
+ epub_translator-0.1.1.dist-info/LICENSE,sha256=5RF32sL3LtMOJIErdDKp1ZEYPGXS8WPpsiSz_jMBnGI,1066
56
+ epub_translator-0.1.1.dist-info/METADATA,sha256=BJDV44wO93Nw7e1hqBV33HXK8KUa_JO2XJ1qQ22RGmc,9655
57
+ epub_translator-0.1.1.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
58
+ epub_translator-0.1.1.dist-info/RECORD,,
@@ -1,33 +0,0 @@
1
- 你是一个校对员,需要帮助用户校对译文与原文的对应关系。用户随后会提交一段文本,先是一段表示原文的XML格式内容,接下来是一段纯文本的{{ target_language }}译文。原文XML文本以<request>标签作为根,<fragment>标签作为子元素。原文将按顺序拆分再各个<fragment>标签中,每个<fragment>标签将包含一个id属性,以唯一标识该片段。
2
-
3
- 你要将{{ target_language }}译文正确分割成片段,并与原文一一对应。并模仿用户提交的格式,将根节点由<request>替换成<response>节点,再将<fragment>的内容由原文替换成{{ target_language }}译文,但保留id不变。最终将整个XML格式内容输出。你的输出必须满足如下规则:
4
- - 分割后的片段对应标准是语义一致。即对应片段的原文与译文互相翻译后,是完全相同的内容。
5
- - 替换后的译文必须严格与用户提交的译文对应,不得增加、修改、删除内容。
6
- - 将你输出<fragment>中的译文单独提取出来按顺序读出来,应与用户提交的{{ target_language }}译文一字不差。
7
- - 译文必须是其对应的原文的直接翻译。
8
- - 绝大部分情况下,译文<fragment>的id能与原文<fragment>的id一一配对,不会出现错位、新增、遗漏的情况。但若发现无论如何都无法一一对应,应该尽可能将文字更多的片段对应对应上,跳过哪些由几个字或几个单词构成的短小、碎片化的片段。此时你输出的<fragment>的id可能不连续,也是没关系的,通过跳过短小片段来保证整体对应关系完整,是可接受的。决不可接受的是,因为遗漏短小片段,导致后面大段大段内容直接错位。
9
-
10
- 特别注意,用户提交的译文也会分自然段,这个自然段与原文的<fragment>**没有任何关系**。这个分段是翻译家仅看过原文文本,但不知其<fragment>划分的情况下自行作出了。你绝对不可参考这个译文的分段,更不要被它误导。匹配标准只有一条,就是语义一致。
11
-
12
- 这里举个例子,假设用户提交的原文是英文,译文是中文。用户提交的内容如下:
13
- ```XML
14
- <request>
15
- <fragment id="1">Although fermentation was an idea dear to the heart of many an alchemist, the particular notion of fermenting water in order to produce the specified materials of the world perceived by the senses is at heart Helmontian.</fragment>
16
- <fragment id="2">In the following it will therefore be useful to give a brief overview of van Helmont’s matter-theory.</fragment>
17
- <fragment id="3">Reference</fragment>
18
- <fragment id="4">[1] Newman, Gehennical Fire, pp. 58–78, 171–96.</fragment>
19
- </response>
20
- ```
21
-
22
- 尽管发酵是许多炼金术士所珍视的理念,但通过发酵水来生成感官所感知的特定物质这一特定概念,其核心却是海尔蒙特式的。因此,下文将简要概述范·海尔蒙特的物质理论引用。[1]纽曼,《地底之火》,第 58-78 页、第 171-96 页。
23
-
24
- 你应该返回如下内容。
25
- ```XML
26
- <response>
27
- <fragment id="1">尽管发酵是许多炼金术士所珍视的理念,但通过发酵水来生成感官所感知的特定物质这一特定概念,其核心却是海尔蒙特式的。</fragment>
28
- <fragment id="2">因此,下文将简要概述范·海尔蒙特的物质理论引用。</fragment>
29
- <fragment id="4">[1] 纽曼,《地底之火》,第 58-78 页、第 171-96 页。</fragment>
30
- </response>
31
- ```
32
-
33
- 在该例子中,仅仅演示如何将片段对应以及输出XML的具体格式。不要参考到底从哪种语言翻译到哪种语言,也不要参考具体内容。
@@ -1,162 +0,0 @@
1
- import os
2
- import re
3
-
4
- from lxml.etree import parse, Element, QName
5
- from html import escape
6
-
7
-
8
- # TODO replace with XML
9
- class Spine:
10
- def __init__(self, folder_path, base_path, item):
11
- self._folder_path = folder_path
12
- self._base_path = base_path
13
- self.href = item.get("href")
14
- self.media_type = item.get("media-type")
15
-
16
- @property
17
- def path(self) -> str:
18
- path = os.path.join(self._base_path, self.href)
19
- path = os.path.abspath(path)
20
-
21
- if os.path.exists(path):
22
- return path
23
-
24
- path = os.path.join(self._folder_path, self.href)
25
- path = os.path.abspath(path)
26
- return path
27
-
28
- class EpubContent:
29
- def __init__(self, path: str):
30
- self.folder_path = path
31
- self._content_path = self._find_content_path(path)
32
- self._tree = parse(self._content_path)
33
- self._namespaces = { "ns": self._tree.getroot().nsmap.get(None) }
34
- self._spine = self._tree.xpath("//ns:spine", namespaces=self._namespaces)[0]
35
- self._metadata = self._tree.xpath("//ns:metadata", namespaces=self._namespaces)[0]
36
- self._manifest = self._tree.xpath("//ns:manifest", namespaces=self._namespaces)[0]
37
-
38
- def save(self):
39
- self._tree.write(self._content_path, pretty_print=True)
40
-
41
- def _find_content_path(self, path: str) -> str:
42
- root = parse(os.path.join(path, "META-INF", "container.xml")).getroot()
43
- rootfile = root.xpath(
44
- "//ns:container/ns:rootfiles/ns:rootfile",
45
- namespaces={ "ns": root.nsmap.get(None) },
46
- )[0]
47
- full_path = rootfile.attrib["full-path"]
48
- joined_path = os.path.join(path, full_path)
49
-
50
- return os.path.abspath(joined_path)
51
-
52
- @property
53
- def ncx_path(self):
54
- ncx_dom = self._manifest.find(".//*[@id=\"ncx\"]")
55
- if ncx_dom is not None:
56
- href_path = ncx_dom.get("href")
57
- base_path = os.path.dirname(self._content_path)
58
- path = os.path.join(base_path, href_path)
59
- path = os.path.abspath(path)
60
-
61
- if os.path.exists(path):
62
- return path
63
-
64
- path = os.path.join(self.folder_path, path)
65
- path = os.path.abspath(path)
66
- return path
67
-
68
- @property
69
- def spines(self) -> list[Spine]:
70
- idref_dict = {}
71
- index = 0
72
-
73
- for child in self._spine.iterchildren():
74
- id = child.get("idref")
75
- idref_dict[id] = index
76
- index += 1
77
-
78
- items = [None for _ in range(index)]
79
- spines = []
80
-
81
- for child in self._manifest.iterchildren():
82
- id = child.get("id")
83
- if id in idref_dict:
84
- index = idref_dict[id]
85
- items[index] = child
86
-
87
- base_path = os.path.dirname(self._content_path)
88
-
89
- for item in items:
90
- if item is not None:
91
- spines.append(Spine(
92
- folder_path=self.folder_path,
93
- base_path=base_path,
94
- item=item,
95
- ))
96
-
97
- return spines
98
-
99
- @property
100
- def title(self):
101
- title_dom = self._get_title()
102
- if title_dom is None:
103
- return None
104
- return title_dom.text
105
-
106
- @title.setter
107
- def title(self, title: str):
108
- title_dom = self._get_title()
109
- if title_dom is not None:
110
- title_dom.text = _escape_ascii(title)
111
-
112
- def _get_title(self):
113
- titles = self._metadata.xpath(
114
- "./dc:title",
115
- namespaces={
116
- "dc": self._metadata.nsmap.get("dc"),
117
- },
118
- )
119
- if len(titles) == 0:
120
- return None
121
- return titles[0]
122
-
123
- @property
124
- def authors(self) -> list[str]:
125
- return list(map(lambda x: x.text, self._get_creators()))
126
-
127
- @authors.setter
128
- def authors(self, authors):
129
- creator_doms = self._get_creators()
130
- if len(creator_doms) == 0:
131
- return
132
- parent_dom = creator_doms[0].getparent()
133
- index_at_parent = parent_dom.index(creator_doms[0])
134
- ns={
135
- "dc": self._metadata.nsmap.get("dc"),
136
- "opf": self._metadata.nsmap.get("opf"),
137
- }
138
- for author in reversed(authors):
139
- creator_dom = Element(QName(ns["dc"], "creator"))
140
- creator_dom.set(QName(ns["opf"], "file-as"), author)
141
- creator_dom.set(QName(ns["opf"], "role"), "aut")
142
- creator_dom.text = _escape_ascii(author)
143
- parent_dom.insert(index_at_parent, creator_dom)
144
-
145
- for creator_dom in creator_doms:
146
- parent_dom.remove(creator_dom)
147
-
148
- def _get_creators(self):
149
- return self._metadata.xpath(
150
- "./dc:creator",
151
- namespaces={
152
- "dc": self._metadata.nsmap.get("dc"),
153
- },
154
- )
155
-
156
- def _escape_ascii(content: str) -> str:
157
- content = escape(content)
158
- content = re.sub(
159
- r"\\u([\da-fA-F]{4})",
160
- lambda x: chr(int(x.group(1), 16)), content,
161
- )
162
- return content
@@ -1 +0,0 @@
1
- from .file import HTMLFile
@@ -1,68 +0,0 @@
1
- from io import StringIO
2
- from typing import cast, Generator, Iterable
3
- from xml.etree.ElementTree import Element
4
- from .texts_searcher import search_texts, TextPosition
5
-
6
-
7
- def read_texts(root: Element) -> Generator[str, None, None]:
8
- for element, position, _ in search_texts(root):
9
- if position == TextPosition.WHOLE_DOM:
10
- yield _plain_text(element)
11
- elif position == TextPosition.TEXT:
12
- yield cast(str, element.text)
13
- elif position == TextPosition.TAIL:
14
- yield cast(str, element.tail)
15
-
16
- def write_texts(root: Element, texts: Iterable[str | Iterable[str] | None], append: bool):
17
- zip_list = list(zip(texts, search_texts(root)))
18
- for text, (element, position, parent) in reversed(zip_list):
19
- if text is None:
20
- continue
21
- if not isinstance(text, str):
22
- # TODO: implements split text
23
- text = "".join(text)
24
- if position == TextPosition.WHOLE_DOM:
25
- if parent is not None:
26
- _write_dom(parent, element, text, append)
27
- elif position == TextPosition.TEXT:
28
- element.text = _write_text(element.text, text, append)
29
- elif position == TextPosition.TAIL:
30
- element.tail = _write_text(element.tail, text, append)
31
-
32
- def _write_dom(parent: Element, origin: Element, text: str, append: bool):
33
- if append:
34
- appended = Element(origin.tag, {**origin.attrib})
35
- for index, child in enumerate(parent):
36
- if child == origin:
37
- parent.insert(index + 1, appended)
38
- break
39
- appended.attrib.pop("id", None)
40
- appended.text = text
41
- appended.tail = origin.tail
42
- origin.tail = None
43
- else:
44
- for child in origin:
45
- origin.remove(child)
46
- origin.text = text
47
-
48
- def _write_text(left: str | None, right: str, append: bool) -> str:
49
- if not append:
50
- return right
51
- elif left is None:
52
- return right
53
- else:
54
- return left + right
55
-
56
- def _plain_text(target: Element):
57
- buffer = StringIO()
58
- for text in _iter_text(target):
59
- buffer.write(text)
60
- return buffer.getvalue()
61
-
62
- def _iter_text(parent: Element):
63
- if parent.text is not None:
64
- yield parent.text
65
- for child in parent:
66
- yield from _iter_text(child)
67
- if parent.tail is not None:
68
- yield parent.tail
@@ -1,23 +0,0 @@
1
- import re
2
-
3
- # HTML 规定了一系列自闭标签,这些标签需要改成非自闭的,因为 EPub 格式不支持
4
- # https://www.tutorialspoint.com/which-html-tags-are-self-closing
5
- _EMPTY_TAGS = (
6
- "br",
7
- "hr",
8
- "input",
9
- "col",
10
- "base",
11
- "meta",
12
- "area",
13
- )
14
-
15
- _EMPTY_TAG_PATTERN = re.compile(
16
- r"<(" + "|".join(_EMPTY_TAGS) + r")(\s[^>]*?)\s*/?>"
17
- )
18
-
19
- def to_html(content: str) -> str:
20
- return re.sub(_EMPTY_TAG_PATTERN, lambda m: f"<{m.group(1)}{m.group(2)}>", content)
21
-
22
- def to_xml(content: str) -> str:
23
- return re.sub(_EMPTY_TAG_PATTERN, lambda m: f"<{m.group(1)}{m.group(2)} />", content)