epub-translator 0.0.4__tar.gz → 0.0.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epub_translator-0.0.6/PKG-INFO +170 -0
- epub_translator-0.0.6/README.md +145 -0
- epub_translator-0.0.6/epub_translator/translation/__init__.py +2 -0
- {epub_translator-0.0.4 → epub_translator-0.0.6}/epub_translator/translation/splitter.py +6 -5
- {epub_translator-0.0.4 → epub_translator-0.0.6}/epub_translator/translation/store.py +3 -3
- {epub_translator-0.0.4 → epub_translator-0.0.6}/epub_translator/translation/translation.py +31 -30
- epub_translator-0.0.6/epub_translator/translation/types.py +45 -0
- {epub_translator-0.0.4 → epub_translator-0.0.6}/epub_translator/translator.py +1 -2
- {epub_translator-0.0.4 → epub_translator-0.0.6}/epub_translator/zip_context.py +1 -1
- {epub_translator-0.0.4 → epub_translator-0.0.6}/pyproject.toml +2 -2
- epub_translator-0.0.4/PKG-INFO +0 -44
- epub_translator-0.0.4/README.md +0 -19
- epub_translator-0.0.4/epub_translator/translation/__init__.py +0 -2
- epub_translator-0.0.4/epub_translator/translation/types.py +0 -49
- {epub_translator-0.0.4 → epub_translator-0.0.6}/LICENSE +0 -0
- {epub_translator-0.0.4 → epub_translator-0.0.6}/epub_translator/__init__.py +0 -0
- {epub_translator-0.0.4 → epub_translator-0.0.6}/epub_translator/data/format.jinja +0 -0
- {epub_translator-0.0.4 → epub_translator-0.0.6}/epub_translator/data/translate.jinja +0 -0
- {epub_translator-0.0.4 → epub_translator-0.0.6}/epub_translator/epub/__init__.py +0 -0
- {epub_translator-0.0.4 → epub_translator-0.0.6}/epub_translator/epub/content_parser.py +0 -0
- {epub_translator-0.0.4 → epub_translator-0.0.6}/epub_translator/epub/html/__init__.py +0 -0
- {epub_translator-0.0.4 → epub_translator-0.0.6}/epub_translator/epub/html/dom_operator.py +0 -0
- {epub_translator-0.0.4 → epub_translator-0.0.6}/epub_translator/epub/html/empty_tags.py +0 -0
- {epub_translator-0.0.4 → epub_translator-0.0.6}/epub_translator/epub/html/file.py +0 -0
- {epub_translator-0.0.4 → epub_translator-0.0.6}/epub_translator/epub/html/texts_searcher.py +0 -0
- {epub_translator-0.0.4 → epub_translator-0.0.6}/epub_translator/llm/__init__.py +0 -0
- {epub_translator-0.0.4 → epub_translator-0.0.6}/epub_translator/llm/error.py +0 -0
- {epub_translator-0.0.4 → epub_translator-0.0.6}/epub_translator/llm/executor.py +0 -0
- {epub_translator-0.0.4 → epub_translator-0.0.6}/epub_translator/llm/increasable.py +0 -0
- {epub_translator-0.0.4 → epub_translator-0.0.6}/epub_translator/llm/node.py +0 -0
- {epub_translator-0.0.4 → epub_translator-0.0.6}/epub_translator/template.py +0 -0
- {epub_translator-0.0.4 → epub_translator-0.0.6}/epub_translator/translation/chunk.py +0 -0
- {epub_translator-0.0.4 → epub_translator-0.0.6}/epub_translator/translation/utils.py +0 -0
- {epub_translator-0.0.4 → epub_translator-0.0.6}/epub_translator/xml/__init__.py +0 -0
- {epub_translator-0.0.4 → epub_translator-0.0.6}/epub_translator/xml/decoder.py +0 -0
- {epub_translator-0.0.4 → epub_translator-0.0.6}/epub_translator/xml/encoder.py +0 -0
- {epub_translator-0.0.4 → epub_translator-0.0.6}/epub_translator/xml/parser.py +0 -0
- {epub_translator-0.0.4 → epub_translator-0.0.6}/epub_translator/xml/tag.py +0 -0
- {epub_translator-0.0.4 → epub_translator-0.0.6}/epub_translator/xml/transform.py +0 -0
- {epub_translator-0.0.4 → epub_translator-0.0.6}/epub_translator/xml/utils.py +0 -0
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: epub-translator
|
|
3
|
+
Version: 0.0.6
|
|
4
|
+
Summary: Translate the epub book using LLM. The translated book will retain the original text and list the translated text side by side with the original text.
|
|
5
|
+
License: MIT
|
|
6
|
+
Keywords: epub,llm,translation,translator
|
|
7
|
+
Author: Tao Zeyu
|
|
8
|
+
Author-email: i@taozeyu.com
|
|
9
|
+
Maintainer: Tao Zeyu
|
|
10
|
+
Maintainer-email: i@taozeyu.com
|
|
11
|
+
Requires-Python: >=3.10,<3.13
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Requires-Dist: jinja2 (>=3.1.5,<4.0.0)
|
|
18
|
+
Requires-Dist: langchain[openai] (>=0.3.21,<0.4.0)
|
|
19
|
+
Requires-Dist: lxml (>=6.0.0,<7.0.0)
|
|
20
|
+
Requires-Dist: resource-segmentation (>=0.0.3,<0.1.0)
|
|
21
|
+
Requires-Dist: tiktoken (>=0.9.0,<0.10.0)
|
|
22
|
+
Project-URL: Homepage, https://hub.oomol.com/package/book-translator
|
|
23
|
+
Project-URL: Repository, https://github.com/oomol-flows/books-translator
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
|
|
26
|
+
<div align=center>
|
|
27
|
+
<h1>EPUB Translator</h1>
|
|
28
|
+
<p>
|
|
29
|
+
<a href="https://github.com/oomol-lab/epub-translator/actions/workflows/build.yml" target="_blank"><img src="https://img.shields.io/github/actions/workflow/status/oomol-lab/epub-translator/build.yml" alt"ci" /></a>
|
|
30
|
+
<a href="https://pypi.org/project/epub-translator/" target="_blank"><img src="https://img.shields.io/badge/pip_install-epub--translator-blue" alt="pip install epub-translator" /></a>
|
|
31
|
+
<a href="https://pypi.org/project/epub-translator/" target="_blank"><img src="https://img.shields.io/pypi/v/epub-translator.svg" alt"pypi epub-translator" /></a>
|
|
32
|
+
<a href="https://pypi.org/project/epub-translator/" target="_blank"><img src="https://img.shields.io/pypi/pyversions/epub-translator.svg" alt="python versions" /></a>
|
|
33
|
+
<a href="https://github.com/oomol-lab/epub-translator/blob/main/LICENSE" target="_blank"><img src="https://img.shields.io/github/license/oomol-lab/epub-translator" alt"license" /></a>
|
|
34
|
+
</p>
|
|
35
|
+
<p><a href="https://hub.oomol.com/package/books-translator?open=true" target="_blank"><img src="https://static.oomol.com/assets/button.svg" alt="Open in OOMOL Studio" /></a></p>
|
|
36
|
+
<p>English | <a href="./README_zh-CN.md">中文</a></p>
|
|
37
|
+
</div>
|
|
38
|
+
|
|
39
|
+
## Introduction
|
|
40
|
+
|
|
41
|
+
epub-translator uses AI big models to automatically translate EPUB e-books, and retains 100% of the original book's format, illustrations, catalog and layout, while generating bilingual comparison versions for easy language learning or international sharing.
|
|
42
|
+
|
|
43
|
+
Whether you are a developer, language learner, or e-book lover, epub-translator can help you easily overcome language barriers.
|
|
44
|
+
|
|
45
|
+
- [x] **Multi-language translation**: Supports translation between mainstream languages such as English, Chinese, Japanese, Spanish, French, and German.
|
|
46
|
+
- [x] **Bilingual comparison**: Generates bilingual EPUBs with top-down comparisons for easy comparison and learning.
|
|
47
|
+
- [x] **Insert prompt words**: Guide AI translation, such as glossary, character name list, etc.
|
|
48
|
+
- [x] **Optional AI model**: Supports mainstream big models such as DeepSeek and ChatGPT.
|
|
49
|
+
- [x] **High-performance parallelism**: AI requests multiple concurrent channels to quickly translate the entire book.
|
|
50
|
+
|
|
51
|
+
## Environment
|
|
52
|
+
|
|
53
|
+
You can call EPUB Translator directly as a library, or use [OOMOL Studio](https://oomol.com/) to run it directly.
|
|
54
|
+
|
|
55
|
+
### Run with OOMOL Studio
|
|
56
|
+
|
|
57
|
+
OOMOL uses container technology to directly package the dependencies required by EPUB Translator, and it is ready to use out of the box.
|
|
58
|
+
|
|
59
|
+
[](https://www.youtube.com/watch?v=QsAdiskxfXI)
|
|
60
|
+
|
|
61
|
+
### Call directly as a library
|
|
62
|
+
|
|
63
|
+
You can also write python code directly and call it as a library. At this time, you need python 3.10 or higher (3.10.16 is recommended).
|
|
64
|
+
|
|
65
|
+
```shell
|
|
66
|
+
pip install epub-translator
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## Quick start
|
|
70
|
+
|
|
71
|
+
First, construct the `LLM` object that calls the AI Large Language Model.
|
|
72
|
+
|
|
73
|
+
```python
|
|
74
|
+
from epub_translator import LLM
|
|
75
|
+
|
|
76
|
+
llm = LLM(
|
|
77
|
+
key="<LLM-API-KEY>", # LLM's API key
|
|
78
|
+
url="https://api.deepseek.com", # LLM's base URL
|
|
79
|
+
model="deepseek-chat", # LLM's model name
|
|
80
|
+
token_encoding="o200k_base", # Local model for calculating the number of tokens
|
|
81
|
+
)
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
Then, you can call the `translate` method to translate.
|
|
85
|
+
|
|
86
|
+
```python
|
|
87
|
+
from epub_translator import translate, Language
|
|
88
|
+
|
|
89
|
+
translate(
|
|
90
|
+
llm=llm, # llm object constructed in the previous step
|
|
91
|
+
source_path="/path/to/epub/file", # Original EPUB file to be translated
|
|
92
|
+
translated_path="/path/to/translated/epub/file", # Path to save the translated EPUB
|
|
93
|
+
target_language=Language.ENGLISH, # Target language for translation, in this case English.
|
|
94
|
+
)
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
After calling this method, the translation can be inserted under the original text while retaining the EPUB format.
|
|
98
|
+
|
|
99
|
+

|
|
100
|
+
|
|
101
|
+
## Function
|
|
102
|
+
|
|
103
|
+
### Save translation progress
|
|
104
|
+
|
|
105
|
+
Calling `translate` to translate the entire EPUB e-book takes a long time, and this process may be interrupted for various reasons. For example, when calling LLM, an error is reported and the process is interrupted due to network reasons, or the user can't wait and manually interrupts the process.
|
|
106
|
+
|
|
107
|
+
EPUB Translator can cache the translated content as a local file, so that when translating the same book, the translation progress can be saved and the progress can be restored from the last translation interruption.
|
|
108
|
+
|
|
109
|
+
Just configure the `working_path` field when calling `translate` and specify a path to cache the files generated by the translation. The next time it is started, EPUB Translator will try to read the translation progress from this path in advance.
|
|
110
|
+
|
|
111
|
+
```python
|
|
112
|
+
translate(
|
|
113
|
+
..., # other parameters
|
|
114
|
+
working_path="/path/to/cache/translating/files",
|
|
115
|
+
)
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
Please note that each call to the `translate` method will write a cache file to the folder where the `workspace_path` is located. This will cause the folder to grow larger and larger. You need to handle it yourself, for example, automatically clear the folder after the translation is successful.
|
|
119
|
+
|
|
120
|
+
### Monitor translation progress
|
|
121
|
+
|
|
122
|
+
When calling `translate`, pass a callback function through `report_progress`, and receive a `float` type parameter representing the progress from 0.0 to 1.0, so that the translation progress of the whole book can be monitored.
|
|
123
|
+
|
|
124
|
+
```python
|
|
125
|
+
from tqdm import tqdm
|
|
126
|
+
from epub_translator import translate
|
|
127
|
+
|
|
128
|
+
with tqdm(total=1.0, desc="Translating") as bar:
|
|
129
|
+
def refresh_progress(progress: float) -> None:
|
|
130
|
+
bar.n = progress
|
|
131
|
+
bar.refresh()
|
|
132
|
+
|
|
133
|
+
translate(
|
|
134
|
+
..., # other parameters
|
|
135
|
+
report_progress=refresh_progress,
|
|
136
|
+
)
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
### Insert prompt words
|
|
140
|
+
|
|
141
|
+
Insert prompt words to guide the AI language model on how to translate. For example, you can insert a glossary so that AI can unify the terms when translating. Just add the `user_prompt` field when calling `translate`.
|
|
142
|
+
|
|
143
|
+
```python
|
|
144
|
+
translate(
|
|
145
|
+
..., # other parameters
|
|
146
|
+
user_prompt='Le Petit Prince should be translated as "Little Prince".',
|
|
147
|
+
)
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
### Large Language Model Parameters
|
|
151
|
+
|
|
152
|
+
There are more configuration options when building the `LLM` object.
|
|
153
|
+
|
|
154
|
+
```python
|
|
155
|
+
llm = LLM(
|
|
156
|
+
key="<LLM-API-KEY>", # LLM's API key
|
|
157
|
+
url="https://api.deepseek.com", # LLM's base URL
|
|
158
|
+
model="deepseek-chat", # LLM's model name
|
|
159
|
+
token_encoding="o200k_base", # Local model for calculating the number of tokens
|
|
160
|
+
timeout=60.0, # Request timeout (in seconds)
|
|
161
|
+
top_p=0.6, # Creativity
|
|
162
|
+
temperature=0.85, # Temperature
|
|
163
|
+
retry_times=5, # Retry times. If the request still fails after this number, an error will be reported
|
|
164
|
+
retry_interval_seconds=6.0, # Retry interval (in seconds)
|
|
165
|
+
)
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
## Related open source libraries
|
|
169
|
+
|
|
170
|
+
[PDF Craft](https://github.com/oomol-lab/pdf-craft) can convert PDF files into various other formats. This project will focus on the processing of PDF files of scanned books. Use this library with the scanned PDF books to convert and translate them. For more information, please refer to [Video: Convert scanned PDF books to EPUB format and translate them into bilingual books](https://www.bilibili.com/video/BV1tMQZY5EYY/).
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
<div align=center>
|
|
2
|
+
<h1>EPUB Translator</h1>
|
|
3
|
+
<p>
|
|
4
|
+
<a href="https://github.com/oomol-lab/epub-translator/actions/workflows/build.yml" target="_blank"><img src="https://img.shields.io/github/actions/workflow/status/oomol-lab/epub-translator/build.yml" alt"ci" /></a>
|
|
5
|
+
<a href="https://pypi.org/project/epub-translator/" target="_blank"><img src="https://img.shields.io/badge/pip_install-epub--translator-blue" alt="pip install epub-translator" /></a>
|
|
6
|
+
<a href="https://pypi.org/project/epub-translator/" target="_blank"><img src="https://img.shields.io/pypi/v/epub-translator.svg" alt"pypi epub-translator" /></a>
|
|
7
|
+
<a href="https://pypi.org/project/epub-translator/" target="_blank"><img src="https://img.shields.io/pypi/pyversions/epub-translator.svg" alt="python versions" /></a>
|
|
8
|
+
<a href="https://github.com/oomol-lab/epub-translator/blob/main/LICENSE" target="_blank"><img src="https://img.shields.io/github/license/oomol-lab/epub-translator" alt"license" /></a>
|
|
9
|
+
</p>
|
|
10
|
+
<p><a href="https://hub.oomol.com/package/books-translator?open=true" target="_blank"><img src="https://static.oomol.com/assets/button.svg" alt="Open in OOMOL Studio" /></a></p>
|
|
11
|
+
<p>English | <a href="./README_zh-CN.md">中文</a></p>
|
|
12
|
+
</div>
|
|
13
|
+
|
|
14
|
+
## Introduction
|
|
15
|
+
|
|
16
|
+
epub-translator uses AI big models to automatically translate EPUB e-books, and retains 100% of the original book's format, illustrations, catalog and layout, while generating bilingual comparison versions for easy language learning or international sharing.
|
|
17
|
+
|
|
18
|
+
Whether you are a developer, language learner, or e-book lover, epub-translator can help you easily overcome language barriers.
|
|
19
|
+
|
|
20
|
+
- [x] **Multi-language translation**: Supports translation between mainstream languages such as English, Chinese, Japanese, Spanish, French, and German.
|
|
21
|
+
- [x] **Bilingual comparison**: Generates bilingual EPUBs with top-down comparisons for easy comparison and learning.
|
|
22
|
+
- [x] **Insert prompt words**: Guide AI translation, such as glossary, character name list, etc.
|
|
23
|
+
- [x] **Optional AI model**: Supports mainstream big models such as DeepSeek and ChatGPT.
|
|
24
|
+
- [x] **High-performance parallelism**: AI requests multiple concurrent channels to quickly translate the entire book.
|
|
25
|
+
|
|
26
|
+
## Environment
|
|
27
|
+
|
|
28
|
+
You can call EPUB Translator directly as a library, or use [OOMOL Studio](https://oomol.com/) to run it directly.
|
|
29
|
+
|
|
30
|
+
### Run with OOMOL Studio
|
|
31
|
+
|
|
32
|
+
OOMOL uses container technology to directly package the dependencies required by EPUB Translator, and it is ready to use out of the box.
|
|
33
|
+
|
|
34
|
+
[](https://www.youtube.com/watch?v=QsAdiskxfXI)
|
|
35
|
+
|
|
36
|
+
### Call directly as a library
|
|
37
|
+
|
|
38
|
+
You can also write python code directly and call it as a library. At this time, you need python 3.10 or higher (3.10.16 is recommended).
|
|
39
|
+
|
|
40
|
+
```shell
|
|
41
|
+
pip install epub-translator
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## Quick start
|
|
45
|
+
|
|
46
|
+
First, construct the `LLM` object that calls the AI Large Language Model.
|
|
47
|
+
|
|
48
|
+
```python
|
|
49
|
+
from epub_translator import LLM
|
|
50
|
+
|
|
51
|
+
llm = LLM(
|
|
52
|
+
key="<LLM-API-KEY>", # LLM's API key
|
|
53
|
+
url="https://api.deepseek.com", # LLM's base URL
|
|
54
|
+
model="deepseek-chat", # LLM's model name
|
|
55
|
+
token_encoding="o200k_base", # Local model for calculating the number of tokens
|
|
56
|
+
)
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
Then, you can call the `translate` method to translate.
|
|
60
|
+
|
|
61
|
+
```python
|
|
62
|
+
from epub_translator import translate, Language
|
|
63
|
+
|
|
64
|
+
translate(
|
|
65
|
+
llm=llm, # llm object constructed in the previous step
|
|
66
|
+
source_path="/path/to/epub/file", # Original EPUB file to be translated
|
|
67
|
+
translated_path="/path/to/translated/epub/file", # Path to save the translated EPUB
|
|
68
|
+
target_language=Language.ENGLISH, # Target language for translation, in this case English.
|
|
69
|
+
)
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
After calling this method, the translation can be inserted under the original text while retaining the EPUB format.
|
|
73
|
+
|
|
74
|
+

|
|
75
|
+
|
|
76
|
+
## Function
|
|
77
|
+
|
|
78
|
+
### Save translation progress
|
|
79
|
+
|
|
80
|
+
Calling `translate` to translate the entire EPUB e-book takes a long time, and this process may be interrupted for various reasons. For example, when calling LLM, an error is reported and the process is interrupted due to network reasons, or the user can't wait and manually interrupts the process.
|
|
81
|
+
|
|
82
|
+
EPUB Translator can cache the translated content as a local file, so that when translating the same book, the translation progress can be saved and the progress can be restored from the last translation interruption.
|
|
83
|
+
|
|
84
|
+
Just configure the `working_path` field when calling `translate` and specify a path to cache the files generated by the translation. The next time it is started, EPUB Translator will try to read the translation progress from this path in advance.
|
|
85
|
+
|
|
86
|
+
```python
|
|
87
|
+
translate(
|
|
88
|
+
..., # other parameters
|
|
89
|
+
working_path="/path/to/cache/translating/files",
|
|
90
|
+
)
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
Please note that each call to the `translate` method will write a cache file to the folder where the `workspace_path` is located. This will cause the folder to grow larger and larger. You need to handle it yourself, for example, automatically clear the folder after the translation is successful.
|
|
94
|
+
|
|
95
|
+
### Monitor translation progress
|
|
96
|
+
|
|
97
|
+
When calling `translate`, pass a callback function through `report_progress`, and receive a `float` type parameter representing the progress from 0.0 to 1.0, so that the translation progress of the whole book can be monitored.
|
|
98
|
+
|
|
99
|
+
```python
|
|
100
|
+
from tqdm import tqdm
|
|
101
|
+
from epub_translator import translate
|
|
102
|
+
|
|
103
|
+
with tqdm(total=1.0, desc="Translating") as bar:
|
|
104
|
+
def refresh_progress(progress: float) -> None:
|
|
105
|
+
bar.n = progress
|
|
106
|
+
bar.refresh()
|
|
107
|
+
|
|
108
|
+
translate(
|
|
109
|
+
..., # other parameters
|
|
110
|
+
report_progress=refresh_progress,
|
|
111
|
+
)
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
### Insert prompt words
|
|
115
|
+
|
|
116
|
+
Insert prompt words to guide the AI language model on how to translate. For example, you can insert a glossary so that AI can unify the terms when translating. Just add the `user_prompt` field when calling `translate`.
|
|
117
|
+
|
|
118
|
+
```python
|
|
119
|
+
translate(
|
|
120
|
+
..., # other parameters
|
|
121
|
+
user_prompt='Le Petit Prince should be translated as "Little Prince".',
|
|
122
|
+
)
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
### Large Language Model Parameters
|
|
126
|
+
|
|
127
|
+
There are more configuration options when building the `LLM` object.
|
|
128
|
+
|
|
129
|
+
```python
|
|
130
|
+
llm = LLM(
|
|
131
|
+
key="<LLM-API-KEY>", # LLM's API key
|
|
132
|
+
url="https://api.deepseek.com", # LLM's base URL
|
|
133
|
+
model="deepseek-chat", # LLM's model name
|
|
134
|
+
token_encoding="o200k_base", # Local model for calculating the number of tokens
|
|
135
|
+
timeout=60.0, # Request timeout (in seconds)
|
|
136
|
+
top_p=0.6, # Creativity
|
|
137
|
+
temperature=0.85, # Temperature
|
|
138
|
+
retry_times=5, # Retry times. If the request still fails after this number, an error will be reported
|
|
139
|
+
retry_interval_seconds=6.0, # Retry interval (in seconds)
|
|
140
|
+
)
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
## Related open source libraries
|
|
144
|
+
|
|
145
|
+
[PDF Craft](https://github.com/oomol-lab/pdf-craft) can convert PDF files into various other formats. This project will focus on the processing of PDF files of scanned books. Use this library with the scanned PDF books to convert and translate them. For more information, please refer to [Video: Convert scanned PDF books to EPUB format and translate them into bilingual books](https://www.bilibili.com/video/BV1tMQZY5EYY/).
|
|
@@ -2,7 +2,7 @@ from typing import Iterator, Generator
|
|
|
2
2
|
from resource_segmentation import split, Resource, Segment
|
|
3
3
|
|
|
4
4
|
from ..llm import LLM
|
|
5
|
-
from .types import Fragment
|
|
5
|
+
from .types import Fragment, Incision
|
|
6
6
|
from .chunk import ChunkRange
|
|
7
7
|
|
|
8
8
|
|
|
@@ -12,20 +12,21 @@ def split_into_chunks(llm: LLM, fragments_iter: Iterator[Fragment], max_chunk_to
|
|
|
12
12
|
max_segment_count=max_chunk_tokens_count,
|
|
13
13
|
gap_rate=0.15,
|
|
14
14
|
tail_rate=0.5,
|
|
15
|
+
border_incision=Incision.IMPOSSIBLE,
|
|
15
16
|
)):
|
|
16
17
|
head_index: int
|
|
17
18
|
tail_index: int
|
|
18
19
|
fragments_count: int
|
|
19
|
-
body_index, body_end_index, body_tokens_count =
|
|
20
|
+
body_index, body_end_index, body_tokens_count = _group_part(group.body)
|
|
20
21
|
|
|
21
22
|
if group.head:
|
|
22
|
-
head_index, head_end_index, _ =
|
|
23
|
+
head_index, head_end_index, _ = _group_part(group.head)
|
|
23
24
|
assert head_end_index + 1 == body_index, "Head must be continuous with body"
|
|
24
25
|
else:
|
|
25
26
|
head_index = body_index
|
|
26
27
|
|
|
27
28
|
if group.tail:
|
|
28
|
-
tail_index, tail_end_index, _ =
|
|
29
|
+
tail_index, tail_end_index, _ = _group_part(group.tail)
|
|
29
30
|
fragments_count = tail_end_index - head_index + 1
|
|
30
31
|
assert body_end_index + 1 == tail_index, "Body must be continuous with tail"
|
|
31
32
|
else:
|
|
@@ -52,7 +53,7 @@ def _gen_resources(llm: LLM, fragments_iter: Iterator[Fragment]) -> Generator[Re
|
|
|
52
53
|
payload=index,
|
|
53
54
|
)
|
|
54
55
|
|
|
55
|
-
def
|
|
56
|
+
def _group_part(target: list[Resource[int] | Segment[int]]) -> tuple[int, int, int]:
|
|
56
57
|
start_index: int | None = None
|
|
57
58
|
previous_index: int = 0
|
|
58
59
|
tokens_count: int = 0
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from shutil import rmtree
|
|
2
2
|
from pathlib import Path
|
|
3
|
-
from typing import
|
|
3
|
+
from typing import Iterable
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
class Store:
|
|
@@ -14,7 +14,7 @@ class Store:
|
|
|
14
14
|
with file_path.open("r", encoding="utf-8") as file:
|
|
15
15
|
return file.read().split("\n")
|
|
16
16
|
|
|
17
|
-
def put(self, chunk_hash: bytes,
|
|
17
|
+
def put(self, chunk_hash: bytes, lines: Iterable[str]):
|
|
18
18
|
file_path = self._file_path(chunk_hash)
|
|
19
19
|
if file_path.exists():
|
|
20
20
|
if file_path.is_file():
|
|
@@ -25,7 +25,7 @@ class Store:
|
|
|
25
25
|
file_path.parent.mkdir(parents=True, exist_ok=True)
|
|
26
26
|
with file_path.open("w", encoding="utf-8") as file:
|
|
27
27
|
is_first_line = True
|
|
28
|
-
for line in
|
|
28
|
+
for line in lines:
|
|
29
29
|
if is_first_line:
|
|
30
30
|
is_first_line = False
|
|
31
31
|
else:
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from math import ceil
|
|
1
2
|
from typing import Callable, Iterator, Generator
|
|
2
3
|
from pathlib import Path
|
|
3
4
|
from concurrent.futures import as_completed, ThreadPoolExecutor
|
|
@@ -71,7 +72,7 @@ def _sort_translated_texts_by_chunk(
|
|
|
71
72
|
target: Iterator[tuple[Chunk, list[str]]],
|
|
72
73
|
total_tokens_count: int,
|
|
73
74
|
report_progress: ProgressReporter,
|
|
74
|
-
) ->
|
|
75
|
+
) -> Generator[str, None, None]:
|
|
75
76
|
|
|
76
77
|
buffer: list[tuple[Chunk, list[str]]] = []
|
|
77
78
|
wanna_next_index: int = 0
|
|
@@ -100,39 +101,39 @@ def _sort_translated_texts_by_chunk(
|
|
|
100
101
|
|
|
101
102
|
def _translate_chunk(
|
|
102
103
|
llm: LLM,
|
|
103
|
-
store: Store,
|
|
104
|
+
store: Store | None,
|
|
104
105
|
chunk: Chunk,
|
|
105
106
|
target_language: Language,
|
|
106
107
|
user_prompt: str | None,
|
|
107
108
|
) -> list[str]:
|
|
108
109
|
|
|
109
|
-
|
|
110
|
-
|
|
110
|
+
translated_texts: list[str] | None = None
|
|
111
|
+
source_texts = chunk.head + chunk.body + chunk.tail
|
|
112
|
+
if store is not None:
|
|
113
|
+
translated_texts = store.get(chunk.hash)
|
|
114
|
+
if translated_texts is not None and \
|
|
115
|
+
len(source_texts) != len(translated_texts):
|
|
116
|
+
translated_texts = None
|
|
117
|
+
print(f"Warning: Mismatched lengths in cached translation for chunk: {chunk.hash.hex()}",)
|
|
118
|
+
|
|
119
|
+
if translated_texts is None:
|
|
120
|
+
translated_texts = [
|
|
121
|
+
clean_spaces(text)
|
|
122
|
+
for text in _translate_texts(
|
|
123
|
+
llm=llm,
|
|
124
|
+
texts=source_texts,
|
|
125
|
+
texts_tokens=chunk.tokens_count,
|
|
126
|
+
target_language=target_language,
|
|
127
|
+
user_prompt=user_prompt,
|
|
128
|
+
)
|
|
129
|
+
]
|
|
111
130
|
if store is not None:
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
if translated_texts is None:
|
|
119
|
-
translated_texts = [
|
|
120
|
-
clean_spaces(text)
|
|
121
|
-
for text in _translate_texts(
|
|
122
|
-
llm=llm,
|
|
123
|
-
texts=source_texts,
|
|
124
|
-
texts_tokens=chunk.tokens_count,
|
|
125
|
-
target_language=target_language,
|
|
126
|
-
user_prompt=user_prompt,
|
|
127
|
-
)
|
|
128
|
-
]
|
|
129
|
-
if store is not None:
|
|
130
|
-
store.put(chunk.hash, translated_texts)
|
|
131
|
-
|
|
132
|
-
head_length = len(chunk.head)
|
|
133
|
-
translated_texts = translated_texts[head_length:head_length + len(chunk.body)]
|
|
134
|
-
|
|
135
|
-
return translated_texts
|
|
131
|
+
store.put(chunk.hash, translated_texts)
|
|
132
|
+
|
|
133
|
+
head_length = len(chunk.head)
|
|
134
|
+
translated_texts = translated_texts[head_length:head_length + len(chunk.body)]
|
|
135
|
+
|
|
136
|
+
return translated_texts
|
|
136
137
|
|
|
137
138
|
_PLAIN_TEXT_SCALE = 2.0
|
|
138
139
|
_XML_TEXT_SCALE = 2.5
|
|
@@ -158,7 +159,7 @@ def _translate_texts(
|
|
|
158
159
|
text_tag="TXT",
|
|
159
160
|
user_data=user_data,
|
|
160
161
|
parser=lambda r: r,
|
|
161
|
-
max_tokens=texts_tokens * _PLAIN_TEXT_SCALE,
|
|
162
|
+
max_tokens=ceil(texts_tokens * _PLAIN_TEXT_SCALE),
|
|
162
163
|
params={
|
|
163
164
|
"target_language": language_chinese_name(target_language),
|
|
164
165
|
"user_prompt": user_prompt,
|
|
@@ -179,7 +180,7 @@ def _translate_texts(
|
|
|
179
180
|
return llm.request_xml(
|
|
180
181
|
template_name="format",
|
|
181
182
|
user_data=request_text,
|
|
182
|
-
max_tokens=texts_tokens * _XML_TEXT_SCALE,
|
|
183
|
+
max_tokens=ceil(texts_tokens * _XML_TEXT_SCALE),
|
|
183
184
|
parser=lambda r: _parse_translated_response(r, len(texts)),
|
|
184
185
|
params={
|
|
185
186
|
"target_language": language_chinese_name(target_language),
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
from enum import Enum, IntEnum
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class Incision(IntEnum):
|
|
6
|
+
MUST_BE = 3
|
|
7
|
+
MOST_LIKELY = 2
|
|
8
|
+
IMPOSSIBLE = 0
|
|
9
|
+
UNCERTAIN = 1
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class Fragment:
|
|
13
|
+
text: str
|
|
14
|
+
start_incision: Incision
|
|
15
|
+
end_incision: Incision
|
|
16
|
+
|
|
17
|
+
class Language(Enum):
|
|
18
|
+
SIMPLIFIED_CHINESE = "zh-Hans"
|
|
19
|
+
TRADITIONAL_CHINESE = "zh-Hant"
|
|
20
|
+
ENGLISH = "en"
|
|
21
|
+
FRENCH = "fr"
|
|
22
|
+
GERMAN = "de"
|
|
23
|
+
SPANISH = "es"
|
|
24
|
+
RUSSIAN = "ru"
|
|
25
|
+
ITALIAN = "it"
|
|
26
|
+
PORTUGUESE = "pt"
|
|
27
|
+
JAPANESE = "ja"
|
|
28
|
+
KOREAN = "ko"
|
|
29
|
+
|
|
30
|
+
_LANGUAGE_NAMES = {
|
|
31
|
+
Language.SIMPLIFIED_CHINESE: "简体中文",
|
|
32
|
+
Language.TRADITIONAL_CHINESE: "繁体中文",
|
|
33
|
+
Language.ENGLISH: "英语",
|
|
34
|
+
Language.FRENCH: "法语",
|
|
35
|
+
Language.GERMAN: "德语",
|
|
36
|
+
Language.SPANISH: "西班牙语",
|
|
37
|
+
Language.RUSSIAN: "俄语",
|
|
38
|
+
Language.ITALIAN: "意大利语",
|
|
39
|
+
Language.PORTUGUESE: "葡萄牙语",
|
|
40
|
+
Language.JAPANESE: "日语",
|
|
41
|
+
Language.KOREAN: "韩语",
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
def language_chinese_name(language: Language) -> str:
|
|
45
|
+
return _LANGUAGE_NAMES[language]
|
|
@@ -2,12 +2,11 @@ from os import PathLike
|
|
|
2
2
|
from pathlib import Path
|
|
3
3
|
from tempfile import mkdtemp
|
|
4
4
|
from shutil import rmtree
|
|
5
|
-
from resource_segmentation import Incision
|
|
6
5
|
|
|
7
6
|
from .llm import LLM
|
|
8
7
|
from .epub import HTMLFile
|
|
9
8
|
from .zip_context import ZipContext
|
|
10
|
-
from .translation import translate as _translate, Fragment, Language, ProgressReporter
|
|
9
|
+
from .translation import translate as _translate, Incision, Fragment, Language, ProgressReporter
|
|
11
10
|
|
|
12
11
|
|
|
13
12
|
def translate(
|
|
@@ -20,7 +20,7 @@ class ZipContext:
|
|
|
20
20
|
file.write(source.read())
|
|
21
21
|
|
|
22
22
|
self._temp_dir: Path = temp_dir
|
|
23
|
-
self._epub_content: EpubContent = EpubContent(temp_dir)
|
|
23
|
+
self._epub_content: EpubContent = EpubContent(str(temp_dir))
|
|
24
24
|
|
|
25
25
|
def archive(self, saved_path: Path):
|
|
26
26
|
with zipfile.ZipFile(saved_path, "w") as zip_file:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "epub-translator"
|
|
3
|
-
version = "0.0.
|
|
3
|
+
version = "0.0.6"
|
|
4
4
|
description = "Translate the epub book using LLM. The translated book will retain the original text and list the translated text side by side with the original text."
|
|
5
5
|
keywords=["epub", "llm", "translation", "translator"]
|
|
6
6
|
authors = [
|
|
@@ -16,7 +16,7 @@ dependencies = [
|
|
|
16
16
|
"lxml (>=6.0.0,<7.0.0)",
|
|
17
17
|
"tiktoken (>=0.9.0,<0.10.0)",
|
|
18
18
|
"jinja2 (>=3.1.5,<4.0.0)",
|
|
19
|
-
"resource-segmentation (
|
|
19
|
+
"resource-segmentation (>=0.0.3,<0.1.0)",
|
|
20
20
|
"langchain[openai] (>=0.3.21,<0.4.0)",
|
|
21
21
|
]
|
|
22
22
|
|
epub_translator-0.0.4/PKG-INFO
DELETED
|
@@ -1,44 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.3
|
|
2
|
-
Name: epub-translator
|
|
3
|
-
Version: 0.0.4
|
|
4
|
-
Summary: Translate the epub book using LLM. The translated book will retain the original text and list the translated text side by side with the original text.
|
|
5
|
-
License: MIT
|
|
6
|
-
Keywords: epub,llm,translation,translator
|
|
7
|
-
Author: Tao Zeyu
|
|
8
|
-
Author-email: i@taozeyu.com
|
|
9
|
-
Maintainer: Tao Zeyu
|
|
10
|
-
Maintainer-email: i@taozeyu.com
|
|
11
|
-
Requires-Python: >=3.10,<3.13
|
|
12
|
-
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
-
Classifier: Programming Language :: Python :: 3
|
|
14
|
-
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
-
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
-
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
-
Requires-Dist: jinja2 (>=3.1.5,<4.0.0)
|
|
18
|
-
Requires-Dist: langchain[openai] (>=0.3.21,<0.4.0)
|
|
19
|
-
Requires-Dist: lxml (>=6.0.0,<7.0.0)
|
|
20
|
-
Requires-Dist: resource-segmentation (==0.0.2)
|
|
21
|
-
Requires-Dist: tiktoken (>=0.9.0,<0.10.0)
|
|
22
|
-
Project-URL: Homepage, https://hub.oomol.com/package/book-translator
|
|
23
|
-
Project-URL: Repository, https://github.com/oomol-flows/books-translator
|
|
24
|
-
Description-Content-Type: text/markdown
|
|
25
|
-
|
|
26
|
-
# epub-translator
|
|
27
|
-
|
|
28
|
-
Translate the epub book using LLM. The translated book will retain the original text and list the translated text side by side with the original text.
|
|
29
|
-
|
|
30
|
-
## Field Description
|
|
31
|
-
|
|
32
|
-
- `file`: the epub file to be translated.
|
|
33
|
-
- `title`: the title of the book to be translated (original language)
|
|
34
|
-
- `max_translating_group`: the maximum amount of translation text submitted each time. Books will be submitted in chunks during translation, and this value will limit the maximum length of each chunk.
|
|
35
|
-
- `max_translating_group_unit`: the unit of the `max_translating_group_unit`.
|
|
36
|
-
- `source`: the language of the book to be translated.
|
|
37
|
-
- `target`: the target language you want to translate it into.
|
|
38
|
-
- `llm_api`: the LLM API format used for translation.
|
|
39
|
-
- `model`: the model used for translation
|
|
40
|
-
- `url`: the URL of the LLM
|
|
41
|
-
- `api_key`: the Key of the LLM
|
|
42
|
-
- `temperature`: the temperature of the LLM, which is a parameter used to control the randomness of the generated text. In simple terms, the lower the temperature value, the more certain and conservative the text generated by the model. The higher the temperature value, the more random and diverse the text generated by the model.
|
|
43
|
-
- `timeout`: the request timeout, in seconds.
|
|
44
|
-
- `binary`: the translated target epub file content.
|
epub_translator-0.0.4/README.md
DELETED
|
@@ -1,19 +0,0 @@
|
|
|
1
|
-
# epub-translator
|
|
2
|
-
|
|
3
|
-
Translate the epub book using LLM. The translated book will retain the original text and list the translated text side by side with the original text.
|
|
4
|
-
|
|
5
|
-
## Field Description
|
|
6
|
-
|
|
7
|
-
- `file`: the epub file to be translated.
|
|
8
|
-
- `title`: the title of the book to be translated (original language)
|
|
9
|
-
- `max_translating_group`: the maximum amount of translation text submitted each time. Books will be submitted in chunks during translation, and this value will limit the maximum length of each chunk.
|
|
10
|
-
- `max_translating_group_unit`: the unit of the `max_translating_group_unit`.
|
|
11
|
-
- `source`: the language of the book to be translated.
|
|
12
|
-
- `target`: the target language you want to translate it into.
|
|
13
|
-
- `llm_api`: the LLM API format used for translation.
|
|
14
|
-
- `model`: the model used for translation
|
|
15
|
-
- `url`: the URL of the LLM
|
|
16
|
-
- `api_key`: the Key of the LLM
|
|
17
|
-
- `temperature`: the temperature of the LLM, which is a parameter used to control the randomness of the generated text. In simple terms, the lower the temperature value, the more certain and conservative the text generated by the model. The higher the temperature value, the more random and diverse the text generated by the model.
|
|
18
|
-
- `timeout`: the request timeout, in seconds.
|
|
19
|
-
- `binary`: the translated target epub file content.
|
|
@@ -1,49 +0,0 @@
|
|
|
1
|
-
from enum import Enum
|
|
2
|
-
from dataclasses import dataclass
|
|
3
|
-
from resource_segmentation import Incision
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
@dataclass
|
|
7
|
-
class Fragment:
|
|
8
|
-
text: str
|
|
9
|
-
start_incision: Incision
|
|
10
|
-
end_incision: Incision
|
|
11
|
-
|
|
12
|
-
class Language(Enum):
|
|
13
|
-
SIMPLIFIED_CHINESE = "zh-Hans"
|
|
14
|
-
TRADITIONAL_CHINESE = "zh-Hant"
|
|
15
|
-
ENGLISH = "en"
|
|
16
|
-
FRENCH = "fr"
|
|
17
|
-
GERMAN = "de"
|
|
18
|
-
SPANISH = "es"
|
|
19
|
-
RUSSIAN = "ru"
|
|
20
|
-
ITALIAN = "it"
|
|
21
|
-
PORTUGUESE = "pt"
|
|
22
|
-
JAPANESE = "ja"
|
|
23
|
-
KOREAN = "ko"
|
|
24
|
-
|
|
25
|
-
def language_chinese_name(language: Language) -> str:
|
|
26
|
-
if language == Language.SIMPLIFIED_CHINESE:
|
|
27
|
-
return "简体中文"
|
|
28
|
-
elif language == Language.TRADITIONAL_CHINESE:
|
|
29
|
-
return "繁体中文"
|
|
30
|
-
elif language == Language.ENGLISH:
|
|
31
|
-
return "英语"
|
|
32
|
-
elif language == Language.FRENCH:
|
|
33
|
-
return "法语"
|
|
34
|
-
elif language == Language.GERMAN:
|
|
35
|
-
return "德语"
|
|
36
|
-
elif language == Language.SPANISH:
|
|
37
|
-
return "西班牙语"
|
|
38
|
-
elif language == Language.RUSSIAN:
|
|
39
|
-
return "俄语"
|
|
40
|
-
elif language == Language.ITALIAN:
|
|
41
|
-
return "意大利语"
|
|
42
|
-
elif language == Language.PORTUGUESE:
|
|
43
|
-
return "葡萄牙语"
|
|
44
|
-
elif language == Language.JAPANESE:
|
|
45
|
-
return "日语"
|
|
46
|
-
elif language == Language.KOREAN:
|
|
47
|
-
return "韩语"
|
|
48
|
-
else:
|
|
49
|
-
raise ValueError(f"Unknown language: {language}")
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|