epub-translator 0.0.2__tar.gz → 0.0.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {epub_translator-0.0.2 → epub_translator-0.0.4}/PKG-INFO +1 -1
- {epub_translator-0.0.2 → epub_translator-0.0.4}/epub_translator/llm/executor.py +4 -1
- {epub_translator-0.0.2 → epub_translator-0.0.4}/epub_translator/llm/node.py +4 -0
- epub_translator-0.0.4/epub_translator/translation/__init__.py +2 -0
- {epub_translator-0.0.2 → epub_translator-0.0.4}/epub_translator/translation/chunk.py +6 -8
- {epub_translator-0.0.2 → epub_translator-0.0.4}/epub_translator/translation/store.py +2 -3
- {epub_translator-0.0.2 → epub_translator-0.0.4}/epub_translator/translation/translation.py +55 -17
- epub_translator-0.0.4/epub_translator/translation/types.py +49 -0
- {epub_translator-0.0.2 → epub_translator-0.0.4}/epub_translator/translator.py +23 -15
- {epub_translator-0.0.2 → epub_translator-0.0.4}/pyproject.toml +1 -1
- epub_translator-0.0.2/epub_translator/translation/__init__.py +0 -2
- epub_translator-0.0.2/epub_translator/translation/types.py +0 -23
- {epub_translator-0.0.2 → epub_translator-0.0.4}/LICENSE +0 -0
- {epub_translator-0.0.2 → epub_translator-0.0.4}/README.md +0 -0
- {epub_translator-0.0.2 → epub_translator-0.0.4}/epub_translator/__init__.py +0 -0
- {epub_translator-0.0.2 → epub_translator-0.0.4}/epub_translator/data/format.jinja +0 -0
- {epub_translator-0.0.2 → epub_translator-0.0.4}/epub_translator/data/translate.jinja +0 -0
- {epub_translator-0.0.2 → epub_translator-0.0.4}/epub_translator/epub/__init__.py +0 -0
- {epub_translator-0.0.2 → epub_translator-0.0.4}/epub_translator/epub/content_parser.py +0 -0
- {epub_translator-0.0.2 → epub_translator-0.0.4}/epub_translator/epub/html/__init__.py +0 -0
- {epub_translator-0.0.2 → epub_translator-0.0.4}/epub_translator/epub/html/dom_operator.py +0 -0
- {epub_translator-0.0.2 → epub_translator-0.0.4}/epub_translator/epub/html/empty_tags.py +0 -0
- {epub_translator-0.0.2 → epub_translator-0.0.4}/epub_translator/epub/html/file.py +0 -0
- {epub_translator-0.0.2 → epub_translator-0.0.4}/epub_translator/epub/html/texts_searcher.py +0 -0
- {epub_translator-0.0.2 → epub_translator-0.0.4}/epub_translator/llm/__init__.py +0 -0
- {epub_translator-0.0.2 → epub_translator-0.0.4}/epub_translator/llm/error.py +0 -0
- {epub_translator-0.0.2 → epub_translator-0.0.4}/epub_translator/llm/increasable.py +0 -0
- {epub_translator-0.0.2 → epub_translator-0.0.4}/epub_translator/template.py +0 -0
- {epub_translator-0.0.2 → epub_translator-0.0.4}/epub_translator/translation/splitter.py +0 -0
- {epub_translator-0.0.2 → epub_translator-0.0.4}/epub_translator/translation/utils.py +0 -0
- {epub_translator-0.0.2 → epub_translator-0.0.4}/epub_translator/xml/__init__.py +0 -0
- {epub_translator-0.0.2 → epub_translator-0.0.4}/epub_translator/xml/decoder.py +0 -0
- {epub_translator-0.0.2 → epub_translator-0.0.4}/epub_translator/xml/encoder.py +0 -0
- {epub_translator-0.0.2 → epub_translator-0.0.4}/epub_translator/xml/parser.py +0 -0
- {epub_translator-0.0.2 → epub_translator-0.0.4}/epub_translator/xml/tag.py +0 -0
- {epub_translator-0.0.2 → epub_translator-0.0.4}/epub_translator/xml/transform.py +0 -0
- {epub_translator-0.0.2 → epub_translator-0.0.4}/epub_translator/xml/utils.py +0 -0
- {epub_translator-0.0.2 → epub_translator-0.0.4}/epub_translator/zip_context.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: epub-translator
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.4
|
|
4
4
|
Summary: Translate the epub book using LLM. The translated book will retain the original text and list the translated text side by side with the original text.
|
|
5
5
|
License: MIT
|
|
6
6
|
Keywords: epub,llm,translation,translator
|
|
@@ -38,7 +38,7 @@ class LLMExecutor:
|
|
|
38
38
|
timeout=timeout,
|
|
39
39
|
)
|
|
40
40
|
|
|
41
|
-
def request(self, input: LanguageModelInput, parser: Callable[[str], Any]) -> Any:
|
|
41
|
+
def request(self, input: LanguageModelInput, parser: Callable[[str], Any], max_tokens: int | None) -> Any:
|
|
42
42
|
result: Any | None = None
|
|
43
43
|
last_error: Exception | None = None
|
|
44
44
|
did_success = False
|
|
@@ -56,6 +56,7 @@ class LLMExecutor:
|
|
|
56
56
|
input=input,
|
|
57
57
|
top_p=top_p.current,
|
|
58
58
|
temperature=temperature.current,
|
|
59
|
+
max_tokens=max_tokens,
|
|
59
60
|
)
|
|
60
61
|
if logger is not None:
|
|
61
62
|
logger.debug(f"[[Response]]:\n{response}\n")
|
|
@@ -133,12 +134,14 @@ class LLMExecutor:
|
|
|
133
134
|
input: LanguageModelInput,
|
|
134
135
|
top_p: float | None,
|
|
135
136
|
temperature: float | None,
|
|
137
|
+
max_tokens: int | None,
|
|
136
138
|
):
|
|
137
139
|
stream = self._model.stream(
|
|
138
140
|
input=input,
|
|
139
141
|
timeout=self._timeout,
|
|
140
142
|
top_p=top_p,
|
|
141
143
|
temperature=temperature,
|
|
144
|
+
max_tokens=max_tokens,
|
|
142
145
|
)
|
|
143
146
|
buffer = StringIO()
|
|
144
147
|
for chunk in stream:
|
|
@@ -80,6 +80,7 @@ class LLM:
|
|
|
80
80
|
text_tag: str,
|
|
81
81
|
user_data: Element | str,
|
|
82
82
|
parser: Callable[[str], R],
|
|
83
|
+
max_tokens: int | None = None,
|
|
83
84
|
params: dict[str, Any] | None = None,
|
|
84
85
|
) -> R:
|
|
85
86
|
|
|
@@ -95,6 +96,7 @@ class LLM:
|
|
|
95
96
|
return self._executor.request(
|
|
96
97
|
input=self._create_input(template_name, user_data, params),
|
|
97
98
|
parser=parse_response,
|
|
99
|
+
max_tokens=max_tokens,
|
|
98
100
|
)
|
|
99
101
|
|
|
100
102
|
def request_xml(
|
|
@@ -102,6 +104,7 @@ class LLM:
|
|
|
102
104
|
template_name: str,
|
|
103
105
|
user_data: Element | str,
|
|
104
106
|
parser: Callable[[Element], R],
|
|
107
|
+
max_tokens: int | None = None,
|
|
105
108
|
params: dict[str, Any] | None = None,
|
|
106
109
|
) -> R:
|
|
107
110
|
|
|
@@ -117,6 +120,7 @@ class LLM:
|
|
|
117
120
|
return self._executor.request(
|
|
118
121
|
input=self._create_input(template_name, user_data, params),
|
|
119
122
|
parser=parse_response,
|
|
123
|
+
max_tokens=max_tokens,
|
|
120
124
|
)
|
|
121
125
|
|
|
122
126
|
def _create_input(self, template_name: str, user_data: Element | str, params: dict[str, Any]):
|
|
@@ -2,7 +2,7 @@ from dataclasses import dataclass
|
|
|
2
2
|
from typing import Iterator, Iterable, Generator
|
|
3
3
|
from hashlib import sha512
|
|
4
4
|
from ..llm import LLM
|
|
5
|
-
from .types import Fragment
|
|
5
|
+
from .types import Fragment, Language
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
@dataclass
|
|
@@ -30,6 +30,7 @@ class ChunkRange:
|
|
|
30
30
|
|
|
31
31
|
def match_fragments(
|
|
32
32
|
llm: LLM,
|
|
33
|
+
target_language: Language,
|
|
33
34
|
chunk_ranges_iter: Iterator[ChunkRange],
|
|
34
35
|
fragments_iter: Iterator[Fragment],
|
|
35
36
|
) -> Generator[Chunk, None, None]:
|
|
@@ -44,7 +45,7 @@ def match_fragments(
|
|
|
44
45
|
body = texts[head_length:head_length + body_length]
|
|
45
46
|
tail = texts[head_length + body_length:]
|
|
46
47
|
|
|
47
|
-
hash = _hash_texts_list((head, body, tail))
|
|
48
|
+
hash = _hash_texts_list(target_language, (head, body, tail))
|
|
48
49
|
head = _crop_extra_texts(llm, head, True, range.head_remain_tokens)
|
|
49
50
|
tail = _crop_extra_texts(llm, tail, False, range.tail_remain_tokens)
|
|
50
51
|
|
|
@@ -88,15 +89,12 @@ def _match_range_and_texts(
|
|
|
88
89
|
|
|
89
90
|
yield from matched_chunk_ranges
|
|
90
91
|
|
|
91
|
-
def _hash_texts_list(texts_iterable: Iterable[list[str]]) -> bytes:
|
|
92
|
-
is_first = True
|
|
92
|
+
def _hash_texts_list(target_language: Language, texts_iterable: Iterable[list[str]]) -> bytes:
|
|
93
93
|
m = sha512()
|
|
94
|
+
m.update(target_language.value.encode("utf-8"))
|
|
94
95
|
for texts in texts_iterable:
|
|
95
96
|
for text in texts:
|
|
96
|
-
|
|
97
|
-
is_first = False
|
|
98
|
-
else:
|
|
99
|
-
m.update(b"\x00")
|
|
97
|
+
m.update(b"\x00")
|
|
100
98
|
m.update(text.encode("utf-8"))
|
|
101
99
|
return m.digest()
|
|
102
100
|
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
from shutil import rmtree
|
|
2
2
|
from pathlib import Path
|
|
3
3
|
from typing import Iterator
|
|
4
|
-
from .utils import clean_spaces
|
|
5
4
|
|
|
6
5
|
|
|
7
6
|
class Store:
|
|
@@ -13,7 +12,7 @@ class Store:
|
|
|
13
12
|
if not file_path.exists() or not file_path.is_file():
|
|
14
13
|
return None
|
|
15
14
|
with file_path.open("r", encoding="utf-8") as file:
|
|
16
|
-
return
|
|
15
|
+
return file.read().split("\n")
|
|
17
16
|
|
|
18
17
|
def put(self, chunk_hash: bytes, lines_iter: Iterator[str]):
|
|
19
18
|
file_path = self._file_path(chunk_hash)
|
|
@@ -31,7 +30,7 @@ class Store:
|
|
|
31
30
|
is_first_line = False
|
|
32
31
|
else:
|
|
33
32
|
file.write("\n")
|
|
34
|
-
file.write(
|
|
33
|
+
file.write(line)
|
|
35
34
|
|
|
36
35
|
def _file_path(self, chunk_hash: bytes) -> Path:
|
|
37
36
|
return self._directory / f"{chunk_hash.hex()}.chunk"
|
|
@@ -6,7 +6,7 @@ from xml.etree.ElementTree import Element
|
|
|
6
6
|
from ..llm import LLM
|
|
7
7
|
from ..xml import encode_friendly
|
|
8
8
|
|
|
9
|
-
from .types import Fragment, Language
|
|
9
|
+
from .types import language_chinese_name, Fragment, Language
|
|
10
10
|
from .store import Store
|
|
11
11
|
from .splitter import split_into_chunks
|
|
12
12
|
from .chunk import match_fragments, Chunk
|
|
@@ -46,12 +46,23 @@ def translate(
|
|
|
46
46
|
)))
|
|
47
47
|
for chunk in match_fragments(
|
|
48
48
|
llm=llm,
|
|
49
|
+
target_language=target_language,
|
|
49
50
|
chunk_ranges_iter=iter(chunk_ranges),
|
|
50
51
|
fragments_iter=gen_fragments_iter(),
|
|
51
52
|
)
|
|
52
53
|
]
|
|
54
|
+
def _generate_chunks_from_futures():
|
|
55
|
+
try:
|
|
56
|
+
for future in as_completed(futures):
|
|
57
|
+
yield future.result()
|
|
58
|
+
except Exception as err:
|
|
59
|
+
for future in futures:
|
|
60
|
+
if not future.done():
|
|
61
|
+
future.cancel()
|
|
62
|
+
raise err
|
|
63
|
+
|
|
53
64
|
yield from _sort_translated_texts_by_chunk(
|
|
54
|
-
target=(
|
|
65
|
+
target=_generate_chunks_from_futures(),
|
|
55
66
|
total_tokens_count=sum(chunk.tokens_count for chunk in chunk_ranges),
|
|
56
67
|
report_progress=report_progress,
|
|
57
68
|
)
|
|
@@ -96,27 +107,40 @@ def _translate_chunk(
|
|
|
96
107
|
) -> list[str]:
|
|
97
108
|
|
|
98
109
|
translated_texts: list[str] | None = None
|
|
110
|
+
source_texts = chunk.head + chunk.body + chunk.tail
|
|
99
111
|
if store is not None:
|
|
100
112
|
translated_texts = store.get(chunk.hash)
|
|
113
|
+
if translated_texts is not None and \
|
|
114
|
+
len(source_texts) != len(translated_texts):
|
|
115
|
+
translated_texts = None
|
|
116
|
+
print(f"Warning: Mismatched lengths in cached translation for chunk: {chunk.hash.hex()}",)
|
|
101
117
|
|
|
102
118
|
if translated_texts is None:
|
|
103
|
-
translated_texts =
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
119
|
+
translated_texts = [
|
|
120
|
+
clean_spaces(text)
|
|
121
|
+
for text in _translate_texts(
|
|
122
|
+
llm=llm,
|
|
123
|
+
texts=source_texts,
|
|
124
|
+
texts_tokens=chunk.tokens_count,
|
|
125
|
+
target_language=target_language,
|
|
126
|
+
user_prompt=user_prompt,
|
|
127
|
+
)
|
|
128
|
+
]
|
|
129
|
+
if store is not None:
|
|
130
|
+
store.put(chunk.hash, translated_texts)
|
|
111
131
|
|
|
112
132
|
head_length = len(chunk.head)
|
|
113
133
|
translated_texts = translated_texts[head_length:head_length + len(chunk.body)]
|
|
114
134
|
|
|
115
135
|
return translated_texts
|
|
116
136
|
|
|
137
|
+
_PLAIN_TEXT_SCALE = 2.0
|
|
138
|
+
_XML_TEXT_SCALE = 2.5
|
|
139
|
+
|
|
117
140
|
def _translate_texts(
|
|
118
141
|
llm: LLM,
|
|
119
142
|
texts: list[str],
|
|
143
|
+
texts_tokens: int,
|
|
120
144
|
target_language: Language,
|
|
121
145
|
user_prompt: str | None,
|
|
122
146
|
) -> list[str]:
|
|
@@ -134,8 +158,9 @@ def _translate_texts(
|
|
|
134
158
|
text_tag="TXT",
|
|
135
159
|
user_data=user_data,
|
|
136
160
|
parser=lambda r: r,
|
|
161
|
+
max_tokens=texts_tokens * _PLAIN_TEXT_SCALE,
|
|
137
162
|
params={
|
|
138
|
-
"target_language": target_language
|
|
163
|
+
"target_language": language_chinese_name(target_language),
|
|
139
164
|
"user_prompt": user_prompt,
|
|
140
165
|
},
|
|
141
166
|
)
|
|
@@ -154,12 +179,15 @@ def _translate_texts(
|
|
|
154
179
|
return llm.request_xml(
|
|
155
180
|
template_name="format",
|
|
156
181
|
user_data=request_text,
|
|
157
|
-
|
|
182
|
+
max_tokens=texts_tokens * _XML_TEXT_SCALE,
|
|
158
183
|
parser=lambda r: _parse_translated_response(r, len(texts)),
|
|
184
|
+
params={
|
|
185
|
+
"target_language": language_chinese_name(target_language),
|
|
186
|
+
},
|
|
159
187
|
)
|
|
160
188
|
|
|
161
189
|
def _parse_translated_response(resp_element: Element, sources_count: int) -> list[str]:
|
|
162
|
-
|
|
190
|
+
fragments: list[str | None] = [None] * sources_count
|
|
163
191
|
for fragment_element in resp_element:
|
|
164
192
|
if fragment_element.text is None:
|
|
165
193
|
continue
|
|
@@ -167,11 +195,21 @@ def _parse_translated_response(resp_element: Element, sources_count: int) -> lis
|
|
|
167
195
|
if id is None:
|
|
168
196
|
continue
|
|
169
197
|
index = int(id) - 1
|
|
170
|
-
if index < 0 or index >= len(
|
|
198
|
+
if index < 0 or index >= len(fragments):
|
|
171
199
|
raise ValueError(f"invalid fragment id: {id}")
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
200
|
+
fragments[index] = fragment_element.text.strip()
|
|
201
|
+
|
|
202
|
+
# 有时 LLM 会将多段融合在一起,这里尽可能让译文靠后,将空白段留在前面。
|
|
203
|
+
# 这样看起来一大段的译文对应若干小段原文,观感更好。
|
|
204
|
+
for i in range(len(fragments)):
|
|
205
|
+
fragment = fragments[i]
|
|
206
|
+
if fragment is not None and i < len(fragments) - 1:
|
|
207
|
+
next_fragment = fragments[i + 1]
|
|
208
|
+
if next_fragment is None:
|
|
209
|
+
fragments[i] = None
|
|
210
|
+
fragments[i + 1] = fragment
|
|
211
|
+
|
|
212
|
+
return [f or "" for f in fragments]
|
|
175
213
|
|
|
176
214
|
def _normalize_user_input(user_lines: list[str]) -> str | None:
|
|
177
215
|
empty_lines_count: int = 0
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from resource_segmentation import Incision
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@dataclass
|
|
7
|
+
class Fragment:
|
|
8
|
+
text: str
|
|
9
|
+
start_incision: Incision
|
|
10
|
+
end_incision: Incision
|
|
11
|
+
|
|
12
|
+
class Language(Enum):
|
|
13
|
+
SIMPLIFIED_CHINESE = "zh-Hans"
|
|
14
|
+
TRADITIONAL_CHINESE = "zh-Hant"
|
|
15
|
+
ENGLISH = "en"
|
|
16
|
+
FRENCH = "fr"
|
|
17
|
+
GERMAN = "de"
|
|
18
|
+
SPANISH = "es"
|
|
19
|
+
RUSSIAN = "ru"
|
|
20
|
+
ITALIAN = "it"
|
|
21
|
+
PORTUGUESE = "pt"
|
|
22
|
+
JAPANESE = "ja"
|
|
23
|
+
KOREAN = "ko"
|
|
24
|
+
|
|
25
|
+
def language_chinese_name(language: Language) -> str:
|
|
26
|
+
if language == Language.SIMPLIFIED_CHINESE:
|
|
27
|
+
return "简体中文"
|
|
28
|
+
elif language == Language.TRADITIONAL_CHINESE:
|
|
29
|
+
return "繁体中文"
|
|
30
|
+
elif language == Language.ENGLISH:
|
|
31
|
+
return "英语"
|
|
32
|
+
elif language == Language.FRENCH:
|
|
33
|
+
return "法语"
|
|
34
|
+
elif language == Language.GERMAN:
|
|
35
|
+
return "德语"
|
|
36
|
+
elif language == Language.SPANISH:
|
|
37
|
+
return "西班牙语"
|
|
38
|
+
elif language == Language.RUSSIAN:
|
|
39
|
+
return "俄语"
|
|
40
|
+
elif language == Language.ITALIAN:
|
|
41
|
+
return "意大利语"
|
|
42
|
+
elif language == Language.PORTUGUESE:
|
|
43
|
+
return "葡萄牙语"
|
|
44
|
+
elif language == Language.JAPANESE:
|
|
45
|
+
return "日语"
|
|
46
|
+
elif language == Language.KOREAN:
|
|
47
|
+
return "韩语"
|
|
48
|
+
else:
|
|
49
|
+
raise ValueError(f"Unknown language: {language}")
|
|
@@ -2,11 +2,12 @@ from os import PathLike
|
|
|
2
2
|
from pathlib import Path
|
|
3
3
|
from tempfile import mkdtemp
|
|
4
4
|
from shutil import rmtree
|
|
5
|
+
from resource_segmentation import Incision
|
|
5
6
|
|
|
6
7
|
from .llm import LLM
|
|
7
8
|
from .epub import HTMLFile
|
|
8
9
|
from .zip_context import ZipContext
|
|
9
|
-
from .translation import translate as _translate, Fragment,
|
|
10
|
+
from .translation import translate as _translate, Fragment, Language, ProgressReporter
|
|
10
11
|
|
|
11
12
|
|
|
12
13
|
def translate(
|
|
@@ -63,6 +64,7 @@ class _Translator:
|
|
|
63
64
|
try:
|
|
64
65
|
temp_dir = _clean_path(working_path / "temp")
|
|
65
66
|
temp_dir.mkdir(parents=True, exist_ok=True)
|
|
67
|
+
cache_path = working_path / "cache"
|
|
66
68
|
|
|
67
69
|
context = ZipContext(
|
|
68
70
|
epub_path=Path(source_path),
|
|
@@ -70,11 +72,12 @@ class _Translator:
|
|
|
70
72
|
)
|
|
71
73
|
context.replace_ncx(lambda texts: self._translate_ncx(
|
|
72
74
|
texts=texts,
|
|
75
|
+
cache_path=cache_path,
|
|
73
76
|
report_progress=lambda p: self._report_progress(p * 0.1)),
|
|
74
77
|
)
|
|
75
78
|
self._translate_spine(
|
|
76
79
|
context=context,
|
|
77
|
-
|
|
80
|
+
cache_path=cache_path,
|
|
78
81
|
report_progress=lambda p: self._report_progress(0.1 + p * 0.8),
|
|
79
82
|
)
|
|
80
83
|
context.archive(translated_path)
|
|
@@ -84,10 +87,10 @@ class _Translator:
|
|
|
84
87
|
if is_temp_workspace:
|
|
85
88
|
rmtree(working_path, ignore_errors=True)
|
|
86
89
|
|
|
87
|
-
def _translate_ncx(self, texts: list[str], report_progress: ProgressReporter) -> list[str]:
|
|
90
|
+
def _translate_ncx(self, texts: list[str], cache_path: Path, report_progress: ProgressReporter) -> list[str]:
|
|
88
91
|
return list(_translate(
|
|
89
92
|
llm=self._llm,
|
|
90
|
-
cache_path=
|
|
93
|
+
cache_path=cache_path,
|
|
91
94
|
max_chunk_tokens_count=self._max_chunk_tokens_count,
|
|
92
95
|
max_threads_count=1,
|
|
93
96
|
target_language=self._target_language,
|
|
@@ -103,16 +106,16 @@ class _Translator:
|
|
|
103
106
|
),
|
|
104
107
|
))
|
|
105
108
|
|
|
106
|
-
def _translate_spine(self, context: ZipContext,
|
|
109
|
+
def _translate_spine(self, context: ZipContext, cache_path: Path, report_progress: ProgressReporter):
|
|
107
110
|
spine_paths_iter = iter(list(context.search_spine_paths()))
|
|
108
|
-
|
|
111
|
+
spine: tuple[Path, HTMLFile] | None = None
|
|
109
112
|
translated_texts: list[str] = []
|
|
110
113
|
translated_count: int = 0
|
|
111
114
|
|
|
112
115
|
for translated_text in _translate(
|
|
113
116
|
llm=self._llm,
|
|
114
117
|
gen_fragments_iter=lambda: _gen_fragments(context),
|
|
115
|
-
cache_path=
|
|
118
|
+
cache_path=cache_path,
|
|
116
119
|
max_chunk_tokens_count=self._max_chunk_tokens_count,
|
|
117
120
|
max_threads_count=self._max_threads_count,
|
|
118
121
|
target_language=self._target_language,
|
|
@@ -121,32 +124,37 @@ class _Translator:
|
|
|
121
124
|
):
|
|
122
125
|
did_touch_end = False
|
|
123
126
|
|
|
124
|
-
if
|
|
125
|
-
|
|
127
|
+
if spine and translated_count >= len(translated_texts):
|
|
128
|
+
spine_path, spine_file = spine
|
|
126
129
|
spine_file.write_texts(translated_texts)
|
|
127
|
-
spine_file
|
|
130
|
+
context.write_spine_file(spine_path, spine_file)
|
|
131
|
+
spine = None
|
|
128
132
|
|
|
129
|
-
while
|
|
133
|
+
while not spine:
|
|
130
134
|
spine_path = next(spine_paths_iter, None)
|
|
131
135
|
if spine_path is None:
|
|
136
|
+
spine = None
|
|
132
137
|
did_touch_end = True
|
|
133
138
|
break
|
|
134
139
|
spine_file = context.read_spine_file(spine_path)
|
|
135
140
|
if spine_file.texts_length == 0:
|
|
136
|
-
spine_file = None
|
|
137
141
|
continue
|
|
142
|
+
spine = (spine_path, spine_file)
|
|
138
143
|
translated_texts = [""] * spine_file.texts_length
|
|
139
144
|
translated_count = 0
|
|
145
|
+
break
|
|
140
146
|
|
|
141
147
|
translated_texts[translated_count] = translated_text
|
|
142
148
|
translated_count += 1
|
|
143
149
|
|
|
144
150
|
if did_touch_end:
|
|
145
151
|
break
|
|
146
|
-
if spine_file and translated_count > 0:
|
|
147
|
-
spine_file.write_texts(translated_texts)
|
|
148
152
|
|
|
149
|
-
|
|
153
|
+
if spine:
|
|
154
|
+
spine_path, spine_file = spine
|
|
155
|
+
if translated_count > 0:
|
|
156
|
+
spine_file.write_texts(translated_texts)
|
|
157
|
+
context.write_spine_file(spine_path, spine_file)
|
|
150
158
|
|
|
151
159
|
def _gen_fragments(context: ZipContext):
|
|
152
160
|
for spine_path in context.search_spine_paths():
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "epub-translator"
|
|
3
|
-
version = "0.0.
|
|
3
|
+
version = "0.0.4"
|
|
4
4
|
description = "Translate the epub book using LLM. The translated book will retain the original text and list the translated text side by side with the original text."
|
|
5
5
|
keywords=["epub", "llm", "translation", "translator"]
|
|
6
6
|
authors = [
|
|
@@ -1,23 +0,0 @@
|
|
|
1
|
-
from enum import Enum
|
|
2
|
-
from dataclasses import dataclass
|
|
3
|
-
from resource_segmentation import Incision
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
@dataclass
|
|
7
|
-
class Fragment:
|
|
8
|
-
text: str
|
|
9
|
-
start_incision: Incision
|
|
10
|
-
end_incision: Incision
|
|
11
|
-
|
|
12
|
-
class Language(Enum):
|
|
13
|
-
SIMPLIFIED_CHINESE = "简体中文"
|
|
14
|
-
TRADITIONAL_CHINESE = "繁体中文"
|
|
15
|
-
ENGLISH = "英语"
|
|
16
|
-
FRENCH = "法语"
|
|
17
|
-
GERMAN = "德语"
|
|
18
|
-
SPANISH = "西班牙语"
|
|
19
|
-
RUSSIAN = "俄语"
|
|
20
|
-
ITALIAN = "意大利语"
|
|
21
|
-
PORTUGUESE = "葡萄牙语"
|
|
22
|
-
JAPANESE = "日语"
|
|
23
|
-
KOREAN = "韩语"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|