epub-translator 0.0.3__tar.gz → 0.0.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. {epub_translator-0.0.3 → epub_translator-0.0.5}/PKG-INFO +1 -1
  2. {epub_translator-0.0.3 → epub_translator-0.0.5}/epub_translator/llm/executor.py +4 -1
  3. {epub_translator-0.0.3 → epub_translator-0.0.5}/epub_translator/llm/node.py +4 -0
  4. epub_translator-0.0.5/epub_translator/translation/__init__.py +2 -0
  5. {epub_translator-0.0.3 → epub_translator-0.0.5}/epub_translator/translation/chunk.py +6 -8
  6. {epub_translator-0.0.3 → epub_translator-0.0.5}/epub_translator/translation/store.py +2 -3
  7. {epub_translator-0.0.3 → epub_translator-0.0.5}/epub_translator/translation/translation.py +56 -17
  8. epub_translator-0.0.5/epub_translator/translation/types.py +40 -0
  9. {epub_translator-0.0.3 → epub_translator-0.0.5}/epub_translator/translator.py +16 -10
  10. {epub_translator-0.0.3 → epub_translator-0.0.5}/pyproject.toml +1 -1
  11. epub_translator-0.0.3/epub_translator/translation/__init__.py +0 -2
  12. epub_translator-0.0.3/epub_translator/translation/types.py +0 -23
  13. {epub_translator-0.0.3 → epub_translator-0.0.5}/LICENSE +0 -0
  14. {epub_translator-0.0.3 → epub_translator-0.0.5}/README.md +0 -0
  15. {epub_translator-0.0.3 → epub_translator-0.0.5}/epub_translator/__init__.py +0 -0
  16. {epub_translator-0.0.3 → epub_translator-0.0.5}/epub_translator/data/format.jinja +0 -0
  17. {epub_translator-0.0.3 → epub_translator-0.0.5}/epub_translator/data/translate.jinja +0 -0
  18. {epub_translator-0.0.3 → epub_translator-0.0.5}/epub_translator/epub/__init__.py +0 -0
  19. {epub_translator-0.0.3 → epub_translator-0.0.5}/epub_translator/epub/content_parser.py +0 -0
  20. {epub_translator-0.0.3 → epub_translator-0.0.5}/epub_translator/epub/html/__init__.py +0 -0
  21. {epub_translator-0.0.3 → epub_translator-0.0.5}/epub_translator/epub/html/dom_operator.py +0 -0
  22. {epub_translator-0.0.3 → epub_translator-0.0.5}/epub_translator/epub/html/empty_tags.py +0 -0
  23. {epub_translator-0.0.3 → epub_translator-0.0.5}/epub_translator/epub/html/file.py +0 -0
  24. {epub_translator-0.0.3 → epub_translator-0.0.5}/epub_translator/epub/html/texts_searcher.py +0 -0
  25. {epub_translator-0.0.3 → epub_translator-0.0.5}/epub_translator/llm/__init__.py +0 -0
  26. {epub_translator-0.0.3 → epub_translator-0.0.5}/epub_translator/llm/error.py +0 -0
  27. {epub_translator-0.0.3 → epub_translator-0.0.5}/epub_translator/llm/increasable.py +0 -0
  28. {epub_translator-0.0.3 → epub_translator-0.0.5}/epub_translator/template.py +0 -0
  29. {epub_translator-0.0.3 → epub_translator-0.0.5}/epub_translator/translation/splitter.py +0 -0
  30. {epub_translator-0.0.3 → epub_translator-0.0.5}/epub_translator/translation/utils.py +0 -0
  31. {epub_translator-0.0.3 → epub_translator-0.0.5}/epub_translator/xml/__init__.py +0 -0
  32. {epub_translator-0.0.3 → epub_translator-0.0.5}/epub_translator/xml/decoder.py +0 -0
  33. {epub_translator-0.0.3 → epub_translator-0.0.5}/epub_translator/xml/encoder.py +0 -0
  34. {epub_translator-0.0.3 → epub_translator-0.0.5}/epub_translator/xml/parser.py +0 -0
  35. {epub_translator-0.0.3 → epub_translator-0.0.5}/epub_translator/xml/tag.py +0 -0
  36. {epub_translator-0.0.3 → epub_translator-0.0.5}/epub_translator/xml/transform.py +0 -0
  37. {epub_translator-0.0.3 → epub_translator-0.0.5}/epub_translator/xml/utils.py +0 -0
  38. {epub_translator-0.0.3 → epub_translator-0.0.5}/epub_translator/zip_context.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: epub-translator
3
- Version: 0.0.3
3
+ Version: 0.0.5
4
4
  Summary: Translate the epub book using LLM. The translated book will retain the original text and list the translated text side by side with the original text.
5
5
  License: MIT
6
6
  Keywords: epub,llm,translation,translator
@@ -38,7 +38,7 @@ class LLMExecutor:
38
38
  timeout=timeout,
39
39
  )
40
40
 
41
- def request(self, input: LanguageModelInput, parser: Callable[[str], Any]) -> Any:
41
+ def request(self, input: LanguageModelInput, parser: Callable[[str], Any], max_tokens: int | None) -> Any:
42
42
  result: Any | None = None
43
43
  last_error: Exception | None = None
44
44
  did_success = False
@@ -56,6 +56,7 @@ class LLMExecutor:
56
56
  input=input,
57
57
  top_p=top_p.current,
58
58
  temperature=temperature.current,
59
+ max_tokens=max_tokens,
59
60
  )
60
61
  if logger is not None:
61
62
  logger.debug(f"[[Response]]:\n{response}\n")
@@ -133,12 +134,14 @@ class LLMExecutor:
133
134
  input: LanguageModelInput,
134
135
  top_p: float | None,
135
136
  temperature: float | None,
137
+ max_tokens: int | None,
136
138
  ):
137
139
  stream = self._model.stream(
138
140
  input=input,
139
141
  timeout=self._timeout,
140
142
  top_p=top_p,
141
143
  temperature=temperature,
144
+ max_tokens=max_tokens,
142
145
  )
143
146
  buffer = StringIO()
144
147
  for chunk in stream:
@@ -80,6 +80,7 @@ class LLM:
80
80
  text_tag: str,
81
81
  user_data: Element | str,
82
82
  parser: Callable[[str], R],
83
+ max_tokens: int | None = None,
83
84
  params: dict[str, Any] | None = None,
84
85
  ) -> R:
85
86
 
@@ -95,6 +96,7 @@ class LLM:
95
96
  return self._executor.request(
96
97
  input=self._create_input(template_name, user_data, params),
97
98
  parser=parse_response,
99
+ max_tokens=max_tokens,
98
100
  )
99
101
 
100
102
  def request_xml(
@@ -102,6 +104,7 @@ class LLM:
102
104
  template_name: str,
103
105
  user_data: Element | str,
104
106
  parser: Callable[[Element], R],
107
+ max_tokens: int | None = None,
105
108
  params: dict[str, Any] | None = None,
106
109
  ) -> R:
107
110
 
@@ -117,6 +120,7 @@ class LLM:
117
120
  return self._executor.request(
118
121
  input=self._create_input(template_name, user_data, params),
119
122
  parser=parse_response,
123
+ max_tokens=max_tokens,
120
124
  )
121
125
 
122
126
  def _create_input(self, template_name: str, user_data: Element | str, params: dict[str, Any]):
@@ -0,0 +1,2 @@
1
+ from .types import Fragment, Language
2
+ from .translation import translate, ProgressReporter
@@ -2,7 +2,7 @@ from dataclasses import dataclass
2
2
  from typing import Iterator, Iterable, Generator
3
3
  from hashlib import sha512
4
4
  from ..llm import LLM
5
- from .types import Fragment
5
+ from .types import Fragment, Language
6
6
 
7
7
 
8
8
  @dataclass
@@ -30,6 +30,7 @@ class ChunkRange:
30
30
 
31
31
  def match_fragments(
32
32
  llm: LLM,
33
+ target_language: Language,
33
34
  chunk_ranges_iter: Iterator[ChunkRange],
34
35
  fragments_iter: Iterator[Fragment],
35
36
  ) -> Generator[Chunk, None, None]:
@@ -44,7 +45,7 @@ def match_fragments(
44
45
  body = texts[head_length:head_length + body_length]
45
46
  tail = texts[head_length + body_length:]
46
47
 
47
- hash = _hash_texts_list((head, body, tail))
48
+ hash = _hash_texts_list(target_language, (head, body, tail))
48
49
  head = _crop_extra_texts(llm, head, True, range.head_remain_tokens)
49
50
  tail = _crop_extra_texts(llm, tail, False, range.tail_remain_tokens)
50
51
 
@@ -88,15 +89,12 @@ def _match_range_and_texts(
88
89
 
89
90
  yield from matched_chunk_ranges
90
91
 
91
- def _hash_texts_list(texts_iterable: Iterable[list[str]]) -> bytes:
92
- is_first = True
92
+ def _hash_texts_list(target_language: Language, texts_iterable: Iterable[list[str]]) -> bytes:
93
93
  m = sha512()
94
+ m.update(target_language.value.encode("utf-8"))
94
95
  for texts in texts_iterable:
95
96
  for text in texts:
96
- if is_first:
97
- is_first = False
98
- else:
99
- m.update(b"\x00")
97
+ m.update(b"\x00")
100
98
  m.update(text.encode("utf-8"))
101
99
  return m.digest()
102
100
 
@@ -1,7 +1,6 @@
1
1
  from shutil import rmtree
2
2
  from pathlib import Path
3
3
  from typing import Iterator
4
- from .utils import clean_spaces
5
4
 
6
5
 
7
6
  class Store:
@@ -13,7 +12,7 @@ class Store:
13
12
  if not file_path.exists() or not file_path.is_file():
14
13
  return None
15
14
  with file_path.open("r", encoding="utf-8") as file:
16
- return list(line for line in file if line.strip())
15
+ return file.read().split("\n")
17
16
 
18
17
  def put(self, chunk_hash: bytes, lines_iter: Iterator[str]):
19
18
  file_path = self._file_path(chunk_hash)
@@ -31,7 +30,7 @@ class Store:
31
30
  is_first_line = False
32
31
  else:
33
32
  file.write("\n")
34
- file.write(clean_spaces(line))
33
+ file.write(line)
35
34
 
36
35
  def _file_path(self, chunk_hash: bytes) -> Path:
37
36
  return self._directory / f"{chunk_hash.hex()}.chunk"
@@ -1,3 +1,4 @@
1
+ from math import ceil
1
2
  from typing import Callable, Iterator, Generator
2
3
  from pathlib import Path
3
4
  from concurrent.futures import as_completed, ThreadPoolExecutor
@@ -6,7 +7,7 @@ from xml.etree.ElementTree import Element
6
7
  from ..llm import LLM
7
8
  from ..xml import encode_friendly
8
9
 
9
- from .types import Fragment, Language
10
+ from .types import language_chinese_name, Fragment, Language
10
11
  from .store import Store
11
12
  from .splitter import split_into_chunks
12
13
  from .chunk import match_fragments, Chunk
@@ -46,12 +47,23 @@ def translate(
46
47
  )))
47
48
  for chunk in match_fragments(
48
49
  llm=llm,
50
+ target_language=target_language,
49
51
  chunk_ranges_iter=iter(chunk_ranges),
50
52
  fragments_iter=gen_fragments_iter(),
51
53
  )
52
54
  ]
55
+ def _generate_chunks_from_futures():
56
+ try:
57
+ for future in as_completed(futures):
58
+ yield future.result()
59
+ except Exception as err:
60
+ for future in futures:
61
+ if not future.done():
62
+ future.cancel()
63
+ raise err
64
+
53
65
  yield from _sort_translated_texts_by_chunk(
54
- target=(f.result() for f in as_completed(futures)),
66
+ target=_generate_chunks_from_futures(),
55
67
  total_tokens_count=sum(chunk.tokens_count for chunk in chunk_ranges),
56
68
  report_progress=report_progress,
57
69
  )
@@ -96,27 +108,40 @@ def _translate_chunk(
96
108
  ) -> list[str]:
97
109
 
98
110
  translated_texts: list[str] | None = None
111
+ source_texts = chunk.head + chunk.body + chunk.tail
99
112
  if store is not None:
100
113
  translated_texts = store.get(chunk.hash)
114
+ if translated_texts is not None and \
115
+ len(source_texts) != len(translated_texts):
116
+ translated_texts = None
117
+ print(f"Warning: Mismatched lengths in cached translation for chunk: {chunk.hash.hex()}",)
101
118
 
102
119
  if translated_texts is None:
103
- translated_texts = _translate_texts(
104
- llm=llm,
105
- texts=chunk.head + chunk.body + chunk.tail,
106
- target_language=target_language,
107
- user_prompt=user_prompt,
108
- )
109
- if store is not None:
110
- store.put(chunk.hash, translated_texts)
120
+ translated_texts = [
121
+ clean_spaces(text)
122
+ for text in _translate_texts(
123
+ llm=llm,
124
+ texts=source_texts,
125
+ texts_tokens=chunk.tokens_count,
126
+ target_language=target_language,
127
+ user_prompt=user_prompt,
128
+ )
129
+ ]
130
+ if store is not None:
131
+ store.put(chunk.hash, translated_texts)
111
132
 
112
133
  head_length = len(chunk.head)
113
134
  translated_texts = translated_texts[head_length:head_length + len(chunk.body)]
114
135
 
115
136
  return translated_texts
116
137
 
138
+ _PLAIN_TEXT_SCALE = 2.0
139
+ _XML_TEXT_SCALE = 2.5
140
+
117
141
  def _translate_texts(
118
142
  llm: LLM,
119
143
  texts: list[str],
144
+ texts_tokens: int,
120
145
  target_language: Language,
121
146
  user_prompt: str | None,
122
147
  ) -> list[str]:
@@ -134,8 +159,9 @@ def _translate_texts(
134
159
  text_tag="TXT",
135
160
  user_data=user_data,
136
161
  parser=lambda r: r,
162
+ max_tokens=ceil(texts_tokens * _PLAIN_TEXT_SCALE),
137
163
  params={
138
- "target_language": target_language.value,
164
+ "target_language": language_chinese_name(target_language),
139
165
  "user_prompt": user_prompt,
140
166
  },
141
167
  )
@@ -154,12 +180,15 @@ def _translate_texts(
154
180
  return llm.request_xml(
155
181
  template_name="format",
156
182
  user_data=request_text,
157
- params={ "target_language": target_language.value },
183
+ max_tokens=ceil(texts_tokens * _XML_TEXT_SCALE),
158
184
  parser=lambda r: _parse_translated_response(r, len(texts)),
185
+ params={
186
+ "target_language": language_chinese_name(target_language),
187
+ },
159
188
  )
160
189
 
161
190
  def _parse_translated_response(resp_element: Element, sources_count: int) -> list[str]:
162
- translated_fragments = [""] * sources_count
191
+ fragments: list[str | None] = [None] * sources_count
163
192
  for fragment_element in resp_element:
164
193
  if fragment_element.text is None:
165
194
  continue
@@ -167,11 +196,21 @@ def _parse_translated_response(resp_element: Element, sources_count: int) -> lis
167
196
  if id is None:
168
197
  continue
169
198
  index = int(id) - 1
170
- if index < 0 or index >= len(translated_fragments):
199
+ if index < 0 or index >= len(fragments):
171
200
  raise ValueError(f"invalid fragment id: {id}")
172
- translated_fragments[index] = fragment_element.text.strip()
173
-
174
- return translated_fragments
201
+ fragments[index] = fragment_element.text.strip()
202
+
203
+ # 有时 LLM 会将多段融合在一起,这里尽可能让译文靠后,将空白段留在前面。
204
+ # 这样看起来一大段的译文对应若干小段原文,观感更好。
205
+ for i in range(len(fragments)):
206
+ fragment = fragments[i]
207
+ if fragment is not None and i < len(fragments) - 1:
208
+ next_fragment = fragments[i + 1]
209
+ if next_fragment is None:
210
+ fragments[i] = None
211
+ fragments[i + 1] = fragment
212
+
213
+ return [f or "" for f in fragments]
175
214
 
176
215
  def _normalize_user_input(user_lines: list[str]) -> str | None:
177
216
  empty_lines_count: int = 0
@@ -0,0 +1,40 @@
1
+ from enum import Enum
2
+ from dataclasses import dataclass
3
+ from resource_segmentation import Incision
4
+
5
+
6
+ @dataclass
7
+ class Fragment:
8
+ text: str
9
+ start_incision: Incision
10
+ end_incision: Incision
11
+
12
+ class Language(Enum):
13
+ SIMPLIFIED_CHINESE = "zh-Hans"
14
+ TRADITIONAL_CHINESE = "zh-Hant"
15
+ ENGLISH = "en"
16
+ FRENCH = "fr"
17
+ GERMAN = "de"
18
+ SPANISH = "es"
19
+ RUSSIAN = "ru"
20
+ ITALIAN = "it"
21
+ PORTUGUESE = "pt"
22
+ JAPANESE = "ja"
23
+ KOREAN = "ko"
24
+
25
+ _LANGUAGE_NAMES = {
26
+ Language.SIMPLIFIED_CHINESE: "简体中文",
27
+ Language.TRADITIONAL_CHINESE: "繁体中文",
28
+ Language.ENGLISH: "英语",
29
+ Language.FRENCH: "法语",
30
+ Language.GERMAN: "德语",
31
+ Language.SPANISH: "西班牙语",
32
+ Language.RUSSIAN: "俄语",
33
+ Language.ITALIAN: "意大利语",
34
+ Language.PORTUGUESE: "葡萄牙语",
35
+ Language.JAPANESE: "日语",
36
+ Language.KOREAN: "韩语",
37
+ }
38
+
39
+ def language_chinese_name(language: Language) -> str:
40
+ return _LANGUAGE_NAMES[language]
@@ -2,11 +2,12 @@ from os import PathLike
2
2
  from pathlib import Path
3
3
  from tempfile import mkdtemp
4
4
  from shutil import rmtree
5
+ from resource_segmentation import Incision
5
6
 
6
7
  from .llm import LLM
7
8
  from .epub import HTMLFile
8
9
  from .zip_context import ZipContext
9
- from .translation import translate as _translate, Fragment, Incision, Language, ProgressReporter
10
+ from .translation import translate as _translate, Fragment, Language, ProgressReporter
10
11
 
11
12
 
12
13
  def translate(
@@ -107,7 +108,7 @@ class _Translator:
107
108
 
108
109
  def _translate_spine(self, context: ZipContext, cache_path: Path, report_progress: ProgressReporter):
109
110
  spine_paths_iter = iter(list(context.search_spine_paths()))
110
- spine_file: HTMLFile | None = None
111
+ spine: tuple[Path, HTMLFile] | None = None
111
112
  translated_texts: list[str] = []
112
113
  translated_count: int = 0
113
114
 
@@ -123,32 +124,37 @@ class _Translator:
123
124
  ):
124
125
  did_touch_end = False
125
126
 
126
- if spine_file is not None and \
127
- translated_count >= len(translated_texts):
127
+ if spine and translated_count >= len(translated_texts):
128
+ spine_path, spine_file = spine
128
129
  spine_file.write_texts(translated_texts)
129
- spine_file = None
130
+ context.write_spine_file(spine_path, spine_file)
131
+ spine = None
130
132
 
131
- while spine_file is None:
133
+ while not spine:
132
134
  spine_path = next(spine_paths_iter, None)
133
135
  if spine_path is None:
136
+ spine = None
134
137
  did_touch_end = True
135
138
  break
136
139
  spine_file = context.read_spine_file(spine_path)
137
140
  if spine_file.texts_length == 0:
138
- spine_file = None
139
141
  continue
142
+ spine = (spine_path, spine_file)
140
143
  translated_texts = [""] * spine_file.texts_length
141
144
  translated_count = 0
145
+ break
142
146
 
143
147
  translated_texts[translated_count] = translated_text
144
148
  translated_count += 1
145
149
 
146
150
  if did_touch_end:
147
151
  break
148
- if spine_file and translated_count > 0:
149
- spine_file.write_texts(translated_texts)
150
152
 
151
- context.write_spine_file(spine_path, spine_file)
153
+ if spine:
154
+ spine_path, spine_file = spine
155
+ if translated_count > 0:
156
+ spine_file.write_texts(translated_texts)
157
+ context.write_spine_file(spine_path, spine_file)
152
158
 
153
159
  def _gen_fragments(context: ZipContext):
154
160
  for spine_path in context.search_spine_paths():
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "epub-translator"
3
- version = "0.0.3"
3
+ version = "0.0.5"
4
4
  description = "Translate the epub book using LLM. The translated book will retain the original text and list the translated text side by side with the original text."
5
5
  keywords=["epub", "llm", "translation", "translator"]
6
6
  authors = [
@@ -1,2 +0,0 @@
1
- from .types import *
2
- from .translation import translate, ProgressReporter
@@ -1,23 +0,0 @@
1
- from enum import Enum
2
- from dataclasses import dataclass
3
- from resource_segmentation import Incision
4
-
5
-
6
- @dataclass
7
- class Fragment:
8
- text: str
9
- start_incision: Incision
10
- end_incision: Incision
11
-
12
- class Language(Enum):
13
- SIMPLIFIED_CHINESE = "简体中文"
14
- TRADITIONAL_CHINESE = "繁体中文"
15
- ENGLISH = "英语"
16
- FRENCH = "法语"
17
- GERMAN = "德语"
18
- SPANISH = "西班牙语"
19
- RUSSIAN = "俄语"
20
- ITALIAN = "意大利语"
21
- PORTUGUESE = "葡萄牙语"
22
- JAPANESE = "日语"
23
- KOREAN = "韩语"
File without changes