epub-translator 0.0.3__py3-none-any.whl → 0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -38,7 +38,7 @@ class LLMExecutor:
38
38
  timeout=timeout,
39
39
  )
40
40
 
41
- def request(self, input: LanguageModelInput, parser: Callable[[str], Any]) -> Any:
41
+ def request(self, input: LanguageModelInput, parser: Callable[[str], Any], max_tokens: int | None) -> Any:
42
42
  result: Any | None = None
43
43
  last_error: Exception | None = None
44
44
  did_success = False
@@ -56,6 +56,7 @@ class LLMExecutor:
56
56
  input=input,
57
57
  top_p=top_p.current,
58
58
  temperature=temperature.current,
59
+ max_tokens=max_tokens,
59
60
  )
60
61
  if logger is not None:
61
62
  logger.debug(f"[[Response]]:\n{response}\n")
@@ -133,12 +134,14 @@ class LLMExecutor:
133
134
  input: LanguageModelInput,
134
135
  top_p: float | None,
135
136
  temperature: float | None,
137
+ max_tokens: int | None,
136
138
  ):
137
139
  stream = self._model.stream(
138
140
  input=input,
139
141
  timeout=self._timeout,
140
142
  top_p=top_p,
141
143
  temperature=temperature,
144
+ max_tokens=max_tokens,
142
145
  )
143
146
  buffer = StringIO()
144
147
  for chunk in stream:
@@ -80,6 +80,7 @@ class LLM:
80
80
  text_tag: str,
81
81
  user_data: Element | str,
82
82
  parser: Callable[[str], R],
83
+ max_tokens: int | None = None,
83
84
  params: dict[str, Any] | None = None,
84
85
  ) -> R:
85
86
 
@@ -95,6 +96,7 @@ class LLM:
95
96
  return self._executor.request(
96
97
  input=self._create_input(template_name, user_data, params),
97
98
  parser=parse_response,
99
+ max_tokens=max_tokens,
98
100
  )
99
101
 
100
102
  def request_xml(
@@ -102,6 +104,7 @@ class LLM:
102
104
  template_name: str,
103
105
  user_data: Element | str,
104
106
  parser: Callable[[Element], R],
107
+ max_tokens: int | None = None,
105
108
  params: dict[str, Any] | None = None,
106
109
  ) -> R:
107
110
 
@@ -117,6 +120,7 @@ class LLM:
117
120
  return self._executor.request(
118
121
  input=self._create_input(template_name, user_data, params),
119
122
  parser=parse_response,
123
+ max_tokens=max_tokens,
120
124
  )
121
125
 
122
126
  def _create_input(self, template_name: str, user_data: Element | str, params: dict[str, Any]):
@@ -1,2 +1,2 @@
1
- from .types import *
1
+ from .types import Fragment, Language
2
2
  from .translation import translate, ProgressReporter
@@ -2,7 +2,7 @@ from dataclasses import dataclass
2
2
  from typing import Iterator, Iterable, Generator
3
3
  from hashlib import sha512
4
4
  from ..llm import LLM
5
- from .types import Fragment
5
+ from .types import Fragment, Language
6
6
 
7
7
 
8
8
  @dataclass
@@ -30,6 +30,7 @@ class ChunkRange:
30
30
 
31
31
  def match_fragments(
32
32
  llm: LLM,
33
+ target_language: Language,
33
34
  chunk_ranges_iter: Iterator[ChunkRange],
34
35
  fragments_iter: Iterator[Fragment],
35
36
  ) -> Generator[Chunk, None, None]:
@@ -44,7 +45,7 @@ def match_fragments(
44
45
  body = texts[head_length:head_length + body_length]
45
46
  tail = texts[head_length + body_length:]
46
47
 
47
- hash = _hash_texts_list((head, body, tail))
48
+ hash = _hash_texts_list(target_language, (head, body, tail))
48
49
  head = _crop_extra_texts(llm, head, True, range.head_remain_tokens)
49
50
  tail = _crop_extra_texts(llm, tail, False, range.tail_remain_tokens)
50
51
 
@@ -88,15 +89,12 @@ def _match_range_and_texts(
88
89
 
89
90
  yield from matched_chunk_ranges
90
91
 
91
- def _hash_texts_list(texts_iterable: Iterable[list[str]]) -> bytes:
92
- is_first = True
92
+ def _hash_texts_list(target_language: Language, texts_iterable: Iterable[list[str]]) -> bytes:
93
93
  m = sha512()
94
+ m.update(target_language.value.encode("utf-8"))
94
95
  for texts in texts_iterable:
95
96
  for text in texts:
96
- if is_first:
97
- is_first = False
98
- else:
99
- m.update(b"\x00")
97
+ m.update(b"\x00")
100
98
  m.update(text.encode("utf-8"))
101
99
  return m.digest()
102
100
 
@@ -1,7 +1,6 @@
1
1
  from shutil import rmtree
2
2
  from pathlib import Path
3
3
  from typing import Iterator
4
- from .utils import clean_spaces
5
4
 
6
5
 
7
6
  class Store:
@@ -13,7 +12,7 @@ class Store:
13
12
  if not file_path.exists() or not file_path.is_file():
14
13
  return None
15
14
  with file_path.open("r", encoding="utf-8") as file:
16
- return list(line for line in file if line.strip())
15
+ return file.read().split("\n")
17
16
 
18
17
  def put(self, chunk_hash: bytes, lines_iter: Iterator[str]):
19
18
  file_path = self._file_path(chunk_hash)
@@ -31,7 +30,7 @@ class Store:
31
30
  is_first_line = False
32
31
  else:
33
32
  file.write("\n")
34
- file.write(clean_spaces(line))
33
+ file.write(line)
35
34
 
36
35
  def _file_path(self, chunk_hash: bytes) -> Path:
37
36
  return self._directory / f"{chunk_hash.hex()}.chunk"
@@ -6,7 +6,7 @@ from xml.etree.ElementTree import Element
6
6
  from ..llm import LLM
7
7
  from ..xml import encode_friendly
8
8
 
9
- from .types import Fragment, Language
9
+ from .types import language_chinese_name, Fragment, Language
10
10
  from .store import Store
11
11
  from .splitter import split_into_chunks
12
12
  from .chunk import match_fragments, Chunk
@@ -46,12 +46,23 @@ def translate(
46
46
  )))
47
47
  for chunk in match_fragments(
48
48
  llm=llm,
49
+ target_language=target_language,
49
50
  chunk_ranges_iter=iter(chunk_ranges),
50
51
  fragments_iter=gen_fragments_iter(),
51
52
  )
52
53
  ]
54
+ def _generate_chunks_from_futures():
55
+ try:
56
+ for future in as_completed(futures):
57
+ yield future.result()
58
+ except Exception as err:
59
+ for future in futures:
60
+ if not future.done():
61
+ future.cancel()
62
+ raise err
63
+
53
64
  yield from _sort_translated_texts_by_chunk(
54
- target=(f.result() for f in as_completed(futures)),
65
+ target=_generate_chunks_from_futures(),
55
66
  total_tokens_count=sum(chunk.tokens_count for chunk in chunk_ranges),
56
67
  report_progress=report_progress,
57
68
  )
@@ -96,27 +107,40 @@ def _translate_chunk(
96
107
  ) -> list[str]:
97
108
 
98
109
  translated_texts: list[str] | None = None
110
+ source_texts = chunk.head + chunk.body + chunk.tail
99
111
  if store is not None:
100
112
  translated_texts = store.get(chunk.hash)
113
+ if translated_texts is not None and \
114
+ len(source_texts) != len(translated_texts):
115
+ translated_texts = None
116
+ print(f"Warning: Mismatched lengths in cached translation for chunk: {chunk.hash.hex()}",)
101
117
 
102
118
  if translated_texts is None:
103
- translated_texts = _translate_texts(
104
- llm=llm,
105
- texts=chunk.head + chunk.body + chunk.tail,
106
- target_language=target_language,
107
- user_prompt=user_prompt,
108
- )
109
- if store is not None:
110
- store.put(chunk.hash, translated_texts)
119
+ translated_texts = [
120
+ clean_spaces(text)
121
+ for text in _translate_texts(
122
+ llm=llm,
123
+ texts=source_texts,
124
+ texts_tokens=chunk.tokens_count,
125
+ target_language=target_language,
126
+ user_prompt=user_prompt,
127
+ )
128
+ ]
129
+ if store is not None:
130
+ store.put(chunk.hash, translated_texts)
111
131
 
112
132
  head_length = len(chunk.head)
113
133
  translated_texts = translated_texts[head_length:head_length + len(chunk.body)]
114
134
 
115
135
  return translated_texts
116
136
 
137
+ _PLAIN_TEXT_SCALE = 2.0
138
+ _XML_TEXT_SCALE = 2.5
139
+
117
140
  def _translate_texts(
118
141
  llm: LLM,
119
142
  texts: list[str],
143
+ texts_tokens: int,
120
144
  target_language: Language,
121
145
  user_prompt: str | None,
122
146
  ) -> list[str]:
@@ -134,8 +158,9 @@ def _translate_texts(
134
158
  text_tag="TXT",
135
159
  user_data=user_data,
136
160
  parser=lambda r: r,
161
+ max_tokens=texts_tokens * _PLAIN_TEXT_SCALE,
137
162
  params={
138
- "target_language": target_language.value,
163
+ "target_language": language_chinese_name(target_language),
139
164
  "user_prompt": user_prompt,
140
165
  },
141
166
  )
@@ -154,12 +179,15 @@ def _translate_texts(
154
179
  return llm.request_xml(
155
180
  template_name="format",
156
181
  user_data=request_text,
157
- params={ "target_language": target_language.value },
182
+ max_tokens=texts_tokens * _XML_TEXT_SCALE,
158
183
  parser=lambda r: _parse_translated_response(r, len(texts)),
184
+ params={
185
+ "target_language": language_chinese_name(target_language),
186
+ },
159
187
  )
160
188
 
161
189
  def _parse_translated_response(resp_element: Element, sources_count: int) -> list[str]:
162
- translated_fragments = [""] * sources_count
190
+ fragments: list[str | None] = [None] * sources_count
163
191
  for fragment_element in resp_element:
164
192
  if fragment_element.text is None:
165
193
  continue
@@ -167,11 +195,21 @@ def _parse_translated_response(resp_element: Element, sources_count: int) -> lis
167
195
  if id is None:
168
196
  continue
169
197
  index = int(id) - 1
170
- if index < 0 or index >= len(translated_fragments):
198
+ if index < 0 or index >= len(fragments):
171
199
  raise ValueError(f"invalid fragment id: {id}")
172
- translated_fragments[index] = fragment_element.text.strip()
173
-
174
- return translated_fragments
200
+ fragments[index] = fragment_element.text.strip()
201
+
202
+ # 有时 LLM 会将多段融合在一起,这里尽可能让译文靠后,将空白段留在前面。
203
+ # 这样看起来一大段的译文对应若干小段原文,观感更好。
204
+ for i in range(len(fragments)):
205
+ fragment = fragments[i]
206
+ if fragment is not None and i < len(fragments) - 1:
207
+ next_fragment = fragments[i + 1]
208
+ if next_fragment is None:
209
+ fragments[i] = None
210
+ fragments[i + 1] = fragment
211
+
212
+ return [f or "" for f in fragments]
175
213
 
176
214
  def _normalize_user_input(user_lines: list[str]) -> str | None:
177
215
  empty_lines_count: int = 0
@@ -10,14 +10,40 @@ class Fragment:
10
10
  end_incision: Incision
11
11
 
12
12
  class Language(Enum):
13
- SIMPLIFIED_CHINESE = "简体中文"
14
- TRADITIONAL_CHINESE = "繁体中文"
15
- ENGLISH = "英语"
16
- FRENCH = "法语"
17
- GERMAN = "德语"
18
- SPANISH = "西班牙语"
19
- RUSSIAN = "俄语"
20
- ITALIAN = "意大利语"
21
- PORTUGUESE = "葡萄牙语"
22
- JAPANESE = "日语"
23
- KOREAN = "韩语"
13
+ SIMPLIFIED_CHINESE = "zh-Hans"
14
+ TRADITIONAL_CHINESE = "zh-Hant"
15
+ ENGLISH = "en"
16
+ FRENCH = "fr"
17
+ GERMAN = "de"
18
+ SPANISH = "es"
19
+ RUSSIAN = "ru"
20
+ ITALIAN = "it"
21
+ PORTUGUESE = "pt"
22
+ JAPANESE = "ja"
23
+ KOREAN = "ko"
24
+
25
+ def language_chinese_name(language: Language) -> str:
26
+ if language == Language.SIMPLIFIED_CHINESE:
27
+ return "简体中文"
28
+ elif language == Language.TRADITIONAL_CHINESE:
29
+ return "繁体中文"
30
+ elif language == Language.ENGLISH:
31
+ return "英语"
32
+ elif language == Language.FRENCH:
33
+ return "法语"
34
+ elif language == Language.GERMAN:
35
+ return "德语"
36
+ elif language == Language.SPANISH:
37
+ return "西班牙语"
38
+ elif language == Language.RUSSIAN:
39
+ return "俄语"
40
+ elif language == Language.ITALIAN:
41
+ return "意大利语"
42
+ elif language == Language.PORTUGUESE:
43
+ return "葡萄牙语"
44
+ elif language == Language.JAPANESE:
45
+ return "日语"
46
+ elif language == Language.KOREAN:
47
+ return "韩语"
48
+ else:
49
+ raise ValueError(f"Unknown language: {language}")
@@ -2,11 +2,12 @@ from os import PathLike
2
2
  from pathlib import Path
3
3
  from tempfile import mkdtemp
4
4
  from shutil import rmtree
5
+ from resource_segmentation import Incision
5
6
 
6
7
  from .llm import LLM
7
8
  from .epub import HTMLFile
8
9
  from .zip_context import ZipContext
9
- from .translation import translate as _translate, Fragment, Incision, Language, ProgressReporter
10
+ from .translation import translate as _translate, Fragment, Language, ProgressReporter
10
11
 
11
12
 
12
13
  def translate(
@@ -107,7 +108,7 @@ class _Translator:
107
108
 
108
109
  def _translate_spine(self, context: ZipContext, cache_path: Path, report_progress: ProgressReporter):
109
110
  spine_paths_iter = iter(list(context.search_spine_paths()))
110
- spine_file: HTMLFile | None = None
111
+ spine: tuple[Path, HTMLFile] | None = None
111
112
  translated_texts: list[str] = []
112
113
  translated_count: int = 0
113
114
 
@@ -123,32 +124,37 @@ class _Translator:
123
124
  ):
124
125
  did_touch_end = False
125
126
 
126
- if spine_file is not None and \
127
- translated_count >= len(translated_texts):
127
+ if spine and translated_count >= len(translated_texts):
128
+ spine_path, spine_file = spine
128
129
  spine_file.write_texts(translated_texts)
129
- spine_file = None
130
+ context.write_spine_file(spine_path, spine_file)
131
+ spine = None
130
132
 
131
- while spine_file is None:
133
+ while not spine:
132
134
  spine_path = next(spine_paths_iter, None)
133
135
  if spine_path is None:
136
+ spine = None
134
137
  did_touch_end = True
135
138
  break
136
139
  spine_file = context.read_spine_file(spine_path)
137
140
  if spine_file.texts_length == 0:
138
- spine_file = None
139
141
  continue
142
+ spine = (spine_path, spine_file)
140
143
  translated_texts = [""] * spine_file.texts_length
141
144
  translated_count = 0
145
+ break
142
146
 
143
147
  translated_texts[translated_count] = translated_text
144
148
  translated_count += 1
145
149
 
146
150
  if did_touch_end:
147
151
  break
148
- if spine_file and translated_count > 0:
149
- spine_file.write_texts(translated_texts)
150
152
 
151
- context.write_spine_file(spine_path, spine_file)
153
+ if spine:
154
+ spine_path, spine_file = spine
155
+ if translated_count > 0:
156
+ spine_file.write_texts(translated_texts)
157
+ context.write_spine_file(spine_path, spine_file)
152
158
 
153
159
  def _gen_fragments(context: ZipContext):
154
160
  for spine_path in context.search_spine_paths():
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: epub-translator
3
- Version: 0.0.3
3
+ Version: 0.0.4
4
4
  Summary: Translate the epub book using LLM. The translated book will retain the original text and list the translated text side by side with the original text.
5
5
  License: MIT
6
6
  Keywords: epub,llm,translation,translator
@@ -10,18 +10,18 @@ epub_translator/epub/html/file.py,sha256=AqUV-Tmptk5J2EYmw3oRVsLjGSqEMNz5rItnoRb
10
10
  epub_translator/epub/html/texts_searcher.py,sha256=vamO99pki6_sX2PeKCJk7mPwHdApZq1sOgSYDTPckx8,1376
11
11
  epub_translator/llm/__init__.py,sha256=wMBWLgh5iLNQBioniSOmWC83NS7RLM41hIs1V1uZiWI,21
12
12
  epub_translator/llm/error.py,sha256=fG0A3z69YoSNu0MNVWVFMtHCB_4fpOvAEb0Kajn9OHc,1401
13
- epub_translator/llm/executor.py,sha256=vwHqtlvCDHjDXLcvvKstlcQ5MfAGNPz1RKbq8W6WwKs,4378
13
+ epub_translator/llm/executor.py,sha256=Z8mpsTAEEfG80E43Pv5VJJzaDKrrClyUhr8GFA2yVxA,4498
14
14
  epub_translator/llm/increasable.py,sha256=Dpu5z4JK5h1OtLorZgsOAdRFeTH2LOkdroasgmCWAIo,1136
15
- epub_translator/llm/node.py,sha256=IKgdWoBwiejHOJ7akv8AiXUpyFlv2U4fGllt7ZLE3M0,5970
15
+ epub_translator/llm/node.py,sha256=qdkBY06OOt0mq8tnLgMJZS5dc3WFColzpeHmyWLE7KU,6104
16
16
  epub_translator/template.py,sha256=GdV3QnypProKFCMH1kBNfdt6wiShygP_-xGnE5EOUwU,1460
17
- epub_translator/translation/__init__.py,sha256=mudXLDVSIG0XTLoHUIos0-wtQCnL9ZreuHsTHcVKjnE,73
18
- epub_translator/translation/chunk.py,sha256=obrkx_yCeGMeikinfIx0NRvMo2kQBwXVbCdJbeT-ERA,3576
17
+ epub_translator/translation/__init__.py,sha256=QvHULHbBc7NKDlV6d0hyXAFXcAWI1tctsubzJD89tvw,90
18
+ epub_translator/translation/chunk.py,sha256=ERIDvhM7kZB_ZqWGw4UQfKgjIHn7TDWsxa-RuTpxRs0,3639
19
19
  epub_translator/translation/splitter.py,sha256=xOaP1p3lqY95CR0vDXdeGUMHYObiqs3y093EUAxJ-jI,2676
20
- epub_translator/translation/store.py,sha256=1FmksPAUj0mt3tN8Jdb_L1ovaI1p_5OhTWgxbIDl0SI,1133
21
- epub_translator/translation/translation.py,sha256=_qiw6s_z_Tv4VmIP1U-_YhDYNiKEDNfigHIIGKa41fU,5734
22
- epub_translator/translation/types.py,sha256=vDW5bVqYwngW_YUgf0SgfZ5zIFWUxcbBGO1U9Dsxc0o,499
20
+ epub_translator/translation/store.py,sha256=4sR3DYZuU56IaNcw3xIMf8ZQP19kcoP5MZUaM8j4gxw,1067
21
+ epub_translator/translation/translation.py,sha256=1ETNgP1zDEbJnUEPaMIWbw1vlh2jDVHmFEJ4fqDe8Ic,7078
22
+ epub_translator/translation/types.py,sha256=OUxqgdyvrDEUFz21b0tv28D_oIqrP77Yv_xKAoXI-7c,1231
23
23
  epub_translator/translation/utils.py,sha256=G6Gqq6mot3lgFA-jqUD0UqtDS0GC1wrb9DnK7rTxJNs,223
24
- epub_translator/translator.py,sha256=oh7PdCijUSGU6f72hzc6doJdMWnAqg6zHRFIj3aeTjc,5332
24
+ epub_translator/translator.py,sha256=U0ZBBpyRNdDEvrAM7AA3b3TatdUwlhM_MSTOt5Bclsw,5520
25
25
  epub_translator/xml/__init__.py,sha256=o2_qwUYU_MUcyfmfKkiOQ-cKUQyl4PiRL8YHVzCTAZU,106
26
26
  epub_translator/xml/decoder.py,sha256=UlqgmEKQDzxt3lvBeNGHgZP6jznmnq_1HLJuAe5X0C4,2181
27
27
  epub_translator/xml/encoder.py,sha256=p4A7GRSOM2i0WOh1lLtEdTTg2gXSQrxDdzMgUqbiV18,2428
@@ -30,7 +30,7 @@ epub_translator/xml/tag.py,sha256=QLZImF0PtYyiASI7swrB8DL_qUwcYpU6cL68jEXDnvg,23
30
30
  epub_translator/xml/transform.py,sha256=vS_a4d_o2Qqf9B6k2CovQVLUknp6TyUi3FyLOu21Vio,1126
31
31
  epub_translator/xml/utils.py,sha256=KDNGWHwaIiFKS27sjZF0e-bBSjeTxzceae_aeuj4wzI,384
32
32
  epub_translator/zip_context.py,sha256=7_05kycmADb4-vxHkw_DX__vkKOxT4zo9pr2a8F4L_U,2409
33
- epub_translator-0.0.3.dist-info/LICENSE,sha256=5RF32sL3LtMOJIErdDKp1ZEYPGXS8WPpsiSz_jMBnGI,1066
34
- epub_translator-0.0.3.dist-info/METADATA,sha256=HuD7ogzPi96e8ZKjAJyTARhSeR4xmu3vZt0Q3Vmg0Hk,2342
35
- epub_translator-0.0.3.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
36
- epub_translator-0.0.3.dist-info/RECORD,,
33
+ epub_translator-0.0.4.dist-info/LICENSE,sha256=5RF32sL3LtMOJIErdDKp1ZEYPGXS8WPpsiSz_jMBnGI,1066
34
+ epub_translator-0.0.4.dist-info/METADATA,sha256=rM-6yy977tIiJvpAXHCJYFGYTgjTuOxKLJhbSbYwJNs,2342
35
+ epub_translator-0.0.4.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
36
+ epub_translator-0.0.4.dist-info/RECORD,,